diff options
Diffstat (limited to 'fs')
51 files changed, 668 insertions, 237 deletions
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b21cb72835cc..4eafe3817e11 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1621,7 +1621,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned long range_bitmap = 0; bool submitted_io = false; - bool error = false; + int found_error = 0; const u64 folio_start = folio_pos(folio); const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); u64 cur; @@ -1685,7 +1685,8 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, */ btrfs_mark_ordered_io_finished(inode, folio, cur, fs_info->sectorsize, false); - error = true; + if (!found_error) + found_error = ret; continue; } submitted_io = true; @@ -1702,11 +1703,11 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, * If we hit any error, the corresponding sector will have its dirty * flag cleared and writeback finished, thus no need to handle the error case. */ - if (!submitted_io && !error) { + if (!submitted_io && !found_error) { btrfs_folio_set_writeback(fs_info, folio, start, len); btrfs_folio_clear_writeback(fs_info, folio, start, len); } - return ret; + return found_error; } /* diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 18db1053cdf0..cd8a09e3d1dc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6479,6 +6479,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, if (!args->subvol) btrfs_inherit_iflags(BTRFS_I(inode), BTRFS_I(dir)); + btrfs_set_inode_mapping_order(BTRFS_I(inode)); if (S_ISREG(inode->i_mode)) { if (btrfs_test_opt(fs_info, NODATASUM)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; @@ -6486,7 +6487,6 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | BTRFS_INODE_NODATASUM; btrfs_update_inode_mapping_flags(BTRFS_I(inode)); - btrfs_set_inode_mapping_order(BTRFS_I(inode)); } ret = btrfs_insert_inode_locked(inode); diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index b002e9b734f9..56c8005b24a3 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -412,7 +412,7 @@ static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma) vm_fault_t vmf; unsigned long off = i * PAGE_SIZE; vmf = vmf_insert_mixed(vma, vma->vm_start + off, - address + off); + PHYS_PFN(address + off)); if (vmf & VM_FAULT_ERROR) ret = vm_fault_to_errno(vmf, 0); } diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 2d73297003d2..625b8ae8f67f 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1835,7 +1835,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_frontend *f, map->m_la = end; err = z_erofs_map_blocks_iter(inode, map, EROFS_GET_BLOCKS_READMORE); - if (err) + if (err || !(map->m_flags & EROFS_MAP_ENCODED)) return; /* expand ra for the trailing edge if readahead */ @@ -1847,7 +1847,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_frontend *f, end = round_up(end, PAGE_SIZE); } else { end = round_up(map->m_la, PAGE_SIZE); - if (!map->m_llen) + if (!(map->m_flags & EROFS_MAP_ENCODED) || !map->m_llen) return; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 01a6e2de7fc3..72e02df72c4c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1982,6 +1982,16 @@ static inline bool ext4_verity_in_progress(struct inode *inode) #define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime /* + * Check whether the inode is tracked as orphan (either in orphan file or + * orphan list). + */ +static inline bool ext4_inode_orphan_tracked(struct inode *inode) +{ + return ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE) || + !list_empty(&EXT4_I(inode)->i_orphan); +} + +/* * Codes for operating systems */ #define EXT4_OS_LINUX 0 diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 93240e35ee36..7a8b30932189 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -354,7 +354,7 @@ static void ext4_inode_extension_cleanup(struct inode *inode, bool need_trunc) * to cleanup the orphan list in ext4_handle_inode_extension(). Do it * now. */ - if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) { + if (ext4_inode_orphan_tracked(inode) && inode->i_nlink) { handle_t *handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5b7a15db4953..5230452e29dd 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4748,7 +4748,7 @@ static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode * old inodes get re-used with the upper 16 bits of the * uid/gid intact. */ - if (ei->i_dtime && list_empty(&ei->i_orphan)) { + if (ei->i_dtime && !ext4_inode_orphan_tracked(inode)) { raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } else { diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 5898d92ba19f..6070d3c86678 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3655,16 +3655,26 @@ static void ext4_discard_work(struct work_struct *work) static inline void ext4_mb_avg_fragment_size_destroy(struct ext4_sb_info *sbi) { + if (!sbi->s_mb_avg_fragment_size) + return; + for (int i = 0; i < MB_NUM_ORDERS(sbi->s_sb); i++) xa_destroy(&sbi->s_mb_avg_fragment_size[i]); + kfree(sbi->s_mb_avg_fragment_size); + sbi->s_mb_avg_fragment_size = NULL; } static inline void ext4_mb_largest_free_orders_destroy(struct ext4_sb_info *sbi) { + if (!sbi->s_mb_largest_free_orders) + return; + for (int i = 0; i < MB_NUM_ORDERS(sbi->s_sb); i++) xa_destroy(&sbi->s_mb_largest_free_orders[i]); + kfree(sbi->s_mb_largest_free_orders); + sbi->s_mb_largest_free_orders = NULL; } int ext4_mb_init(struct super_block *sb) diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c index 524d4658fa40..0fbcce67ffd4 100644 --- a/fs/ext4/orphan.c +++ b/fs/ext4/orphan.c @@ -109,11 +109,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && !inode_is_locked(inode)); - /* - * Inode orphaned in orphan file or in orphan list? - */ - if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE) || - !list_empty(&EXT4_I(inode)->i_orphan)) + if (ext4_inode_orphan_tracked(inode)) return 0; /* diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 699c15db28a8..ba497387b9c8 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1438,9 +1438,9 @@ static void ext4_free_in_core_inode(struct inode *inode) static void ext4_destroy_inode(struct inode *inode) { - if (!list_empty(&(EXT4_I(inode)->i_orphan))) { + if (ext4_inode_orphan_tracked(inode)) { ext4_msg(inode->i_sb, KERN_ERR, - "Inode %lu (%p): orphan list check failed!", + "Inode %lu (%p): inode tracked as orphan!", inode->i_ino, EXT4_I(inode)); print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4, EXT4_I(inode), sizeof(struct ext4_inode_info), diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 5c1f47e45dab..72bc05b913af 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1245,20 +1245,29 @@ int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock) for (i = cluster_size - 1; i >= 0; i--) { struct folio *folio = page_folio(rpages[i]); - loff_t start = folio->index << PAGE_SHIFT; + loff_t start = (loff_t)folio->index << PAGE_SHIFT; + loff_t offset = from > start ? from - start : 0; - if (from <= start) { - folio_zero_segment(folio, 0, folio_size(folio)); - } else { - folio_zero_segment(folio, from - start, - folio_size(folio)); + folio_zero_segment(folio, offset, folio_size(folio)); + + if (from >= start) break; - } } f2fs_compress_write_end(inode, fsdata, start_idx, true); + + err = filemap_write_and_wait_range(inode->i_mapping, + round_down(from, cluster_size << PAGE_SHIFT), + LLONG_MAX); + if (err) + return err; + + truncate_pagecache(inode, from); + + err = f2fs_do_truncate_blocks(inode, + round_up(from, PAGE_SIZE), lock); } - return 0; + return err; } static int f2fs_write_compressed_pages(struct compress_ctx *cc, diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7961e0ddfca3..50c90bd03923 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -911,7 +911,7 @@ alloc_new: if (fio->io_wbc) wbc_account_cgroup_owner(fio->io_wbc, folio, folio_size(folio)); - inc_page_count(fio->sbi, WB_DATA_TYPE(data_folio, false)); + inc_page_count(fio->sbi, WB_DATA_TYPE(folio, false)); *fio->last_block = fio->new_blkaddr; *fio->bio = bio; @@ -1778,12 +1778,13 @@ sync_out: if (map->m_flags & F2FS_MAP_MAPPED) { unsigned int ofs = start_pgofs - map->m_lblk; - f2fs_update_read_extent_cache_range(&dn, - start_pgofs, map->m_pblk + ofs, - map->m_len - ofs); + if (map->m_len > ofs) + f2fs_update_read_extent_cache_range(&dn, + start_pgofs, map->m_pblk + ofs, + map->m_len - ofs); } if (map->m_next_extent) - *map->m_next_extent = pgofs + 1; + *map->m_next_extent = is_hole ? pgofs + 1 : pgofs; } f2fs_put_dnode(&dn); unlock_out: diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b4b62ac46bc6..dac7d44885e4 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2361,8 +2361,6 @@ static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi, { if (!inode) return true; - if (!test_opt(sbi, RESERVE_ROOT)) - return false; if (IS_NOQUOTA(inode)) return true; if (uid_eq(F2FS_OPTION(sbi).s_resuid, current_fsuid())) @@ -2383,7 +2381,7 @@ static inline unsigned int get_available_block_count(struct f2fs_sb_info *sbi, avail_user_block_count = sbi->user_block_count - sbi->current_reserved_blocks; - if (!__allow_reserved_blocks(sbi, inode, cap)) + if (test_opt(sbi, RESERVE_ROOT) && !__allow_reserved_blocks(sbi, inode, cap)) avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 42faaed6a02d..ffa045b39c01 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -35,15 +35,23 @@ #include <trace/events/f2fs.h> #include <uapi/linux/f2fs.h> -static void f2fs_zero_post_eof_page(struct inode *inode, loff_t new_size) +static void f2fs_zero_post_eof_page(struct inode *inode, + loff_t new_size, bool lock) { loff_t old_size = i_size_read(inode); if (old_size >= new_size) return; + if (mapping_empty(inode->i_mapping)) + return; + + if (lock) + filemap_invalidate_lock(inode->i_mapping); /* zero or drop pages only in range of [old_size, new_size] */ - truncate_pagecache(inode, old_size); + truncate_inode_pages_range(inode->i_mapping, old_size, new_size); + if (lock) + filemap_invalidate_unlock(inode->i_mapping); } static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf) @@ -114,9 +122,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) f2fs_bug_on(sbi, f2fs_has_inline_data(inode)); - filemap_invalidate_lock(inode->i_mapping); - f2fs_zero_post_eof_page(inode, (folio->index + 1) << PAGE_SHIFT); - filemap_invalidate_unlock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, (folio->index + 1) << PAGE_SHIFT, true); file_update_time(vmf->vma->vm_file); filemap_invalidate_lock_shared(inode->i_mapping); @@ -904,8 +910,16 @@ int f2fs_truncate(struct inode *inode) /* we should check inline_data size */ if (!f2fs_may_inline_data(inode)) { err = f2fs_convert_inline_inode(inode); - if (err) + if (err) { + /* + * Always truncate page #0 to avoid page cache + * leak in evict() path. + */ + truncate_inode_pages_range(inode->i_mapping, + F2FS_BLK_TO_BYTES(0), + F2FS_BLK_END_BYTES(0)); return err; + } } err = f2fs_truncate_blocks(inode, i_size_read(inode), true); @@ -1141,7 +1155,7 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, filemap_invalidate_lock(inode->i_mapping); if (attr->ia_size > old_size) - f2fs_zero_post_eof_page(inode, attr->ia_size); + f2fs_zero_post_eof_page(inode, attr->ia_size, false); truncate_setsize(inode, attr->ia_size); if (attr->ia_size <= old_size) @@ -1260,9 +1274,7 @@ static int f2fs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (ret) return ret; - filemap_invalidate_lock(inode->i_mapping); - f2fs_zero_post_eof_page(inode, offset + len); - filemap_invalidate_unlock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, offset + len, true); pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; @@ -1547,7 +1559,7 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); - f2fs_zero_post_eof_page(inode, offset + len); + f2fs_zero_post_eof_page(inode, offset + len, false); f2fs_lock_op(sbi); f2fs_drop_extent_tree(inode); @@ -1670,9 +1682,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, if (ret) return ret; - filemap_invalidate_lock(mapping); - f2fs_zero_post_eof_page(inode, offset + len); - filemap_invalidate_unlock(mapping); + f2fs_zero_post_eof_page(inode, offset + len, true); pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; @@ -1806,7 +1816,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(mapping); - f2fs_zero_post_eof_page(inode, offset + len); + f2fs_zero_post_eof_page(inode, offset + len, false); truncate_pagecache(inode, offset); while (!ret && idx > pg_start) { @@ -1864,9 +1874,7 @@ static int f2fs_expand_inode_data(struct inode *inode, loff_t offset, if (err) return err; - filemap_invalidate_lock(inode->i_mapping); - f2fs_zero_post_eof_page(inode, offset + len); - filemap_invalidate_unlock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, offset + len, true); f2fs_balance_fs(sbi, true); @@ -4914,9 +4922,8 @@ static ssize_t f2fs_write_checks(struct kiocb *iocb, struct iov_iter *from) if (err) return err; - filemap_invalidate_lock(inode->i_mapping); - f2fs_zero_post_eof_page(inode, iocb->ki_pos + iov_iter_count(from)); - filemap_invalidate_unlock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, + iocb->ki_pos + iov_iter_count(from), true); return count; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index c0f209f74688..5734e0386468 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1794,6 +1794,13 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, struct folio *sum_folio = filemap_get_folio(META_MAPPING(sbi), GET_SUM_BLOCK(sbi, segno)); + if (is_cursec(sbi, GET_SEC_FROM_SEG(sbi, segno))) { + f2fs_err(sbi, "%s: segment %u is used by log", + __func__, segno); + f2fs_bug_on(sbi, 1); + goto skip; + } + if (get_valid_blocks(sbi, segno, false) == 0) goto freed; if (gc_type == BG_GC && __is_large_section(sbi) && @@ -1805,7 +1812,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, sum = folio_address(sum_folio); if (type != GET_SUM_TYPE((&sum->footer))) { - f2fs_err(sbi, "Inconsistent segment (%u) type [%d, %d] in SSA and SIT", + f2fs_err(sbi, "Inconsistent segment (%u) type [%d, %d] in SIT and SSA", segno, type, GET_SUM_TYPE((&sum->footer))); f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_CORRUPTED_SUMMARY); @@ -2068,6 +2075,13 @@ int f2fs_gc_range(struct f2fs_sb_info *sbi, .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; + /* + * avoid migrating empty section, as it can be allocated by + * log in parallel. + */ + if (!get_valid_blocks(sbi, segno, true)) + continue; + if (is_cursec(sbi, GET_SEC_FROM_SEG(sbi, segno))) continue; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e16c4e2830c2..8086a3456e4d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -988,6 +988,10 @@ static int f2fs_parse_param(struct fs_context *fc, struct fs_parameter *param) ctx_set_opt(ctx, F2FS_MOUNT_DISABLE_CHECKPOINT); break; case Opt_checkpoint_enable: + F2FS_CTX_INFO(ctx).unusable_cap_perc = 0; + ctx->spec_mask |= F2FS_SPEC_checkpoint_disable_cap_perc; + F2FS_CTX_INFO(ctx).unusable_cap = 0; + ctx->spec_mask |= F2FS_SPEC_checkpoint_disable_cap; ctx_clear_opt(ctx, F2FS_MOUNT_DISABLE_CHECKPOINT); break; default: @@ -1185,7 +1189,11 @@ static int f2fs_check_quota_consistency(struct fs_context *fc, goto err_jquota_change; if (old_qname) { - if (strcmp(old_qname, new_qname) == 0) { + if (!new_qname) { + f2fs_info(sbi, "remove qf_name %s", + old_qname); + continue; + } else if (strcmp(old_qname, new_qname) == 0) { ctx->qname_mask &= ~(1 << i); continue; } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 4adcf09d4b01..c7351ca07065 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1175,7 +1175,6 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, num = min(iov_iter_count(ii), fc->max_write); ap->args.in_pages = true; - ap->descs[0].offset = offset; while (num && ap->num_folios < max_folios) { size_t tmp; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 72d95185a39f..bc67fa058c84 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1442,6 +1442,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host); struct lm_lockstruct *ls = &sdp->sd_lockstruct; + int ret; if (!(fl->c.flc_flags & FL_POSIX)) return -ENOLCK; @@ -1450,14 +1451,20 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) locks_lock_file_wait(file, fl); return -EIO; } - if (cmd == F_CANCELLK) - return dlm_posix_cancel(ls->ls_dlm, ip->i_no_addr, file, fl); - else if (IS_GETLK(cmd)) - return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl); - else if (lock_is_unlock(fl)) - return dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl); - else - return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl); + down_read(&ls->ls_sem); + ret = -ENODEV; + if (likely(ls->ls_dlm != NULL)) { + if (cmd == F_CANCELLK) + ret = dlm_posix_cancel(ls->ls_dlm, ip->i_no_addr, file, fl); + else if (IS_GETLK(cmd)) + ret = dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl); + else if (lock_is_unlock(fl)) + ret = dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl); + else + ret = dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl); + } + up_read(&ls->ls_sem); + return ret; } static void __flock_holder_uninit(struct file *file, struct gfs2_holder *fl_gh) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index b6fd1cb17de7..a6535413a0b4 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -502,7 +502,7 @@ static bool do_promote(struct gfs2_glock *gl) */ if (list_is_first(&gh->gh_list, &gl->gl_holders)) return false; - do_error(gl, 0); + do_error(gl, 0); /* Fail queued try locks */ break; } set_bit(HIF_HOLDER, &gh->gh_iflags); @@ -703,44 +703,25 @@ __acquires(&gl->gl_lockref.lock) lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP); GLOCK_BUG_ON(gl, gl->gl_state == target); GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target); - if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) && - glops->go_inval) { - /* - * If another process is already doing the invalidate, let that - * finish first. The glock state machine will get back to this - * holder again later. - */ - if (test_and_set_bit(GLF_INVALIDATE_IN_PROGRESS, - &gl->gl_flags)) - return; - do_error(gl, 0); /* Fail queued try locks */ - } - gl->gl_req = target; - set_bit(GLF_BLOCKING, &gl->gl_flags); - if ((gl->gl_req == LM_ST_UNLOCKED) || - (gl->gl_state == LM_ST_EXCLUSIVE) || - (lck_flags & (LM_FLAG_TRY|LM_FLAG_TRY_1CB))) - clear_bit(GLF_BLOCKING, &gl->gl_flags); - if (!glops->go_inval && !glops->go_sync) + if (!glops->go_inval || !glops->go_sync) goto skip_inval; spin_unlock(&gl->gl_lockref.lock); - if (glops->go_sync) { - ret = glops->go_sync(gl); - /* If we had a problem syncing (due to io errors or whatever, - * we should not invalidate the metadata or tell dlm to - * release the glock to other nodes. - */ - if (ret) { - if (cmpxchg(&sdp->sd_log_error, 0, ret)) { - fs_err(sdp, "Error %d syncing glock \n", ret); - gfs2_dump_glock(NULL, gl, true); - } - spin_lock(&gl->gl_lockref.lock); - goto skip_inval; + ret = glops->go_sync(gl); + /* If we had a problem syncing (due to io errors or whatever, + * we should not invalidate the metadata or tell dlm to + * release the glock to other nodes. + */ + if (ret) { + if (cmpxchg(&sdp->sd_log_error, 0, ret)) { + fs_err(sdp, "Error %d syncing glock\n", ret); + gfs2_dump_glock(NULL, gl, true); } + spin_lock(&gl->gl_lockref.lock); + goto skip_inval; } - if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) { + + if (target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) { /* * The call to go_sync should have cleared out the ail list. * If there are still items, we have a problem. We ought to @@ -755,7 +736,6 @@ __acquires(&gl->gl_lockref.lock) gfs2_dump_glock(NULL, gl, true); } glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA); - clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); } spin_lock(&gl->gl_lockref.lock); @@ -805,8 +785,6 @@ skip_inval: clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); gfs2_glock_queue_work(gl, GL_GLOCK_DFT_HOLD); return; - } else { - clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); } } @@ -816,21 +794,22 @@ skip_inval: ret = ls->ls_ops->lm_lock(gl, target, lck_flags); spin_lock(&gl->gl_lockref.lock); - if (ret == -EINVAL && gl->gl_target == LM_ST_UNLOCKED && - target == LM_ST_UNLOCKED && - test_bit(DFL_UNMOUNT, &ls->ls_recover_flags)) { + if (!ret) { + /* The operation will be completed asynchronously. */ + return; + } + clear_bit(GLF_PENDING_REPLY, &gl->gl_flags); + + if (ret == -ENODEV && gl->gl_target == LM_ST_UNLOCKED && + target == LM_ST_UNLOCKED) { /* * The lockspace has been released and the lock has * been unlocked implicitly. */ - } else if (ret) { + } else { fs_err(sdp, "lm_lock ret %d\n", ret); target = gl->gl_state | LM_OUT_ERROR; - } else { - /* The operation will be completed asynchronously. */ - return; } - clear_bit(GLF_PENDING_REPLY, &gl->gl_flags); } /* Complete the operation now. */ @@ -1462,6 +1441,24 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...) va_end(args); } +static bool gfs2_should_queue_trylock(struct gfs2_glock *gl, + struct gfs2_holder *gh) +{ + struct gfs2_holder *current_gh, *gh2; + + current_gh = find_first_holder(gl); + if (current_gh && !may_grant(gl, current_gh, gh)) + return false; + + list_for_each_entry(gh2, &gl->gl_holders, gh_list) { + if (test_bit(HIF_HOLDER, &gh2->gh_iflags)) + continue; + if (!(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) + return false; + } + return true; +} + static inline bool pid_is_meaningful(const struct gfs2_holder *gh) { if (!(gh->gh_flags & GL_NOPID)) @@ -1480,27 +1477,20 @@ static inline bool pid_is_meaningful(const struct gfs2_holder *gh) */ static inline void add_to_queue(struct gfs2_holder *gh) -__releases(&gl->gl_lockref.lock) -__acquires(&gl->gl_lockref.lock) { struct gfs2_glock *gl = gh->gh_gl; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct gfs2_holder *gh2; - int try_futile = 0; GLOCK_BUG_ON(gl, gh->gh_owner_pid == NULL); if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags)) GLOCK_BUG_ON(gl, true); - if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) { - if (test_bit(GLF_LOCK, &gl->gl_flags)) { - struct gfs2_holder *current_gh; - - current_gh = find_first_holder(gl); - try_futile = !may_grant(gl, current_gh, gh); - } - if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) - goto fail; + if ((gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) && + !gfs2_should_queue_trylock(gl, gh)) { + gh->gh_error = GLR_TRYFAILED; + gfs2_holder_wake(gh); + return; } list_for_each_entry(gh2, &gl->gl_holders, gh_list) { @@ -1512,15 +1502,6 @@ __acquires(&gl->gl_lockref.lock) continue; goto trap_recursive; } - list_for_each_entry(gh2, &gl->gl_holders, gh_list) { - if (try_futile && - !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) { -fail: - gh->gh_error = GLR_TRYFAILED; - gfs2_holder_wake(gh); - return; - } - } trace_gfs2_glock_queue(gh, 1); gfs2_glstats_inc(gl, GFS2_LKS_QCOUNT); gfs2_sbstats_inc(gl, GFS2_LKS_QCOUNT); @@ -2321,8 +2302,6 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl) *p++ = 'y'; if (test_bit(GLF_LFLUSH, gflags)) *p++ = 'f'; - if (test_bit(GLF_INVALIDATE_IN_PROGRESS, gflags)) - *p++ = 'i'; if (test_bit(GLF_PENDING_REPLY, gflags)) *p++ = 'R'; if (test_bit(GLF_HAVE_REPLY, gflags)) diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 9339a3bff6ee..d041b922b45e 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -68,6 +68,10 @@ enum { * also be granted in SHARED. The preferred state is whichever is compatible * with other granted locks, or the specified state if no other locks exist. * + * In addition, when a lock is already held in EX mode locally, a SHARED or + * DEFERRED mode request with the LM_FLAG_ANY flag set will be granted. + * (The LM_FLAG_ANY flag is only use for SHARED mode requests currently.) + * * LM_FLAG_NODE_SCOPE * This holder agrees to share the lock within this node. In other words, * the glock is held in EX mode according to DLM, but local holders on the diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index d4ad82f47eee..3fcb7ab198d4 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -319,7 +319,6 @@ enum { GLF_DEMOTE_IN_PROGRESS = 5, GLF_DIRTY = 6, GLF_LFLUSH = 7, - GLF_INVALIDATE_IN_PROGRESS = 8, GLF_HAVE_REPLY = 9, GLF_INITIAL = 10, GLF_HAVE_FROZEN_REPLY = 11, @@ -658,6 +657,8 @@ struct lm_lockstruct { struct completion ls_sync_wait; /* {control,mounted}_{lock,unlock} */ char *ls_lvb_bits; + struct rw_semaphore ls_sem; + spinlock_t ls_recover_spin; /* protects following fields */ unsigned long ls_recover_flags; /* DFL_ */ uint32_t ls_recover_mount; /* gen in first recover_done cb */ diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index cee5d199d2d8..6db37c20587d 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -58,6 +58,7 @@ static inline void gfs2_update_stats(struct gfs2_lkstats *s, unsigned index, /** * gfs2_update_reply_times - Update locking statistics * @gl: The glock to update + * @blocking: The operation may have been blocking * * This assumes that gl->gl_dstamp has been set earlier. * @@ -72,12 +73,12 @@ static inline void gfs2_update_stats(struct gfs2_lkstats *s, unsigned index, * TRY_1CB flags are set are classified as non-blocking. All * other DLM requests are counted as (potentially) blocking. */ -static inline void gfs2_update_reply_times(struct gfs2_glock *gl) +static inline void gfs2_update_reply_times(struct gfs2_glock *gl, + bool blocking) { struct gfs2_pcpu_lkstats *lks; const unsigned gltype = gl->gl_name.ln_type; - unsigned index = test_bit(GLF_BLOCKING, &gl->gl_flags) ? - GFS2_LKS_SRTTB : GFS2_LKS_SRTT; + unsigned index = blocking ? GFS2_LKS_SRTTB : GFS2_LKS_SRTT; s64 rtt; preempt_disable(); @@ -119,14 +120,18 @@ static inline void gfs2_update_request_times(struct gfs2_glock *gl) static void gdlm_ast(void *arg) { struct gfs2_glock *gl = arg; + bool blocking; unsigned ret; + blocking = test_bit(GLF_BLOCKING, &gl->gl_flags); + gfs2_update_reply_times(gl, blocking); + clear_bit(GLF_BLOCKING, &gl->gl_flags); + /* If the glock is dead, we only react to a dlm_unlock() reply. */ if (__lockref_is_dead(&gl->gl_lockref) && gl->gl_lksb.sb_status != -DLM_EUNLOCK) return; - gfs2_update_reply_times(gl); BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED); if ((gl->gl_lksb.sb_flags & DLM_SBF_VALNOTVALID) && gl->gl_lksb.sb_lvbptr) @@ -241,7 +246,7 @@ static bool down_conversion(int cur, int req) } static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags, - const int cur, const int req) + const int req, bool blocking) { u32 lkf = 0; @@ -274,7 +279,7 @@ static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags, * "upward" lock conversions or else DLM will reject the * request as invalid. */ - if (!down_conversion(cur, req)) + if (blocking) lkf |= DLM_LKF_QUECVT; } @@ -294,14 +299,20 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state, unsigned int flags) { struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct; + bool blocking; int cur, req; u32 lkf; char strname[GDLM_STRNAME_BYTES] = ""; int error; + gl->gl_req = req_state; cur = make_mode(gl->gl_name.ln_sbd, gl->gl_state); req = make_mode(gl->gl_name.ln_sbd, req_state); - lkf = make_flags(gl, flags, cur, req); + blocking = !down_conversion(cur, req) && + !(flags & (LM_FLAG_TRY|LM_FLAG_TRY_1CB)); + lkf = make_flags(gl, flags, req, blocking); + if (blocking) + set_bit(GLF_BLOCKING, &gl->gl_flags); gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT); gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT); if (test_bit(GLF_INITIAL, &gl->gl_flags)) { @@ -318,8 +329,13 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state, */ again: - error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, strname, - GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast); + down_read(&ls->ls_sem); + error = -ENODEV; + if (likely(ls->ls_dlm != NULL)) { + error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, strname, + GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast); + } + up_read(&ls->ls_sem); if (error == -EBUSY) { msleep(20); goto again; @@ -341,7 +357,6 @@ static void gdlm_put_lock(struct gfs2_glock *gl) return; } - clear_bit(GLF_BLOCKING, &gl->gl_flags); gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT); gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT); gfs2_update_request_times(gl); @@ -369,8 +384,13 @@ static void gdlm_put_lock(struct gfs2_glock *gl) flags |= DLM_LKF_VALBLK; again: - error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, flags, - NULL, gl); + down_read(&ls->ls_sem); + error = -ENODEV; + if (likely(ls->ls_dlm != NULL)) { + error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, flags, + NULL, gl); + } + up_read(&ls->ls_sem); if (error == -EBUSY) { msleep(20); goto again; @@ -386,7 +406,12 @@ again: static void gdlm_cancel(struct gfs2_glock *gl) { struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct; - dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl); + + down_read(&ls->ls_sem); + if (likely(ls->ls_dlm != NULL)) { + dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl); + } + up_read(&ls->ls_sem); } /* @@ -567,7 +592,11 @@ static int sync_unlock(struct gfs2_sbd *sdp, struct dlm_lksb *lksb, char *name) struct lm_lockstruct *ls = &sdp->sd_lockstruct; int error; - error = dlm_unlock(ls->ls_dlm, lksb->sb_lkid, 0, lksb, ls); + down_read(&ls->ls_sem); + error = -ENODEV; + if (likely(ls->ls_dlm != NULL)) + error = dlm_unlock(ls->ls_dlm, lksb->sb_lkid, 0, lksb, ls); + up_read(&ls->ls_sem); if (error) { fs_err(sdp, "%s lkid %x error %d\n", name, lksb->sb_lkid, error); @@ -594,9 +623,14 @@ static int sync_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags, memset(strname, 0, GDLM_STRNAME_BYTES); snprintf(strname, GDLM_STRNAME_BYTES, "%8x%16x", LM_TYPE_NONDISK, num); - error = dlm_lock(ls->ls_dlm, mode, lksb, flags, - strname, GDLM_STRNAME_BYTES - 1, - 0, sync_wait_cb, ls, NULL); + down_read(&ls->ls_sem); + error = -ENODEV; + if (likely(ls->ls_dlm != NULL)) { + error = dlm_lock(ls->ls_dlm, mode, lksb, flags, + strname, GDLM_STRNAME_BYTES - 1, + 0, sync_wait_cb, ls, NULL); + } + up_read(&ls->ls_sem); if (error) { fs_err(sdp, "%s lkid %x flags %x mode %d error %d\n", name, lksb->sb_lkid, flags, mode, error); @@ -1323,6 +1357,7 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table) */ INIT_DELAYED_WORK(&sdp->sd_control_work, gfs2_control_func); + ls->ls_dlm = NULL; spin_lock_init(&ls->ls_recover_spin); ls->ls_recover_flags = 0; ls->ls_recover_mount = 0; @@ -1357,6 +1392,7 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table) * create/join lockspace */ + init_rwsem(&ls->ls_sem); error = dlm_new_lockspace(fsname, cluster, flags, GDLM_LVB_SIZE, &gdlm_lockspace_ops, sdp, &ops_result, &ls->ls_dlm); @@ -1436,10 +1472,12 @@ static void gdlm_unmount(struct gfs2_sbd *sdp) /* mounted_lock and control_lock will be purged in dlm recovery */ release: + down_write(&ls->ls_sem); if (ls->ls_dlm) { dlm_release_lockspace(ls->ls_dlm, 2); ls->ls_dlm = NULL; } + up_write(&ls->ls_sem); free_recover_size(ls); } diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h index 26036ffc3f33..1c2507a27318 100644 --- a/fs/gfs2/trace_gfs2.h +++ b/fs/gfs2/trace_gfs2.h @@ -52,7 +52,6 @@ {(1UL << GLF_DEMOTE_IN_PROGRESS), "p" }, \ {(1UL << GLF_DIRTY), "y" }, \ {(1UL << GLF_LFLUSH), "f" }, \ - {(1UL << GLF_INVALIDATE_IN_PROGRESS), "i" }, \ {(1UL << GLF_PENDING_REPLY), "R" }, \ {(1UL << GLF_HAVE_REPLY), "r" }, \ {(1UL << GLF_INITIAL), "a" }, \ diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 876bbb80fb4d..1b3e27a0d5e0 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -204,7 +204,7 @@ static int hfsplus_readdir(struct file *file, struct dir_context *ctx) fd.entrylength); type = be16_to_cpu(entry.type); len = NLS_MAX_CHARSET_SIZE * HFSPLUS_MAX_STRLEN; - err = hfsplus_uni2asc(sb, &fd.key->cat.name, strbuf, &len); + err = hfsplus_uni2asc_str(sb, &fd.key->cat.name, strbuf, &len); if (err) goto out; if (type == HFSPLUS_FOLDER) { diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 96a5c24813dd..2311e4be4e86 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -521,8 +521,12 @@ int hfsplus_strcasecmp(const struct hfsplus_unistr *s1, const struct hfsplus_unistr *s2); int hfsplus_strcmp(const struct hfsplus_unistr *s1, const struct hfsplus_unistr *s2); -int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, - char *astr, int *len_p); +int hfsplus_uni2asc_str(struct super_block *sb, + const struct hfsplus_unistr *ustr, char *astr, + int *len_p); +int hfsplus_uni2asc_xattr_str(struct super_block *sb, + const struct hfsplus_attr_unistr *ustr, + char *astr, int *len_p); int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr, int max_unistr_len, const char *astr, int len); int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str); diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c index 36b6cf2a3abb..862ba27f1628 100644 --- a/fs/hfsplus/unicode.c +++ b/fs/hfsplus/unicode.c @@ -119,9 +119,8 @@ static u16 *hfsplus_compose_lookup(u16 *p, u16 cc) return NULL; } -int hfsplus_uni2asc(struct super_block *sb, - const struct hfsplus_unistr *ustr, - char *astr, int *len_p) +static int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, + int max_len, char *astr, int *len_p) { const hfsplus_unichr *ip; struct nls_table *nls = HFSPLUS_SB(sb)->nls; @@ -134,8 +133,8 @@ int hfsplus_uni2asc(struct super_block *sb, ip = ustr->unicode; ustrlen = be16_to_cpu(ustr->length); - if (ustrlen > HFSPLUS_MAX_STRLEN) { - ustrlen = HFSPLUS_MAX_STRLEN; + if (ustrlen > max_len) { + ustrlen = max_len; pr_err("invalid length %u has been corrected to %d\n", be16_to_cpu(ustr->length), ustrlen); } @@ -256,6 +255,21 @@ out: return res; } +inline int hfsplus_uni2asc_str(struct super_block *sb, + const struct hfsplus_unistr *ustr, char *astr, + int *len_p) +{ + return hfsplus_uni2asc(sb, ustr, HFSPLUS_MAX_STRLEN, astr, len_p); +} + +inline int hfsplus_uni2asc_xattr_str(struct super_block *sb, + const struct hfsplus_attr_unistr *ustr, + char *astr, int *len_p) +{ + return hfsplus_uni2asc(sb, (const struct hfsplus_unistr *)ustr, + HFSPLUS_ATTR_MAX_STRLEN, astr, len_p); +} + /* * Convert one or more ASCII characters into a single unicode character. * Returns the number of ASCII characters corresponding to the unicode char. diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index 18dc3d254d21..c951fa9835aa 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -735,9 +735,9 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) goto end_listxattr; xattr_name_len = NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN; - if (hfsplus_uni2asc(inode->i_sb, - (const struct hfsplus_unistr *)&fd.key->attr.key_name, - strbuf, &xattr_name_len)) { + if (hfsplus_uni2asc_xattr_str(inode->i_sb, + &fd.key->attr.key_name, strbuf, + &xattr_name_len)) { pr_err("unicode conversion failed\n"); res = -EIO; goto end_listxattr; diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index 97abf62f109d..31ce210f032a 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -49,11 +49,6 @@ struct nfs_local_fsync_ctx { static bool localio_enabled __read_mostly = true; module_param(localio_enabled, bool, 0644); -static bool localio_O_DIRECT_semantics __read_mostly = false; -module_param(localio_O_DIRECT_semantics, bool, 0644); -MODULE_PARM_DESC(localio_O_DIRECT_semantics, - "LOCALIO will use O_DIRECT semantics to filesystem."); - static inline bool nfs_client_is_local(const struct nfs_client *clp) { return !!rcu_access_pointer(clp->cl_uuid.net); @@ -321,12 +316,9 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr, return NULL; } - if (localio_O_DIRECT_semantics && - test_bit(NFS_IOHDR_ODIRECT, &hdr->flags)) { - iocb->kiocb.ki_filp = file; + init_sync_kiocb(&iocb->kiocb, file); + if (test_bit(NFS_IOHDR_ODIRECT, &hdr->flags)) iocb->kiocb.ki_flags = IOCB_DIRECT; - } else - init_sync_kiocb(&iocb->kiocb, file); iocb->kiocb.ki_pos = hdr->args.offset; iocb->hdr = hdr; @@ -336,6 +328,30 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr, return iocb; } +static bool nfs_iov_iter_aligned_bvec(const struct iov_iter *i, + loff_t offset, unsigned int addr_mask, unsigned int len_mask) +{ + const struct bio_vec *bvec = i->bvec; + size_t skip = i->iov_offset; + size_t size = i->count; + + if ((offset | size) & len_mask) + return false; + do { + size_t len = bvec->bv_len; + + if (len > size) + len = size; + if ((unsigned long)(bvec->bv_offset + skip) & addr_mask) + return false; + bvec++; + size -= len; + skip = 0; + } while (size); + + return true; +} + static void nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int dir) { @@ -345,6 +361,25 @@ nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int dir) hdr->args.count + hdr->args.pgbase); if (hdr->args.pgbase != 0) iov_iter_advance(i, hdr->args.pgbase); + + if (iocb->kiocb.ki_flags & IOCB_DIRECT) { + u32 nf_dio_mem_align, nf_dio_offset_align, nf_dio_read_offset_align; + /* Verify the IO is DIO-aligned as required */ + nfs_to->nfsd_file_dio_alignment(iocb->localio, &nf_dio_mem_align, + &nf_dio_offset_align, + &nf_dio_read_offset_align); + if (dir == READ) + nf_dio_offset_align = nf_dio_read_offset_align; + + if (nf_dio_mem_align && nf_dio_offset_align && + nfs_iov_iter_aligned_bvec(i, hdr->args.offset, + nf_dio_mem_align - 1, + nf_dio_offset_align - 1)) + return; /* is DIO-aligned */ + + /* Fallback to using buffered for this misaligned IO */ + iocb->kiocb.ki_flags &= ~IOCB_DIRECT; + } } static void @@ -405,6 +440,11 @@ nfs_local_read_done(struct nfs_local_kiocb *iocb, long status) struct nfs_pgio_header *hdr = iocb->hdr; struct file *filp = iocb->kiocb.ki_filp; + if ((iocb->kiocb.ki_flags & IOCB_DIRECT) && status == -EINVAL) { + /* Underlying FS will return -EINVAL if misaligned DIO is attempted. */ + pr_info_ratelimited("nfs: Unexpected direct I/O read alignment failure\n"); + } + nfs_local_pgio_done(hdr, status); /* @@ -597,6 +637,11 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status) dprintk("%s: wrote %ld bytes.\n", __func__, status > 0 ? status : 0); + if ((iocb->kiocb.ki_flags & IOCB_DIRECT) && status == -EINVAL) { + /* Underlying FS will return -EINVAL if misaligned DIO is attempted. */ + pr_info_ratelimited("nfs: Unexpected direct I/O write alignment failure\n"); + } + /* Handle short writes as if they are ENOSPC */ if (status > 0 && status < hdr->args.count) { hdr->mds_offset += status; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ce61253efd45..611e6283c194 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -9442,7 +9442,7 @@ static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args goto out; if (rcvd->max_rqst_sz > sent->max_rqst_sz) return -EINVAL; - if (rcvd->max_resp_sz < sent->max_resp_sz) + if (rcvd->max_resp_sz > sent->max_resp_sz) return -EINVAL; if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached) return -EINVAL; diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 732abf6b92a5..7ca1dedf4e04 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -231,6 +231,9 @@ nfsd_file_alloc(struct net *net, struct inode *inode, unsigned char need, refcount_set(&nf->nf_ref, 1); nf->nf_may = need; nf->nf_mark = NULL; + nf->nf_dio_mem_align = 0; + nf->nf_dio_offset_align = 0; + nf->nf_dio_read_offset_align = 0; return nf; } @@ -1070,6 +1073,35 @@ nfsd_file_is_cached(struct inode *inode) } static __be32 +nfsd_file_get_dio_attrs(const struct svc_fh *fhp, struct nfsd_file *nf) +{ + struct inode *inode = file_inode(nf->nf_file); + struct kstat stat; + __be32 status; + + /* Currently only need to get DIO alignment info for regular files */ + if (!S_ISREG(inode->i_mode)) + return nfs_ok; + + status = fh_getattr(fhp, &stat); + if (status != nfs_ok) + return status; + + trace_nfsd_file_get_dio_attrs(inode, &stat); + + if (stat.result_mask & STATX_DIOALIGN) { + nf->nf_dio_mem_align = stat.dio_mem_align; + nf->nf_dio_offset_align = stat.dio_offset_align; + } + if (stat.result_mask & STATX_DIO_READ_ALIGN) + nf->nf_dio_read_offset_align = stat.dio_read_offset_align; + else + nf->nf_dio_read_offset_align = nf->nf_dio_offset_align; + + return nfs_ok; +} + +static __be32 nfsd_file_do_acquire(struct svc_rqst *rqstp, struct net *net, struct svc_cred *cred, struct auth_domain *client, @@ -1187,6 +1219,8 @@ open_file: } status = nfserrno(ret); trace_nfsd_file_open(nf, status); + if (status == nfs_ok) + status = nfsd_file_get_dio_attrs(fhp, nf); } } else status = nfserr_jukebox; diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index 722b26c71e45..237a05c74211 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -54,6 +54,10 @@ struct nfsd_file { struct list_head nf_gc; struct rcu_head nf_rcu; ktime_t nf_birthtime; + + u32 nf_dio_mem_align; + u32 nf_dio_offset_align; + u32 nf_dio_read_offset_align; }; int nfsd_file_cache_init(void); diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c index cb237f1b902a..9e0a37cd29d8 100644 --- a/fs/nfsd/localio.c +++ b/fs/nfsd/localio.c @@ -117,6 +117,16 @@ nfsd_open_local_fh(struct net *net, struct auth_domain *dom, return localio; } +static void nfsd_file_dio_alignment(struct nfsd_file *nf, + u32 *nf_dio_mem_align, + u32 *nf_dio_offset_align, + u32 *nf_dio_read_offset_align) +{ + *nf_dio_mem_align = nf->nf_dio_mem_align; + *nf_dio_offset_align = nf->nf_dio_offset_align; + *nf_dio_read_offset_align = nf->nf_dio_read_offset_align; +} + static const struct nfsd_localio_operations nfsd_localio_ops = { .nfsd_net_try_get = nfsd_net_try_get, .nfsd_net_put = nfsd_net_put, @@ -124,6 +134,7 @@ static const struct nfsd_localio_operations nfsd_localio_ops = { .nfsd_file_put_local = nfsd_file_put_local, .nfsd_file_get_local = nfsd_file_get_local, .nfsd_file_file = nfsd_file_file, + .nfsd_file_dio_alignment = nfsd_file_dio_alignment, }; void nfsd_localio_ops_init(void) diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index a664fdf1161e..6e2c8e2aab10 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -1133,6 +1133,33 @@ TRACE_EVENT(nfsd_file_alloc, ) ); +TRACE_EVENT(nfsd_file_get_dio_attrs, + TP_PROTO( + const struct inode *inode, + const struct kstat *stat + ), + TP_ARGS(inode, stat), + TP_STRUCT__entry( + __field(const void *, inode) + __field(unsigned long, mask) + __field(u32, mem_align) + __field(u32, offset_align) + __field(u32, read_offset_align) + ), + TP_fast_assign( + __entry->inode = inode; + __entry->mask = stat->result_mask; + __entry->mem_align = stat->dio_mem_align; + __entry->offset_align = stat->dio_offset_align; + __entry->read_offset_align = stat->dio_read_offset_align; + ), + TP_printk("inode=%p flags=%s mem_align=%u offset_align=%u read_offset_align=%u", + __entry->inode, show_statx_mask(__entry->mask), + __entry->mem_align, __entry->offset_align, + __entry->read_offset_align + ) +); + TRACE_EVENT(nfsd_file_acquire, TP_PROTO( const struct svc_rqst *rqstp, diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index eff04959606f..fde3e0c11dba 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -185,6 +185,10 @@ static inline __be32 fh_getattr(const struct svc_fh *fh, struct kstat *stat) u32 request_mask = STATX_BASIC_STATS; struct path p = {.mnt = fh->fh_export->ex_path.mnt, .dentry = fh->fh_dentry}; + struct inode *inode = d_inode(p.dentry); + + if (S_ISREG(inode->i_mode)) + request_mask |= (STATX_DIOALIGN | STATX_DIO_READ_ALIGN); if (fh->fh_maxsize == NFS4_FHSIZE) request_mask |= (STATX_BTIME | STATX_CHANGE_COOKIE); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index b192ee068a7a..561339b4cf75 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1999,7 +1999,10 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, user_ns = path.mnt->mnt_sb->s_user_ns; obj = path.mnt->mnt_sb; } else if (obj_type == FSNOTIFY_OBJ_TYPE_MNTNS) { + ret = -EINVAL; mntns = mnt_ns_from_dentry(path.dentry); + if (!mntns) + goto path_put_and_out; user_ns = mntns->user_ns; obj = mntns; } diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c index 1bf2a6593dec..6d1bf890929d 100644 --- a/fs/ntfs3/index.c +++ b/fs/ntfs3/index.c @@ -1508,6 +1508,16 @@ static int indx_add_allocate(struct ntfs_index *indx, struct ntfs_inode *ni, bmp_size = bmp_size_v = le32_to_cpu(bmp->res.data_size); } + /* + * Index blocks exist, but $BITMAP has zero valid bits. + * This implies an on-disk corruption and must be rejected. + */ + if (in->name == I30_NAME && + unlikely(bmp_size_v == 0 && indx->alloc_run.count)) { + err = -EINVAL; + goto out1; + } + bit = bmp_size << 3; } diff --git a/fs/ntfs3/run.c b/fs/ntfs3/run.c index 6e86d66197ef..88550085f745 100644 --- a/fs/ntfs3/run.c +++ b/fs/ntfs3/run.c @@ -9,6 +9,7 @@ #include <linux/blkdev.h> #include <linux/fs.h> #include <linux/log2.h> +#include <linux/overflow.h> #include "debug.h" #include "ntfs.h" @@ -982,14 +983,18 @@ int run_unpack(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino, if (!dlcn) return -EINVAL; - lcn = prev_lcn + dlcn; + + if (check_add_overflow(prev_lcn, dlcn, &lcn)) + return -EINVAL; prev_lcn = lcn; } else { /* The size of 'dlcn' can't be > 8. */ return -EINVAL; } - next_vcn = vcn64 + len; + if (check_add_overflow(vcn64, len, &next_vcn)) + return -EINVAL; + /* Check boundary. */ if (next_vcn > evcn + 1) return -EINVAL; @@ -1153,7 +1158,8 @@ int run_get_highest_vcn(CLST vcn, const u8 *run_buf, u64 *highest_vcn) return -EINVAL; run_buf += size_size + offset_size; - vcn64 += len; + if (check_add_overflow(vcn64, len, &vcn64)) + return -EINVAL; #ifndef CONFIG_NTFS3_64BIT_CLUSTER if (vcn64 > 0x100000000ull) diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 0f045e45fa0c..439742cec3c2 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -1011,6 +1011,7 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn) printk(KERN_ERR "ocfs2: Could not determine" " locking version\n"); user_cluster_disconnect(conn); + lc = NULL; goto out; } wait_event(lc->oc_wait, (atomic_read(&lc->oc_this_node) > 0)); diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index e586f3f4b5c9..68286673afc9 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -4219,7 +4219,7 @@ fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len, static void *smb2_aead_req_alloc(struct crypto_aead *tfm, const struct smb_rqst *rqst, int num_rqst, const u8 *sig, u8 **iv, struct aead_request **req, struct sg_table *sgt, - unsigned int *num_sgs, size_t *sensitive_size) + unsigned int *num_sgs) { unsigned int req_size = sizeof(**req) + crypto_aead_reqsize(tfm); unsigned int iv_size = crypto_aead_ivsize(tfm); @@ -4236,9 +4236,8 @@ static void *smb2_aead_req_alloc(struct crypto_aead *tfm, const struct smb_rqst len += req_size; len = ALIGN(len, __alignof__(struct scatterlist)); len += array_size(*num_sgs, sizeof(struct scatterlist)); - *sensitive_size = len; - p = kvzalloc(len, GFP_NOFS); + p = kzalloc(len, GFP_NOFS); if (!p) return ERR_PTR(-ENOMEM); @@ -4252,16 +4251,14 @@ static void *smb2_aead_req_alloc(struct crypto_aead *tfm, const struct smb_rqst static void *smb2_get_aead_req(struct crypto_aead *tfm, struct smb_rqst *rqst, int num_rqst, const u8 *sig, u8 **iv, - struct aead_request **req, struct scatterlist **sgl, - size_t *sensitive_size) + struct aead_request **req, struct scatterlist **sgl) { struct sg_table sgtable = {}; unsigned int skip, num_sgs, i, j; ssize_t rc; void *p; - p = smb2_aead_req_alloc(tfm, rqst, num_rqst, sig, iv, req, &sgtable, - &num_sgs, sensitive_size); + p = smb2_aead_req_alloc(tfm, rqst, num_rqst, sig, iv, req, &sgtable, &num_sgs); if (IS_ERR(p)) return ERR_CAST(p); @@ -4350,7 +4347,6 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, DECLARE_CRYPTO_WAIT(wait); unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize); void *creq; - size_t sensitive_size; rc = smb2_get_enc_key(server, le64_to_cpu(tr_hdr->SessionId), enc, key); if (rc) { @@ -4376,8 +4372,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, return rc; } - creq = smb2_get_aead_req(tfm, rqst, num_rqst, sign, &iv, &req, &sg, - &sensitive_size); + creq = smb2_get_aead_req(tfm, rqst, num_rqst, sign, &iv, &req, &sg); if (IS_ERR(creq)) return PTR_ERR(creq); @@ -4407,7 +4402,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, if (!rc && enc) memcpy(&tr_hdr->Signature, sign, SMB2_SIGNATURE_SIZE); - kvfree_sensitive(creq, sensitive_size); + kfree_sensitive(creq); return rc; } diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index e0fce5033004..6480945c2459 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -179,6 +179,8 @@ static int smbd_conn_upcall( struct smbd_connection *info = id->context; struct smbdirect_socket *sc = &info->socket; const char *event_name = rdma_event_msg(event->event); + u8 peer_initiator_depth; + u8 peer_responder_resources; log_rdma_event(INFO, "event=%s status=%d\n", event_name, event->status); @@ -204,6 +206,85 @@ static int smbd_conn_upcall( case RDMA_CM_EVENT_ESTABLISHED: log_rdma_event(INFO, "connected event=%s\n", event_name); + + /* + * Here we work around an inconsistency between + * iWarp and other devices (at least rxe and irdma using RoCEv2) + */ + if (rdma_protocol_iwarp(id->device, id->port_num)) { + /* + * iWarp devices report the peer's values + * with the perspective of the peer here. + * Tested with siw and irdma (in iwarp mode) + * We need to change to our perspective here, + * so we need to switch the values. + */ + peer_initiator_depth = event->param.conn.responder_resources; + peer_responder_resources = event->param.conn.initiator_depth; + } else { + /* + * Non iWarp devices report the peer's values + * already changed to our perspective here. + * Tested with rxe and irdma (in roce mode). + */ + peer_initiator_depth = event->param.conn.initiator_depth; + peer_responder_resources = event->param.conn.responder_resources; + } + if (rdma_protocol_iwarp(id->device, id->port_num) && + event->param.conn.private_data_len == 8) { + /* + * Legacy clients with only iWarp MPA v1 support + * need a private blob in order to negotiate + * the IRD/ORD values. + */ + const __be32 *ird_ord_hdr = event->param.conn.private_data; + u32 ird32 = be32_to_cpu(ird_ord_hdr[0]); + u32 ord32 = be32_to_cpu(ird_ord_hdr[1]); + + /* + * cifs.ko sends the legacy IRD/ORD negotiation + * event if iWarp MPA v2 was used. + * + * Here we check that the values match and only + * mark the client as legacy if they don't match. + */ + if ((u32)event->param.conn.initiator_depth != ird32 || + (u32)event->param.conn.responder_resources != ord32) { + /* + * There are broken clients (old cifs.ko) + * using little endian and also + * struct rdma_conn_param only uses u8 + * for initiator_depth and responder_resources, + * so we truncate the value to U8_MAX. + * + * smb_direct_accept_client() will then + * do the real negotiation in order to + * select the minimum between client and + * server. + */ + ird32 = min_t(u32, ird32, U8_MAX); + ord32 = min_t(u32, ord32, U8_MAX); + + info->legacy_iwarp = true; + peer_initiator_depth = (u8)ird32; + peer_responder_resources = (u8)ord32; + } + } + + /* + * negotiate the value by using the minimum + * between client and server if the client provided + * non 0 values. + */ + if (peer_initiator_depth != 0) + info->initiator_depth = + min_t(u8, info->initiator_depth, + peer_initiator_depth); + if (peer_responder_resources != 0) + info->responder_resources = + min_t(u8, info->responder_resources, + peer_responder_resources); + sc->status = SMBDIRECT_SOCKET_CONNECTED; wake_up_interruptible(&info->status_wait); break; @@ -1551,7 +1632,7 @@ static struct smbd_connection *_smbd_get_connection( struct ib_qp_init_attr qp_attr; struct sockaddr_in *addr_in = (struct sockaddr_in *) dstaddr; struct ib_port_immutable port_immutable; - u32 ird_ord_hdr[2]; + __be32 ird_ord_hdr[2]; info = kzalloc(sizeof(struct smbd_connection), GFP_KERNEL); if (!info) @@ -1559,6 +1640,9 @@ static struct smbd_connection *_smbd_get_connection( sc = &info->socket; sp = &sc->parameters; + info->initiator_depth = 1; + info->responder_resources = SMBD_CM_RESPONDER_RESOURCES; + sc->status = SMBDIRECT_SOCKET_CONNECTING; rc = smbd_ia_open(info, dstaddr, port); if (rc) { @@ -1639,22 +1723,22 @@ static struct smbd_connection *_smbd_get_connection( } sc->ib.qp = sc->rdma.cm_id->qp; - memset(&conn_param, 0, sizeof(conn_param)); - conn_param.initiator_depth = 0; - - conn_param.responder_resources = - min(sc->ib.dev->attrs.max_qp_rd_atom, - SMBD_CM_RESPONDER_RESOURCES); - info->responder_resources = conn_param.responder_resources; + info->responder_resources = + min_t(u8, info->responder_resources, + sc->ib.dev->attrs.max_qp_rd_atom); log_rdma_mr(INFO, "responder_resources=%d\n", info->responder_resources); + memset(&conn_param, 0, sizeof(conn_param)); + conn_param.initiator_depth = info->initiator_depth; + conn_param.responder_resources = info->responder_resources; + /* Need to send IRD/ORD in private data for iWARP */ sc->ib.dev->ops.get_port_immutable( sc->ib.dev, sc->rdma.cm_id->port_num, &port_immutable); if (port_immutable.core_cap_flags & RDMA_CORE_PORT_IWARP) { - ird_ord_hdr[0] = info->responder_resources; - ird_ord_hdr[1] = 1; + ird_ord_hdr[0] = cpu_to_be32(conn_param.responder_resources); + ird_ord_hdr[1] = cpu_to_be32(conn_param.initiator_depth); conn_param.private_data = ird_ord_hdr; conn_param.private_data_len = sizeof(ird_ord_hdr); } else { @@ -2121,6 +2205,12 @@ static int allocate_mr_list(struct smbd_connection *info) atomic_set(&info->mr_used_count, 0); init_waitqueue_head(&info->wait_for_mr_cleanup); INIT_WORK(&info->mr_recovery_work, smbd_mr_recovery_work); + + if (info->responder_resources == 0) { + log_rdma_mr(ERR, "responder_resources negotiated as 0\n"); + return -EINVAL; + } + /* Allocate more MRs (2x) than hardware responder_resources */ for (i = 0; i < info->responder_resources * 2; i++) { smbdirect_mr = kzalloc(sizeof(*smbdirect_mr), GFP_KERNEL); diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h index e45aa9ddd71d..4ca9b2b2c57f 100644 --- a/fs/smb/client/smbdirect.h +++ b/fs/smb/client/smbdirect.h @@ -67,7 +67,9 @@ struct smbd_connection { /* Memory registrations */ /* Maximum number of RDMA read/write outstanding on this connection */ - int responder_resources; + bool legacy_iwarp; + u8 initiator_depth; + u8 responder_resources; /* Maximum number of pages in a single RDMA write/read on this connection */ int max_frmr_depth; /* diff --git a/fs/smb/server/ksmbd_netlink.h b/fs/smb/server/ksmbd_netlink.h index 3f07a612c05b..8ccd57fd904b 100644 --- a/fs/smb/server/ksmbd_netlink.h +++ b/fs/smb/server/ksmbd_netlink.h @@ -112,10 +112,11 @@ struct ksmbd_startup_request { __u32 smbd_max_io_size; /* smbd read write size */ __u32 max_connections; /* Number of maximum simultaneous connections */ __s8 bind_interfaces_only; - __s8 reserved[503]; /* Reserved room */ + __u32 max_ip_connections; /* Number of maximum connection per ip address */ + __s8 reserved[499]; /* Reserved room */ __u32 ifc_list_sz; /* interfaces list size */ __s8 ____payload[]; -}; +} __packed; #define KSMBD_STARTUP_CONFIG_INTERFACES(s) ((s)->____payload) diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c index 9dec4c2940bc..b36d0676dbe5 100644 --- a/fs/smb/server/mgmt/user_session.c +++ b/fs/smb/server/mgmt/user_session.c @@ -104,29 +104,32 @@ int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name) if (!entry) return -ENOMEM; - down_read(&sess->rpc_lock); entry->method = method; entry->id = id = ksmbd_ipc_id_alloc(); if (id < 0) goto free_entry; + + down_write(&sess->rpc_lock); old = xa_store(&sess->rpc_handle_list, id, entry, KSMBD_DEFAULT_GFP); - if (xa_is_err(old)) + if (xa_is_err(old)) { + up_write(&sess->rpc_lock); goto free_id; + } resp = ksmbd_rpc_open(sess, id); - if (!resp) - goto erase_xa; + if (!resp) { + xa_erase(&sess->rpc_handle_list, entry->id); + up_write(&sess->rpc_lock); + goto free_id; + } - up_read(&sess->rpc_lock); + up_write(&sess->rpc_lock); kvfree(resp); return id; -erase_xa: - xa_erase(&sess->rpc_handle_list, entry->id); free_id: ksmbd_rpc_id_free(entry->id); free_entry: kfree(entry); - up_read(&sess->rpc_lock); return -EINVAL; } @@ -144,9 +147,14 @@ void ksmbd_session_rpc_close(struct ksmbd_session *sess, int id) int ksmbd_session_rpc_method(struct ksmbd_session *sess, int id) { struct ksmbd_session_rpc *entry; + int method; + down_read(&sess->rpc_lock); entry = xa_load(&sess->rpc_handle_list, id); - return entry ? entry->method : 0; + method = entry ? entry->method : 0; + up_read(&sess->rpc_lock); + + return method; } void ksmbd_session_destroy(struct ksmbd_session *sess) diff --git a/fs/smb/server/server.h b/fs/smb/server/server.h index 995555febe7d..b8a7317be86b 100644 --- a/fs/smb/server/server.h +++ b/fs/smb/server/server.h @@ -43,6 +43,7 @@ struct ksmbd_server_config { unsigned int auth_mechs; unsigned int max_connections; unsigned int max_inflight_req; + unsigned int max_ip_connections; char *conf[SERVER_CONF_WORK_GROUP + 1]; struct task_struct *dh_task; diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index a565fc36cee6..a1db006ab6e9 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -5628,7 +5628,8 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, if (!work->tcon->posix_extensions) { pr_err("client doesn't negotiate with SMB3.1.1 POSIX Extensions\n"); - rc = -EOPNOTSUPP; + path_put(&path); + return -EOPNOTSUPP; } else { info = (struct filesystem_posix_info *)(rsp->Buffer); info->OptimalTransferSize = cpu_to_le32(stfs.f_bsize); diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c index 2a3e2b0ce557..2aa1b29bea08 100644 --- a/fs/smb/server/transport_ipc.c +++ b/fs/smb/server/transport_ipc.c @@ -335,6 +335,9 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req) if (req->max_connections) server_conf.max_connections = req->max_connections; + if (req->max_ip_connections) + server_conf.max_ip_connections = req->max_ip_connections; + ret = ksmbd_set_netbios_name(req->netbios_name); ret |= ksmbd_set_server_string(req->server_string); ret |= ksmbd_set_work_group(req->work_group); diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 74dfb6496095..e1f659d3b4cf 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -153,6 +153,10 @@ struct smb_direct_transport { struct work_struct disconnect_work; bool negotiation_requested; + + bool legacy_iwarp; + u8 initiator_depth; + u8 responder_resources; }; #define KSMBD_TRANS(t) ((struct ksmbd_transport *)&((t)->transport)) @@ -347,6 +351,9 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) t->cm_id = cm_id; cm_id->context = t; + t->initiator_depth = SMB_DIRECT_CM_INITIATOR_DEPTH; + t->responder_resources = 1; + t->status = SMB_DIRECT_CS_NEW; init_waitqueue_head(&t->wait_status); @@ -1676,21 +1683,21 @@ static int smb_direct_send_negotiate_response(struct smb_direct_transport *t, static int smb_direct_accept_client(struct smb_direct_transport *t) { struct rdma_conn_param conn_param; - struct ib_port_immutable port_immutable; - u32 ird_ord_hdr[2]; + __be32 ird_ord_hdr[2]; int ret; + /* + * smb_direct_handle_connect_request() + * already negotiated t->initiator_depth + * and t->responder_resources + */ memset(&conn_param, 0, sizeof(conn_param)); - conn_param.initiator_depth = min_t(u8, t->cm_id->device->attrs.max_qp_rd_atom, - SMB_DIRECT_CM_INITIATOR_DEPTH); - conn_param.responder_resources = 0; - - t->cm_id->device->ops.get_port_immutable(t->cm_id->device, - t->cm_id->port_num, - &port_immutable); - if (port_immutable.core_cap_flags & RDMA_CORE_PORT_IWARP) { - ird_ord_hdr[0] = conn_param.responder_resources; - ird_ord_hdr[1] = 1; + conn_param.initiator_depth = t->initiator_depth; + conn_param.responder_resources = t->responder_resources; + + if (t->legacy_iwarp) { + ird_ord_hdr[0] = cpu_to_be32(conn_param.responder_resources); + ird_ord_hdr[1] = cpu_to_be32(conn_param.initiator_depth); conn_param.private_data = ird_ord_hdr; conn_param.private_data_len = sizeof(ird_ord_hdr); } else { @@ -2081,10 +2088,13 @@ static bool rdma_frwr_is_supported(struct ib_device_attr *attrs) return true; } -static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id) +static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id, + struct rdma_cm_event *event) { struct smb_direct_transport *t; struct task_struct *handler; + u8 peer_initiator_depth; + u8 peer_responder_resources; int ret; if (!rdma_frwr_is_supported(&new_cm_id->device->attrs)) { @@ -2098,6 +2108,67 @@ static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id) if (!t) return -ENOMEM; + peer_initiator_depth = event->param.conn.initiator_depth; + peer_responder_resources = event->param.conn.responder_resources; + if (rdma_protocol_iwarp(new_cm_id->device, new_cm_id->port_num) && + event->param.conn.private_data_len == 8) { + /* + * Legacy clients with only iWarp MPA v1 support + * need a private blob in order to negotiate + * the IRD/ORD values. + */ + const __be32 *ird_ord_hdr = event->param.conn.private_data; + u32 ird32 = be32_to_cpu(ird_ord_hdr[0]); + u32 ord32 = be32_to_cpu(ird_ord_hdr[1]); + + /* + * cifs.ko sends the legacy IRD/ORD negotiation + * event if iWarp MPA v2 was used. + * + * Here we check that the values match and only + * mark the client as legacy if they don't match. + */ + if ((u32)event->param.conn.initiator_depth != ird32 || + (u32)event->param.conn.responder_resources != ord32) { + /* + * There are broken clients (old cifs.ko) + * using little endian and also + * struct rdma_conn_param only uses u8 + * for initiator_depth and responder_resources, + * so we truncate the value to U8_MAX. + * + * smb_direct_accept_client() will then + * do the real negotiation in order to + * select the minimum between client and + * server. + */ + ird32 = min_t(u32, ird32, U8_MAX); + ord32 = min_t(u32, ord32, U8_MAX); + + t->legacy_iwarp = true; + peer_initiator_depth = (u8)ird32; + peer_responder_resources = (u8)ord32; + } + } + + /* + * First set what the we as server are able to support + */ + t->initiator_depth = min_t(u8, t->initiator_depth, + new_cm_id->device->attrs.max_qp_rd_atom); + + /* + * negotiate the value by using the minimum + * between client and server if the client provided + * non 0 values. + */ + if (peer_initiator_depth != 0) + t->initiator_depth = min_t(u8, t->initiator_depth, + peer_initiator_depth); + if (peer_responder_resources != 0) + t->responder_resources = min_t(u8, t->responder_resources, + peer_responder_resources); + ret = smb_direct_connect(t); if (ret) goto out_err; @@ -2122,7 +2193,7 @@ static int smb_direct_listen_handler(struct rdma_cm_id *cm_id, { switch (event->event) { case RDMA_CM_EVENT_CONNECT_REQUEST: { - int ret = smb_direct_handle_connect_request(cm_id); + int ret = smb_direct_handle_connect_request(cm_id, event); if (ret) { pr_err("Can't create transport: %d\n", ret); diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c index 4337df97987d..1009cb324fd5 100644 --- a/fs/smb/server/transport_tcp.c +++ b/fs/smb/server/transport_tcp.c @@ -238,6 +238,7 @@ static int ksmbd_kthread_fn(void *p) struct interface *iface = (struct interface *)p; struct ksmbd_conn *conn; int ret; + unsigned int max_ip_conns; while (!kthread_should_stop()) { mutex_lock(&iface->sock_release_lock); @@ -255,34 +256,38 @@ static int ksmbd_kthread_fn(void *p) continue; } + if (!server_conf.max_ip_connections) + goto skip_max_ip_conns_limit; + /* * Limits repeated connections from clients with the same IP. */ + max_ip_conns = 0; down_read(&conn_list_lock); - list_for_each_entry(conn, &conn_list, conns_list) + list_for_each_entry(conn, &conn_list, conns_list) { #if IS_ENABLED(CONFIG_IPV6) if (client_sk->sk->sk_family == AF_INET6) { if (memcmp(&client_sk->sk->sk_v6_daddr, - &conn->inet6_addr, 16) == 0) { - ret = -EAGAIN; - break; - } + &conn->inet6_addr, 16) == 0) + max_ip_conns++; } else if (inet_sk(client_sk->sk)->inet_daddr == - conn->inet_addr) { - ret = -EAGAIN; - break; - } + conn->inet_addr) + max_ip_conns++; #else if (inet_sk(client_sk->sk)->inet_daddr == - conn->inet_addr) { + conn->inet_addr) + max_ip_conns++; +#endif + if (server_conf.max_ip_connections <= max_ip_conns) { ret = -EAGAIN; break; } -#endif + } up_read(&conn_list_lock); if (ret == -EAGAIN) continue; +skip_max_ip_conns_limit: if (server_conf.max_connections && atomic_inc_return(&active_num_conn) >= server_conf.max_connections) { pr_info_ratelimited("Limit the maximum number of connections(%u)\n", diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index d5918eba27e3..53104f25de51 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -165,6 +165,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block); squashfs_i(inode)->block_list_start = block; squashfs_i(inode)->offset = offset; + squashfs_i(inode)->parent = 0; inode->i_data.a_ops = &squashfs_aops; TRACE("File inode %x:%x, start_block %llx, block_list_start " @@ -212,6 +213,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) squashfs_i(inode)->start = le64_to_cpu(sqsh_ino->start_block); squashfs_i(inode)->block_list_start = block; squashfs_i(inode)->offset = offset; + squashfs_i(inode)->parent = 0; inode->i_data.a_ops = &squashfs_aops; TRACE("File inode %x:%x, start_block %llx, block_list_start " @@ -292,6 +294,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) inode->i_mode |= S_IFLNK; squashfs_i(inode)->start = block; squashfs_i(inode)->offset = offset; + squashfs_i(inode)->parent = 0; if (type == SQUASHFS_LSYMLINK_TYPE) { __le32 xattr; @@ -329,6 +332,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); rdev = le32_to_cpu(sqsh_ino->rdev); init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); + squashfs_i(inode)->parent = 0; TRACE("Device inode %x:%x, rdev %x\n", SQUASHFS_INODE_BLK(ino), offset, rdev); @@ -353,6 +357,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); rdev = le32_to_cpu(sqsh_ino->rdev); init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); + squashfs_i(inode)->parent = 0; TRACE("Device inode %x:%x, rdev %x\n", SQUASHFS_INODE_BLK(ino), offset, rdev); @@ -373,6 +378,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) inode->i_mode |= S_IFSOCK; set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); init_special_inode(inode, inode->i_mode, 0); + squashfs_i(inode)->parent = 0; break; } case SQUASHFS_LFIFO_TYPE: @@ -392,6 +398,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) inode->i_op = &squashfs_inode_ops; set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); init_special_inode(inode, inode->i_mode, 0); + squashfs_i(inode)->parent = 0; break; } default: diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h index 2c82d6f2a456..8e497ac07b9a 100644 --- a/fs/squashfs/squashfs_fs_i.h +++ b/fs/squashfs/squashfs_fs_i.h @@ -16,6 +16,7 @@ struct squashfs_inode_info { u64 xattr; unsigned int xattr_size; int xattr_count; + int parent; union { struct { u64 fragment_block; @@ -27,7 +28,6 @@ struct squashfs_inode_info { u64 dir_idx_start; int dir_idx_offset; int dir_idx_cnt; - int parent; }; }; struct inode vfs_inode; diff --git a/fs/udf/inode.c b/fs/udf/inode.c index f24aa98e6869..a79d73f28aa7 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -2272,6 +2272,9 @@ int udf_current_aext(struct inode *inode, struct extent_position *epos, if (check_add_overflow(sizeof(struct allocExtDesc), le32_to_cpu(header->lengthAllocDescs), &alen)) return -1; + + if (alen > epos->bh->b_size) + return -1; } switch (iinfo->i_alloc_type) { |