diff options
author | Paolo Bonzini <pbonzini@redhat.com> | 2019-08-09 17:53:39 +0300 |
---|---|---|
committer | Paolo Bonzini <pbonzini@redhat.com> | 2019-08-09 17:53:39 +0300 |
commit | 0e1c438c44dd9cde56effb44c5f1cfeda72e108d (patch) | |
tree | fa3492d4d7d8b7444e5d8ebe6c78210826333e4b /fs | |
parent | c096397c78f766db972f923433031f2dec01cae0 (diff) | |
parent | cdb2d3ee0436d74fa9092f2df46aaa6f9e03c969 (diff) | |
download | linux-0e1c438c44dd9cde56effb44c5f1cfeda72e108d.tar.xz |
Merge tag 'kvmarm-fixes-for-5.3' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm into HEAD
KVM/arm fixes for 5.3
- A bunch of switch/case fall-through annotation, fixing one actual bug
- Fix PMU reset bug
- Add missing exception class debug strings
Diffstat (limited to 'fs')
220 files changed, 10017 insertions, 7355 deletions
diff --git a/fs/Makefile b/fs/Makefile index c9aea23aba56..d60089fd689b 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -52,7 +52,7 @@ obj-$(CONFIG_COREDUMP) += coredump.o obj-$(CONFIG_SYSCTL) += drop_caches.o obj-$(CONFIG_FHANDLE) += fhandle.o -obj-$(CONFIG_FS_IOMAP) += iomap.o +obj-y += iomap/ obj-y += quota/ diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h index 804c6a77c5db..b7e844d2f321 100644 --- a/fs/adfs/adfs.h +++ b/fs/adfs/adfs.h @@ -1,4 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/buffer_head.h> #include <linux/fs.h> #include <linux/adfs_fs.h> @@ -8,6 +9,15 @@ #define ADFS_BAD_FRAG 1 #define ADFS_ROOT_FRAG 2 +#define ADFS_FILETYPE_NONE ((u16)~0) + +/* RISC OS 12-bit filetype is stored in load_address[19:8] */ +static inline u16 adfs_filetype(u32 loadaddr) +{ + return (loadaddr & 0xfff00000) == 0xfff00000 ? + (loadaddr >> 8) & 0xfff : ADFS_FILETYPE_NONE; +} + #define ADFS_NDA_OWNER_READ (1 << 0) #define ADFS_NDA_OWNER_WRITE (1 << 1) #define ADFS_NDA_LOCKED (1 << 2) @@ -18,22 +28,28 @@ #include "dir_f.h" -struct buffer_head; - /* * adfs file system inode data in memory */ struct adfs_inode_info { loff_t mmu_private; - unsigned long parent_id; /* object id of parent */ + __u32 parent_id; /* parent indirect disc address */ __u32 loadaddr; /* RISC OS load address */ __u32 execaddr; /* RISC OS exec address */ - unsigned int filetype; /* RISC OS file type */ unsigned int attr; /* RISC OS permissions */ - unsigned int stamped:1; /* RISC OS file has date/time */ struct inode vfs_inode; }; +static inline struct adfs_inode_info *ADFS_I(struct inode *inode) +{ + return container_of(inode, struct adfs_inode_info, vfs_inode); +} + +static inline bool adfs_inode_is_stamped(struct inode *inode) +{ + return (ADFS_I(inode)->loadaddr & 0xfff00000) == 0xfff00000; +} + /* * Forward-declare this */ @@ -59,10 +75,8 @@ struct adfs_sb_info { __u32 s_ids_per_zone; /* max. no ids in one zone */ __u32 s_idlen; /* length of ID in map */ __u32 s_map_size; /* sector size of a map */ - unsigned long s_size; /* total size (in blocks) of this fs */ signed int s_map2blk; /* shift left by this for map->sector*/ unsigned int s_log2sharesize;/* log2 share size */ - __le32 s_version; /* disc format version */ unsigned int s_namelen; /* maximum number of characters in name */ }; @@ -71,11 +85,6 @@ static inline struct adfs_sb_info *ADFS_SB(struct super_block *sb) return sb->s_fs_info; } -static inline struct adfs_inode_info *ADFS_I(struct inode *inode) -{ - return container_of(inode, struct adfs_inode_info, vfs_inode); -} - /* * Directory handling */ @@ -89,7 +98,7 @@ struct adfs_dir { struct buffer_head **bh_fplus; unsigned int pos; - unsigned int parent_id; + __u32 parent_id; struct adfs_dirheader dirhead; union adfs_dirtail dirtail; @@ -101,20 +110,18 @@ struct adfs_dir { #define ADFS_MAX_NAME_LEN (256 + 4) /* +4 for ,xyz hex filetype suffix */ struct object_info { __u32 parent_id; /* parent object id */ - __u32 file_id; /* object id */ + __u32 indaddr; /* indirect disc addr */ __u32 loadaddr; /* load address */ __u32 execaddr; /* execution address */ __u32 size; /* size */ __u8 attr; /* RISC OS attributes */ unsigned int name_len; /* name length */ char name[ADFS_MAX_NAME_LEN];/* file name */ - - /* RISC OS file type (12-bit: derived from loadaddr) */ - __u16 filetype; }; struct adfs_dir_ops { - int (*read)(struct super_block *sb, unsigned int id, unsigned int sz, struct adfs_dir *dir); + int (*read)(struct super_block *sb, unsigned int indaddr, + unsigned int size, struct adfs_dir *dir); int (*setpos)(struct adfs_dir *dir, unsigned int fpos); int (*getnext)(struct adfs_dir *dir, struct object_info *obj); int (*update)(struct adfs_dir *dir, struct object_info *obj); @@ -137,7 +144,7 @@ int adfs_write_inode(struct inode *inode, struct writeback_control *wbc); int adfs_notify_change(struct dentry *dentry, struct iattr *attr); /* map.c */ -extern int adfs_map_lookup(struct super_block *sb, unsigned int frag_id, unsigned int offset); +int adfs_map_lookup(struct super_block *sb, u32 frag_id, unsigned int offset); extern unsigned int adfs_map_free(struct super_block *sb); /* Misc */ @@ -145,6 +152,7 @@ __printf(3, 4) void __adfs_error(struct super_block *sb, const char *function, const char *fmt, ...); #define adfs_error(sb, fmt...) __adfs_error(sb, __func__, fmt) +void adfs_msg(struct super_block *sb, const char *pfx, const char *fmt, ...); /* super.c */ @@ -182,16 +190,28 @@ static inline __u32 signed_asl(__u32 val, signed int shift) * * The root directory ID should always be looked up in the map [3.4] */ -static inline int -__adfs_block_map(struct super_block *sb, unsigned int object_id, - unsigned int block) +static inline int __adfs_block_map(struct super_block *sb, u32 indaddr, + unsigned int block) { - if (object_id & 255) { + if (indaddr & 255) { unsigned int off; - off = (object_id & 255) - 1; + off = (indaddr & 255) - 1; block += off << ADFS_SB(sb)->s_log2sharesize; } - return adfs_map_lookup(sb, object_id >> 8, block); + return adfs_map_lookup(sb, indaddr >> 8, block); +} + +/* Return the disc record from the map */ +static inline +struct adfs_discrecord *adfs_map_discrecord(struct adfs_discmap *dm) +{ + return (void *)(dm[0].dm_bh->b_data + 4); +} + +static inline u64 adfs_disc_size(const struct adfs_discrecord *dr) +{ + return (u64)le32_to_cpu(dr->disc_size_high) << 32 | + le32_to_cpu(dr->disc_size); } diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c index 35a4d9f4c3ae..a54c53244992 100644 --- a/fs/adfs/dir.c +++ b/fs/adfs/dir.c @@ -35,20 +35,14 @@ void adfs_object_fixup(struct adfs_dir *dir, struct object_info *obj) if (obj->name_len <= 2 && dots == obj->name_len) obj->name[0] = '^'; - obj->filetype = -1; - /* - * object is a file and is filetyped and timestamped? - * RISC OS 12-bit filetype is stored in load_address[19:8] + * If the object is a file, and the user requested the ,xyz hex + * filetype suffix to the name, check the filetype and append. */ - if ((0 == (obj->attr & ADFS_NDA_DIRECTORY)) && - (0xfff00000 == (0xfff00000 & obj->loadaddr))) { - obj->filetype = (__u16) ((0x000fff00 & obj->loadaddr) >> 8); - - /* optionally append the ,xyz hex filetype suffix */ - if (ADFS_SB(dir->sb)->s_ftsuffix) { - __u16 filetype = obj->filetype; + if (!(obj->attr & ADFS_NDA_DIRECTORY) && ADFS_SB(dir->sb)->s_ftsuffix) { + u16 filetype = adfs_filetype(obj->loadaddr); + if (filetype != ADFS_FILETYPE_NONE) { obj->name[obj->name_len++] = ','; obj->name[obj->name_len++] = hex_asc_lo(filetype >> 8); obj->name[obj->name_len++] = hex_asc_lo(filetype >> 4); @@ -92,7 +86,7 @@ adfs_readdir(struct file *file, struct dir_context *ctx) goto unlock_out; while (ops->getnext(&dir, &obj) == 0) { if (!dir_emit(ctx, obj.name, obj.name_len, - obj.file_id, DT_UNKNOWN)) + obj.indaddr, DT_UNKNOWN)) break; ctx->pos++; } @@ -113,8 +107,8 @@ adfs_dir_update(struct super_block *sb, struct object_info *obj, int wait) const struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir; struct adfs_dir dir; - printk(KERN_INFO "adfs_dir_update: object %06X in dir %06X\n", - obj->file_id, obj->parent_id); + printk(KERN_INFO "adfs_dir_update: object %06x in dir %06x\n", + obj->indaddr, obj->parent_id); if (!ops->update) { ret = -EINVAL; @@ -178,7 +172,8 @@ static int adfs_dir_lookup_byname(struct inode *inode, const struct qstr *qstr, goto out; if (ADFS_I(inode)->parent_id != dir.parent_id) { - adfs_error(sb, "parent directory changed under me! (%lx but got %x)\n", + adfs_error(sb, + "parent directory changed under me! (%06x but got %06x)\n", ADFS_I(inode)->parent_id, dir.parent_id); ret = -EIO; goto free_out; diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c index 7557378e58b3..c1a950c7400a 100644 --- a/fs/adfs/dir_f.c +++ b/fs/adfs/dir_f.c @@ -6,7 +6,6 @@ * * E and F format directory handling */ -#include <linux/buffer_head.h> #include "adfs.h" #include "dir_f.h" @@ -124,12 +123,9 @@ adfs_dir_checkbyte(const struct adfs_dir *dir) return (dircheck ^ (dircheck >> 8) ^ (dircheck >> 16) ^ (dircheck >> 24)) & 0xff; } -/* - * Read and check that a directory is valid - */ -static int -adfs_dir_read(struct super_block *sb, unsigned long object_id, - unsigned int size, struct adfs_dir *dir) +/* Read and check that a directory is valid */ +static int adfs_dir_read(struct super_block *sb, u32 indaddr, + unsigned int size, struct adfs_dir *dir) { const unsigned int blocksize_bits = sb->s_blocksize_bits; int blk = 0; @@ -149,10 +145,10 @@ adfs_dir_read(struct super_block *sb, unsigned long object_id, for (blk = 0; blk < size; blk++) { int phys; - phys = __adfs_block_map(sb, object_id, blk); + phys = __adfs_block_map(sb, indaddr, blk); if (!phys) { - adfs_error(sb, "dir object %lX has a hole at offset %d", - object_id, blk); + adfs_error(sb, "dir %06x has a hole at offset %d", + indaddr, blk); goto release_buffers; } @@ -180,8 +176,7 @@ adfs_dir_read(struct super_block *sb, unsigned long object_id, return 0; bad_dir: - adfs_error(sb, "corrupted directory fragment %lX", - object_id); + adfs_error(sb, "dir %06x is corrupted", indaddr); release_buffers: for (blk -= 1; blk >= 0; blk -= 1) brelse(dir->bh[blk]); @@ -208,7 +203,7 @@ adfs_dir2obj(struct adfs_dir *dir, struct object_info *obj, } obj->name_len = name_len; - obj->file_id = adfs_readval(de->dirinddiscadd, 3); + obj->indaddr = adfs_readval(de->dirinddiscadd, 3); obj->loadaddr = adfs_readval(de->dirload, 4); obj->execaddr = adfs_readval(de->direxec, 4); obj->size = adfs_readval(de->dirlen, 4); @@ -223,7 +218,7 @@ adfs_dir2obj(struct adfs_dir *dir, struct object_info *obj, static inline void adfs_obj2dir(struct adfs_direntry *de, struct object_info *obj) { - adfs_writeval(de->dirinddiscadd, 3, obj->file_id); + adfs_writeval(de->dirinddiscadd, 3, obj->indaddr); adfs_writeval(de->dirload, 4, obj->loadaddr); adfs_writeval(de->direxec, 4, obj->execaddr); adfs_writeval(de->dirlen, 4, obj->size); @@ -309,8 +304,7 @@ __adfs_dir_put(struct adfs_dir *dir, int pos, struct object_info *obj) * the caller is responsible for holding the necessary * locks. */ -static int -adfs_dir_find_entry(struct adfs_dir *dir, unsigned long object_id) +static int adfs_dir_find_entry(struct adfs_dir *dir, u32 indaddr) { int pos, ret; @@ -322,7 +316,7 @@ adfs_dir_find_entry(struct adfs_dir *dir, unsigned long object_id) if (!__adfs_dir_get(dir, pos, &obj)) break; - if (obj.file_id == object_id) { + if (obj.indaddr == indaddr) { ret = pos; break; } @@ -331,15 +325,15 @@ adfs_dir_find_entry(struct adfs_dir *dir, unsigned long object_id) return ret; } -static int -adfs_f_read(struct super_block *sb, unsigned int id, unsigned int sz, struct adfs_dir *dir) +static int adfs_f_read(struct super_block *sb, u32 indaddr, unsigned int size, + struct adfs_dir *dir) { int ret; - if (sz != ADFS_NEWDIR_SIZE) + if (size != ADFS_NEWDIR_SIZE) return -EIO; - ret = adfs_dir_read(sb, id, sz, dir); + ret = adfs_dir_read(sb, indaddr, size, dir); if (ret) adfs_error(sb, "unable to read directory"); else @@ -376,7 +370,7 @@ adfs_f_update(struct adfs_dir *dir, struct object_info *obj) struct super_block *sb = dir->sb; int ret, i; - ret = adfs_dir_find_entry(dir, obj->file_id); + ret = adfs_dir_find_entry(dir, obj->indaddr); if (ret < 0) { adfs_error(dir->sb, "unable to locate entry to update"); goto out; diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c index 6c5fbb0259c9..d56924c11b17 100644 --- a/fs/adfs/dir_fplus.c +++ b/fs/adfs/dir_fplus.c @@ -4,7 +4,6 @@ * * Copyright (C) 1997-1999 Russell King */ -#include <linux/buffer_head.h> #include <linux/slab.h> #include "adfs.h" #include "dir_fplus.h" @@ -37,17 +36,15 @@ adfs_fplus_read(struct super_block *sb, unsigned int id, unsigned int sz, struct h = (struct adfs_bigdirheader *)dir->bh_fplus[0]->b_data; size = le32_to_cpu(h->bigdirsize); if (size != sz) { - printk(KERN_WARNING "adfs: adfs_fplus_read:" - " directory header size %X\n" - " does not match directory size %X\n", - size, sz); + adfs_msg(sb, KERN_WARNING, + "directory header size %X does not match directory size %X", + size, sz); } if (h->bigdirversion[0] != 0 || h->bigdirversion[1] != 0 || h->bigdirversion[2] != 0 || size & 2047 || h->bigdirstartname != cpu_to_le32(BIGDIRSTARTNAME)) { - printk(KERN_WARNING "adfs: dir object %X has" - " malformed dir header\n", id); + adfs_error(sb, "dir %06x has malformed header", id); goto out; } @@ -58,9 +55,10 @@ adfs_fplus_read(struct super_block *sb, unsigned int id, unsigned int sz, struct kcalloc(size, sizeof(struct buffer_head *), GFP_KERNEL); if (!bh_fplus) { + adfs_msg(sb, KERN_ERR, + "not enough memory for dir object %X (%d blocks)", + id, size); ret = -ENOMEM; - adfs_error(sb, "not enough memory for" - " dir object %X (%d blocks)", id, size); goto out; } dir->bh_fplus = bh_fplus; @@ -91,8 +89,7 @@ adfs_fplus_read(struct super_block *sb, unsigned int id, unsigned int sz, struct if (t->bigdirendname != cpu_to_le32(BIGDIRENDNAME) || t->bigdirendmasseq != h->startmasseq || t->reserved[0] != 0 || t->reserved[1] != 0) { - printk(KERN_WARNING "adfs: dir object %X has " - "malformed dir end\n", id); + adfs_error(sb, "dir %06x has malformed tail", id); goto out; } @@ -180,7 +177,7 @@ adfs_fplus_getnext(struct adfs_dir *dir, struct object_info *obj) obj->loadaddr = le32_to_cpu(bde.bigdirload); obj->execaddr = le32_to_cpu(bde.bigdirexec); obj->size = le32_to_cpu(bde.bigdirlen); - obj->file_id = le32_to_cpu(bde.bigdirindaddr); + obj->indaddr = le32_to_cpu(bde.bigdirindaddr); obj->attr = le32_to_cpu(bde.bigdirattr); obj->name_len = le32_to_cpu(bde.bigdirobnamelen); diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index 904d624541ad..124de75413a5 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -94,7 +94,7 @@ adfs_atts2mode(struct super_block *sb, struct inode *inode) return S_IFDIR | S_IXUGO | mode; } - switch (ADFS_I(inode)->filetype) { + switch (adfs_filetype(ADFS_I(inode)->loadaddr)) { case 0xfc0: /* LinkFS */ return S_IFLNK|S_IRWXUGO; @@ -174,7 +174,7 @@ adfs_adfs2unix_time(struct timespec64 *tv, struct inode *inode) 2208988800000000000LL; s64 nsec; - if (ADFS_I(inode)->stamped == 0) + if (!adfs_inode_is_stamped(inode)) goto cur_time; high = ADFS_I(inode)->loadaddr & 0xFF; /* top 8 bits of timestamp */ @@ -213,7 +213,7 @@ adfs_unix2adfs_time(struct inode *inode, unsigned int secs) { unsigned int high, low; - if (ADFS_I(inode)->stamped) { + if (adfs_inode_is_stamped(inode)) { /* convert 32-bit seconds to 40-bit centi-seconds */ low = (secs & 255) * 100; high = (secs / 256) * 100 + (low >> 8) + 0x336e996a; @@ -247,7 +247,7 @@ adfs_iget(struct super_block *sb, struct object_info *obj) inode->i_uid = ADFS_SB(sb)->s_uid; inode->i_gid = ADFS_SB(sb)->s_gid; - inode->i_ino = obj->file_id; + inode->i_ino = obj->indaddr; inode->i_size = obj->size; set_nlink(inode, 2); inode->i_blocks = (inode->i_size + sb->s_blocksize - 1) >> @@ -263,8 +263,6 @@ adfs_iget(struct super_block *sb, struct object_info *obj) ADFS_I(inode)->loadaddr = obj->loadaddr; ADFS_I(inode)->execaddr = obj->execaddr; ADFS_I(inode)->attr = obj->attr; - ADFS_I(inode)->filetype = obj->filetype; - ADFS_I(inode)->stamped = ((obj->loadaddr & 0xfff00000) == 0xfff00000); inode->i_mode = adfs_atts2mode(sb, inode); adfs_adfs2unix_time(&inode->i_mtime, inode); @@ -355,7 +353,7 @@ int adfs_write_inode(struct inode *inode, struct writeback_control *wbc) struct object_info obj; int ret; - obj.file_id = inode->i_ino; + obj.indaddr = inode->i_ino; obj.name_len = 0; obj.parent_id = ADFS_I(inode)->parent_id; obj.loadaddr = ADFS_I(inode)->loadaddr; diff --git a/fs/adfs/map.c b/fs/adfs/map.c index 4d34338c6176..f44d12cef5be 100644 --- a/fs/adfs/map.c +++ b/fs/adfs/map.c @@ -4,7 +4,6 @@ * * Copyright (C) 1997-2002 Russell King */ -#include <linux/buffer_head.h> #include <asm/unaligned.h> #include "adfs.h" @@ -64,9 +63,8 @@ static DEFINE_RWLOCK(adfs_map_lock); * output of: * gcc -D__KERNEL__ -O2 -I../../include -o - -S map.c */ -static int -lookup_zone(const struct adfs_discmap *dm, const unsigned int idlen, - const unsigned int frag_id, unsigned int *offset) +static int lookup_zone(const struct adfs_discmap *dm, const unsigned int idlen, + const u32 frag_id, unsigned int *offset) { const unsigned int mapsize = dm->dm_endbit; const u32 idmask = (1 << idlen) - 1; @@ -185,9 +183,8 @@ error: return 0; } -static int -scan_map(struct adfs_sb_info *asb, unsigned int zone, - const unsigned int frag_id, unsigned int mapoff) +static int scan_map(struct adfs_sb_info *asb, unsigned int zone, + const u32 frag_id, unsigned int mapoff) { const unsigned int idlen = asb->s_idlen; struct adfs_discmap *dm, *dm_end; @@ -241,9 +238,7 @@ adfs_map_free(struct super_block *sb) return signed_asl(total, asb->s_map2blk); } -int -adfs_map_lookup(struct super_block *sb, unsigned int frag_id, - unsigned int offset) +int adfs_map_lookup(struct super_block *sb, u32 frag_id, unsigned int offset) { struct adfs_sb_info *asb = ADFS_SB(sb); unsigned int zone, mapoff; diff --git a/fs/adfs/super.c b/fs/adfs/super.c index ffb669f9bba7..65b04ebb51c3 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -6,7 +6,6 @@ */ #include <linux/module.h> #include <linux/init.h> -#include <linux/buffer_head.h> #include <linux/parser.h> #include <linux/mount.h> #include <linux/seq_file.h> @@ -17,25 +16,42 @@ #include "dir_f.h" #include "dir_fplus.h" +#define ADFS_SB_FLAGS SB_NOATIME + #define ADFS_DEFAULT_OWNER_MASK S_IRWXU #define ADFS_DEFAULT_OTHER_MASK (S_IRWXG | S_IRWXO) void __adfs_error(struct super_block *sb, const char *function, const char *fmt, ...) { - char error_buf[128]; + struct va_format vaf; va_list args; va_start(args, fmt); - vsnprintf(error_buf, sizeof(error_buf), fmt, args); - va_end(args); + vaf.fmt = fmt; + vaf.va = &args; - printk(KERN_CRIT "ADFS-fs error (device %s)%s%s: %s\n", + printk(KERN_CRIT "ADFS-fs error (device %s)%s%s: %pV\n", sb->s_id, function ? ": " : "", - function ? function : "", error_buf); + function ? function : "", &vaf); + + va_end(args); +} + +void adfs_msg(struct super_block *sb, const char *pfx, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk("%sADFS-fs (%s): %pV\n", pfx, sb->s_id, &vaf); + va_end(args); } static int adfs_checkdiscrecord(struct adfs_discrecord *dr) { + unsigned int max_idlen; int i; /* sector size must be 256, 512 or 1024 bytes */ @@ -55,8 +71,13 @@ static int adfs_checkdiscrecord(struct adfs_discrecord *dr) if (le32_to_cpu(dr->disc_size_high) >> dr->log2secsize) return 1; - /* idlen must be no greater than 19 v2 [1.0] */ - if (dr->idlen > 19) + /* + * Maximum idlen is limited to 16 bits for new directories by + * the three-byte storage of an indirect disc address. For + * big directories, idlen must be no greater than 19 v2 [1.0] + */ + max_idlen = dr->format_version ? 19 : 16; + if (dr->idlen > max_idlen) return 1; /* reserved bytes should be zero */ @@ -152,10 +173,10 @@ static const match_table_t tokens = { {Opt_err, NULL} }; -static int parse_options(struct super_block *sb, char *options) +static int parse_options(struct super_block *sb, struct adfs_sb_info *asb, + char *options) { char *p; - struct adfs_sb_info *asb = ADFS_SB(sb); int option; if (!options) @@ -199,8 +220,9 @@ static int parse_options(struct super_block *sb, char *options) asb->s_ftsuffix = option; break; default: - printk("ADFS-fs: unrecognised mount option \"%s\" " - "or missing value\n", p); + adfs_msg(sb, KERN_ERR, + "unrecognised mount option \"%s\" or missing value", + p); return -EINVAL; } } @@ -209,21 +231,31 @@ static int parse_options(struct super_block *sb, char *options) static int adfs_remount(struct super_block *sb, int *flags, char *data) { + struct adfs_sb_info temp_asb; + int ret; + sync_filesystem(sb); - *flags |= SB_NODIRATIME; - return parse_options(sb, data); + *flags |= ADFS_SB_FLAGS; + + temp_asb = *ADFS_SB(sb); + ret = parse_options(sb, &temp_asb, data); + if (ret == 0) + *ADFS_SB(sb) = temp_asb; + + return ret; } static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct adfs_sb_info *sbi = ADFS_SB(sb); + struct adfs_discrecord *dr = adfs_map_discrecord(sbi->s_map); u64 id = huge_encode_dev(sb->s_bdev->bd_dev); buf->f_type = ADFS_SUPER_MAGIC; buf->f_namelen = sbi->s_namelen; buf->f_bsize = sb->s_blocksize; - buf->f_blocks = sbi->s_size; + buf->f_blocks = adfs_disc_size(dr) >> sb->s_blocksize_bits; buf->f_files = sbi->s_ids_per_zone * sbi->s_map_size; buf->f_bavail = buf->f_bfree = adfs_map_free(sb); @@ -327,8 +359,7 @@ static struct adfs_discmap *adfs_read_map(struct super_block *sb, struct adfs_di i = zone - 1; dm[0].dm_startblk = 0; dm[0].dm_startbit = ADFS_DR_SIZE_BITS; - dm[i].dm_endbit = (le32_to_cpu(dr->disc_size_high) << (32 - dr->log2bpmb)) + - (le32_to_cpu(dr->disc_size) >> dr->log2bpmb) + + dm[i].dm_endbit = (adfs_disc_size(dr) >> dr->log2bpmb) + (ADFS_DR_SIZE_BITS - i * zone_size); if (adfs_checkmap(sb, dm)) @@ -344,27 +375,18 @@ error_free: return ERR_PTR(-EIO); } -static inline unsigned long adfs_discsize(struct adfs_discrecord *dr, int block_bits) -{ - unsigned long discsize; - - discsize = le32_to_cpu(dr->disc_size_high) << (32 - block_bits); - discsize |= le32_to_cpu(dr->disc_size) >> block_bits; - - return discsize; -} - static int adfs_fill_super(struct super_block *sb, void *data, int silent) { struct adfs_discrecord *dr; struct buffer_head *bh; struct object_info root_obj; unsigned char *b_data; + unsigned int blocksize; struct adfs_sb_info *asb; struct inode *root; int ret = -EINVAL; - sb->s_flags |= SB_NODIRATIME; + sb->s_flags |= ADFS_SB_FLAGS; asb = kzalloc(sizeof(*asb), GFP_KERNEL); if (!asb) @@ -378,12 +400,12 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent) asb->s_other_mask = ADFS_DEFAULT_OTHER_MASK; asb->s_ftsuffix = 0; - if (parse_options(sb, data)) + if (parse_options(sb, asb, data)) goto error; sb_set_blocksize(sb, BLOCK_SIZE); if (!(bh = sb_bread(sb, ADFS_DISCRECORD / BLOCK_SIZE))) { - adfs_error(sb, "unable to read superblock"); + adfs_msg(sb, KERN_ERR, "error: unable to read superblock"); ret = -EIO; goto error; } @@ -391,11 +413,8 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent) b_data = bh->b_data + (ADFS_DISCRECORD % BLOCK_SIZE); if (adfs_checkbblk(b_data)) { - if (!silent) - printk("VFS: Can't find an adfs filesystem on dev " - "%s.\n", sb->s_id); ret = -EINVAL; - goto error_free_bh; + goto error_badfs; } dr = (struct adfs_discrecord *)(b_data + ADFS_DR_OFFSET); @@ -404,33 +423,33 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent) * Do some sanity checks on the ADFS disc record */ if (adfs_checkdiscrecord(dr)) { - if (!silent) - printk("VPS: Can't find an adfs filesystem on dev " - "%s.\n", sb->s_id); ret = -EINVAL; - goto error_free_bh; + goto error_badfs; } + blocksize = 1 << dr->log2secsize; brelse(bh); - if (sb_set_blocksize(sb, 1 << dr->log2secsize)) { + + if (sb_set_blocksize(sb, blocksize)) { bh = sb_bread(sb, ADFS_DISCRECORD / sb->s_blocksize); if (!bh) { - adfs_error(sb, "couldn't read superblock on " - "2nd try."); + adfs_msg(sb, KERN_ERR, + "error: couldn't read superblock on 2nd try."); ret = -EIO; goto error; } b_data = bh->b_data + (ADFS_DISCRECORD % sb->s_blocksize); if (adfs_checkbblk(b_data)) { - adfs_error(sb, "disc record mismatch, very weird!"); + adfs_msg(sb, KERN_ERR, + "error: disc record mismatch, very weird!"); ret = -EINVAL; goto error_free_bh; } dr = (struct adfs_discrecord *)(b_data + ADFS_DR_OFFSET); } else { if (!silent) - printk(KERN_ERR "VFS: Unsupported blocksize on dev " - "%s.\n", sb->s_id); + adfs_msg(sb, KERN_ERR, + "error: unsupported blocksize"); ret = -EINVAL; goto error; } @@ -443,8 +462,6 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent) asb->s_idlen = dr->idlen; asb->s_map_size = dr->nzones | (dr->nzones_high << 8); asb->s_map2blk = dr->log2bpmb - dr->log2secsize; - asb->s_size = adfs_discsize(dr, sb->s_blocksize_bits); - asb->s_version = dr->format_version; asb->s_log2sharesize = dr->log2sharesize; asb->s_map = adfs_read_map(sb, dr); @@ -460,9 +477,9 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent) */ sb->s_op = &adfs_sops; - dr = (struct adfs_discrecord *)(asb->s_map[0].dm_bh->b_data + 4); + dr = adfs_map_discrecord(asb->s_map); - root_obj.parent_id = root_obj.file_id = le32_to_cpu(dr->root); + root_obj.parent_id = root_obj.indaddr = le32_to_cpu(dr->root); root_obj.name_len = 0; /* Set root object date as 01 Jan 1987 00:00:00 */ root_obj.loadaddr = 0xfff0003f; @@ -470,13 +487,12 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent) root_obj.size = ADFS_NEWDIR_SIZE; root_obj.attr = ADFS_NDA_DIRECTORY | ADFS_NDA_OWNER_READ | ADFS_NDA_OWNER_WRITE | ADFS_NDA_PUBLIC_READ; - root_obj.filetype = -1; /* * If this is a F+ disk with variable length directories, * get the root_size from the disc record. */ - if (asb->s_version) { + if (dr->format_version) { root_obj.size = le32_to_cpu(dr->root_size); asb->s_dir = &adfs_fplus_dir_ops; asb->s_namelen = ADFS_FPLUS_NAME_LEN; @@ -505,6 +521,11 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent) } return 0; +error_badfs: + if (!silent) + adfs_msg(sb, KERN_ERR, + "error: can't find an ADFS filesystem on dev %s.", + sb->s_id); error_free_bh: brelse(bh); error: @@ -42,6 +42,7 @@ #include <linux/ramfs.h> #include <linux/percpu-refcount.h> #include <linux/mount.h> +#include <linux/pseudo_fs.h> #include <asm/kmap_types.h> #include <linux/uaccess.h> @@ -249,15 +250,12 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages) return file; } -static struct dentry *aio_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int aio_init_fs_context(struct fs_context *fc) { - struct dentry *root = mount_pseudo(fs_type, "aio:", NULL, NULL, - AIO_RING_MAGIC); - - if (!IS_ERR(root)) - root->d_sb->s_iflags |= SB_I_NOEXEC; - return root; + if (!init_pseudo(fc, AIO_RING_MAGIC)) + return -ENOMEM; + fc->s_iflags |= SB_I_NOEXEC; + return 0; } /* aio_setup @@ -268,7 +266,7 @@ static int __init aio_setup(void) { static struct file_system_type aio_fs = { .name = "aio", - .mount = aio_mount, + .init_fs_context = aio_init_fs_context, .kill_sb = kill_anon_super, }; aio_mnt = kern_mount(&aio_fs); @@ -425,7 +423,7 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, BUG_ON(PageWriteback(old)); get_page(new); - rc = migrate_page_move_mapping(mapping, new, old, mode, 1); + rc = migrate_page_move_mapping(mapping, new, old, 1); if (rc != MIGRATEPAGE_SUCCESS) { put_page(new); goto out_unlock; @@ -1479,8 +1477,9 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) return 0; } -static int aio_setup_rw(int rw, const struct iocb *iocb, struct iovec **iovec, - bool vectored, bool compat, struct iov_iter *iter) +static ssize_t aio_setup_rw(int rw, const struct iocb *iocb, + struct iovec **iovec, bool vectored, bool compat, + struct iov_iter *iter) { void __user *buf = (void __user *)(uintptr_t)iocb->aio_buf; size_t len = iocb->aio_nbytes; @@ -1537,7 +1536,7 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb, return -EINVAL; ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter); - if (ret) + if (ret < 0) return ret; ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter)); if (!ret) @@ -1565,7 +1564,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb, return -EINVAL; ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter); - if (ret) + if (ret < 0) return ret; ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter)); if (!ret) { @@ -2093,7 +2092,6 @@ SYSCALL_DEFINE6(io_pgetevents, const struct __aio_sigset __user *, usig) { struct __aio_sigset ksig = { NULL, }; - sigset_t ksigmask, sigsaved; struct timespec64 ts; bool interrupted; int ret; @@ -2104,14 +2102,14 @@ SYSCALL_DEFINE6(io_pgetevents, if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) return -EFAULT; - ret = set_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize); + ret = set_user_sigmask(ksig.sigmask, ksig.sigsetsize); if (ret) return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); interrupted = signal_pending(current); - restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted); + restore_saved_sigmask_unless(interrupted); if (interrupted && !ret) ret = -ERESTARTNOHAND; @@ -2129,7 +2127,6 @@ SYSCALL_DEFINE6(io_pgetevents_time32, const struct __aio_sigset __user *, usig) { struct __aio_sigset ksig = { NULL, }; - sigset_t ksigmask, sigsaved; struct timespec64 ts; bool interrupted; int ret; @@ -2141,14 +2138,14 @@ SYSCALL_DEFINE6(io_pgetevents_time32, return -EFAULT; - ret = set_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize); + ret = set_user_sigmask(ksig.sigmask, ksig.sigsetsize); if (ret) return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); interrupted = signal_pending(current); - restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted); + restore_saved_sigmask_unless(interrupted); if (interrupted && !ret) ret = -ERESTARTNOHAND; @@ -2197,7 +2194,6 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents, const struct __compat_aio_sigset __user *, usig) { struct __compat_aio_sigset ksig = { NULL, }; - sigset_t ksigmask, sigsaved; struct timespec64 t; bool interrupted; int ret; @@ -2208,14 +2204,14 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents, if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) return -EFAULT; - ret = set_compat_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize); + ret = set_compat_user_sigmask(ksig.sigmask, ksig.sigsetsize); if (ret) return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); interrupted = signal_pending(current); - restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted); + restore_saved_sigmask_unless(interrupted); if (interrupted && !ret) ret = -ERESTARTNOHAND; @@ -2233,7 +2229,6 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64, const struct __compat_aio_sigset __user *, usig) { struct __compat_aio_sigset ksig = { NULL, }; - sigset_t ksigmask, sigsaved; struct timespec64 t; bool interrupted; int ret; @@ -2244,14 +2239,14 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64, if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) return -EFAULT; - ret = set_compat_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize); + ret = set_compat_user_sigmask(ksig.sigmask, ksig.sigsetsize); if (ret) return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); interrupted = signal_pending(current); - restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted); + restore_saved_sigmask_unless(interrupted); if (interrupted && !ret) ret = -ERESTARTNOHAND; diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index c2b8663f5b00..89714308c25b 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -20,6 +20,7 @@ #include <linux/kernel.h> #include <linux/magic.h> #include <linux/anon_inodes.h> +#include <linux/pseudo_fs.h> #include <linux/uaccess.h> @@ -39,16 +40,18 @@ static const struct dentry_operations anon_inodefs_dentry_operations = { .d_dname = anon_inodefs_dname, }; -static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int anon_inodefs_init_fs_context(struct fs_context *fc) { - return mount_pseudo(fs_type, "anon_inode:", NULL, - &anon_inodefs_dentry_operations, ANON_INODE_FS_MAGIC); + struct pseudo_fs_context *ctx = init_pseudo(fc, ANON_INODE_FS_MAGIC); + if (!ctx) + return -ENOMEM; + ctx->dops = &anon_inodefs_dentry_operations; + return 0; } static struct file_system_type anon_inode_fs_type = { .name = "anon_inodefs", - .mount = anon_inodefs_mount, + .init_fs_context = anon_inodefs_init_fs_context, .kill_sb = kill_anon_super, }; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 8264b468f283..d4e11b2e04f6 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1127,7 +1127,6 @@ out_free_interp: load_addr, interp_load_addr); if (retval < 0) goto out; - /* N.B. passed_fileno might not be initialized? */ current->mm->end_code = end_code; current->mm->start_code = start_code; current->mm->start_data = start_data; diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 8c6b50f34466..831a2b25ba79 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -431,7 +431,6 @@ static int load_flat_file(struct linux_binprm *bprm, unsigned long len, memp, memp_size, extra, rlim; __be32 __user *reloc; u32 __user *rp; - struct inode *inode; int i, rev, relocs; loff_t fpos; unsigned long start_code, end_code; @@ -439,7 +438,6 @@ static int load_flat_file(struct linux_binprm *bprm, int ret; hdr = ((struct flat_hdr *) bprm->buf); /* exec-header */ - inode = file_inode(bprm->file); text_len = ntohl(hdr->data_start); data_len = ntohl(hdr->data_end) - ntohl(hdr->data_start); diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index b8e145552ec7..cdb45829354d 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -23,6 +23,7 @@ #include <linux/pagemap.h> #include <linux/namei.h> #include <linux/mount.h> +#include <linux/fs_context.h> #include <linux/syscalls.h> #include <linux/fs.h> #include <linux/uaccess.h> @@ -821,7 +822,7 @@ static const struct super_operations s_ops = { .evict_inode = bm_evict_inode, }; -static int bm_fill_super(struct super_block *sb, void *data, int silent) +static int bm_fill_super(struct super_block *sb, struct fs_context *fc) { int err; static const struct tree_descr bm_files[] = { @@ -836,10 +837,19 @@ static int bm_fill_super(struct super_block *sb, void *data, int silent) return err; } -static struct dentry *bm_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int bm_get_tree(struct fs_context *fc) { - return mount_single(fs_type, flags, data, bm_fill_super); + return get_tree_single(fc, bm_fill_super); +} + +static const struct fs_context_operations bm_context_ops = { + .get_tree = bm_get_tree, +}; + +static int bm_init_fs_context(struct fs_context *fc) +{ + fc->ops = &bm_context_ops; + return 0; } static struct linux_binfmt misc_format = { @@ -850,7 +860,7 @@ static struct linux_binfmt misc_format = { static struct file_system_type bm_fs_type = { .owner = THIS_MODULE, .name = "binfmt_misc", - .mount = bm_mount, + .init_fs_context = bm_init_fs_context, .kill_sb = kill_litter_super, }; MODULE_ALIAS_FS("binfmt_misc"); diff --git a/fs/block_dev.c b/fs/block_dev.c index f00b569a9f89..4707dfff991b 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -26,6 +26,7 @@ #include <linux/writeback.h> #include <linux/mpage.h> #include <linux/mount.h> +#include <linux/pseudo_fs.h> #include <linux/uio.h> #include <linux/namei.h> #include <linux/log2.h> @@ -821,19 +822,19 @@ static const struct super_operations bdev_sops = { .evict_inode = bdev_evict_inode, }; -static struct dentry *bd_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int bd_init_fs_context(struct fs_context *fc) { - struct dentry *dent; - dent = mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC); - if (!IS_ERR(dent)) - dent->d_sb->s_iflags |= SB_I_CGROUPWB; - return dent; + struct pseudo_fs_context *ctx = init_pseudo(fc, BDEVFS_MAGIC); + if (!ctx) + return -ENOMEM; + fc->s_iflags |= SB_I_CGROUPWB; + ctx->ops = &bdev_sops; + return 0; } static struct file_system_type bd_type = { .name = "bdev", - .mount = bd_mount, + .init_fs_context = bd_init_fs_context, .kill_sb = kill_anon_super, }; diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 23537bc8c827..212b4a854f2c 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -2,7 +2,8 @@ config BTRFS_FS tristate "Btrfs filesystem support" - select LIBCRC32C + select CRYPTO + select CRYPTO_CRC32C select ZLIB_INFLATE select ZLIB_DEFLATE select LZO_COMPRESS diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index ca693dd554e9..76a843198bcb 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -10,7 +10,8 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ - uuid-tree.o props.o free-space-tree.o tree-checker.o + uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ + block-rsv.o delalloc-space.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 982152d3f920..89116afda7a2 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1465,12 +1465,11 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, * * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error. */ -int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr) +int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr, + struct ulist *roots, struct ulist *tmp) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; - struct ulist *tmp = NULL; - struct ulist *roots = NULL; struct ulist_iterator uiter; struct ulist_node *node; struct seq_list elem = SEQ_LIST_INIT(elem); @@ -1481,12 +1480,8 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr) .share_count = 0, }; - tmp = ulist_alloc(GFP_NOFS); - roots = ulist_alloc(GFP_NOFS); - if (!tmp || !roots) { - ret = -ENOMEM; - goto out; - } + ulist_init(roots); + ulist_init(tmp); trans = btrfs_attach_transaction(root); if (IS_ERR(trans)) { @@ -1527,8 +1522,8 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr) up_read(&fs_info->commit_root_sem); } out: - ulist_free(tmp); - ulist_free(roots); + ulist_release(roots); + ulist_release(tmp); return ret; } diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 54d58988483a..777f61dc081e 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -57,7 +57,8 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, u64 start_off, struct btrfs_path *path, struct btrfs_inode_extref **ret_extref, u64 *found_off); -int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr); +int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr, + struct ulist *roots, struct ulist *tmp_ulist); int __init btrfs_prelim_ref_init(void); void __cold btrfs_prelim_ref_exit(void); diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c new file mode 100644 index 000000000000..698470b9f32d --- /dev/null +++ b/fs/btrfs/block-rsv.c @@ -0,0 +1,425 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "ctree.h" +#include "block-rsv.h" +#include "space-info.h" +#include "math.h" +#include "transaction.h" + +static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, + struct btrfs_block_rsv *dest, u64 num_bytes, + u64 *qgroup_to_release_ret) +{ + struct btrfs_space_info *space_info = block_rsv->space_info; + u64 qgroup_to_release = 0; + u64 ret; + + spin_lock(&block_rsv->lock); + if (num_bytes == (u64)-1) { + num_bytes = block_rsv->size; + qgroup_to_release = block_rsv->qgroup_rsv_size; + } + block_rsv->size -= num_bytes; + if (block_rsv->reserved >= block_rsv->size) { + num_bytes = block_rsv->reserved - block_rsv->size; + block_rsv->reserved = block_rsv->size; + block_rsv->full = 1; + } else { + num_bytes = 0; + } + if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) { + qgroup_to_release = block_rsv->qgroup_rsv_reserved - + block_rsv->qgroup_rsv_size; + block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size; + } else { + qgroup_to_release = 0; + } + spin_unlock(&block_rsv->lock); + + ret = num_bytes; + if (num_bytes > 0) { + if (dest) { + spin_lock(&dest->lock); + if (!dest->full) { + u64 bytes_to_add; + + bytes_to_add = dest->size - dest->reserved; + bytes_to_add = min(num_bytes, bytes_to_add); + dest->reserved += bytes_to_add; + if (dest->reserved >= dest->size) + dest->full = 1; + num_bytes -= bytes_to_add; + } + spin_unlock(&dest->lock); + } + if (num_bytes) + btrfs_space_info_add_old_bytes(fs_info, space_info, + num_bytes); + } + if (qgroup_to_release_ret) + *qgroup_to_release_ret = qgroup_to_release; + return ret; +} + +int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, + struct btrfs_block_rsv *dst, u64 num_bytes, + bool update_size) +{ + int ret; + + ret = btrfs_block_rsv_use_bytes(src, num_bytes); + if (ret) + return ret; + + btrfs_block_rsv_add_bytes(dst, num_bytes, update_size); + return 0; +} + +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) +{ + memset(rsv, 0, sizeof(*rsv)); + spin_lock_init(&rsv->lock); + rsv->type = type; +} + +void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv, + unsigned short type) +{ + btrfs_init_block_rsv(rsv, type); + rsv->space_info = btrfs_find_space_info(fs_info, + BTRFS_BLOCK_GROUP_METADATA); +} + +struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, + unsigned short type) +{ + struct btrfs_block_rsv *block_rsv; + + block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); + if (!block_rsv) + return NULL; + + btrfs_init_metadata_block_rsv(fs_info, block_rsv, type); + return block_rsv; +} + +void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv) +{ + if (!rsv) + return; + btrfs_block_rsv_release(fs_info, rsv, (u64)-1); + kfree(rsv); +} + +int btrfs_block_rsv_add(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, u64 num_bytes, + enum btrfs_reserve_flush_enum flush) +{ + int ret; + + if (num_bytes == 0) + return 0; + + ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush); + if (!ret) + btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true); + + return ret; +} + +int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor) +{ + u64 num_bytes = 0; + int ret = -ENOSPC; + + if (!block_rsv) + return 0; + + spin_lock(&block_rsv->lock); + num_bytes = div_factor(block_rsv->size, min_factor); + if (block_rsv->reserved >= num_bytes) + ret = 0; + spin_unlock(&block_rsv->lock); + + return ret; +} + +int btrfs_block_rsv_refill(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, u64 min_reserved, + enum btrfs_reserve_flush_enum flush) +{ + u64 num_bytes = 0; + int ret = -ENOSPC; + + if (!block_rsv) + return 0; + + spin_lock(&block_rsv->lock); + num_bytes = min_reserved; + if (block_rsv->reserved >= num_bytes) + ret = 0; + else + num_bytes -= block_rsv->reserved; + spin_unlock(&block_rsv->lock); + + if (!ret) + return 0; + + ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush); + if (!ret) { + btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false); + return 0; + } + + return ret; +} + +u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes, u64 *qgroup_to_release) +{ + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; + struct btrfs_block_rsv *target = NULL; + + /* + * If we are the delayed_rsv then push to the global rsv, otherwise dump + * into the delayed rsv if it is not full. + */ + if (block_rsv == delayed_rsv) + target = global_rsv; + else if (block_rsv != global_rsv && !delayed_rsv->full) + target = delayed_rsv; + + if (target && block_rsv->space_info != target->space_info) + target = NULL; + + return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes, + qgroup_to_release); +} + +int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes) +{ + int ret = -ENOSPC; + + spin_lock(&block_rsv->lock); + if (block_rsv->reserved >= num_bytes) { + block_rsv->reserved -= num_bytes; + if (block_rsv->reserved < block_rsv->size) + block_rsv->full = 0; + ret = 0; + } + spin_unlock(&block_rsv->lock); + return ret; +} + +void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, + u64 num_bytes, bool update_size) +{ + spin_lock(&block_rsv->lock); + block_rsv->reserved += num_bytes; + if (update_size) + block_rsv->size += num_bytes; + else if (block_rsv->reserved >= block_rsv->size) + block_rsv->full = 1; + spin_unlock(&block_rsv->lock); +} + +int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *dest, u64 num_bytes, + int min_factor) +{ + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + u64 min_bytes; + + if (global_rsv->space_info != dest->space_info) + return -ENOSPC; + + spin_lock(&global_rsv->lock); + min_bytes = div_factor(global_rsv->size, min_factor); + if (global_rsv->reserved < min_bytes + num_bytes) { + spin_unlock(&global_rsv->lock); + return -ENOSPC; + } + global_rsv->reserved -= num_bytes; + if (global_rsv->reserved < global_rsv->size) + global_rsv->full = 0; + spin_unlock(&global_rsv->lock); + + btrfs_block_rsv_add_bytes(dest, num_bytes, true); + return 0; +} + +void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; + struct btrfs_space_info *sinfo = block_rsv->space_info; + u64 num_bytes; + + /* + * The global block rsv is based on the size of the extent tree, the + * checksum tree and the root tree. If the fs is empty we want to set + * it to a minimal amount for safety. + */ + num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) + + btrfs_root_used(&fs_info->csum_root->root_item) + + btrfs_root_used(&fs_info->tree_root->root_item); + num_bytes = max_t(u64, num_bytes, SZ_16M); + + spin_lock(&sinfo->lock); + spin_lock(&block_rsv->lock); + + block_rsv->size = min_t(u64, num_bytes, SZ_512M); + + if (block_rsv->reserved < block_rsv->size) { + num_bytes = btrfs_space_info_used(sinfo, true); + if (sinfo->total_bytes > num_bytes) { + num_bytes = sinfo->total_bytes - num_bytes; + num_bytes = min(num_bytes, + block_rsv->size - block_rsv->reserved); + block_rsv->reserved += num_bytes; + btrfs_space_info_update_bytes_may_use(fs_info, sinfo, + num_bytes); + trace_btrfs_space_reservation(fs_info, "space_info", + sinfo->flags, num_bytes, + 1); + } + } else if (block_rsv->reserved > block_rsv->size) { + num_bytes = block_rsv->reserved - block_rsv->size; + btrfs_space_info_update_bytes_may_use(fs_info, sinfo, + -num_bytes); + trace_btrfs_space_reservation(fs_info, "space_info", + sinfo->flags, num_bytes, 0); + block_rsv->reserved = block_rsv->size; + } + + if (block_rsv->reserved == block_rsv->size) + block_rsv->full = 1; + else + block_rsv->full = 0; + + spin_unlock(&block_rsv->lock); + spin_unlock(&sinfo->lock); +} + +void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info) +{ + struct btrfs_space_info *space_info; + + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); + fs_info->chunk_block_rsv.space_info = space_info; + + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + fs_info->global_block_rsv.space_info = space_info; + fs_info->trans_block_rsv.space_info = space_info; + fs_info->empty_block_rsv.space_info = space_info; + fs_info->delayed_block_rsv.space_info = space_info; + fs_info->delayed_refs_rsv.space_info = space_info; + + fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv; + fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv; + fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; + fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; + if (fs_info->quota_root) + fs_info->quota_root->block_rsv = &fs_info->global_block_rsv; + fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; + + btrfs_update_global_block_rsv(fs_info); +} + +void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info) +{ + btrfs_block_rsv_release(fs_info, &fs_info->global_block_rsv, (u64)-1); + WARN_ON(fs_info->trans_block_rsv.size > 0); + WARN_ON(fs_info->trans_block_rsv.reserved > 0); + WARN_ON(fs_info->chunk_block_rsv.size > 0); + WARN_ON(fs_info->chunk_block_rsv.reserved > 0); + WARN_ON(fs_info->delayed_block_rsv.size > 0); + WARN_ON(fs_info->delayed_block_rsv.reserved > 0); + WARN_ON(fs_info->delayed_refs_rsv.reserved > 0); + WARN_ON(fs_info->delayed_refs_rsv.size > 0); +} + +static struct btrfs_block_rsv *get_block_rsv( + const struct btrfs_trans_handle *trans, + const struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_rsv *block_rsv = NULL; + + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || + (root == fs_info->csum_root && trans->adding_csums) || + (root == fs_info->uuid_root)) + block_rsv = trans->block_rsv; + + if (!block_rsv) + block_rsv = root->block_rsv; + + if (!block_rsv) + block_rsv = &fs_info->empty_block_rsv; + + return block_rsv; +} + +struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u32 blocksize) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_rsv *block_rsv; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + int ret; + bool global_updated = false; + + block_rsv = get_block_rsv(trans, root); + + if (unlikely(block_rsv->size == 0)) + goto try_reserve; +again: + ret = btrfs_block_rsv_use_bytes(block_rsv, blocksize); + if (!ret) + return block_rsv; + + if (block_rsv->failfast) + return ERR_PTR(ret); + + if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) { + global_updated = true; + btrfs_update_global_block_rsv(fs_info); + goto again; + } + + /* + * The global reserve still exists to save us from ourselves, so don't + * warn_on if we are short on our delayed refs reserve. + */ + if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS && + btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { + static DEFINE_RATELIMIT_STATE(_rs, + DEFAULT_RATELIMIT_INTERVAL * 10, + /*DEFAULT_RATELIMIT_BURST*/ 1); + if (__ratelimit(&_rs)) + WARN(1, KERN_DEBUG + "BTRFS: block rsv returned %d\n", ret); + } +try_reserve: + ret = btrfs_reserve_metadata_bytes(root, block_rsv, blocksize, + BTRFS_RESERVE_NO_FLUSH); + if (!ret) + return block_rsv; + /* + * If we couldn't reserve metadata bytes try and use some from + * the global reserve if its space type is the same as the global + * reservation. + */ + if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL && + block_rsv->space_info == global_rsv->space_info) { + ret = btrfs_block_rsv_use_bytes(global_rsv, blocksize); + if (!ret) + return global_rsv; + } + return ERR_PTR(ret); +} diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h new file mode 100644 index 000000000000..d1428bb73fc5 --- /dev/null +++ b/fs/btrfs/block-rsv.h @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_BLOCK_RSV_H +#define BTRFS_BLOCK_RSV_H + +struct btrfs_trans_handle; +enum btrfs_reserve_flush_enum; + +/* + * Types of block reserves + */ +enum { + BTRFS_BLOCK_RSV_GLOBAL, + BTRFS_BLOCK_RSV_DELALLOC, + BTRFS_BLOCK_RSV_TRANS, + BTRFS_BLOCK_RSV_CHUNK, + BTRFS_BLOCK_RSV_DELOPS, + BTRFS_BLOCK_RSV_DELREFS, + BTRFS_BLOCK_RSV_EMPTY, + BTRFS_BLOCK_RSV_TEMP, +}; + +struct btrfs_block_rsv { + u64 size; + u64 reserved; + struct btrfs_space_info *space_info; + spinlock_t lock; + unsigned short full; + unsigned short type; + unsigned short failfast; + + /* + * Qgroup equivalent for @size @reserved + * + * Unlike normal @size/@reserved for inode rsv, qgroup doesn't care + * about things like csum size nor how many tree blocks it will need to + * reserve. + * + * Qgroup cares more about net change of the extent usage. + * + * So for one newly inserted file extent, in worst case it will cause + * leaf split and level increase, nodesize for each file extent is + * already too much. + * + * In short, qgroup_size/reserved is the upper limit of possible needed + * qgroup metadata reservation. + */ + u64 qgroup_rsv_size; + u64 qgroup_rsv_reserved; +}; + +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); +struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, + unsigned short type); +void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv, + unsigned short type); +void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv); +int btrfs_block_rsv_add(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, u64 num_bytes, + enum btrfs_reserve_flush_enum flush); +int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor); +int btrfs_block_rsv_refill(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, u64 min_reserved, + enum btrfs_reserve_flush_enum flush); +int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, + struct btrfs_block_rsv *dst_rsv, u64 num_bytes, + bool update_size); +int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes); +int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *dest, u64 num_bytes, + int min_factor); +void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, + u64 num_bytes, bool update_size); +u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes, u64 *qgroup_to_release); +void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info); +void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info); +void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info); +struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u32 blocksize); + +static inline void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes) +{ + __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); +} + +static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, + u32 blocksize) +{ + btrfs_block_rsv_add_bytes(block_rsv, blocksize, false); + btrfs_block_rsv_release(fs_info, block_rsv, 0); +} + +#endif /* BTRFS_BLOCK_RSV_H */ diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index d5b438706b77..f853835c409c 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -337,22 +337,34 @@ static inline void btrfs_inode_resume_unlocked_dio(struct btrfs_inode *inode) clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags); } +/* Array of bytes with variable length, hexadecimal format 0x1234 */ +#define CSUM_FMT "0x%*phN" +#define CSUM_FMT_VALUE(size, bytes) size, bytes + static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode, - u64 logical_start, u32 csum, u32 csum_expected, int mirror_num) + u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num) { struct btrfs_root *root = inode->root; + struct btrfs_super_block *sb = root->fs_info->super_copy; + const u16 csum_size = btrfs_super_csum_size(sb); /* Output minus objectid, which is more meaningful */ if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) btrfs_warn_rl(root->fs_info, - "csum failed root %lld ino %lld off %llu csum 0x%08x expected csum 0x%08x mirror %d", +"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", root->root_key.objectid, btrfs_ino(inode), - logical_start, csum, csum_expected, mirror_num); + logical_start, + CSUM_FMT_VALUE(csum_size, csum), + CSUM_FMT_VALUE(csum_size, csum_expected), + mirror_num); else btrfs_warn_rl(root->fs_info, - "csum failed root %llu ino %llu off %llu csum 0x%08x expected csum 0x%08x mirror %d", +"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", root->root_key.objectid, btrfs_ino(inode), - logical_start, csum, csum_expected, mirror_num); + logical_start, + CSUM_FMT_VALUE(csum_size, csum), + CSUM_FMT_VALUE(csum_size, csum_expected), + mirror_num); } #endif diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index b0c8094528d1..81a9731959a9 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -83,7 +83,7 @@ #include <linux/blkdev.h> #include <linux/mm.h> #include <linux/string.h> -#include <linux/crc32c.h> +#include <crypto/hash.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -1710,9 +1710,9 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state, char **datav, unsigned int num_pages) { struct btrfs_fs_info *fs_info = state->fs_info; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); struct btrfs_header *h; u8 csum[BTRFS_CSUM_SIZE]; - u32 crc = ~(u32)0; unsigned int i; if (num_pages * PAGE_SIZE < state->metablock_size) @@ -1723,14 +1723,17 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state, if (memcmp(h->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE)) return 1; + shash->tfm = fs_info->csum_shash; + crypto_shash_init(shash); + for (i = 0; i < num_pages; i++) { u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE); size_t sublen = i ? PAGE_SIZE : (PAGE_SIZE - BTRFS_CSUM_SIZE); - crc = crc32c(crc, data, sublen); + crypto_shash_update(shash, data, sublen); } - btrfs_csum_final(crc, csum); + crypto_shash_final(shash, csum); if (memcmp(csum, h->csum, state->csum_size)) return 1; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 84dd4a8980c5..60c47b417a4b 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -17,6 +17,7 @@ #include <linux/slab.h> #include <linux/sched/mm.h> #include <linux/log2.h> +#include <crypto/hash.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -42,6 +43,22 @@ const char* btrfs_compress_type2str(enum btrfs_compression_type type) return NULL; } +bool btrfs_compress_is_valid_type(const char *str, size_t len) +{ + int i; + + for (i = 1; i < ARRAY_SIZE(btrfs_compress_types); i++) { + size_t comp_len = strlen(btrfs_compress_types[i]); + + if (len < comp_len) + continue; + + if (!strncmp(btrfs_compress_types[i], str, comp_len)) + return true; + } + return false; +} + static int btrfs_decompress_bio(struct compressed_bio *cb); static inline int compressed_bio_size(struct btrfs_fs_info *fs_info, @@ -57,32 +74,37 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct compressed_bio *cb, u64 disk_start) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); + const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); int ret; struct page *page; unsigned long i; char *kaddr; - u32 csum; - u32 *cb_sum = &cb->sums; + u8 csum[BTRFS_CSUM_SIZE]; + u8 *cb_sum = cb->sums; if (inode->flags & BTRFS_INODE_NODATASUM) return 0; + shash->tfm = fs_info->csum_shash; + for (i = 0; i < cb->nr_pages; i++) { page = cb->compressed_pages[i]; - csum = ~(u32)0; + crypto_shash_init(shash); kaddr = kmap_atomic(page); - csum = btrfs_csum_data(kaddr, csum, PAGE_SIZE); - btrfs_csum_final(csum, (u8 *)&csum); + crypto_shash_update(shash, kaddr, PAGE_SIZE); kunmap_atomic(kaddr); + crypto_shash_final(shash, (u8 *)&csum); - if (csum != *cb_sum) { - btrfs_print_data_csum_error(inode, disk_start, csum, - *cb_sum, cb->mirror_num); + if (memcmp(&csum, cb_sum, csum_size)) { + btrfs_print_data_csum_error(inode, disk_start, + csum, cb_sum, cb->mirror_num); ret = -EIO; goto fail; } - cb_sum++; + cb_sum += csum_size; } ret = 0; @@ -318,7 +340,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, bdev = fs_info->fs_devices->latest_bdev; - bio = btrfs_bio_alloc(bdev, first_byte); + bio = btrfs_bio_alloc(first_byte); + bio_set_dev(bio, bdev); bio->bi_opf = REQ_OP_WRITE | write_flags; bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; @@ -360,7 +383,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, bio_endio(bio); } - bio = btrfs_bio_alloc(bdev, first_byte); + bio = btrfs_bio_alloc(first_byte); + bio_set_dev(bio, bdev); bio->bi_opf = REQ_OP_WRITE | write_flags; bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; @@ -536,7 +560,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, struct extent_map *em; blk_status_t ret = BLK_STS_RESOURCE; int faili = 0; - u32 *sums; + const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + u8 *sums; em_tree = &BTRFS_I(inode)->extent_tree; @@ -558,7 +583,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, cb->errors = 0; cb->inode = inode; cb->mirror_num = mirror_num; - sums = &cb->sums; + sums = cb->sums; cb->start = em->orig_start; em_len = em->len; @@ -597,7 +622,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, /* include any pages we added in add_ra-bio_pages */ cb->len = bio->bi_iter.bi_size; - comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte); + comp_bio = btrfs_bio_alloc(cur_disk_byte); + bio_set_dev(comp_bio, bdev); comp_bio->bi_opf = REQ_OP_READ; comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; @@ -617,6 +643,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, page->mapping = NULL; if (submit || bio_add_page(comp_bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { + unsigned int nr_sectors; + ret = btrfs_bio_wq_end_io(fs_info, comp_bio, BTRFS_WQ_ENDIO_DATA); BUG_ON(ret); /* -ENOMEM */ @@ -634,8 +662,10 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, sums); BUG_ON(ret); /* -ENOMEM */ } - sums += DIV_ROUND_UP(comp_bio->bi_iter.bi_size, - fs_info->sectorsize); + + nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size, + fs_info->sectorsize); + sums += csum_size * nr_sectors; ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0); if (ret) { @@ -643,7 +673,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, bio_endio(comp_bio); } - comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte); + comp_bio = btrfs_bio_alloc(cur_disk_byte); + bio_set_dev(comp_bio, bdev); comp_bio->bi_opf = REQ_OP_READ; comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 9976fe0f7526..2035b8eb1290 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -61,7 +61,7 @@ struct compressed_bio { * the start of a variable length array of checksums only * used by reads */ - u32 sums; + u8 sums[]; }; static inline unsigned int btrfs_compress_type(unsigned int type_level) @@ -173,6 +173,7 @@ extern const struct btrfs_compress_op btrfs_lzo_compress; extern const struct btrfs_compress_op btrfs_zstd_compress; const char* btrfs_compress_type2str(enum btrfs_compression_type type); +bool btrfs_compress_is_valid_type(const char *str, size_t len); int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0a61dff27f57..299e11e6c554 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -19,6 +19,7 @@ #include <linux/kobject.h> #include <trace/events/btrfs.h> #include <asm/kmap_types.h> +#include <asm/unaligned.h> #include <linux/pagemap.h> #include <linux/btrfs.h> #include <linux/btrfs_tree.h> @@ -31,11 +32,13 @@ #include "extent_io.h" #include "extent_map.h" #include "async-thread.h" +#include "block-rsv.h" struct btrfs_trans_handle; struct btrfs_transaction; struct btrfs_pending_snapshot; struct btrfs_delayed_ref_root; +struct btrfs_space_info; extern struct kmem_cache *btrfs_trans_handle_cachep; extern struct kmem_cache *btrfs_bit_radix_cachep; extern struct kmem_cache *btrfs_path_cachep; @@ -45,7 +48,16 @@ struct btrfs_ref; #define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */ -#define BTRFS_MAX_MIRRORS 3 +/* + * Maximum number of mirrors that can be available for all profiles counting + * the target device of dev-replace as one. During an active device replace + * procedure, the target device of the copy operation is a mirror for the + * filesystem data as well that can be used to read data in order to repair + * read errors on other disks. + * + * Current value is derived from RAID1 with 2 copies. + */ +#define BTRFS_MAX_MIRRORS (2 + 1) #define BTRFS_MAX_LEVEL 8 @@ -72,6 +84,7 @@ struct btrfs_ref; /* four bytes for CRC32 */ static const int btrfs_csum_sizes[] = { 4 }; +static const char *btrfs_csum_names[] = { "crc32c" }; #define BTRFS_EMPTY_DIR_SIZE 0 @@ -99,10 +112,6 @@ static inline u32 count_max_extents(u64 size) return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); } -struct btrfs_mapping_tree { - struct extent_map_tree map_tree; -}; - static inline unsigned long btrfs_chunk_item_size(int num_stripes) { BUG_ON(num_stripes == 0); @@ -395,115 +404,6 @@ struct raid_kobject { struct list_head list; }; -struct btrfs_space_info { - spinlock_t lock; - - u64 total_bytes; /* total bytes in the space, - this doesn't take mirrors into account */ - u64 bytes_used; /* total bytes used, - this doesn't take mirrors into account */ - u64 bytes_pinned; /* total bytes pinned, will be freed when the - transaction finishes */ - u64 bytes_reserved; /* total bytes the allocator has reserved for - current allocations */ - u64 bytes_may_use; /* number of bytes that may be used for - delalloc/allocations */ - u64 bytes_readonly; /* total bytes that are read only */ - - u64 max_extent_size; /* This will hold the maximum extent size of - the space info if we had an ENOSPC in the - allocator. */ - - unsigned int full:1; /* indicates that we cannot allocate any more - chunks for this space */ - unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ - - unsigned int flush:1; /* set if we are trying to make space */ - - unsigned int force_alloc; /* set if we need to force a chunk - alloc for this space */ - - u64 disk_used; /* total bytes used on disk */ - u64 disk_total; /* total bytes on disk, takes mirrors into - account */ - - u64 flags; - - /* - * bytes_pinned is kept in line with what is actually pinned, as in - * we've called update_block_group and dropped the bytes_used counter - * and increased the bytes_pinned counter. However this means that - * bytes_pinned does not reflect the bytes that will be pinned once the - * delayed refs are flushed, so this counter is inc'ed every time we - * call btrfs_free_extent so it is a realtime count of what will be - * freed once the transaction is committed. It will be zeroed every - * time the transaction commits. - */ - struct percpu_counter total_bytes_pinned; - - struct list_head list; - /* Protected by the spinlock 'lock'. */ - struct list_head ro_bgs; - struct list_head priority_tickets; - struct list_head tickets; - /* - * tickets_id just indicates the next ticket will be handled, so note - * it's not stored per ticket. - */ - u64 tickets_id; - - struct rw_semaphore groups_sem; - /* for block groups in our same type */ - struct list_head block_groups[BTRFS_NR_RAID_TYPES]; - wait_queue_head_t wait; - - struct kobject kobj; - struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES]; -}; - -/* - * Types of block reserves - */ -enum { - BTRFS_BLOCK_RSV_GLOBAL, - BTRFS_BLOCK_RSV_DELALLOC, - BTRFS_BLOCK_RSV_TRANS, - BTRFS_BLOCK_RSV_CHUNK, - BTRFS_BLOCK_RSV_DELOPS, - BTRFS_BLOCK_RSV_DELREFS, - BTRFS_BLOCK_RSV_EMPTY, - BTRFS_BLOCK_RSV_TEMP, -}; - -struct btrfs_block_rsv { - u64 size; - u64 reserved; - struct btrfs_space_info *space_info; - spinlock_t lock; - unsigned short full; - unsigned short type; - unsigned short failfast; - - /* - * Qgroup equivalent for @size @reserved - * - * Unlike normal @size/@reserved for inode rsv, qgroup doesn't care - * about things like csum size nor how many tree blocks it will need to - * reserve. - * - * Qgroup cares more about net change of the extent usage. - * - * So for one newly inserted file extent, in worst case it will cause - * leaf split and level increase, nodesize for each file extent is - * already too much. - * - * In short, qgroup_size/reserved is the upper limit of possible needed - * qgroup metadata reservation. - */ - u64 qgroup_rsv_size; - u64 qgroup_rsv_reserved; -}; - /* * free clusters are used to claim free space in relatively large chunks, * allowing us to do less seeky writes. They are used for all metadata @@ -786,11 +686,18 @@ enum { /* * Indicate that balance has been set up from the ioctl and is in the * main phase. The fs_info::balance_ctl is initialized. + * Set and cleared while holding fs_info::balance_mutex. */ BTRFS_FS_BALANCE_RUNNING, /* Indicate that the cleaner thread is awake and doing something. */ BTRFS_FS_CLEANER_RUNNING, + + /* + * The checksumming has an optimized version and is considered fast, + * so we don't need to offload checksums to workqueues. + */ + BTRFS_FS_CSUM_IMPL_FAST, }; struct btrfs_fs_info { @@ -824,7 +731,7 @@ struct btrfs_fs_info { struct extent_io_tree *pinned_extents; /* logical->physical extent mapping */ - struct btrfs_mapping_tree mapping_tree; + struct extent_map_tree mapping_tree; /* * block reservation for extent, checksum, root tree and @@ -1160,6 +1067,14 @@ struct btrfs_fs_info { spinlock_t swapfile_pins_lock; struct rb_root swapfile_pins; + struct crypto_shash *csum_shash; + + /* + * Number of send operations in progress. + * Updated while holding fs_info::balance_mutex. + */ + int send_in_progress; + #ifdef CONFIG_BTRFS_FS_REF_VERIFY spinlock_t ref_verify_lock; struct rb_root block_tree; @@ -2451,6 +2366,11 @@ static inline int btrfs_super_csum_size(const struct btrfs_super_block *s) return btrfs_csum_sizes[t]; } +static inline const char *btrfs_super_csum_name(u16 csum_type) +{ + /* csum type is validated at mount time */ + return btrfs_csum_names[csum_type]; +} /* * The leaf data grows from end-to-front in the node. @@ -2642,6 +2562,16 @@ BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right, ((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \ btrfs_item_offset_nr(leaf, slot))) +static inline u32 btrfs_crc32c(u32 crc, const void *address, unsigned length) +{ + return crc32c(crc, address, length); +} + +static inline void btrfs_crc32c_final(u32 crc, u8 *result) +{ + put_unaligned_le32(~crc, result); +} + static inline u64 btrfs_name_hash(const char *name, int len) { return crc32c((u32)~1, name, len); @@ -2656,12 +2586,6 @@ static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name, return (u64) crc32c(parent_objectid, name, len); } -static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) -{ - return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) && - (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); -} - static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) { return mapping_gfp_constraint(mapping, ~__GFP_FS); @@ -2698,8 +2622,6 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info, return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; } -int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans); -bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, const u64 start); void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg); @@ -2814,17 +2736,28 @@ enum btrfs_flush_state { COMMIT_TRANS = 9, }; -int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); -int btrfs_check_data_free_space(struct inode *inode, - struct extent_changeset **reserved, u64 start, u64 len); -void btrfs_free_reserved_data_space(struct inode *inode, - struct extent_changeset *reserved, u64 start, u64 len); -void btrfs_delalloc_release_space(struct inode *inode, - struct extent_changeset *reserved, - u64 start, u64 len, bool qgroup_free); -void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, - u64 len); -void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); +/* + * control flags for do_chunk_alloc's force field + * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk + * if we really need one. + * + * CHUNK_ALLOC_LIMITED means to only try and allocate one + * if we have very few chunks already allocated. This is + * used as part of the clustering code to help make sure + * we have a good pool of storage to cluster in, without + * filling the FS with empty chunks + * + * CHUNK_ALLOC_FORCE means it must try to allocate one + * + */ +enum btrfs_chunk_alloc_enum { + CHUNK_ALLOC_NO_FORCE, + CHUNK_ALLOC_LIMITED, + CHUNK_ALLOC_FORCE, +}; + +int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, + enum btrfs_chunk_alloc_enum force); int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int nitems, bool use_global_rsv); @@ -2834,41 +2767,6 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes, bool qgroup_free); int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); -void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, - bool qgroup_free); -int btrfs_delalloc_reserve_space(struct inode *inode, - struct extent_changeset **reserved, u64 start, u64 len); -void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); -struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, - unsigned short type); -void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *rsv, - unsigned short type); -void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *rsv); -int btrfs_block_rsv_add(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, u64 num_bytes, - enum btrfs_reserve_flush_enum flush); -int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor); -int btrfs_block_rsv_refill(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, u64 min_reserved, - enum btrfs_reserve_flush_enum flush); -int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, - struct btrfs_block_rsv *dst_rsv, u64 num_bytes, - bool update_size); -int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *dest, u64 num_bytes, - int min_factor); -void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes); -void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr); -void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans); -int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, - enum btrfs_reserve_flush_enum flush); -void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *src, - u64 num_bytes); int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache); void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache); void btrfs_put_block_group_cache(struct btrfs_fs_info *info); @@ -3186,7 +3084,8 @@ int btrfs_find_name_in_ext_backref(struct extent_buffer *leaf, int slot, struct btrfs_dio_private; int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 len); -blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst); +blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, + u8 *dst); blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, u64 logical_offset); int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, @@ -3514,8 +3413,7 @@ __cold static inline void assfail(const char *expr, const char *file, int line) { if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) { - pr_err("assertion failed: %s, file: %s, line: %d\n", - expr, file, line); + pr_err("assertion failed: %s, in %s:%d\n", expr, file, line); BUG(); } } @@ -3599,10 +3497,11 @@ do { \ /* compatibility and incompatibility defines */ #define btrfs_set_fs_incompat(__fs_info, opt) \ - __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt) + __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, \ + #opt) static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, - u64 flag) + u64 flag, const char* name) { struct btrfs_super_block *disk_super; u64 features; @@ -3615,18 +3514,20 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, if (!(features & flag)) { features |= flag; btrfs_set_super_incompat_flags(disk_super, features); - btrfs_info(fs_info, "setting %llu feature flag", - flag); + btrfs_info(fs_info, + "setting incompat feature flag for %s (0x%llx)", + name, flag); } spin_unlock(&fs_info->super_lock); } } #define btrfs_clear_fs_incompat(__fs_info, opt) \ - __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt) + __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, \ + #opt) static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, - u64 flag) + u64 flag, const char* name) { struct btrfs_super_block *disk_super; u64 features; @@ -3639,8 +3540,9 @@ static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, if (features & flag) { features &= ~flag; btrfs_set_super_incompat_flags(disk_super, features); - btrfs_info(fs_info, "clearing %llu feature flag", - flag); + btrfs_info(fs_info, + "clearing incompat feature flag for %s (0x%llx)", + name, flag); } spin_unlock(&fs_info->super_lock); } @@ -3657,10 +3559,11 @@ static inline bool __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag) } #define btrfs_set_fs_compat_ro(__fs_info, opt) \ - __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) + __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, \ + #opt) static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, - u64 flag) + u64 flag, const char *name) { struct btrfs_super_block *disk_super; u64 features; @@ -3673,18 +3576,20 @@ static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, if (!(features & flag)) { features |= flag; btrfs_set_super_compat_ro_flags(disk_super, features); - btrfs_info(fs_info, "setting %llu ro feature flag", - flag); + btrfs_info(fs_info, + "setting compat-ro feature flag for %s (0x%llx)", + name, flag); } spin_unlock(&fs_info->super_lock); } } #define btrfs_clear_fs_compat_ro(__fs_info, opt) \ - __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) + __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, \ + #opt) static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, - u64 flag) + u64 flag, const char *name) { struct btrfs_super_block *disk_super; u64 features; @@ -3697,8 +3602,9 @@ static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, if (features & flag) { features &= ~flag; btrfs_set_super_compat_ro_flags(disk_super, features); - btrfs_info(fs_info, "clearing %llu ro feature flag", - flag); + btrfs_info(fs_info, + "clearing compat-ro feature flag for %s (0x%llx)", + name, flag); } spin_unlock(&fs_info->super_lock); } diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c new file mode 100644 index 000000000000..17f7c0d38768 --- /dev/null +++ b/fs/btrfs/delalloc-space.c @@ -0,0 +1,494 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "ctree.h" +#include "delalloc-space.h" +#include "block-rsv.h" +#include "btrfs_inode.h" +#include "space-info.h" +#include "transaction.h" +#include "qgroup.h" + +int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) +{ + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; + u64 used; + int ret = 0; + int need_commit = 2; + int have_pinned_space; + + /* Make sure bytes are sectorsize aligned */ + bytes = ALIGN(bytes, fs_info->sectorsize); + + if (btrfs_is_free_space_inode(inode)) { + need_commit = 0; + ASSERT(current->journal_info); + } + +again: + /* Make sure we have enough space to handle the data first */ + spin_lock(&data_sinfo->lock); + used = btrfs_space_info_used(data_sinfo, true); + + if (used + bytes > data_sinfo->total_bytes) { + struct btrfs_trans_handle *trans; + + /* + * If we don't have enough free bytes in this space then we need + * to alloc a new chunk. + */ + if (!data_sinfo->full) { + u64 alloc_target; + + data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; + spin_unlock(&data_sinfo->lock); + + alloc_target = btrfs_data_alloc_profile(fs_info); + /* + * It is ugly that we don't call nolock join + * transaction for the free space inode case here. + * But it is safe because we only do the data space + * reservation for the free space cache in the + * transaction context, the common join transaction + * just increase the counter of the current transaction + * handler, doesn't try to acquire the trans_lock of + * the fs. + */ + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + ret = btrfs_chunk_alloc(trans, alloc_target, + CHUNK_ALLOC_NO_FORCE); + btrfs_end_transaction(trans); + if (ret < 0) { + if (ret != -ENOSPC) + return ret; + else { + have_pinned_space = 1; + goto commit_trans; + } + } + + goto again; + } + + /* + * If we don't have enough pinned space to deal with this + * allocation, and no removed chunk in current transaction, + * don't bother committing the transaction. + */ + have_pinned_space = __percpu_counter_compare( + &data_sinfo->total_bytes_pinned, + used + bytes - data_sinfo->total_bytes, + BTRFS_TOTAL_BYTES_PINNED_BATCH); + spin_unlock(&data_sinfo->lock); + + /* Commit the current transaction and try again */ +commit_trans: + if (need_commit) { + need_commit--; + + if (need_commit > 0) { + btrfs_start_delalloc_roots(fs_info, -1); + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, + (u64)-1); + } + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + if (have_pinned_space >= 0 || + test_bit(BTRFS_TRANS_HAVE_FREE_BGS, + &trans->transaction->flags) || + need_commit > 0) { + ret = btrfs_commit_transaction(trans); + if (ret) + return ret; + /* + * The cleaner kthread might still be doing iput + * operations. Wait for it to finish so that + * more space is released. We don't need to + * explicitly run the delayed iputs here because + * the commit_transaction would have woken up + * the cleaner. + */ + ret = btrfs_wait_on_delayed_iputs(fs_info); + if (ret) + return ret; + goto again; + } else { + btrfs_end_transaction(trans); + } + } + + trace_btrfs_space_reservation(fs_info, + "space_info:enospc", + data_sinfo->flags, bytes, 1); + return -ENOSPC; + } + btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes); + trace_btrfs_space_reservation(fs_info, "space_info", + data_sinfo->flags, bytes, 1); + spin_unlock(&data_sinfo->lock); + + return 0; +} + +int btrfs_check_data_free_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + int ret; + + /* align the range */ + len = round_up(start + len, fs_info->sectorsize) - + round_down(start, fs_info->sectorsize); + start = round_down(start, fs_info->sectorsize); + + ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len); + if (ret < 0) + return ret; + + /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ + ret = btrfs_qgroup_reserve_data(inode, reserved, start, len); + if (ret < 0) + btrfs_free_reserved_data_space_noquota(inode, start, len); + else + ret = 0; + return ret; +} + +/* + * Called if we need to clear a data reservation for this inode + * Normally in a error case. + * + * This one will *NOT* use accurate qgroup reserved space API, just for case + * which we can't sleep and is sure it won't affect qgroup reserved space. + * Like clear_bit_hook(). + */ +void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, + u64 len) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_space_info *data_sinfo; + + /* Make sure the range is aligned to sectorsize */ + len = round_up(start + len, fs_info->sectorsize) - + round_down(start, fs_info->sectorsize); + start = round_down(start, fs_info->sectorsize); + + data_sinfo = fs_info->data_sinfo; + spin_lock(&data_sinfo->lock); + btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, -len); + trace_btrfs_space_reservation(fs_info, "space_info", + data_sinfo->flags, len, 0); + spin_unlock(&data_sinfo->lock); +} + +/* + * Called if we need to clear a data reservation for this inode + * Normally in a error case. + * + * This one will handle the per-inode data rsv map for accurate reserved + * space framework. + */ +void btrfs_free_reserved_data_space(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + + /* Make sure the range is aligned to sectorsize */ + len = round_up(start + len, root->fs_info->sectorsize) - + round_down(start, root->fs_info->sectorsize); + start = round_down(start, root->fs_info->sectorsize); + + btrfs_free_reserved_data_space_noquota(inode, start, len); + btrfs_qgroup_free_data(inode, reserved, start, len); +} + +/** + * btrfs_inode_rsv_release - release any excessive reservation. + * @inode - the inode we need to release from. + * @qgroup_free - free or convert qgroup meta. + * Unlike normal operation, qgroup meta reservation needs to know if we are + * freeing qgroup reservation or just converting it into per-trans. Normally + * @qgroup_free is true for error handling, and false for normal release. + * + * This is the same as btrfs_block_rsv_release, except that it handles the + * tracepoint for the reservation. + */ +static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_block_rsv *block_rsv = &inode->block_rsv; + u64 released = 0; + u64 qgroup_to_release = 0; + + /* + * Since we statically set the block_rsv->size we just want to say we + * are releasing 0 bytes, and then we'll just get the reservation over + * the size free'd. + */ + released = __btrfs_block_rsv_release(fs_info, block_rsv, 0, + &qgroup_to_release); + if (released > 0) + trace_btrfs_space_reservation(fs_info, "delalloc", + btrfs_ino(inode), released, 0); + if (qgroup_free) + btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release); + else + btrfs_qgroup_convert_reserved_meta(inode->root, + qgroup_to_release); +} + +static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, + struct btrfs_inode *inode) +{ + struct btrfs_block_rsv *block_rsv = &inode->block_rsv; + u64 reserve_size = 0; + u64 qgroup_rsv_size = 0; + u64 csum_leaves; + unsigned outstanding_extents; + + lockdep_assert_held(&inode->lock); + outstanding_extents = inode->outstanding_extents; + if (outstanding_extents) + reserve_size = btrfs_calc_trans_metadata_size(fs_info, + outstanding_extents + 1); + csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, + inode->csum_bytes); + reserve_size += btrfs_calc_trans_metadata_size(fs_info, + csum_leaves); + /* + * For qgroup rsv, the calculation is very simple: + * account one nodesize for each outstanding extent + * + * This is overestimating in most cases. + */ + qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize; + + spin_lock(&block_rsv->lock); + block_rsv->size = reserve_size; + block_rsv->qgroup_rsv_size = qgroup_rsv_size; + spin_unlock(&block_rsv->lock); +} + +static void calc_inode_reservations(struct btrfs_fs_info *fs_info, + u64 num_bytes, u64 *meta_reserve, + u64 *qgroup_reserve) +{ + u64 nr_extents = count_max_extents(num_bytes); + u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes); + + /* We add one for the inode update at finish ordered time */ + *meta_reserve = btrfs_calc_trans_metadata_size(fs_info, + nr_extents + csum_leaves + 1); + *qgroup_reserve = nr_extents * fs_info->nodesize; +} + +int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) +{ + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_rsv *block_rsv = &inode->block_rsv; + u64 meta_reserve, qgroup_reserve; + unsigned nr_extents; + enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; + int ret = 0; + bool delalloc_lock = true; + + /* + * If we are a free space inode we need to not flush since we will be in + * the middle of a transaction commit. We also don't need the delalloc + * mutex since we won't race with anybody. We need this mostly to make + * lockdep shut its filthy mouth. + * + * If we have a transaction open (can happen if we call truncate_block + * from truncate), then we need FLUSH_LIMIT so we don't deadlock. + */ + if (btrfs_is_free_space_inode(inode)) { + flush = BTRFS_RESERVE_NO_FLUSH; + delalloc_lock = false; + } else { + if (current->journal_info) + flush = BTRFS_RESERVE_FLUSH_LIMIT; + + if (btrfs_transaction_in_commit(fs_info)) + schedule_timeout(1); + } + + if (delalloc_lock) + mutex_lock(&inode->delalloc_mutex); + + num_bytes = ALIGN(num_bytes, fs_info->sectorsize); + + /* + * We always want to do it this way, every other way is wrong and ends + * in tears. Pre-reserving the amount we are going to add will always + * be the right way, because otherwise if we have enough parallelism we + * could end up with thousands of inodes all holding little bits of + * reservations they were able to make previously and the only way to + * reclaim that space is to ENOSPC out the operations and clear + * everything out and try again, which is bad. This way we just + * over-reserve slightly, and clean up the mess when we are done. + */ + calc_inode_reservations(fs_info, num_bytes, &meta_reserve, + &qgroup_reserve); + ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true); + if (ret) + goto out_fail; + ret = btrfs_reserve_metadata_bytes(root, block_rsv, meta_reserve, flush); + if (ret) + goto out_qgroup; + + /* + * Now we need to update our outstanding extents and csum bytes _first_ + * and then add the reservation to the block_rsv. This keeps us from + * racing with an ordered completion or some such that would think it + * needs to free the reservation we just made. + */ + spin_lock(&inode->lock); + nr_extents = count_max_extents(num_bytes); + btrfs_mod_outstanding_extents(inode, nr_extents); + inode->csum_bytes += num_bytes; + btrfs_calculate_inode_block_rsv_size(fs_info, inode); + spin_unlock(&inode->lock); + + /* Now we can safely add our space to our block rsv */ + btrfs_block_rsv_add_bytes(block_rsv, meta_reserve, false); + trace_btrfs_space_reservation(root->fs_info, "delalloc", + btrfs_ino(inode), meta_reserve, 1); + + spin_lock(&block_rsv->lock); + block_rsv->qgroup_rsv_reserved += qgroup_reserve; + spin_unlock(&block_rsv->lock); + + if (delalloc_lock) + mutex_unlock(&inode->delalloc_mutex); + return 0; +out_qgroup: + btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve); +out_fail: + btrfs_inode_rsv_release(inode, true); + if (delalloc_lock) + mutex_unlock(&inode->delalloc_mutex); + return ret; +} + +/** + * btrfs_delalloc_release_metadata - release a metadata reservation for an inode + * @inode: the inode to release the reservation for. + * @num_bytes: the number of bytes we are releasing. + * @qgroup_free: free qgroup reservation or convert it to per-trans reservation + * + * This will release the metadata reservation for an inode. This can be called + * once we complete IO for a given set of bytes to release their metadata + * reservations, or on error for the same reason. + */ +void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, + bool qgroup_free) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + + num_bytes = ALIGN(num_bytes, fs_info->sectorsize); + spin_lock(&inode->lock); + inode->csum_bytes -= num_bytes; + btrfs_calculate_inode_block_rsv_size(fs_info, inode); + spin_unlock(&inode->lock); + + if (btrfs_is_testing(fs_info)) + return; + + btrfs_inode_rsv_release(inode, qgroup_free); +} + +/** + * btrfs_delalloc_release_extents - release our outstanding_extents + * @inode: the inode to balance the reservation for. + * @num_bytes: the number of bytes we originally reserved with + * @qgroup_free: do we need to free qgroup meta reservation or convert them. + * + * When we reserve space we increase outstanding_extents for the extents we may + * add. Once we've set the range as delalloc or created our ordered extents we + * have outstanding_extents to track the real usage, so we use this to free our + * temporarily tracked outstanding_extents. This _must_ be used in conjunction + * with btrfs_delalloc_reserve_metadata. + */ +void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes, + bool qgroup_free) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + unsigned num_extents; + + spin_lock(&inode->lock); + num_extents = count_max_extents(num_bytes); + btrfs_mod_outstanding_extents(inode, -num_extents); + btrfs_calculate_inode_block_rsv_size(fs_info, inode); + spin_unlock(&inode->lock); + + if (btrfs_is_testing(fs_info)) + return; + + btrfs_inode_rsv_release(inode, qgroup_free); +} + +/** + * btrfs_delalloc_reserve_space - reserve data and metadata space for + * delalloc + * @inode: inode we're writing to + * @start: start range we are writing to + * @len: how long the range we are writing to + * @reserved: mandatory parameter, record actually reserved qgroup ranges of + * current reservation. + * + * This will do the following things + * + * - reserve space in data space info for num bytes + * and reserve precious corresponding qgroup space + * (Done in check_data_free_space) + * + * - reserve space for metadata space, based on the number of outstanding + * extents and how much csums will be needed + * also reserve metadata space in a per root over-reserve method. + * - add to the inodes->delalloc_bytes + * - add it to the fs_info's delalloc inodes list. + * (Above 3 all done in delalloc_reserve_metadata) + * + * Return 0 for success + * Return <0 for error(-ENOSPC or -EQUOT) + */ +int btrfs_delalloc_reserve_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len) +{ + int ret; + + ret = btrfs_check_data_free_space(inode, reserved, start, len); + if (ret < 0) + return ret; + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); + if (ret < 0) + btrfs_free_reserved_data_space(inode, *reserved, start, len); + return ret; +} + +/** + * btrfs_delalloc_release_space - release data and metadata space for delalloc + * @inode: inode we're releasing space for + * @start: start position of the space already reserved + * @len: the len of the space already reserved + * @release_bytes: the len of the space we consumed or didn't use + * + * This function will release the metadata space that was not used and will + * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes + * list if there are no delalloc bytes left. + * Also it will handle the qgroup reserved space. + */ +void btrfs_delalloc_release_space(struct inode *inode, + struct extent_changeset *reserved, + u64 start, u64 len, bool qgroup_free) +{ + btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free); + btrfs_free_reserved_data_space(inode, reserved, start, len); +} diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h new file mode 100644 index 000000000000..54466fbd7075 --- /dev/null +++ b/fs/btrfs/delalloc-space.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_DELALLOC_SPACE_H +#define BTRFS_DELALLOC_SPACE_H + +struct extent_changeset; + +int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); +int btrfs_check_data_free_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len); +void btrfs_free_reserved_data_space(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len); +void btrfs_delalloc_release_space(struct inode *inode, + struct extent_changeset *reserved, + u64 start, u64 len, bool qgroup_free); +void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, + u64 len); +void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, + bool qgroup_free); +int btrfs_delalloc_reserve_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len); + +#endif /* BTRFS_DELALLOC_SPACE_H */ diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index a73fc23e2961..9a91d1eb0af4 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -10,6 +10,7 @@ #include "delayed-ref.h" #include "transaction.h" #include "qgroup.h" +#include "space-info.h" struct kmem_cache *btrfs_delayed_ref_head_cachep; struct kmem_cache *btrfs_delayed_tree_ref_cachep; @@ -24,6 +25,179 @@ struct kmem_cache *btrfs_delayed_extent_op_cachep; * of hammering updates on the extent allocation tree. */ +bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + bool ret = false; + u64 reserved; + + spin_lock(&global_rsv->lock); + reserved = global_rsv->reserved; + spin_unlock(&global_rsv->lock); + + /* + * Since the global reserve is just kind of magic we don't really want + * to rely on it to save our bacon, so if our size is more than the + * delayed_refs_rsv and the global rsv then it's time to think about + * bailing. + */ + spin_lock(&delayed_refs_rsv->lock); + reserved += delayed_refs_rsv->reserved; + if (delayed_refs_rsv->size >= reserved) + ret = true; + spin_unlock(&delayed_refs_rsv->lock); + return ret; +} + +int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) +{ + u64 num_entries = + atomic_read(&trans->transaction->delayed_refs.num_entries); + u64 avg_runtime; + u64 val; + + smp_mb(); + avg_runtime = trans->fs_info->avg_delayed_ref_runtime; + val = num_entries * avg_runtime; + if (val >= NSEC_PER_SEC) + return 1; + if (val >= NSEC_PER_SEC / 2) + return 2; + + return btrfs_check_space_for_delayed_refs(trans->fs_info); +} + +/** + * btrfs_delayed_refs_rsv_release - release a ref head's reservation. + * @fs_info - the fs_info for our fs. + * @nr - the number of items to drop. + * + * This drops the delayed ref head's count from the delayed refs rsv and frees + * any excess reservation we had. + */ +void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) +{ + struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; + u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr); + u64 released = 0; + + released = __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, + NULL); + if (released) + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", + 0, released, 0); +} + +/* + * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv + * @trans - the trans that may have generated delayed refs + * + * This is to be called anytime we may have adjusted trans->delayed_ref_updates, + * it'll calculate the additional size and add it to the delayed_refs_rsv. + */ +void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; + u64 num_bytes; + + if (!trans->delayed_ref_updates) + return; + + num_bytes = btrfs_calc_trans_metadata_size(fs_info, + trans->delayed_ref_updates); + spin_lock(&delayed_rsv->lock); + delayed_rsv->size += num_bytes; + delayed_rsv->full = 0; + spin_unlock(&delayed_rsv->lock); + trans->delayed_ref_updates = 0; +} + +/** + * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv. + * @fs_info - the fs info for our fs. + * @src - the source block rsv to transfer from. + * @num_bytes - the number of bytes to transfer. + * + * This transfers up to the num_bytes amount from the src rsv to the + * delayed_refs_rsv. Any extra bytes are returned to the space info. + */ +void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *src, + u64 num_bytes) +{ + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; + u64 to_free = 0; + + spin_lock(&src->lock); + src->reserved -= num_bytes; + src->size -= num_bytes; + spin_unlock(&src->lock); + + spin_lock(&delayed_refs_rsv->lock); + if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) { + u64 delta = delayed_refs_rsv->size - + delayed_refs_rsv->reserved; + if (num_bytes > delta) { + to_free = num_bytes - delta; + num_bytes = delta; + } + } else { + to_free = num_bytes; + num_bytes = 0; + } + + if (num_bytes) + delayed_refs_rsv->reserved += num_bytes; + if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size) + delayed_refs_rsv->full = 1; + spin_unlock(&delayed_refs_rsv->lock); + + if (num_bytes) + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", + 0, num_bytes, 1); + if (to_free) + btrfs_space_info_add_old_bytes(fs_info, + delayed_refs_rsv->space_info, to_free); +} + +/** + * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage. + * @fs_info - the fs_info for our fs. + * @flush - control how we can flush for this reservation. + * + * This will refill the delayed block_rsv up to 1 items size worth of space and + * will return -ENOSPC if we can't make the reservation. + */ +int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, + enum btrfs_reserve_flush_enum flush) +{ + struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; + u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1); + u64 num_bytes = 0; + int ret = -ENOSPC; + + spin_lock(&block_rsv->lock); + if (block_rsv->reserved < block_rsv->size) { + num_bytes = block_rsv->size - block_rsv->reserved; + num_bytes = min(num_bytes, limit); + } + spin_unlock(&block_rsv->lock); + + if (!num_bytes) + return 0; + + ret = btrfs_reserve_metadata_bytes(fs_info->extent_root, block_rsv, + num_bytes, flush); + if (ret) + return ret; + btrfs_block_rsv_add_bytes(block_rsv, num_bytes, 0); + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", + 0, num_bytes, 1); + return 0; +} + /* * compare two delayed tree backrefs with same bytenr and type */ @@ -957,13 +1131,14 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, } /* - * this does a simple search for the head node for a given extent. - * It must be called with the delayed ref spinlock held, and it returns - * the head node if any where found, or NULL if not. + * This does a simple search for the head node for a given extent. Returns the + * head node if found, or NULL if not. */ struct btrfs_delayed_ref_head * btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr) { + lockdep_assert_held(&delayed_refs->lock); + return find_ref_head(delayed_refs, bytenr, false); } diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index c18f93ea88ed..1c977e6d45dc 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -364,6 +364,16 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head( int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq); +void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr); +void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans); +int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, + enum btrfs_reserve_flush_enum flush); +void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *src, + u64 num_bytes); +int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans); +bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); + /* * helper functions to cast a node into its container */ diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index ee0989c7e3a9..6b2e9aa83ffa 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -201,7 +201,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, return PTR_ERR(bdev); } - filemap_write_and_wait(bdev->bd_inode->i_mapping); + sync_blockdev(bdev); devices = &fs_info->fs_devices->devices; list_for_each_entry(device, devices, dev_list) { @@ -237,7 +237,6 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, } rcu_assign_pointer(device->name, name); - mutex_lock(&fs_info->fs_devices->device_list_mutex); set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); device->generation = 0; device->io_width = fs_info->sectorsize; @@ -256,6 +255,8 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, device->dev_stats_valid = 1; set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); device->fs_devices = fs_info->fs_devices; + + mutex_lock(&fs_info->fs_devices->device_list_mutex); list_add(&device->dev_list, &fs_info->fs_devices->devices); fs_info->fs_devices->num_devices++; fs_info->fs_devices->open_devices++; @@ -399,7 +400,6 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, int ret; struct btrfs_device *tgt_device = NULL; struct btrfs_device *src_device = NULL; - bool need_unlock; src_device = btrfs_find_device_by_devspec(fs_info, srcdevid, srcdev_name); @@ -413,11 +413,6 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, return -ETXTBSY; } - ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name, - src_device, &tgt_device); - if (ret) - return ret; - /* * Here we commit the transaction to make sure commit_total_bytes * of all the devices are updated. @@ -431,7 +426,11 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, return PTR_ERR(trans); } - need_unlock = true; + ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name, + src_device, &tgt_device); + if (ret) + return ret; + down_write(&dev_replace->rwsem); switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: @@ -442,11 +441,11 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: ASSERT(0); ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; + up_write(&dev_replace->rwsem); goto leave; } dev_replace->cont_reading_from_srcdev_mode = read_src; - WARN_ON(!src_device); dev_replace->srcdev = src_device; dev_replace->tgtdev = tgt_device; @@ -471,7 +470,6 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, atomic64_set(&dev_replace->num_write_errors, 0); atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); up_write(&dev_replace->rwsem); - need_unlock = false; ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device); if (ret) @@ -479,16 +477,16 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); - /* force writing the updated state information to disk */ - trans = btrfs_start_transaction(root, 0); + /* Commit dev_replace state and reserve 1 item for it. */ + trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - need_unlock = true; down_write(&dev_replace->rwsem); dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED; dev_replace->srcdev = NULL; dev_replace->tgtdev = NULL; + up_write(&dev_replace->rwsem); goto leave; } @@ -510,8 +508,6 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, return ret; leave: - if (need_unlock) - up_write(&dev_replace->rwsem); btrfs_destroy_dev_replace_tgtdev(tgt_device); return ret; } @@ -678,7 +674,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, btrfs_device_set_disk_total_bytes(tgt_device, src_device->disk_total_bytes); btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used); - tgt_device->commit_total_bytes = src_device->commit_total_bytes; tgt_device->commit_bytes_used = src_device->bytes_used; btrfs_assign_next_active_device(src_device, tgt_device); @@ -728,7 +723,7 @@ static void btrfs_dev_replace_update_device_in_mapping_tree( struct btrfs_device *srcdev, struct btrfs_device *tgtdev) { - struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; + struct extent_map_tree *em_tree = &fs_info->mapping_tree; struct extent_map *em; struct map_lookup *map; u64 start = 0; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index deb74a8c191a..41a2bd2e0c56 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -19,6 +19,7 @@ #include <linux/crc32c.h> #include <linux/sched/mm.h> #include <asm/unaligned.h> +#include <crypto/hash.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -40,10 +41,6 @@ #include "tree-checker.h" #include "ref-verify.h" -#ifdef CONFIG_X86 -#include <asm/cpufeature.h> -#endif - #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ BTRFS_SUPER_FLAG_ERROR |\ @@ -249,16 +246,6 @@ out: return em; } -u32 btrfs_csum_data(const char *data, u32 seed, size_t len) -{ - return crc32c(seed, data, len); -} - -void btrfs_csum_final(u32 crc, u8 *result) -{ - put_unaligned_le32(~crc, result); -} - /* * Compute the csum of a btree block and store the result to provided buffer. * @@ -266,6 +253,8 @@ void btrfs_csum_final(u32 crc, u8 *result) */ static int csum_tree_block(struct extent_buffer *buf, u8 *result) { + struct btrfs_fs_info *fs_info = buf->fs_info; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); unsigned long len; unsigned long cur_len; unsigned long offset = BTRFS_CSUM_SIZE; @@ -273,9 +262,12 @@ static int csum_tree_block(struct extent_buffer *buf, u8 *result) unsigned long map_start; unsigned long map_len; int err; - u32 crc = ~(u32)0; + + shash->tfm = fs_info->csum_shash; + crypto_shash_init(shash); len = buf->len - offset; + while (len > 0) { /* * Note: we don't need to check for the err == 1 case here, as @@ -288,14 +280,13 @@ static int csum_tree_block(struct extent_buffer *buf, u8 *result) if (WARN_ON(err)) return err; cur_len = min(len, map_len - (offset - map_start)); - crc = btrfs_csum_data(kaddr + offset - map_start, - crc, cur_len); + crypto_shash_update(shash, kaddr + offset - map_start, cur_len); len -= cur_len; offset += cur_len; } memset(result, 0, BTRFS_CSUM_SIZE); - btrfs_csum_final(crc, result); + crypto_shash_final(shash, result); return 0; } @@ -356,6 +347,16 @@ out: return ret; } +static bool btrfs_supported_super_csum(u16 csum_type) +{ + switch (csum_type) { + case BTRFS_CSUM_TYPE_CRC32: + return true; + default: + return false; + } +} + /* * Return 0 if the superblock checksum type matches the checksum value of that * algorithm. Pass the raw disk superblock data. @@ -365,33 +366,25 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, { struct btrfs_super_block *disk_sb = (struct btrfs_super_block *)raw_disk_sb; - u16 csum_type = btrfs_super_csum_type(disk_sb); - int ret = 0; + char result[BTRFS_CSUM_SIZE]; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - if (csum_type == BTRFS_CSUM_TYPE_CRC32) { - u32 crc = ~(u32)0; - char result[sizeof(crc)]; + shash->tfm = fs_info->csum_shash; + crypto_shash_init(shash); - /* - * The super_block structure does not span the whole - * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space - * is filled with zeros and is included in the checksum. - */ - crc = btrfs_csum_data(raw_disk_sb + BTRFS_CSUM_SIZE, - crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); - btrfs_csum_final(crc, result); + /* + * The super_block structure does not span the whole + * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is + * filled with zeros and is included in the checksum. + */ + crypto_shash_update(shash, raw_disk_sb + BTRFS_CSUM_SIZE, + BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); + crypto_shash_final(shash, result); - if (memcmp(raw_disk_sb, result, sizeof(result))) - ret = 1; - } + if (memcmp(disk_sb->csum, result, btrfs_super_csum_size(disk_sb))) + return 1; - if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) { - btrfs_err(fs_info, "unsupported checksum algorithm %u", - csum_type); - ret = 1; - } - - return ret; + return 0; } int btrfs_verify_level_key(struct extent_buffer *eb, int level, @@ -873,14 +866,13 @@ static blk_status_t btree_submit_bio_start(void *private_data, struct bio *bio, return btree_csum_one_bio(bio); } -static int check_async_write(struct btrfs_inode *bi) +static int check_async_write(struct btrfs_fs_info *fs_info, + struct btrfs_inode *bi) { if (atomic_read(&bi->sync_writers)) return 0; -#ifdef CONFIG_X86 - if (static_cpu_has(X86_FEATURE_XMM4_2)) + if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) return 0; -#endif return 1; } @@ -889,7 +881,7 @@ static blk_status_t btree_submit_bio_hook(struct inode *inode, struct bio *bio, unsigned long bio_flags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int async = check_async_write(BTRFS_I(inode)); + int async = check_async_write(fs_info, BTRFS_I(inode)); blk_status_t ret; if (bio_op(bio) != REQ_OP_WRITE) { @@ -2262,6 +2254,29 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, return 0; } +static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type) +{ + struct crypto_shash *csum_shash; + const char *csum_name = btrfs_super_csum_name(csum_type); + + csum_shash = crypto_alloc_shash(csum_name, 0, 0); + + if (IS_ERR(csum_shash)) { + btrfs_err(fs_info, "error allocating %s hash for checksum", + csum_name); + return PTR_ERR(csum_shash); + } + + fs_info->csum_shash = csum_shash; + + return 0; +} + +static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) +{ + crypto_free_shash(fs_info->csum_shash); +} + static int btrfs_replay_log(struct btrfs_fs_info *fs_info, struct btrfs_fs_devices *fs_devices) { @@ -2577,7 +2592,7 @@ static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info, ret = validate_super(fs_info, sb, -1); if (ret < 0) goto out; - if (btrfs_super_csum_type(sb) != BTRFS_CSUM_TYPE_CRC32) { + if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) { ret = -EUCLEAN; btrfs_err(fs_info, "invalid csum type, has %u want %u", btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32); @@ -2607,6 +2622,7 @@ int open_ctree(struct super_block *sb, u32 stripesize; u64 generation; u64 features; + u16 csum_type; struct btrfs_key location; struct buffer_head *bh; struct btrfs_super_block *disk_super; @@ -2689,7 +2705,7 @@ int open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->space_info); INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); INIT_LIST_HEAD(&fs_info->unused_bgs); - btrfs_mapping_init(&fs_info->mapping_tree); + extent_map_tree_init(&fs_info->mapping_tree); btrfs_init_block_rsv(&fs_info->global_block_rsv, BTRFS_BLOCK_RSV_GLOBAL); btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); @@ -2793,6 +2809,8 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->swapfile_pins_lock); fs_info->swapfile_pins = RB_ROOT; + fs_info->send_in_progress = 0; + ret = btrfs_alloc_stripe_hash_table(fs_info); if (ret) { err = ret; @@ -2813,6 +2831,25 @@ int open_ctree(struct super_block *sb, } /* + * Verify the type first, if that or the the checksum value are + * corrupted, we'll find out + */ + csum_type = btrfs_super_csum_type((struct btrfs_super_block *)bh->b_data); + if (!btrfs_supported_super_csum(csum_type)) { + btrfs_err(fs_info, "unsupported checksum algorithm: %u", + csum_type); + err = -EINVAL; + brelse(bh); + goto fail_alloc; + } + + ret = btrfs_init_csum_hash(fs_info, csum_type); + if (ret) { + err = ret; + goto fail_alloc; + } + + /* * We want to check superblock checksum, the type is stored inside. * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k). */ @@ -2820,7 +2857,7 @@ int open_ctree(struct super_block *sb, btrfs_err(fs_info, "superblock checksum mismatch"); err = -EINVAL; brelse(bh); - goto fail_alloc; + goto fail_csum; } /* @@ -2857,11 +2894,11 @@ int open_ctree(struct super_block *sb, if (ret) { btrfs_err(fs_info, "superblock contains fatal errors"); err = -EINVAL; - goto fail_alloc; + goto fail_csum; } if (!btrfs_super_root(disk_super)) - goto fail_alloc; + goto fail_csum; /* check FS state, whether FS is broken. */ if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) @@ -2883,7 +2920,7 @@ int open_ctree(struct super_block *sb, ret = btrfs_parse_options(fs_info, options, sb->s_flags); if (ret) { err = ret; - goto fail_alloc; + goto fail_csum; } features = btrfs_super_incompat_flags(disk_super) & @@ -2893,7 +2930,7 @@ int open_ctree(struct super_block *sb, "cannot mount because of unsupported optional features (%llx)", features); err = -EINVAL; - goto fail_alloc; + goto fail_csum; } features = btrfs_super_incompat_flags(disk_super); @@ -2937,7 +2974,7 @@ int open_ctree(struct super_block *sb, btrfs_err(fs_info, "unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups", nodesize, sectorsize); - goto fail_alloc; + goto fail_csum; } /* @@ -2953,7 +2990,7 @@ int open_ctree(struct super_block *sb, "cannot mount read-write because of unsupported optional features (%llx)", features); err = -EINVAL; - goto fail_alloc; + goto fail_csum; } ret = btrfs_init_workqueues(fs_info, fs_devices); @@ -3331,6 +3368,8 @@ fail_tree_roots: fail_sb_buffer: btrfs_stop_all_workers(fs_info); btrfs_free_block_groups(fs_info); +fail_csum: + btrfs_free_csum_hash(fs_info); fail_alloc: fail_iput: btrfs_mapping_tree_free(&fs_info->mapping_tree); @@ -3472,17 +3511,20 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) static int write_dev_supers(struct btrfs_device *device, struct btrfs_super_block *sb, int max_mirrors) { + struct btrfs_fs_info *fs_info = device->fs_info; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); struct buffer_head *bh; int i; int ret; int errors = 0; - u32 crc; u64 bytenr; int op_flags; if (max_mirrors == 0) max_mirrors = BTRFS_SUPER_MIRROR_MAX; + shash->tfm = fs_info->csum_shash; + for (i = 0; i < max_mirrors; i++) { bytenr = btrfs_sb_offset(i); if (bytenr + BTRFS_SUPER_INFO_SIZE >= @@ -3491,10 +3533,10 @@ static int write_dev_supers(struct btrfs_device *device, btrfs_set_super_bytenr(sb, bytenr); - crc = ~(u32)0; - crc = btrfs_csum_data((const char *)sb + BTRFS_CSUM_SIZE, crc, - BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); - btrfs_csum_final(crc, sb->csum); + crypto_shash_init(shash); + crypto_shash_update(shash, (const char *)sb + BTRFS_CSUM_SIZE, + BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); + crypto_shash_final(shash, sb->csum); /* One reference for us, and we leave it for the caller */ bh = __getblk(device->bdev, bytenr / BTRFS_BDEV_BLOCKSIZE, @@ -3709,7 +3751,7 @@ int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags) if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 || (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE)) - min_tolerated = min(min_tolerated, + min_tolerated = min_t(int, min_tolerated, btrfs_raid_array[BTRFS_RAID_SINGLE]. tolerated_failures); @@ -3718,7 +3760,7 @@ int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags) continue; if (!(flags & btrfs_raid_array[raid_type].bg_flag)) continue; - min_tolerated = min(min_tolerated, + min_tolerated = min_t(int, min_tolerated, btrfs_raid_array[raid_type]. tolerated_failures); } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index a0161aa1ea0b..e80f7c45a307 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -115,8 +115,6 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level, struct btrfs_key *first_key); -u32 btrfs_csum_data(const char *data, u32 seed, size_t len); -void btrfs_csum_final(u32 crc, u8 *result); blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, enum btrfs_wq_endio_type metadata); blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5faf057f6f37..d3b58e388535 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -28,46 +28,12 @@ #include "sysfs.h" #include "qgroup.h" #include "ref-verify.h" +#include "space-info.h" +#include "block-rsv.h" +#include "delalloc-space.h" #undef SCRAMBLE_DELAYED_REFS -/* - * control flags for do_chunk_alloc's force field - * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk - * if we really need one. - * - * CHUNK_ALLOC_LIMITED means to only try and allocate one - * if we have very few chunks already allocated. This is - * used as part of the clustering code to help make sure - * we have a good pool of storage to cluster in, without - * filling the FS with empty chunks - * - * CHUNK_ALLOC_FORCE means it must try to allocate one - * - */ -enum { - CHUNK_ALLOC_NO_FORCE = 0, - CHUNK_ALLOC_LIMITED = 1, - CHUNK_ALLOC_FORCE = 2, -}; - -/* - * Declare a helper function to detect underflow of various space info members - */ -#define DECLARE_SPACE_INFO_UPDATE(name) \ -static inline void update_##name(struct btrfs_space_info *sinfo, \ - s64 bytes) \ -{ \ - if (bytes < 0 && sinfo->name < -bytes) { \ - WARN_ON(1); \ - sinfo->name = 0; \ - return; \ - } \ - sinfo->name += bytes; \ -} - -DECLARE_SPACE_INFO_UPDATE(bytes_may_use); -DECLARE_SPACE_INFO_UPDATE(bytes_pinned); static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, u64 parent, @@ -84,21 +50,8 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op); -static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, - int force); static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key); -static void dump_space_info(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *info, u64 bytes, - int dump_block_groups); -static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, - u64 num_bytes); -static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 num_bytes); -static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 num_bytes); static noinline int block_group_cache_done(struct btrfs_block_group_cache *cache) @@ -737,62 +690,39 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group( return block_group_cache_tree_search(info, bytenr, 1); } -static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, - u64 flags) +static u64 generic_ref_to_space_flags(struct btrfs_ref *ref) { - struct list_head *head = &info->space_info; - struct btrfs_space_info *found; - - flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; - - rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) { - if (found->flags & flags) { - rcu_read_unlock(); - return found; - } + if (ref->type == BTRFS_REF_METADATA) { + if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID) + return BTRFS_BLOCK_GROUP_SYSTEM; + else + return BTRFS_BLOCK_GROUP_METADATA; } - rcu_read_unlock(); - return NULL; + return BTRFS_BLOCK_GROUP_DATA; } static void add_pinned_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_ref *ref, int sign) + struct btrfs_ref *ref) { struct btrfs_space_info *space_info; - s64 num_bytes; - u64 flags; - - ASSERT(sign == 1 || sign == -1); - num_bytes = sign * ref->len; - if (ref->type == BTRFS_REF_METADATA) { - if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID) - flags = BTRFS_BLOCK_GROUP_SYSTEM; - else - flags = BTRFS_BLOCK_GROUP_METADATA; - } else { - flags = BTRFS_BLOCK_GROUP_DATA; - } + u64 flags = generic_ref_to_space_flags(ref); - space_info = __find_space_info(fs_info, flags); + space_info = btrfs_find_space_info(fs_info, flags); ASSERT(space_info); - percpu_counter_add_batch(&space_info->total_bytes_pinned, num_bytes, + percpu_counter_add_batch(&space_info->total_bytes_pinned, ref->len, BTRFS_TOTAL_BYTES_PINNED_BATCH); } -/* - * after adding space to the filesystem, we need to clear the full flags - * on all the space infos. - */ -void btrfs_clear_space_info_full(struct btrfs_fs_info *info) +static void sub_pinned_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_ref *ref) { - struct list_head *head = &info->space_info; - struct btrfs_space_info *found; + struct btrfs_space_info *space_info; + u64 flags = generic_ref_to_space_flags(ref); - rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) - found->full = 0; - rcu_read_unlock(); + space_info = btrfs_find_space_info(fs_info, flags); + ASSERT(space_info); + percpu_counter_add_batch(&space_info->total_bytes_pinned, -ref->len, + BTRFS_TOTAL_BYTES_PINNED_BATCH); } /* simple helper to search for an existing data extent at a given offset */ @@ -1121,11 +1051,11 @@ static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) __le64 lenum; lenum = cpu_to_le64(root_objectid); - high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); + high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(owner); - low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); + low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(offset); - low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); + low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); return ((u64)high_crc << 31) ^ (u64)low_crc; } @@ -2065,7 +1995,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, btrfs_ref_tree_mod(fs_info, generic_ref); if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) - add_pinned_bytes(fs_info, generic_ref, -1); + sub_pinned_bytes(fs_info, generic_ref); return ret; } @@ -2462,7 +2392,7 @@ void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, flags = BTRFS_BLOCK_GROUP_SYSTEM; else flags = BTRFS_BLOCK_GROUP_METADATA; - space_info = __find_space_info(fs_info, flags); + space_info = btrfs_find_space_info(fs_info, flags); ASSERT(space_info); percpu_counter_add_batch(&space_info->total_bytes_pinned, -head->num_bytes, @@ -2824,49 +2754,6 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes) return num_csums; } -bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info) -{ - struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - bool ret = false; - u64 reserved; - - spin_lock(&global_rsv->lock); - reserved = global_rsv->reserved; - spin_unlock(&global_rsv->lock); - - /* - * Since the global reserve is just kind of magic we don't really want - * to rely on it to save our bacon, so if our size is more than the - * delayed_refs_rsv and the global rsv then it's time to think about - * bailing. - */ - spin_lock(&delayed_refs_rsv->lock); - reserved += delayed_refs_rsv->reserved; - if (delayed_refs_rsv->size >= reserved) - ret = true; - spin_unlock(&delayed_refs_rsv->lock); - return ret; -} - -int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) -{ - u64 num_entries = - atomic_read(&trans->transaction->delayed_refs.num_entries); - u64 avg_runtime; - u64 val; - - smp_mb(); - avg_runtime = trans->fs_info->avg_delayed_ref_runtime; - val = num_entries * avg_runtime; - if (val >= NSEC_PER_SEC) - return 1; - if (val >= NSEC_PER_SEC / 2) - return 2; - - return btrfs_check_space_for_delayed_refs(trans->fs_info); -} - /* * this starts processing the delayed reference count updates and * extent insertions we have queued up so far. count can be @@ -3834,93 +3721,6 @@ void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg) wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers)); } -static const char *alloc_name(u64 flags) -{ - switch (flags) { - case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA: - return "mixed"; - case BTRFS_BLOCK_GROUP_METADATA: - return "metadata"; - case BTRFS_BLOCK_GROUP_DATA: - return "data"; - case BTRFS_BLOCK_GROUP_SYSTEM: - return "system"; - default: - WARN_ON(1); - return "invalid-combination"; - }; -} - -static int create_space_info(struct btrfs_fs_info *info, u64 flags) -{ - - struct btrfs_space_info *space_info; - int i; - int ret; - - space_info = kzalloc(sizeof(*space_info), GFP_NOFS); - if (!space_info) - return -ENOMEM; - - ret = percpu_counter_init(&space_info->total_bytes_pinned, 0, - GFP_KERNEL); - if (ret) { - kfree(space_info); - return ret; - } - - for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) - INIT_LIST_HEAD(&space_info->block_groups[i]); - init_rwsem(&space_info->groups_sem); - spin_lock_init(&space_info->lock); - space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; - space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; - init_waitqueue_head(&space_info->wait); - INIT_LIST_HEAD(&space_info->ro_bgs); - INIT_LIST_HEAD(&space_info->tickets); - INIT_LIST_HEAD(&space_info->priority_tickets); - - ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype, - info->space_info_kobj, "%s", - alloc_name(space_info->flags)); - if (ret) { - kobject_put(&space_info->kobj); - return ret; - } - - list_add_rcu(&space_info->list, &info->space_info); - if (flags & BTRFS_BLOCK_GROUP_DATA) - info->data_sinfo = space_info; - - return ret; -} - -static void update_space_info(struct btrfs_fs_info *info, u64 flags, - u64 total_bytes, u64 bytes_used, - u64 bytes_readonly, - struct btrfs_space_info **space_info) -{ - struct btrfs_space_info *found; - int factor; - - factor = btrfs_bg_type_to_factor(flags); - - found = __find_space_info(info, flags); - ASSERT(found); - spin_lock(&found->lock); - found->total_bytes += total_bytes; - found->disk_total += total_bytes * factor; - found->bytes_used += bytes_used; - found->disk_used += bytes_used * factor; - found->bytes_readonly += bytes_readonly; - if (total_bytes > 0) - found->full = 0; - space_info_add_new_bytes(info, found, total_bytes - - bytes_used - bytes_readonly); - spin_unlock(&found->lock); - *space_info = found; -} - static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) { u64 extra_flags = chunk_to_extended(flags) & @@ -4068,215 +3868,6 @@ u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info) return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); } -static u64 btrfs_space_info_used(struct btrfs_space_info *s_info, - bool may_use_included) -{ - ASSERT(s_info); - return s_info->bytes_used + s_info->bytes_reserved + - s_info->bytes_pinned + s_info->bytes_readonly + - (may_use_included ? s_info->bytes_may_use : 0); -} - -int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) -{ - struct btrfs_root *root = inode->root; - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; - u64 used; - int ret = 0; - int need_commit = 2; - int have_pinned_space; - - /* make sure bytes are sectorsize aligned */ - bytes = ALIGN(bytes, fs_info->sectorsize); - - if (btrfs_is_free_space_inode(inode)) { - need_commit = 0; - ASSERT(current->journal_info); - } - -again: - /* make sure we have enough space to handle the data first */ - spin_lock(&data_sinfo->lock); - used = btrfs_space_info_used(data_sinfo, true); - - if (used + bytes > data_sinfo->total_bytes) { - struct btrfs_trans_handle *trans; - - /* - * if we don't have enough free bytes in this space then we need - * to alloc a new chunk. - */ - if (!data_sinfo->full) { - u64 alloc_target; - - data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; - spin_unlock(&data_sinfo->lock); - - alloc_target = btrfs_data_alloc_profile(fs_info); - /* - * It is ugly that we don't call nolock join - * transaction for the free space inode case here. - * But it is safe because we only do the data space - * reservation for the free space cache in the - * transaction context, the common join transaction - * just increase the counter of the current transaction - * handler, doesn't try to acquire the trans_lock of - * the fs. - */ - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - ret = do_chunk_alloc(trans, alloc_target, - CHUNK_ALLOC_NO_FORCE); - btrfs_end_transaction(trans); - if (ret < 0) { - if (ret != -ENOSPC) - return ret; - else { - have_pinned_space = 1; - goto commit_trans; - } - } - - goto again; - } - - /* - * If we don't have enough pinned space to deal with this - * allocation, and no removed chunk in current transaction, - * don't bother committing the transaction. - */ - have_pinned_space = __percpu_counter_compare( - &data_sinfo->total_bytes_pinned, - used + bytes - data_sinfo->total_bytes, - BTRFS_TOTAL_BYTES_PINNED_BATCH); - spin_unlock(&data_sinfo->lock); - - /* commit the current transaction and try again */ -commit_trans: - if (need_commit) { - need_commit--; - - if (need_commit > 0) { - btrfs_start_delalloc_roots(fs_info, -1); - btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, - (u64)-1); - } - - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - if (have_pinned_space >= 0 || - test_bit(BTRFS_TRANS_HAVE_FREE_BGS, - &trans->transaction->flags) || - need_commit > 0) { - ret = btrfs_commit_transaction(trans); - if (ret) - return ret; - /* - * The cleaner kthread might still be doing iput - * operations. Wait for it to finish so that - * more space is released. We don't need to - * explicitly run the delayed iputs here because - * the commit_transaction would have woken up - * the cleaner. - */ - ret = btrfs_wait_on_delayed_iputs(fs_info); - if (ret) - return ret; - goto again; - } else { - btrfs_end_transaction(trans); - } - } - - trace_btrfs_space_reservation(fs_info, - "space_info:enospc", - data_sinfo->flags, bytes, 1); - return -ENOSPC; - } - update_bytes_may_use(data_sinfo, bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - data_sinfo->flags, bytes, 1); - spin_unlock(&data_sinfo->lock); - - return 0; -} - -int btrfs_check_data_free_space(struct inode *inode, - struct extent_changeset **reserved, u64 start, u64 len) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int ret; - - /* align the range */ - len = round_up(start + len, fs_info->sectorsize) - - round_down(start, fs_info->sectorsize); - start = round_down(start, fs_info->sectorsize); - - ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len); - if (ret < 0) - return ret; - - /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ - ret = btrfs_qgroup_reserve_data(inode, reserved, start, len); - if (ret < 0) - btrfs_free_reserved_data_space_noquota(inode, start, len); - else - ret = 0; - return ret; -} - -/* - * Called if we need to clear a data reservation for this inode - * Normally in a error case. - * - * This one will *NOT* use accurate qgroup reserved space API, just for case - * which we can't sleep and is sure it won't affect qgroup reserved space. - * Like clear_bit_hook(). - */ -void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, - u64 len) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_space_info *data_sinfo; - - /* Make sure the range is aligned to sectorsize */ - len = round_up(start + len, fs_info->sectorsize) - - round_down(start, fs_info->sectorsize); - start = round_down(start, fs_info->sectorsize); - - data_sinfo = fs_info->data_sinfo; - spin_lock(&data_sinfo->lock); - update_bytes_may_use(data_sinfo, -len); - trace_btrfs_space_reservation(fs_info, "space_info", - data_sinfo->flags, len, 0); - spin_unlock(&data_sinfo->lock); -} - -/* - * Called if we need to clear a data reservation for this inode - * Normally in a error case. - * - * This one will handle the per-inode data rsv map for accurate reserved - * space framework. - */ -void btrfs_free_reserved_data_space(struct inode *inode, - struct extent_changeset *reserved, u64 start, u64 len) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - - /* Make sure the range is aligned to sectorsize */ - len = round_up(start + len, root->fs_info->sectorsize) - - round_down(start, root->fs_info->sectorsize); - start = round_down(start, root->fs_info->sectorsize); - - btrfs_free_reserved_data_space_noquota(inode, start, len); - btrfs_qgroup_free_data(inode, reserved, start, len); -} - static void force_metadata_allocation(struct btrfs_fs_info *info) { struct list_head *head = &info->space_info; @@ -4290,11 +3881,6 @@ static void force_metadata_allocation(struct btrfs_fs_info *info) rcu_read_unlock(); } -static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) -{ - return (global->size << 1); -} - static int should_alloc_chunk(struct btrfs_fs_info *fs_info, struct btrfs_space_info *sinfo, int force) { @@ -4325,15 +3911,9 @@ static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) { u64 num_dev; - if (type & (BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) + num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max; + if (!num_dev) num_dev = fs_info->fs_devices->rw_devices; - else if (type & BTRFS_BLOCK_GROUP_RAID1) - num_dev = 2; - else - num_dev = 1; /* DUP or single */ return num_dev; } @@ -4358,7 +3938,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) */ lockdep_assert_held(&fs_info->chunk_mutex); - info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); + info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); spin_lock(&info->lock); left = info->total_bytes - btrfs_space_info_used(info, true); spin_unlock(&info->lock); @@ -4372,7 +3952,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", left, thresh, type); - dump_space_info(fs_info, info, 0, 0); + btrfs_dump_space_info(fs_info, info, 0, 0); } if (left < thresh) { @@ -4405,8 +3985,8 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) * - return 1 if it successfully allocates a chunk, * - return errors including -ENOSPC otherwise. */ -static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, - int force) +int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, + enum btrfs_chunk_alloc_enum force) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_space_info *space_info; @@ -4418,7 +3998,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, if (trans->allocating_chunk) return -ENOSPC; - space_info = __find_space_info(fs_info, flags); + space_info = btrfs_find_space_info(fs_info, flags); ASSERT(space_info); do { @@ -4525,1714 +4105,6 @@ out: return ret; } -static int can_overcommit(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, u64 bytes, - enum btrfs_reserve_flush_enum flush, - bool system_chunk) -{ - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - u64 profile; - u64 space_size; - u64 avail; - u64 used; - int factor; - - /* Don't overcommit when in mixed mode. */ - if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) - return 0; - - if (system_chunk) - profile = btrfs_system_alloc_profile(fs_info); - else - profile = btrfs_metadata_alloc_profile(fs_info); - - used = btrfs_space_info_used(space_info, false); - - /* - * We only want to allow over committing if we have lots of actual space - * free, but if we don't have enough space to handle the global reserve - * space then we could end up having a real enospc problem when trying - * to allocate a chunk or some other such important allocation. - */ - spin_lock(&global_rsv->lock); - space_size = calc_global_rsv_need_space(global_rsv); - spin_unlock(&global_rsv->lock); - if (used + space_size >= space_info->total_bytes) - return 0; - - used += space_info->bytes_may_use; - - avail = atomic64_read(&fs_info->free_chunk_space); - - /* - * If we have dup, raid1 or raid10 then only half of the free - * space is actually usable. For raid56, the space info used - * doesn't include the parity drive, so we don't have to - * change the math - */ - factor = btrfs_bg_type_to_factor(profile); - avail = div_u64(avail, factor); - - /* - * If we aren't flushing all things, let us overcommit up to - * 1/2th of the space. If we can flush, don't let us overcommit - * too much, let it overcommit up to 1/8 of the space. - */ - if (flush == BTRFS_RESERVE_FLUSH_ALL) - avail >>= 3; - else - avail >>= 1; - - if (used + bytes < space_info->total_bytes + avail) - return 1; - return 0; -} - -static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, - unsigned long nr_pages, int nr_items) -{ - struct super_block *sb = fs_info->sb; - - if (down_read_trylock(&sb->s_umount)) { - writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); - up_read(&sb->s_umount); - } else { - /* - * We needn't worry the filesystem going from r/w to r/o though - * we don't acquire ->s_umount mutex, because the filesystem - * should guarantee the delalloc inodes list be empty after - * the filesystem is readonly(all dirty pages are written to - * the disk). - */ - btrfs_start_delalloc_roots(fs_info, nr_items); - if (!current->journal_info) - btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1); - } -} - -static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, - u64 to_reclaim) -{ - u64 bytes; - u64 nr; - - bytes = btrfs_calc_trans_metadata_size(fs_info, 1); - nr = div64_u64(to_reclaim, bytes); - if (!nr) - nr = 1; - return nr; -} - -#define EXTENT_SIZE_PER_ITEM SZ_256K - -/* - * shrink metadata reservation for delalloc - */ -static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, - u64 orig, bool wait_ordered) -{ - struct btrfs_space_info *space_info; - struct btrfs_trans_handle *trans; - u64 delalloc_bytes; - u64 dio_bytes; - u64 async_pages; - u64 items; - long time_left; - unsigned long nr_pages; - int loops; - - /* Calc the number of the pages we need flush for space reservation */ - items = calc_reclaim_items_nr(fs_info, to_reclaim); - to_reclaim = items * EXTENT_SIZE_PER_ITEM; - - trans = (struct btrfs_trans_handle *)current->journal_info; - space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); - - delalloc_bytes = percpu_counter_sum_positive( - &fs_info->delalloc_bytes); - dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); - if (delalloc_bytes == 0 && dio_bytes == 0) { - if (trans) - return; - if (wait_ordered) - btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); - return; - } - - /* - * If we are doing more ordered than delalloc we need to just wait on - * ordered extents, otherwise we'll waste time trying to flush delalloc - * that likely won't give us the space back we need. - */ - if (dio_bytes > delalloc_bytes) - wait_ordered = true; - - loops = 0; - while ((delalloc_bytes || dio_bytes) && loops < 3) { - nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; - - /* - * Triggers inode writeback for up to nr_pages. This will invoke - * ->writepages callback and trigger delalloc filling - * (btrfs_run_delalloc_range()). - */ - btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items); - - /* - * We need to wait for the compressed pages to start before - * we continue. - */ - async_pages = atomic_read(&fs_info->async_delalloc_pages); - if (!async_pages) - goto skip_async; - - /* - * Calculate how many compressed pages we want to be written - * before we continue. I.e if there are more async pages than we - * require wait_event will wait until nr_pages are written. - */ - if (async_pages <= nr_pages) - async_pages = 0; - else - async_pages -= nr_pages; - - wait_event(fs_info->async_submit_wait, - atomic_read(&fs_info->async_delalloc_pages) <= - (int)async_pages); -skip_async: - spin_lock(&space_info->lock); - if (list_empty(&space_info->tickets) && - list_empty(&space_info->priority_tickets)) { - spin_unlock(&space_info->lock); - break; - } - spin_unlock(&space_info->lock); - - loops++; - if (wait_ordered && !trans) { - btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); - } else { - time_left = schedule_timeout_killable(1); - if (time_left) - break; - } - delalloc_bytes = percpu_counter_sum_positive( - &fs_info->delalloc_bytes); - dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); - } -} - -struct reserve_ticket { - u64 orig_bytes; - u64 bytes; - int error; - struct list_head list; - wait_queue_head_t wait; -}; - -/** - * maybe_commit_transaction - possibly commit the transaction if its ok to - * @root - the root we're allocating for - * @bytes - the number of bytes we want to reserve - * @force - force the commit - * - * This will check to make sure that committing the transaction will actually - * get us somewhere and then commit the transaction if it does. Otherwise it - * will return -ENOSPC. - */ -static int may_commit_transaction(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info) -{ - struct reserve_ticket *ticket = NULL; - struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; - struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; - struct btrfs_trans_handle *trans; - u64 bytes_needed; - u64 reclaim_bytes = 0; - - trans = (struct btrfs_trans_handle *)current->journal_info; - if (trans) - return -EAGAIN; - - spin_lock(&space_info->lock); - if (!list_empty(&space_info->priority_tickets)) - ticket = list_first_entry(&space_info->priority_tickets, - struct reserve_ticket, list); - else if (!list_empty(&space_info->tickets)) - ticket = list_first_entry(&space_info->tickets, - struct reserve_ticket, list); - bytes_needed = (ticket) ? ticket->bytes : 0; - spin_unlock(&space_info->lock); - - if (!bytes_needed) - return 0; - - trans = btrfs_join_transaction(fs_info->extent_root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - /* - * See if there is enough pinned space to make this reservation, or if - * we have block groups that are going to be freed, allowing us to - * possibly do a chunk allocation the next loop through. - */ - if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || - __percpu_counter_compare(&space_info->total_bytes_pinned, - bytes_needed, - BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) - goto commit; - - /* - * See if there is some space in the delayed insertion reservation for - * this reservation. - */ - if (space_info != delayed_rsv->space_info) - goto enospc; - - spin_lock(&delayed_rsv->lock); - reclaim_bytes += delayed_rsv->reserved; - spin_unlock(&delayed_rsv->lock); - - spin_lock(&delayed_refs_rsv->lock); - reclaim_bytes += delayed_refs_rsv->reserved; - spin_unlock(&delayed_refs_rsv->lock); - if (reclaim_bytes >= bytes_needed) - goto commit; - bytes_needed -= reclaim_bytes; - - if (__percpu_counter_compare(&space_info->total_bytes_pinned, - bytes_needed, - BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) - goto enospc; - -commit: - return btrfs_commit_transaction(trans); -enospc: - btrfs_end_transaction(trans); - return -ENOSPC; -} - -/* - * Try to flush some data based on policy set by @state. This is only advisory - * and may fail for various reasons. The caller is supposed to examine the - * state of @space_info to detect the outcome. - */ -static void flush_space(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, u64 num_bytes, - int state) -{ - struct btrfs_root *root = fs_info->extent_root; - struct btrfs_trans_handle *trans; - int nr; - int ret = 0; - - switch (state) { - case FLUSH_DELAYED_ITEMS_NR: - case FLUSH_DELAYED_ITEMS: - if (state == FLUSH_DELAYED_ITEMS_NR) - nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2; - else - nr = -1; - - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - break; - } - ret = btrfs_run_delayed_items_nr(trans, nr); - btrfs_end_transaction(trans); - break; - case FLUSH_DELALLOC: - case FLUSH_DELALLOC_WAIT: - shrink_delalloc(fs_info, num_bytes * 2, num_bytes, - state == FLUSH_DELALLOC_WAIT); - break; - case FLUSH_DELAYED_REFS_NR: - case FLUSH_DELAYED_REFS: - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - break; - } - if (state == FLUSH_DELAYED_REFS_NR) - nr = calc_reclaim_items_nr(fs_info, num_bytes); - else - nr = 0; - btrfs_run_delayed_refs(trans, nr); - btrfs_end_transaction(trans); - break; - case ALLOC_CHUNK: - case ALLOC_CHUNK_FORCE: - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - break; - } - ret = do_chunk_alloc(trans, - btrfs_metadata_alloc_profile(fs_info), - (state == ALLOC_CHUNK) ? - CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE); - btrfs_end_transaction(trans); - if (ret > 0 || ret == -ENOSPC) - ret = 0; - break; - case COMMIT_TRANS: - /* - * If we have pending delayed iputs then we could free up a - * bunch of pinned space, so make sure we run the iputs before - * we do our pinned bytes check below. - */ - btrfs_run_delayed_iputs(fs_info); - btrfs_wait_on_delayed_iputs(fs_info); - - ret = may_commit_transaction(fs_info, space_info); - break; - default: - ret = -ENOSPC; - break; - } - - trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state, - ret); - return; -} - -static inline u64 -btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - bool system_chunk) -{ - struct reserve_ticket *ticket; - u64 used; - u64 expected; - u64 to_reclaim = 0; - - list_for_each_entry(ticket, &space_info->tickets, list) - to_reclaim += ticket->bytes; - list_for_each_entry(ticket, &space_info->priority_tickets, list) - to_reclaim += ticket->bytes; - if (to_reclaim) - return to_reclaim; - - to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); - if (can_overcommit(fs_info, space_info, to_reclaim, - BTRFS_RESERVE_FLUSH_ALL, system_chunk)) - return 0; - - used = btrfs_space_info_used(space_info, true); - - if (can_overcommit(fs_info, space_info, SZ_1M, - BTRFS_RESERVE_FLUSH_ALL, system_chunk)) - expected = div_factor_fine(space_info->total_bytes, 95); - else - expected = div_factor_fine(space_info->total_bytes, 90); - - if (used > expected) - to_reclaim = used - expected; - else - to_reclaim = 0; - to_reclaim = min(to_reclaim, space_info->bytes_may_use + - space_info->bytes_reserved); - return to_reclaim; -} - -static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 used, bool system_chunk) -{ - u64 thresh = div_factor_fine(space_info->total_bytes, 98); - - /* If we're just plain full then async reclaim just slows us down. */ - if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) - return 0; - - if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info, - system_chunk)) - return 0; - - return (used >= thresh && !btrfs_fs_closing(fs_info) && - !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); -} - -static bool wake_all_tickets(struct list_head *head) -{ - struct reserve_ticket *ticket; - - while (!list_empty(head)) { - ticket = list_first_entry(head, struct reserve_ticket, list); - list_del_init(&ticket->list); - ticket->error = -ENOSPC; - wake_up(&ticket->wait); - if (ticket->bytes != ticket->orig_bytes) - return true; - } - return false; -} - -/* - * This is for normal flushers, we can wait all goddamned day if we want to. We - * will loop and continuously try to flush as long as we are making progress. - * We count progress as clearing off tickets each time we have to loop. - */ -static void btrfs_async_reclaim_metadata_space(struct work_struct *work) -{ - struct btrfs_fs_info *fs_info; - struct btrfs_space_info *space_info; - u64 to_reclaim; - int flush_state; - int commit_cycles = 0; - u64 last_tickets_id; - - fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); - space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); - - spin_lock(&space_info->lock); - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, - false); - if (!to_reclaim) { - space_info->flush = 0; - spin_unlock(&space_info->lock); - return; - } - last_tickets_id = space_info->tickets_id; - spin_unlock(&space_info->lock); - - flush_state = FLUSH_DELAYED_ITEMS_NR; - do { - flush_space(fs_info, space_info, to_reclaim, flush_state); - spin_lock(&space_info->lock); - if (list_empty(&space_info->tickets)) { - space_info->flush = 0; - spin_unlock(&space_info->lock); - return; - } - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, - space_info, - false); - if (last_tickets_id == space_info->tickets_id) { - flush_state++; - } else { - last_tickets_id = space_info->tickets_id; - flush_state = FLUSH_DELAYED_ITEMS_NR; - if (commit_cycles) - commit_cycles--; - } - - /* - * We don't want to force a chunk allocation until we've tried - * pretty hard to reclaim space. Think of the case where we - * freed up a bunch of space and so have a lot of pinned space - * to reclaim. We would rather use that than possibly create a - * underutilized metadata chunk. So if this is our first run - * through the flushing state machine skip ALLOC_CHUNK_FORCE and - * commit the transaction. If nothing has changed the next go - * around then we can force a chunk allocation. - */ - if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles) - flush_state++; - - if (flush_state > COMMIT_TRANS) { - commit_cycles++; - if (commit_cycles > 2) { - if (wake_all_tickets(&space_info->tickets)) { - flush_state = FLUSH_DELAYED_ITEMS_NR; - commit_cycles--; - } else { - space_info->flush = 0; - } - } else { - flush_state = FLUSH_DELAYED_ITEMS_NR; - } - } - spin_unlock(&space_info->lock); - } while (flush_state <= COMMIT_TRANS); -} - -void btrfs_init_async_reclaim_work(struct work_struct *work) -{ - INIT_WORK(work, btrfs_async_reclaim_metadata_space); -} - -static const enum btrfs_flush_state priority_flush_states[] = { - FLUSH_DELAYED_ITEMS_NR, - FLUSH_DELAYED_ITEMS, - ALLOC_CHUNK, -}; - -static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - struct reserve_ticket *ticket) -{ - u64 to_reclaim; - int flush_state; - - spin_lock(&space_info->lock); - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, - false); - if (!to_reclaim) { - spin_unlock(&space_info->lock); - return; - } - spin_unlock(&space_info->lock); - - flush_state = 0; - do { - flush_space(fs_info, space_info, to_reclaim, - priority_flush_states[flush_state]); - flush_state++; - spin_lock(&space_info->lock); - if (ticket->bytes == 0) { - spin_unlock(&space_info->lock); - return; - } - spin_unlock(&space_info->lock); - } while (flush_state < ARRAY_SIZE(priority_flush_states)); -} - -static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - struct reserve_ticket *ticket) - -{ - DEFINE_WAIT(wait); - u64 reclaim_bytes = 0; - int ret = 0; - - spin_lock(&space_info->lock); - while (ticket->bytes > 0 && ticket->error == 0) { - ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); - if (ret) { - ret = -EINTR; - break; - } - spin_unlock(&space_info->lock); - - schedule(); - - finish_wait(&ticket->wait, &wait); - spin_lock(&space_info->lock); - } - if (!ret) - ret = ticket->error; - if (!list_empty(&ticket->list)) - list_del_init(&ticket->list); - if (ticket->bytes && ticket->bytes < ticket->orig_bytes) - reclaim_bytes = ticket->orig_bytes - ticket->bytes; - spin_unlock(&space_info->lock); - - if (reclaim_bytes) - space_info_add_old_bytes(fs_info, space_info, reclaim_bytes); - return ret; -} - -/** - * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space - * @root - the root we're allocating for - * @space_info - the space info we want to allocate from - * @orig_bytes - the number of bytes we want - * @flush - whether or not we can flush to make our reservation - * - * This will reserve orig_bytes number of bytes from the space info associated - * with the block_rsv. If there is not enough space it will make an attempt to - * flush out space to make room. It will do this by flushing delalloc if - * possible or committing the transaction. If flush is 0 then no attempts to - * regain reservations will be made and this will fail if there is not enough - * space already. - */ -static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 orig_bytes, - enum btrfs_reserve_flush_enum flush, - bool system_chunk) -{ - struct reserve_ticket ticket; - u64 used; - u64 reclaim_bytes = 0; - int ret = 0; - - ASSERT(orig_bytes); - ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); - - spin_lock(&space_info->lock); - ret = -ENOSPC; - used = btrfs_space_info_used(space_info, true); - - /* - * If we have enough space then hooray, make our reservation and carry - * on. If not see if we can overcommit, and if we can, hooray carry on. - * If not things get more complicated. - */ - if (used + orig_bytes <= space_info->total_bytes) { - update_bytes_may_use(space_info, orig_bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, orig_bytes, 1); - ret = 0; - } else if (can_overcommit(fs_info, space_info, orig_bytes, flush, - system_chunk)) { - update_bytes_may_use(space_info, orig_bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, orig_bytes, 1); - ret = 0; - } - - /* - * If we couldn't make a reservation then setup our reservation ticket - * and kick the async worker if it's not already running. - * - * If we are a priority flusher then we just need to add our ticket to - * the list and we will do our own flushing further down. - */ - if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { - ticket.orig_bytes = orig_bytes; - ticket.bytes = orig_bytes; - ticket.error = 0; - init_waitqueue_head(&ticket.wait); - if (flush == BTRFS_RESERVE_FLUSH_ALL) { - list_add_tail(&ticket.list, &space_info->tickets); - if (!space_info->flush) { - space_info->flush = 1; - trace_btrfs_trigger_flush(fs_info, - space_info->flags, - orig_bytes, flush, - "enospc"); - queue_work(system_unbound_wq, - &fs_info->async_reclaim_work); - } - } else { - list_add_tail(&ticket.list, - &space_info->priority_tickets); - } - } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { - used += orig_bytes; - /* - * We will do the space reservation dance during log replay, - * which means we won't have fs_info->fs_root set, so don't do - * the async reclaim as we will panic. - */ - if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && - need_do_async_reclaim(fs_info, space_info, - used, system_chunk) && - !work_busy(&fs_info->async_reclaim_work)) { - trace_btrfs_trigger_flush(fs_info, space_info->flags, - orig_bytes, flush, "preempt"); - queue_work(system_unbound_wq, - &fs_info->async_reclaim_work); - } - } - spin_unlock(&space_info->lock); - if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) - return ret; - - if (flush == BTRFS_RESERVE_FLUSH_ALL) - return wait_reserve_ticket(fs_info, space_info, &ticket); - - ret = 0; - priority_reclaim_metadata_space(fs_info, space_info, &ticket); - spin_lock(&space_info->lock); - if (ticket.bytes) { - if (ticket.bytes < orig_bytes) - reclaim_bytes = orig_bytes - ticket.bytes; - list_del_init(&ticket.list); - ret = -ENOSPC; - } - spin_unlock(&space_info->lock); - - if (reclaim_bytes) - space_info_add_old_bytes(fs_info, space_info, reclaim_bytes); - ASSERT(list_empty(&ticket.list)); - return ret; -} - -/** - * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space - * @root - the root we're allocating for - * @block_rsv - the block_rsv we're allocating for - * @orig_bytes - the number of bytes we want - * @flush - whether or not we can flush to make our reservation - * - * This will reserve orig_bytes number of bytes from the space info associated - * with the block_rsv. If there is not enough space it will make an attempt to - * flush out space to make room. It will do this by flushing delalloc if - * possible or committing the transaction. If flush is 0 then no attempts to - * regain reservations will be made and this will fail if there is not enough - * space already. - */ -static int reserve_metadata_bytes(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 orig_bytes, - enum btrfs_reserve_flush_enum flush) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - int ret; - bool system_chunk = (root == fs_info->chunk_root); - - ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info, - orig_bytes, flush, system_chunk); - if (ret == -ENOSPC && - unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { - if (block_rsv != global_rsv && - !block_rsv_use_bytes(global_rsv, orig_bytes)) - ret = 0; - } - if (ret == -ENOSPC) { - trace_btrfs_space_reservation(fs_info, "space_info:enospc", - block_rsv->space_info->flags, - orig_bytes, 1); - - if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) - dump_space_info(fs_info, block_rsv->space_info, - orig_bytes, 0); - } - return ret; -} - -static struct btrfs_block_rsv *get_block_rsv( - const struct btrfs_trans_handle *trans, - const struct btrfs_root *root) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_block_rsv *block_rsv = NULL; - - if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || - (root == fs_info->csum_root && trans->adding_csums) || - (root == fs_info->uuid_root)) - block_rsv = trans->block_rsv; - - if (!block_rsv) - block_rsv = root->block_rsv; - - if (!block_rsv) - block_rsv = &fs_info->empty_block_rsv; - - return block_rsv; -} - -static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, - u64 num_bytes) -{ - int ret = -ENOSPC; - spin_lock(&block_rsv->lock); - if (block_rsv->reserved >= num_bytes) { - block_rsv->reserved -= num_bytes; - if (block_rsv->reserved < block_rsv->size) - block_rsv->full = 0; - ret = 0; - } - spin_unlock(&block_rsv->lock); - return ret; -} - -static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, - u64 num_bytes, bool update_size) -{ - spin_lock(&block_rsv->lock); - block_rsv->reserved += num_bytes; - if (update_size) - block_rsv->size += num_bytes; - else if (block_rsv->reserved >= block_rsv->size) - block_rsv->full = 1; - spin_unlock(&block_rsv->lock); -} - -int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *dest, u64 num_bytes, - int min_factor) -{ - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - u64 min_bytes; - - if (global_rsv->space_info != dest->space_info) - return -ENOSPC; - - spin_lock(&global_rsv->lock); - min_bytes = div_factor(global_rsv->size, min_factor); - if (global_rsv->reserved < min_bytes + num_bytes) { - spin_unlock(&global_rsv->lock); - return -ENOSPC; - } - global_rsv->reserved -= num_bytes; - if (global_rsv->reserved < global_rsv->size) - global_rsv->full = 0; - spin_unlock(&global_rsv->lock); - - block_rsv_add_bytes(dest, num_bytes, true); - return 0; -} - -/** - * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv. - * @fs_info - the fs info for our fs. - * @src - the source block rsv to transfer from. - * @num_bytes - the number of bytes to transfer. - * - * This transfers up to the num_bytes amount from the src rsv to the - * delayed_refs_rsv. Any extra bytes are returned to the space info. - */ -void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *src, - u64 num_bytes) -{ - struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; - u64 to_free = 0; - - spin_lock(&src->lock); - src->reserved -= num_bytes; - src->size -= num_bytes; - spin_unlock(&src->lock); - - spin_lock(&delayed_refs_rsv->lock); - if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) { - u64 delta = delayed_refs_rsv->size - - delayed_refs_rsv->reserved; - if (num_bytes > delta) { - to_free = num_bytes - delta; - num_bytes = delta; - } - } else { - to_free = num_bytes; - num_bytes = 0; - } - - if (num_bytes) - delayed_refs_rsv->reserved += num_bytes; - if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size) - delayed_refs_rsv->full = 1; - spin_unlock(&delayed_refs_rsv->lock); - - if (num_bytes) - trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", - 0, num_bytes, 1); - if (to_free) - space_info_add_old_bytes(fs_info, delayed_refs_rsv->space_info, - to_free); -} - -/** - * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage. - * @fs_info - the fs_info for our fs. - * @flush - control how we can flush for this reservation. - * - * This will refill the delayed block_rsv up to 1 items size worth of space and - * will return -ENOSPC if we can't make the reservation. - */ -int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, - enum btrfs_reserve_flush_enum flush) -{ - struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; - u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1); - u64 num_bytes = 0; - int ret = -ENOSPC; - - spin_lock(&block_rsv->lock); - if (block_rsv->reserved < block_rsv->size) { - num_bytes = block_rsv->size - block_rsv->reserved; - num_bytes = min(num_bytes, limit); - } - spin_unlock(&block_rsv->lock); - - if (!num_bytes) - return 0; - - ret = reserve_metadata_bytes(fs_info->extent_root, block_rsv, - num_bytes, flush); - if (ret) - return ret; - block_rsv_add_bytes(block_rsv, num_bytes, 0); - trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", - 0, num_bytes, 1); - return 0; -} - -/* - * This is for space we already have accounted in space_info->bytes_may_use, so - * basically when we're returning space from block_rsv's. - */ -static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 num_bytes) -{ - struct reserve_ticket *ticket; - struct list_head *head; - u64 used; - enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; - bool check_overcommit = false; - - spin_lock(&space_info->lock); - head = &space_info->priority_tickets; - - /* - * If we are over our limit then we need to check and see if we can - * overcommit, and if we can't then we just need to free up our space - * and not satisfy any requests. - */ - used = btrfs_space_info_used(space_info, true); - if (used - num_bytes >= space_info->total_bytes) - check_overcommit = true; -again: - while (!list_empty(head) && num_bytes) { - ticket = list_first_entry(head, struct reserve_ticket, - list); - /* - * We use 0 bytes because this space is already reserved, so - * adding the ticket space would be a double count. - */ - if (check_overcommit && - !can_overcommit(fs_info, space_info, 0, flush, false)) - break; - if (num_bytes >= ticket->bytes) { - list_del_init(&ticket->list); - num_bytes -= ticket->bytes; - ticket->bytes = 0; - space_info->tickets_id++; - wake_up(&ticket->wait); - } else { - ticket->bytes -= num_bytes; - num_bytes = 0; - } - } - - if (num_bytes && head == &space_info->priority_tickets) { - head = &space_info->tickets; - flush = BTRFS_RESERVE_FLUSH_ALL; - goto again; - } - update_bytes_may_use(space_info, -num_bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, num_bytes, 0); - spin_unlock(&space_info->lock); -} - -/* - * This is for newly allocated space that isn't accounted in - * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent - * we use this helper. - */ -static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 num_bytes) -{ - struct reserve_ticket *ticket; - struct list_head *head = &space_info->priority_tickets; - -again: - while (!list_empty(head) && num_bytes) { - ticket = list_first_entry(head, struct reserve_ticket, - list); - if (num_bytes >= ticket->bytes) { - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, - ticket->bytes, 1); - list_del_init(&ticket->list); - num_bytes -= ticket->bytes; - update_bytes_may_use(space_info, ticket->bytes); - ticket->bytes = 0; - space_info->tickets_id++; - wake_up(&ticket->wait); - } else { - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, - num_bytes, 1); - update_bytes_may_use(space_info, num_bytes); - ticket->bytes -= num_bytes; - num_bytes = 0; - } - } - - if (num_bytes && head == &space_info->priority_tickets) { - head = &space_info->tickets; - goto again; - } -} - -static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, - struct btrfs_block_rsv *dest, u64 num_bytes, - u64 *qgroup_to_release_ret) -{ - struct btrfs_space_info *space_info = block_rsv->space_info; - u64 qgroup_to_release = 0; - u64 ret; - - spin_lock(&block_rsv->lock); - if (num_bytes == (u64)-1) { - num_bytes = block_rsv->size; - qgroup_to_release = block_rsv->qgroup_rsv_size; - } - block_rsv->size -= num_bytes; - if (block_rsv->reserved >= block_rsv->size) { - num_bytes = block_rsv->reserved - block_rsv->size; - block_rsv->reserved = block_rsv->size; - block_rsv->full = 1; - } else { - num_bytes = 0; - } - if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) { - qgroup_to_release = block_rsv->qgroup_rsv_reserved - - block_rsv->qgroup_rsv_size; - block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size; - } else { - qgroup_to_release = 0; - } - spin_unlock(&block_rsv->lock); - - ret = num_bytes; - if (num_bytes > 0) { - if (dest) { - spin_lock(&dest->lock); - if (!dest->full) { - u64 bytes_to_add; - - bytes_to_add = dest->size - dest->reserved; - bytes_to_add = min(num_bytes, bytes_to_add); - dest->reserved += bytes_to_add; - if (dest->reserved >= dest->size) - dest->full = 1; - num_bytes -= bytes_to_add; - } - spin_unlock(&dest->lock); - } - if (num_bytes) - space_info_add_old_bytes(fs_info, space_info, - num_bytes); - } - if (qgroup_to_release_ret) - *qgroup_to_release_ret = qgroup_to_release; - return ret; -} - -int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, - struct btrfs_block_rsv *dst, u64 num_bytes, - bool update_size) -{ - int ret; - - ret = block_rsv_use_bytes(src, num_bytes); - if (ret) - return ret; - - block_rsv_add_bytes(dst, num_bytes, update_size); - return 0; -} - -void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) -{ - memset(rsv, 0, sizeof(*rsv)); - spin_lock_init(&rsv->lock); - rsv->type = type; -} - -void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *rsv, - unsigned short type) -{ - btrfs_init_block_rsv(rsv, type); - rsv->space_info = __find_space_info(fs_info, - BTRFS_BLOCK_GROUP_METADATA); -} - -struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, - unsigned short type) -{ - struct btrfs_block_rsv *block_rsv; - - block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); - if (!block_rsv) - return NULL; - - btrfs_init_metadata_block_rsv(fs_info, block_rsv, type); - return block_rsv; -} - -void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *rsv) -{ - if (!rsv) - return; - btrfs_block_rsv_release(fs_info, rsv, (u64)-1); - kfree(rsv); -} - -int btrfs_block_rsv_add(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, u64 num_bytes, - enum btrfs_reserve_flush_enum flush) -{ - int ret; - - if (num_bytes == 0) - return 0; - - ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); - if (!ret) - block_rsv_add_bytes(block_rsv, num_bytes, true); - - return ret; -} - -int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor) -{ - u64 num_bytes = 0; - int ret = -ENOSPC; - - if (!block_rsv) - return 0; - - spin_lock(&block_rsv->lock); - num_bytes = div_factor(block_rsv->size, min_factor); - if (block_rsv->reserved >= num_bytes) - ret = 0; - spin_unlock(&block_rsv->lock); - - return ret; -} - -int btrfs_block_rsv_refill(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, u64 min_reserved, - enum btrfs_reserve_flush_enum flush) -{ - u64 num_bytes = 0; - int ret = -ENOSPC; - - if (!block_rsv) - return 0; - - spin_lock(&block_rsv->lock); - num_bytes = min_reserved; - if (block_rsv->reserved >= num_bytes) - ret = 0; - else - num_bytes -= block_rsv->reserved; - spin_unlock(&block_rsv->lock); - - if (!ret) - return 0; - - ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); - if (!ret) { - block_rsv_add_bytes(block_rsv, num_bytes, false); - return 0; - } - - return ret; -} - -static u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes, u64 *qgroup_to_release) -{ - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; - struct btrfs_block_rsv *target = delayed_rsv; - - if (target->full || target == block_rsv) - target = global_rsv; - - if (block_rsv->space_info != target->space_info) - target = NULL; - - return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes, - qgroup_to_release); -} - -void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes) -{ - __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); -} - -/** - * btrfs_inode_rsv_release - release any excessive reservation. - * @inode - the inode we need to release from. - * @qgroup_free - free or convert qgroup meta. - * Unlike normal operation, qgroup meta reservation needs to know if we are - * freeing qgroup reservation or just converting it into per-trans. Normally - * @qgroup_free is true for error handling, and false for normal release. - * - * This is the same as btrfs_block_rsv_release, except that it handles the - * tracepoint for the reservation. - */ -static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_block_rsv *block_rsv = &inode->block_rsv; - u64 released = 0; - u64 qgroup_to_release = 0; - - /* - * Since we statically set the block_rsv->size we just want to say we - * are releasing 0 bytes, and then we'll just get the reservation over - * the size free'd. - */ - released = __btrfs_block_rsv_release(fs_info, block_rsv, 0, - &qgroup_to_release); - if (released > 0) - trace_btrfs_space_reservation(fs_info, "delalloc", - btrfs_ino(inode), released, 0); - if (qgroup_free) - btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release); - else - btrfs_qgroup_convert_reserved_meta(inode->root, - qgroup_to_release); -} - -/** - * btrfs_delayed_refs_rsv_release - release a ref head's reservation. - * @fs_info - the fs_info for our fs. - * @nr - the number of items to drop. - * - * This drops the delayed ref head's count from the delayed refs rsv and frees - * any excess reservation we had. - */ -void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) -{ - struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr); - u64 released = 0; - - released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, - num_bytes, NULL); - if (released) - trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", - 0, released, 0); -} - -static void update_global_block_rsv(struct btrfs_fs_info *fs_info) -{ - struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; - struct btrfs_space_info *sinfo = block_rsv->space_info; - u64 num_bytes; - - /* - * The global block rsv is based on the size of the extent tree, the - * checksum tree and the root tree. If the fs is empty we want to set - * it to a minimal amount for safety. - */ - num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) + - btrfs_root_used(&fs_info->csum_root->root_item) + - btrfs_root_used(&fs_info->tree_root->root_item); - num_bytes = max_t(u64, num_bytes, SZ_16M); - - spin_lock(&sinfo->lock); - spin_lock(&block_rsv->lock); - - block_rsv->size = min_t(u64, num_bytes, SZ_512M); - - if (block_rsv->reserved < block_rsv->size) { - num_bytes = btrfs_space_info_used(sinfo, true); - if (sinfo->total_bytes > num_bytes) { - num_bytes = sinfo->total_bytes - num_bytes; - num_bytes = min(num_bytes, - block_rsv->size - block_rsv->reserved); - block_rsv->reserved += num_bytes; - update_bytes_may_use(sinfo, num_bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - sinfo->flags, num_bytes, - 1); - } - } else if (block_rsv->reserved > block_rsv->size) { - num_bytes = block_rsv->reserved - block_rsv->size; - update_bytes_may_use(sinfo, -num_bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - sinfo->flags, num_bytes, 0); - block_rsv->reserved = block_rsv->size; - } - - if (block_rsv->reserved == block_rsv->size) - block_rsv->full = 1; - else - block_rsv->full = 0; - - spin_unlock(&block_rsv->lock); - spin_unlock(&sinfo->lock); -} - -static void init_global_block_rsv(struct btrfs_fs_info *fs_info) -{ - struct btrfs_space_info *space_info; - - space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); - fs_info->chunk_block_rsv.space_info = space_info; - - space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); - fs_info->global_block_rsv.space_info = space_info; - fs_info->trans_block_rsv.space_info = space_info; - fs_info->empty_block_rsv.space_info = space_info; - fs_info->delayed_block_rsv.space_info = space_info; - fs_info->delayed_refs_rsv.space_info = space_info; - - fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv; - fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv; - fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; - fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; - if (fs_info->quota_root) - fs_info->quota_root->block_rsv = &fs_info->global_block_rsv; - fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; - - update_global_block_rsv(fs_info); -} - -static void release_global_block_rsv(struct btrfs_fs_info *fs_info) -{ - block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, - (u64)-1, NULL); - WARN_ON(fs_info->trans_block_rsv.size > 0); - WARN_ON(fs_info->trans_block_rsv.reserved > 0); - WARN_ON(fs_info->chunk_block_rsv.size > 0); - WARN_ON(fs_info->chunk_block_rsv.reserved > 0); - WARN_ON(fs_info->delayed_block_rsv.size > 0); - WARN_ON(fs_info->delayed_block_rsv.reserved > 0); - WARN_ON(fs_info->delayed_refs_rsv.reserved > 0); - WARN_ON(fs_info->delayed_refs_rsv.size > 0); -} - -/* - * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv - * @trans - the trans that may have generated delayed refs - * - * This is to be called anytime we may have adjusted trans->delayed_ref_updates, - * it'll calculate the additional size and add it to the delayed_refs_rsv. - */ -void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; - u64 num_bytes; - - if (!trans->delayed_ref_updates) - return; - - num_bytes = btrfs_calc_trans_metadata_size(fs_info, - trans->delayed_ref_updates); - spin_lock(&delayed_rsv->lock); - delayed_rsv->size += num_bytes; - delayed_rsv->full = 0; - spin_unlock(&delayed_rsv->lock); - trans->delayed_ref_updates = 0; -} - -/* - * To be called after all the new block groups attached to the transaction - * handle have been created (btrfs_create_pending_block_groups()). - */ -void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - - if (!trans->chunk_bytes_reserved) - return; - - WARN_ON_ONCE(!list_empty(&trans->new_bgs)); - - block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL, - trans->chunk_bytes_reserved, NULL); - trans->chunk_bytes_reserved = 0; -} - -/* - * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation - * root: the root of the parent directory - * rsv: block reservation - * items: the number of items that we need do reservation - * use_global_rsv: allow fallback to the global block reservation - * - * This function is used to reserve the space for snapshot/subvolume - * creation and deletion. Those operations are different with the - * common file/directory operations, they change two fs/file trees - * and root tree, the number of items that the qgroup reserves is - * different with the free space reservation. So we can not use - * the space reservation mechanism in start_transaction(). - */ -int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, - struct btrfs_block_rsv *rsv, int items, - bool use_global_rsv) -{ - u64 qgroup_num_bytes = 0; - u64 num_bytes; - int ret; - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - - if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { - /* One for parent inode, two for dir entries */ - qgroup_num_bytes = 3 * fs_info->nodesize; - ret = btrfs_qgroup_reserve_meta_prealloc(root, - qgroup_num_bytes, true); - if (ret) - return ret; - } - - num_bytes = btrfs_calc_trans_metadata_size(fs_info, items); - rsv->space_info = __find_space_info(fs_info, - BTRFS_BLOCK_GROUP_METADATA); - ret = btrfs_block_rsv_add(root, rsv, num_bytes, - BTRFS_RESERVE_FLUSH_ALL); - - if (ret == -ENOSPC && use_global_rsv) - ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true); - - if (ret && qgroup_num_bytes) - btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); - - return ret; -} - -void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *rsv) -{ - btrfs_block_rsv_release(fs_info, rsv, (u64)-1); -} - -static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, - struct btrfs_inode *inode) -{ - struct btrfs_block_rsv *block_rsv = &inode->block_rsv; - u64 reserve_size = 0; - u64 qgroup_rsv_size = 0; - u64 csum_leaves; - unsigned outstanding_extents; - - lockdep_assert_held(&inode->lock); - outstanding_extents = inode->outstanding_extents; - if (outstanding_extents) - reserve_size = btrfs_calc_trans_metadata_size(fs_info, - outstanding_extents + 1); - csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, - inode->csum_bytes); - reserve_size += btrfs_calc_trans_metadata_size(fs_info, - csum_leaves); - /* - * For qgroup rsv, the calculation is very simple: - * account one nodesize for each outstanding extent - * - * This is overestimating in most cases. - */ - qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize; - - spin_lock(&block_rsv->lock); - block_rsv->size = reserve_size; - block_rsv->qgroup_rsv_size = qgroup_rsv_size; - spin_unlock(&block_rsv->lock); -} - -static void calc_inode_reservations(struct btrfs_fs_info *fs_info, - u64 num_bytes, u64 *meta_reserve, - u64 *qgroup_reserve) -{ - u64 nr_extents = count_max_extents(num_bytes); - u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes); - - /* We add one for the inode update at finish ordered time */ - *meta_reserve = btrfs_calc_trans_metadata_size(fs_info, - nr_extents + csum_leaves + 1); - *qgroup_reserve = nr_extents * fs_info->nodesize; -} - -int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) -{ - struct btrfs_root *root = inode->root; - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_block_rsv *block_rsv = &inode->block_rsv; - u64 meta_reserve, qgroup_reserve; - unsigned nr_extents; - enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; - int ret = 0; - bool delalloc_lock = true; - - /* If we are a free space inode we need to not flush since we will be in - * the middle of a transaction commit. We also don't need the delalloc - * mutex since we won't race with anybody. We need this mostly to make - * lockdep shut its filthy mouth. - * - * If we have a transaction open (can happen if we call truncate_block - * from truncate), then we need FLUSH_LIMIT so we don't deadlock. - */ - if (btrfs_is_free_space_inode(inode)) { - flush = BTRFS_RESERVE_NO_FLUSH; - delalloc_lock = false; - } else { - if (current->journal_info) - flush = BTRFS_RESERVE_FLUSH_LIMIT; - - if (btrfs_transaction_in_commit(fs_info)) - schedule_timeout(1); - } - - if (delalloc_lock) - mutex_lock(&inode->delalloc_mutex); - - num_bytes = ALIGN(num_bytes, fs_info->sectorsize); - - /* - * We always want to do it this way, every other way is wrong and ends - * in tears. Pre-reserving the amount we are going to add will always - * be the right way, because otherwise if we have enough parallelism we - * could end up with thousands of inodes all holding little bits of - * reservations they were able to make previously and the only way to - * reclaim that space is to ENOSPC out the operations and clear - * everything out and try again, which is bad. This way we just - * over-reserve slightly, and clean up the mess when we are done. - */ - calc_inode_reservations(fs_info, num_bytes, &meta_reserve, - &qgroup_reserve); - ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true); - if (ret) - goto out_fail; - ret = reserve_metadata_bytes(root, block_rsv, meta_reserve, flush); - if (ret) - goto out_qgroup; - - /* - * Now we need to update our outstanding extents and csum bytes _first_ - * and then add the reservation to the block_rsv. This keeps us from - * racing with an ordered completion or some such that would think it - * needs to free the reservation we just made. - */ - spin_lock(&inode->lock); - nr_extents = count_max_extents(num_bytes); - btrfs_mod_outstanding_extents(inode, nr_extents); - inode->csum_bytes += num_bytes; - btrfs_calculate_inode_block_rsv_size(fs_info, inode); - spin_unlock(&inode->lock); - - /* Now we can safely add our space to our block rsv */ - block_rsv_add_bytes(block_rsv, meta_reserve, false); - trace_btrfs_space_reservation(root->fs_info, "delalloc", - btrfs_ino(inode), meta_reserve, 1); - - spin_lock(&block_rsv->lock); - block_rsv->qgroup_rsv_reserved += qgroup_reserve; - spin_unlock(&block_rsv->lock); - - if (delalloc_lock) - mutex_unlock(&inode->delalloc_mutex); - return 0; -out_qgroup: - btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve); -out_fail: - btrfs_inode_rsv_release(inode, true); - if (delalloc_lock) - mutex_unlock(&inode->delalloc_mutex); - return ret; -} - -/** - * btrfs_delalloc_release_metadata - release a metadata reservation for an inode - * @inode: the inode to release the reservation for. - * @num_bytes: the number of bytes we are releasing. - * @qgroup_free: free qgroup reservation or convert it to per-trans reservation - * - * This will release the metadata reservation for an inode. This can be called - * once we complete IO for a given set of bytes to release their metadata - * reservations, or on error for the same reason. - */ -void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, - bool qgroup_free) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - - num_bytes = ALIGN(num_bytes, fs_info->sectorsize); - spin_lock(&inode->lock); - inode->csum_bytes -= num_bytes; - btrfs_calculate_inode_block_rsv_size(fs_info, inode); - spin_unlock(&inode->lock); - - if (btrfs_is_testing(fs_info)) - return; - - btrfs_inode_rsv_release(inode, qgroup_free); -} - -/** - * btrfs_delalloc_release_extents - release our outstanding_extents - * @inode: the inode to balance the reservation for. - * @num_bytes: the number of bytes we originally reserved with - * @qgroup_free: do we need to free qgroup meta reservation or convert them. - * - * When we reserve space we increase outstanding_extents for the extents we may - * add. Once we've set the range as delalloc or created our ordered extents we - * have outstanding_extents to track the real usage, so we use this to free our - * temporarily tracked outstanding_extents. This _must_ be used in conjunction - * with btrfs_delalloc_reserve_metadata. - */ -void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes, - bool qgroup_free) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - unsigned num_extents; - - spin_lock(&inode->lock); - num_extents = count_max_extents(num_bytes); - btrfs_mod_outstanding_extents(inode, -num_extents); - btrfs_calculate_inode_block_rsv_size(fs_info, inode); - spin_unlock(&inode->lock); - - if (btrfs_is_testing(fs_info)) - return; - - btrfs_inode_rsv_release(inode, qgroup_free); -} - -/** - * btrfs_delalloc_reserve_space - reserve data and metadata space for - * delalloc - * @inode: inode we're writing to - * @start: start range we are writing to - * @len: how long the range we are writing to - * @reserved: mandatory parameter, record actually reserved qgroup ranges of - * current reservation. - * - * This will do the following things - * - * o reserve space in data space info for num bytes - * and reserve precious corresponding qgroup space - * (Done in check_data_free_space) - * - * o reserve space for metadata space, based on the number of outstanding - * extents and how much csums will be needed - * also reserve metadata space in a per root over-reserve method. - * o add to the inodes->delalloc_bytes - * o add it to the fs_info's delalloc inodes list. - * (Above 3 all done in delalloc_reserve_metadata) - * - * Return 0 for success - * Return <0 for error(-ENOSPC or -EQUOT) - */ -int btrfs_delalloc_reserve_space(struct inode *inode, - struct extent_changeset **reserved, u64 start, u64 len) -{ - int ret; - - ret = btrfs_check_data_free_space(inode, reserved, start, len); - if (ret < 0) - return ret; - ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); - if (ret < 0) - btrfs_free_reserved_data_space(inode, *reserved, start, len); - return ret; -} - -/** - * btrfs_delalloc_release_space - release data and metadata space for delalloc - * @inode: inode we're releasing space for - * @start: start position of the space already reserved - * @len: the len of the space already reserved - * @release_bytes: the len of the space we consumed or didn't use - * - * This function will release the metadata space that was not used and will - * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes - * list if there are no delalloc bytes left. - * Also it will handle the qgroup reserved space. - */ -void btrfs_delalloc_release_space(struct inode *inode, - struct extent_changeset *reserved, - u64 start, u64 len, bool qgroup_free) -{ - btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free); - btrfs_free_reserved_data_space(inode, reserved, start, len); -} - static int update_block_group(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, int alloc) { @@ -6296,7 +4168,8 @@ static int update_block_group(struct btrfs_trans_handle *trans, old_val -= num_bytes; btrfs_set_block_group_used(&cache->item, old_val); cache->pinned += num_bytes; - update_bytes_pinned(cache->space_info, num_bytes); + btrfs_space_info_update_bytes_pinned(info, + cache->space_info, num_bytes); cache->space_info->bytes_used -= num_bytes; cache->space_info->disk_used -= num_bytes * factor; spin_unlock(&cache->lock); @@ -6371,7 +4244,8 @@ static int pin_down_extent(struct btrfs_block_group_cache *cache, spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); cache->pinned += num_bytes; - update_bytes_pinned(cache->space_info, num_bytes); + btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info, + num_bytes); if (reserved) { cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; @@ -6580,7 +4454,8 @@ static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, } else { cache->reserved += num_bytes; space_info->bytes_reserved += num_bytes; - update_bytes_may_use(space_info, -ram_bytes); + btrfs_space_info_update_bytes_may_use(cache->fs_info, + space_info, -ram_bytes); if (delalloc) cache->delalloc_bytes += num_bytes; } @@ -6646,7 +4521,7 @@ void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info) up_write(&fs_info->commit_root_sem); - update_global_block_rsv(fs_info); + btrfs_update_global_block_rsv(fs_info); } /* @@ -6736,7 +4611,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, spin_lock(&space_info->lock); spin_lock(&cache->lock); cache->pinned -= len; - update_bytes_pinned(space_info, -len); + btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len); trace_btrfs_space_reservation(fs_info, "pinned", space_info->flags, len, 0); @@ -6757,7 +4632,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, to_add = min(len, global_rsv->size - global_rsv->reserved); global_rsv->reserved += to_add; - update_bytes_may_use(space_info, to_add); + btrfs_space_info_update_bytes_may_use(fs_info, + space_info, to_add); if (global_rsv->reserved >= global_rsv->size) global_rsv->full = 1; trace_btrfs_space_reservation(fs_info, @@ -6769,8 +4645,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, spin_unlock(&global_rsv->lock); /* Add to any tickets we may have */ if (len) - space_info_add_new_bytes(fs_info, space_info, - len); + btrfs_space_info_add_new_bytes(fs_info, + space_info, len); } spin_unlock(&space_info->lock); } @@ -7191,7 +5067,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, } out: if (pin) - add_pinned_bytes(fs_info, &generic_ref, 1); + add_pinned_bytes(fs_info, &generic_ref); if (last_ref) { /* @@ -7239,7 +5115,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) btrfs_ref_tree_mod(fs_info, ref); if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) - add_pinned_bytes(fs_info, ref, 1); + add_pinned_bytes(fs_info, ref); return ret; } @@ -7292,10 +5168,10 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache) } enum btrfs_loop_type { - LOOP_CACHING_NOWAIT = 0, - LOOP_CACHING_WAIT = 1, - LOOP_ALLOC_CHUNK = 2, - LOOP_NO_EMPTY_SIZE = 3, + LOOP_CACHING_NOWAIT, + LOOP_CACHING_WAIT, + LOOP_ALLOC_CHUNK, + LOOP_NO_EMPTY_SIZE, }; static inline void @@ -7661,8 +5537,8 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, return ret; } - ret = do_chunk_alloc(trans, ffe_ctl->flags, - CHUNK_ALLOC_FORCE); + ret = btrfs_chunk_alloc(trans, ffe_ctl->flags, + CHUNK_ALLOC_FORCE); /* * If we can't allocate a new chunk we've already looped @@ -7758,7 +5634,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, trace_find_free_extent(fs_info, num_bytes, empty_size, flags); - space_info = __find_space_info(fs_info, flags); + space_info = btrfs_find_space_info(fs_info, flags); if (!space_info) { btrfs_err(fs_info, "No space info for %llu", flags); return -ENOSPC; @@ -7863,9 +5739,8 @@ search: */ if (!block_group_bits(block_group, flags)) { u64 extra = BTRFS_BLOCK_GROUP_DUP | - BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6 | + BTRFS_BLOCK_GROUP_RAID1_MASK | + BTRFS_BLOCK_GROUP_RAID56_MASK | BTRFS_BLOCK_GROUP_RAID10; /* @@ -7984,60 +5859,6 @@ loop: return ret; } -#define DUMP_BLOCK_RSV(fs_info, rsv_name) \ -do { \ - struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \ - spin_lock(&__rsv->lock); \ - btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \ - __rsv->size, __rsv->reserved); \ - spin_unlock(&__rsv->lock); \ -} while (0) - -static void dump_space_info(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *info, u64 bytes, - int dump_block_groups) -{ - struct btrfs_block_group_cache *cache; - int index = 0; - - spin_lock(&info->lock); - btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull", - info->flags, - info->total_bytes - btrfs_space_info_used(info, true), - info->full ? "" : "not "); - btrfs_info(fs_info, - "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu", - info->total_bytes, info->bytes_used, info->bytes_pinned, - info->bytes_reserved, info->bytes_may_use, - info->bytes_readonly); - spin_unlock(&info->lock); - - DUMP_BLOCK_RSV(fs_info, global_block_rsv); - DUMP_BLOCK_RSV(fs_info, trans_block_rsv); - DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); - DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); - DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); - - if (!dump_block_groups) - return; - - down_read(&info->groups_sem); -again: - list_for_each_entry(cache, &info->block_groups[index], list) { - spin_lock(&cache->lock); - btrfs_info(fs_info, - "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s", - cache->key.objectid, cache->key.offset, - btrfs_block_group_used(&cache->item), cache->pinned, - cache->reserved, cache->ro ? "[readonly]" : ""); - btrfs_dump_free_space(cache, bytes); - spin_unlock(&cache->lock); - } - if (++index < BTRFS_NR_RAID_TYPES) - goto again; - up_read(&info->groups_sem); -} - /* * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a * hole that is at least as big as @num_bytes. @@ -8113,12 +5934,13 @@ again: } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { struct btrfs_space_info *sinfo; - sinfo = __find_space_info(fs_info, flags); + sinfo = btrfs_find_space_info(fs_info, flags); btrfs_err(fs_info, "allocation failed flags %llu, wanted %llu", flags, num_bytes); if (sinfo) - dump_space_info(fs_info, sinfo, num_bytes, 1); + btrfs_dump_space_info(fs_info, sinfo, + num_bytes, 1); } } @@ -8456,73 +6278,6 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, return buf; } -static struct btrfs_block_rsv * -use_block_rsv(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u32 blocksize) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_block_rsv *block_rsv; - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - int ret; - bool global_updated = false; - - block_rsv = get_block_rsv(trans, root); - - if (unlikely(block_rsv->size == 0)) - goto try_reserve; -again: - ret = block_rsv_use_bytes(block_rsv, blocksize); - if (!ret) - return block_rsv; - - if (block_rsv->failfast) - return ERR_PTR(ret); - - if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) { - global_updated = true; - update_global_block_rsv(fs_info); - goto again; - } - - /* - * The global reserve still exists to save us from ourselves, so don't - * warn_on if we are short on our delayed refs reserve. - */ - if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS && - btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { - static DEFINE_RATELIMIT_STATE(_rs, - DEFAULT_RATELIMIT_INTERVAL * 10, - /*DEFAULT_RATELIMIT_BURST*/ 1); - if (__ratelimit(&_rs)) - WARN(1, KERN_DEBUG - "BTRFS: block rsv returned %d\n", ret); - } -try_reserve: - ret = reserve_metadata_bytes(root, block_rsv, blocksize, - BTRFS_RESERVE_NO_FLUSH); - if (!ret) - return block_rsv; - /* - * If we couldn't reserve metadata bytes try and use some from - * the global reserve if its space type is the same as the global - * reservation. - */ - if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL && - block_rsv->space_info == global_rsv->space_info) { - ret = block_rsv_use_bytes(global_rsv, blocksize); - if (!ret) - return global_rsv; - } - return ERR_PTR(ret); -} - -static void unuse_block_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, u32 blocksize) -{ - block_rsv_add_bytes(block_rsv, blocksize, false); - block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL); -} - /* * finds a free extent and does all the dirty work required for allocation * returns the tree buffer or an ERR_PTR on error. @@ -8555,7 +6310,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, } #endif - block_rsv = use_block_rsv(trans, root, blocksize); + block_rsv = btrfs_use_block_rsv(trans, root, blocksize); if (IS_ERR(block_rsv)) return ERR_CAST(block_rsv); @@ -8613,7 +6368,7 @@ out_free_buf: out_free_reserved: btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); out_unuse: - unuse_block_rsv(fs_info, block_rsv, blocksize); + btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize); return ERR_PTR(ret); } @@ -9552,9 +7307,8 @@ static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags) num_devices = fs_info->fs_devices->rw_devices; - stripped = BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | - BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; + stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID56_MASK | + BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10; if (num_devices == 1) { stripped |= BTRFS_BLOCK_GROUP_DUP; @@ -9565,7 +7319,7 @@ static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags) return stripped; /* turn mirroring into duplication */ - if (flags & (BTRFS_BLOCK_GROUP_RAID1 | + if (flags & (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)) return stripped | BTRFS_BLOCK_GROUP_DUP; } else { @@ -9636,7 +7390,7 @@ out: btrfs_info(cache->fs_info, "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu", sinfo_used, num_bytes, min_allocable_bytes); - dump_space_info(cache->fs_info, cache->space_info, 0, 0); + btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0); } return ret; } @@ -9678,8 +7432,7 @@ again: */ alloc_flags = update_block_group_flags(fs_info, cache->flags); if (alloc_flags != cache->flags) { - ret = do_chunk_alloc(trans, alloc_flags, - CHUNK_ALLOC_FORCE); + ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); /* * ENOSPC is allowed here, we may have enough space * already allocated at the new raid level to @@ -9695,7 +7448,7 @@ again: if (!ret) goto out; alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags); - ret = do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); + ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); if (ret < 0) goto out; ret = inc_block_group_ro(cache, 0); @@ -9716,7 +7469,7 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type) { u64 alloc_flags = get_alloc_profile(trans->fs_info, type); - return do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); + return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); } /* @@ -9949,7 +7702,7 @@ static int find_first_block_group(struct btrfs_fs_info *fs_info, struct extent_map_tree *em_tree; struct extent_map *em; - em_tree = &root->fs_info->mapping_tree.map_tree; + em_tree = &root->fs_info->mapping_tree; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, found_key.objectid, found_key.offset); @@ -10102,7 +7855,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) */ synchronize_rcu(); - release_global_block_rsv(info); + btrfs_release_global_block_rsv(info); while (!list_empty(&info->space_info)) { int i; @@ -10118,7 +7871,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) if (WARN_ON(space_info->bytes_pinned > 0 || space_info->bytes_reserved > 0 || space_info->bytes_may_use > 0)) - dump_space_info(info, space_info, 0, 0); + btrfs_dump_space_info(info, space_info, 0, 0); list_del(&space_info->list); for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { struct kobject *kobj; @@ -10141,7 +7894,6 @@ void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info) struct btrfs_space_info *space_info; struct raid_kobject *rkobj; LIST_HEAD(list); - int index; int ret = 0; spin_lock(&fs_info->pending_raid_kobjs_lock); @@ -10149,11 +7901,10 @@ void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info) spin_unlock(&fs_info->pending_raid_kobjs_lock); list_for_each_entry(rkobj, &list, list) { - space_info = __find_space_info(fs_info, rkobj->flags); - index = btrfs_bg_flags_to_raid_index(rkobj->flags); + space_info = btrfs_find_space_info(fs_info, rkobj->flags); ret = kobject_add(&rkobj->kobj, &space_info->kobj, - "%s", get_raid_name(index)); + "%s", btrfs_bg_type_to_raid_name(rkobj->flags)); if (ret) { kobject_put(&rkobj->kobj); break; @@ -10243,21 +7994,21 @@ btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info, */ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) { - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; + struct extent_map_tree *map_tree = &fs_info->mapping_tree; struct extent_map *em; struct btrfs_block_group_cache *bg; u64 start = 0; int ret = 0; while (1) { - read_lock(&map_tree->map_tree.lock); + read_lock(&map_tree->lock); /* * lookup_extent_mapping will return the first extent map * intersecting the range, so setting @len to 1 is enough to * get the first chunk. */ - em = lookup_extent_mapping(&map_tree->map_tree, start, 1); - read_unlock(&map_tree->map_tree.lock); + em = lookup_extent_mapping(map_tree, start, 1); + read_unlock(&map_tree->lock); if (!em) break; @@ -10417,9 +8168,9 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) } trace_btrfs_add_block_group(info, cache, 0); - update_space_info(info, cache->flags, found_key.offset, - btrfs_block_group_used(&cache->item), - cache->bytes_super, &space_info); + btrfs_update_space_info(info, cache->flags, found_key.offset, + btrfs_block_group_used(&cache->item), + cache->bytes_super, &space_info); cache->space_info = space_info; @@ -10437,9 +8188,8 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) list_for_each_entry_rcu(space_info, &info->space_info, list) { if (!(get_alloc_profile(info, space_info->flags) & (BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6 | + BTRFS_BLOCK_GROUP_RAID1_MASK | + BTRFS_BLOCK_GROUP_RAID56_MASK | BTRFS_BLOCK_GROUP_DUP))) continue; /* @@ -10457,7 +8207,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) } btrfs_add_raid_kobjects(info); - init_global_block_rsv(info); + btrfs_init_global_block_rsv(info); ret = check_chunk_block_group_mappings(info); error: btrfs_free_path(path); @@ -10554,7 +8304,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, * assigned to our block group. We want our bg to be added to the rbtree * with its ->space_info set. */ - cache->space_info = __find_space_info(fs_info, cache->flags); + cache->space_info = btrfs_find_space_info(fs_info, cache->flags); ASSERT(cache->space_info); ret = btrfs_add_block_group_cache(fs_info, cache); @@ -10569,9 +8319,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, * the rbtree, update the space info's counters. */ trace_btrfs_add_block_group(fs_info, cache, 1); - update_space_info(fs_info, cache->flags, size, bytes_used, + btrfs_update_space_info(fs_info, cache->flags, size, bytes_used, cache->bytes_super, &cache->space_info); - update_global_block_rsv(fs_info); + btrfs_update_global_block_rsv(fs_info); link_block_group(cache); @@ -10598,6 +8348,35 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) write_sequnlock(&fs_info->profiles_lock); } +/* + * Clear incompat bits for the following feature(s): + * + * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group + * in the whole filesystem + */ +static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags) +{ + if (flags & BTRFS_BLOCK_GROUP_RAID56_MASK) { + struct list_head *head = &fs_info->space_info; + struct btrfs_space_info *sinfo; + + list_for_each_entry_rcu(sinfo, head, list) { + bool found = false; + + down_read(&sinfo->groups_sem); + if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5])) + found = true; + if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6])) + found = true; + up_read(&sinfo->groups_sem); + + if (found) + return; + } + btrfs_clear_fs_incompat(fs_info, RAID56); + } +} + int btrfs_remove_block_group(struct btrfs_trans_handle *trans, u64 group_start, struct extent_map *em) { @@ -10744,6 +8523,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, clear_avail_alloc_bits(fs_info, block_group->flags); } up_write(&block_group->space_info->groups_sem); + clear_incompat_bg_bits(fs_info, block_group->flags); if (kobj) { kobject_del(kobj); kobject_put(kobj); @@ -10853,7 +8633,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, if (remove_em) { struct extent_map_tree *em_tree; - em_tree = &fs_info->mapping_tree.map_tree; + em_tree = &fs_info->mapping_tree; write_lock(&em_tree->lock); remove_extent_mapping(em_tree, em); write_unlock(&em_tree->lock); @@ -10871,7 +8651,7 @@ struct btrfs_trans_handle * btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info, const u64 chunk_offset) { - struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; + struct extent_map_tree *em_tree = &fs_info->mapping_tree; struct extent_map *em; struct map_lookup *map; unsigned int num_items; @@ -11020,7 +8800,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_lock(&space_info->lock); spin_lock(&block_group->lock); - update_bytes_pinned(space_info, -block_group->pinned); + btrfs_space_info_update_bytes_pinned(fs_info, space_info, + -block_group->pinned); space_info->bytes_readonly += block_group->pinned; percpu_counter_add_batch(&space_info->total_bytes_pinned, -block_group->pinned, @@ -11076,43 +8857,6 @@ next: spin_unlock(&fs_info->unused_bgs_lock); } -int btrfs_init_space_info(struct btrfs_fs_info *fs_info) -{ - struct btrfs_super_block *disk_super; - u64 features; - u64 flags; - int mixed = 0; - int ret; - - disk_super = fs_info->super_copy; - if (!btrfs_super_root(disk_super)) - return -EINVAL; - - features = btrfs_super_incompat_flags(disk_super); - if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) - mixed = 1; - - flags = BTRFS_BLOCK_GROUP_SYSTEM; - ret = create_space_info(fs_info, flags); - if (ret) - goto out; - - if (mixed) { - flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; - ret = create_space_info(fs_info, flags); - } else { - flags = BTRFS_BLOCK_GROUP_METADATA; - ret = create_space_info(fs_info, flags); - if (ret) - goto out; - - flags = BTRFS_BLOCK_GROUP_DATA; - ret = create_space_info(fs_info, flags); - } -out: - return ret; -} - int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end) { @@ -11171,12 +8915,17 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) find_first_clear_extent_bit(&device->alloc_state, start, &start, &end, CHUNK_TRIMMED | CHUNK_ALLOCATED); + + /* Ensure we skip the reserved area in the first 1M */ + start = max_t(u64, start, SZ_1M); + /* * If find_first_clear_extent_bit find a range that spans the * end of the device it will set end to -1, in this case it's up * to the caller to trim the value to the size of the device. */ end = min(end, device->total_bytes - 1); + len = end - start + 1; /* We didn't find any extents */ diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index db337e53aab3..1ff438fd5bc2 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -359,6 +359,24 @@ do_insert: return NULL; } +/** + * __etree_search - searche @tree for an entry that contains @offset. Such + * entry would have entry->start <= offset && entry->end >= offset. + * + * @tree - the tree to search + * @offset - offset that should fall within an entry in @tree + * @next_ret - pointer to the first entry whose range ends after @offset + * @prev - pointer to the first entry whose range begins before @offset + * @p_ret - pointer where new node should be anchored (used when inserting an + * entry in the tree) + * @parent_ret - points to entry which would have been the parent of the entry, + * containing @offset + * + * This function returns a pointer to the entry that contains @offset byte + * address. If no such entry exists, then NULL is returned and the other + * pointer arguments to the function are filled, otherwise the found entry is + * returned and other pointers are left untouched. + */ static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, struct rb_node **next_ret, struct rb_node **prev_ret, @@ -504,9 +522,11 @@ static int insert_state(struct extent_io_tree *tree, { struct rb_node *node; - if (end < start) - WARN(1, KERN_ERR "BTRFS: end < start %llu %llu\n", - end, start); + if (end < start) { + btrfs_err(tree->fs_info, + "insert state: end < start %llu %llu", end, start); + WARN_ON(1); + } state->start = start; state->end = end; @@ -516,7 +536,8 @@ static int insert_state(struct extent_io_tree *tree, if (node) { struct extent_state *found; found = rb_entry(node, struct extent_state, rb_node); - pr_err("BTRFS: found node %llu %llu on insert of %llu %llu\n", + btrfs_err(tree->fs_info, + "found node %llu %llu on insert of %llu %llu", found->start, found->end, start, end); return -EEXIST; } @@ -1537,8 +1558,8 @@ out: } /** - * find_first_clear_extent_bit - finds the first range that has @bits not set - * and that starts after @start + * find_first_clear_extent_bit - find the first range that has @bits not set. + * This range could start before @start. * * @tree - the tree to search * @start - the offset at/after which the found extent should start @@ -1578,12 +1599,52 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, goto out; } } + /* + * At this point 'node' either contains 'start' or start is + * before 'node' + */ state = rb_entry(node, struct extent_state, rb_node); - if (in_range(start, state->start, state->end - state->start + 1) && - (state->state & bits)) { - start = state->end + 1; + + if (in_range(start, state->start, state->end - state->start + 1)) { + if (state->state & bits) { + /* + * |--range with bits sets--| + * | + * start + */ + start = state->end + 1; + } else { + /* + * 'start' falls within a range that doesn't + * have the bits set, so take its start as + * the beginning of the desired range + * + * |--range with bits cleared----| + * | + * start + */ + *start_ret = state->start; + break; + } } else { - *start_ret = start; + /* + * |---prev range---|---hole/unset---|---node range---| + * | + * start + * + * or + * + * |---hole/unset--||--first node--| + * 0 | + * start + */ + if (prev) { + state = rb_entry(prev, struct extent_state, + rb_node); + *start_ret = state->end + 1; + } else { + *start_ret = 0; + } break; } } @@ -1719,10 +1780,10 @@ static noinline int lock_delalloc_pages(struct inode *inode, */ EXPORT_FOR_TESTS noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, - struct extent_io_tree *tree, struct page *locked_page, u64 *start, u64 *end) { + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; u64 delalloc_start; u64 delalloc_end; @@ -2800,12 +2861,11 @@ static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio) * never fail. We're returning a bio right now but you can call btrfs_io_bio * for the appropriate container_of magic */ -struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte) +struct bio *btrfs_bio_alloc(u64 first_byte) { struct bio *bio; bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &btrfs_bioset); - bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = first_byte >> 9; btrfs_io_bio_init(btrfs_io_bio(bio)); return bio; @@ -2911,12 +2971,13 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, bio = NULL; } else { if (wbc) - wbc_account_io(wbc, page, page_size); + wbc_account_cgroup_owner(wbc, page, page_size); return 0; } } - bio = btrfs_bio_alloc(bdev, offset); + bio = btrfs_bio_alloc(offset); + bio_set_dev(bio, bdev); bio_add_page(bio, page, page_size, pg_offset); bio->bi_end_io = end_io_func; bio->bi_private = tree; @@ -2924,7 +2985,7 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, bio->bi_opf = opf; if (wbc) { wbc_init_bio(wbc, bio); - wbc_account_io(wbc, page, page_size); + wbc_account_cgroup_owner(wbc, page, page_size); } *bio_ret = bio; @@ -3204,21 +3265,10 @@ static inline void contiguous_readpages(struct extent_io_tree *tree, unsigned long *bio_flags, u64 *prev_em_start) { - struct inode *inode; - struct btrfs_ordered_extent *ordered; + struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host); int index; - inode = pages[0]->mapping->host; - while (1) { - lock_extent(tree, start, end); - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, - end - start + 1); - if (!ordered) - break; - unlock_extent(tree, start, end); - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - } + btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL); for (index = 0; index < nr_pages; index++) { __do_readpage(tree, pages[index], btrfs_get_extent, em_cached, @@ -3234,22 +3284,12 @@ static int __extent_read_full_page(struct extent_io_tree *tree, unsigned long *bio_flags, unsigned int read_flags) { - struct inode *inode = page->mapping->host; - struct btrfs_ordered_extent *ordered; + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); u64 start = page_offset(page); u64 end = start + PAGE_SIZE - 1; int ret; - while (1) { - lock_extent(tree, start, end); - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, - PAGE_SIZE); - if (!ordered) - break; - unlock_extent(tree, start, end); - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - } + btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL); ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, bio_flags, read_flags, NULL); @@ -3290,7 +3330,6 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode, struct page *page, struct writeback_control *wbc, u64 delalloc_start, unsigned long *nr_written) { - struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; u64 page_end = delalloc_start + PAGE_SIZE - 1; bool found; u64 delalloc_to_write = 0; @@ -3300,8 +3339,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode, while (delalloc_end < page_end) { - found = find_lock_delalloc_range(inode, tree, - page, + found = find_lock_delalloc_range(inode, page, &delalloc_start, &delalloc_end); if (!found) { @@ -3310,7 +3348,6 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode, } ret = btrfs_run_delalloc_range(inode, page, delalloc_start, delalloc_end, &page_started, nr_written, wbc); - /* File system has been set read-only */ if (ret) { SetPageError(page); /* @@ -4542,6 +4579,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, struct btrfs_path *path; struct btrfs_root *root = BTRFS_I(inode)->root; struct fiemap_cache cache = { 0 }; + struct ulist *roots; + struct ulist *tmp_ulist; int end = 0; u64 em_start = 0; u64 em_len = 0; @@ -4555,6 +4594,13 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return -ENOMEM; path->leave_spinning = 1; + roots = ulist_alloc(GFP_KERNEL); + tmp_ulist = ulist_alloc(GFP_KERNEL); + if (!roots || !tmp_ulist) { + ret = -ENOMEM; + goto out_free_ulist; + } + start = round_down(start, btrfs_inode_sectorsize(inode)); len = round_up(max, btrfs_inode_sectorsize(inode)) - start; @@ -4565,8 +4611,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(BTRFS_I(inode)), -1, 0); if (ret < 0) { - btrfs_free_path(path); - return ret; + goto out_free_ulist; } else { WARN_ON(!ret); if (ret == 1) @@ -4675,7 +4720,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, */ ret = btrfs_check_shared(root, btrfs_ino(BTRFS_I(inode)), - bytenr); + bytenr, roots, tmp_ulist); if (ret < 0) goto out_free; if (ret) @@ -4718,9 +4763,13 @@ out_free: ret = emit_last_fiemap_cache(fieinfo, &cache); free_extent_map(em); out: - btrfs_free_path(path); unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1, &cached_state); + +out_free_ulist: + btrfs_free_path(path); + ulist_free(roots); + ulist_free(tmp_ulist); return ret; } @@ -4808,7 +4857,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, eb->bflags = 0; rwlock_init(&eb->lock); atomic_set(&eb->blocking_readers, 0); - atomic_set(&eb->blocking_writers, 0); + eb->blocking_writers = 0; eb->lock_nested = false; init_waitqueue_head(&eb->write_lock_wq); init_waitqueue_head(&eb->read_lock_wq); @@ -4827,10 +4876,10 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE); #ifdef CONFIG_BTRFS_DEBUG - atomic_set(&eb->spinning_writers, 0); + eb->spinning_writers = 0; atomic_set(&eb->spinning_readers, 0); atomic_set(&eb->read_locks, 0); - atomic_set(&eb->write_locks, 0); + eb->write_locks = 0; #endif return eb; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index aa18a16a6ed7..401423b16976 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -167,7 +167,7 @@ struct extent_buffer { struct rcu_head rcu_head; pid_t lock_owner; - atomic_t blocking_writers; + int blocking_writers; atomic_t blocking_readers; bool lock_nested; /* >= 0 if eb belongs to a log tree, -1 otherwise */ @@ -187,10 +187,10 @@ struct extent_buffer { wait_queue_head_t read_lock_wq; struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; #ifdef CONFIG_BTRFS_DEBUG - atomic_t spinning_writers; + int spinning_writers; atomic_t spinning_readers; atomic_t read_locks; - atomic_t write_locks; + int write_locks; struct list_head leak_list; #endif }; @@ -497,7 +497,7 @@ void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, u64 delalloc_end, struct page *locked_page, unsigned bits_to_clear, unsigned long page_ops); -struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte); +struct bio *btrfs_bio_alloc(u64 first_byte); struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs); struct bio *btrfs_bio_clone(struct bio *bio); struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size); @@ -549,7 +549,7 @@ int free_io_failure(struct extent_io_tree *failure_tree, struct extent_io_tree *io_tree, struct io_failure_record *rec); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -bool find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, +bool find_lock_delalloc_range(struct inode *inode, struct page *locked_page, u64 *start, u64 *end); #endif diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index d431ea8198e4..1a599f50837b 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -8,6 +8,7 @@ #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/sched/mm.h> +#include <crypto/hash.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -22,9 +23,13 @@ #define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \ PAGE_SIZE)) -#define MAX_ORDERED_SUM_BYTES(fs_info) ((PAGE_SIZE - \ - sizeof(struct btrfs_ordered_sum)) / \ - sizeof(u32) * (fs_info)->sectorsize) +static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info, + u16 csum_size) +{ + u32 ncsums = (PAGE_SIZE - sizeof(struct btrfs_ordered_sum)) / csum_size; + + return ncsums * fs_info->sectorsize; +} int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -144,7 +149,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, } static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, - u64 logical_offset, u32 *dst, int dio) + u64 logical_offset, u8 *dst, int dio) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct bio_vec bvec; @@ -182,7 +187,7 @@ static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio } csum = btrfs_bio->csum; } else { - csum = (u8 *)dst; + csum = dst; } if (bio->bi_iter.bi_size > PAGE_SIZE * 8) @@ -211,7 +216,7 @@ static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio if (!dio) offset = page_offset(bvec.bv_page) + bvec.bv_offset; count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, - (u32 *)csum, nblocks); + csum, nblocks); if (count) goto found; @@ -283,7 +288,8 @@ next: return 0; } -blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst) +blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, + u8 *dst) { return __btrfs_lookup_bio_sums(inode, bio, 0, dst, 0); } @@ -374,7 +380,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct btrfs_csum_item); while (start < csum_end) { size = min_t(size_t, csum_end - start, - MAX_ORDERED_SUM_BYTES(fs_info)); + max_ordered_sum_bytes(fs_info, csum_size)); sums = kzalloc(btrfs_ordered_sum_size(fs_info, size), GFP_NOFS); if (!sums) { @@ -427,6 +433,7 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, u64 file_start, int contig) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); struct btrfs_ordered_sum *sums; struct btrfs_ordered_extent *ordered = NULL; char *data; @@ -439,6 +446,7 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, int i; u64 offset; unsigned nofs_flag; + const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); nofs_flag = memalloc_nofs_save(); sums = kvzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size), @@ -459,6 +467,8 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, sums->bytenr = (u64)bio->bi_iter.bi_sector << 9; index = 0; + shash->tfm = fs_info->csum_shash; + bio_for_each_segment(bvec, bio, iter) { if (!contig) offset = page_offset(bvec.bv_page) + bvec.bv_offset; @@ -498,17 +508,14 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, index = 0; } - sums->sums[index] = ~(u32)0; + crypto_shash_init(shash); data = kmap_atomic(bvec.bv_page); - sums->sums[index] - = btrfs_csum_data(data + bvec.bv_offset - + (i * fs_info->sectorsize), - sums->sums[index], - fs_info->sectorsize); + crypto_shash_update(shash, data + bvec.bv_offset + + (i * fs_info->sectorsize), + fs_info->sectorsize); kunmap_atomic(data); - btrfs_csum_final(sums->sums[index], - (char *)(sums->sums + index)); - index++; + crypto_shash_final(shash, (char *)(sums->sums + index)); + index += csum_size; offset += fs_info->sectorsize; this_sum_bytes += fs_info->sectorsize; total_bytes += fs_info->sectorsize; @@ -904,9 +911,9 @@ found: write_extent_buffer(leaf, sums->sums + index, (unsigned long)item, ins_size); + index += ins_size; ins_size /= csum_size; total_bytes += ins_size * fs_info->sectorsize; - index += ins_size; btrfs_mark_buffer_dirty(path->nodes[0]); if (total_bytes < sums->len) { diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 89f5be2bfb43..58a18ed11546 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -26,6 +26,7 @@ #include "volumes.h" #include "qgroup.h" #include "compression.h" +#include "delalloc-space.h" static struct kmem_cache *btrfs_inode_defrag_cachep; /* @@ -1550,30 +1551,20 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos, { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; - struct btrfs_ordered_extent *ordered; u64 lockstart, lockend; u64 num_bytes; int ret; ret = btrfs_start_write_no_snapshotting(root); if (!ret) - return -ENOSPC; + return -EAGAIN; lockstart = round_down(pos, fs_info->sectorsize); lockend = round_up(pos + *write_bytes, fs_info->sectorsize) - 1; - while (1) { - lock_extent(&inode->io_tree, lockstart, lockend); - ordered = btrfs_lookup_ordered_range(inode, lockstart, - lockend - lockstart + 1); - if (!ordered) { - break; - } - unlock_extent(&inode->io_tree, lockstart, lockend); - btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - } + btrfs_lock_and_flush_ordered_range(&inode->io_tree, inode, lockstart, + lockend, NULL); num_bytes = lockend - lockstart + 1; ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes, @@ -2721,6 +2712,11 @@ out_only_mutex: * for detecting, at fsync time, if the inode isn't yet in the * log tree or it's there but not up to date. */ + struct timespec64 now = current_time(inode); + + inode_inc_iversion(inode); + inode->i_mtime = now; + inode->i_ctime = now; trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { err = PTR_ERR(trans); @@ -2801,9 +2797,9 @@ static int btrfs_fallocate_update_isize(struct inode *inode, } enum { - RANGE_BOUNDARY_WRITTEN_EXTENT = 0, - RANGE_BOUNDARY_PREALLOC_EXTENT = 1, - RANGE_BOUNDARY_HOLE = 2, + RANGE_BOUNDARY_WRITTEN_EXTENT, + RANGE_BOUNDARY_PREALLOC_EXTENT, + RANGE_BOUNDARY_HOLE, }; static int btrfs_zero_range_check_range_boundary(struct inode *inode, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index f74dc259307b..062be9dde4c6 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -18,6 +18,8 @@ #include "extent_io.h" #include "inode-map.h" #include "volumes.h" +#include "space-info.h" +#include "delalloc-space.h" #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) #define MAX_CACHE_BYTES_PER_GIG SZ_32K @@ -465,9 +467,8 @@ static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index) if (index == 0) offset = sizeof(u32) * io_ctl->num_pages; - crc = btrfs_csum_data(io_ctl->orig + offset, crc, - PAGE_SIZE - offset); - btrfs_csum_final(crc, (u8 *)&crc); + crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); + btrfs_crc32c_final(crc, (u8 *)&crc); io_ctl_unmap_page(io_ctl); tmp = page_address(io_ctl->pages[0]); tmp += index; @@ -493,9 +494,8 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) val = *tmp; io_ctl_map_page(io_ctl, 0); - crc = btrfs_csum_data(io_ctl->orig + offset, crc, - PAGE_SIZE - offset); - btrfs_csum_final(crc, (u8 *)&crc); + crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); + btrfs_crc32c_final(crc, (u8 *)&crc); if (val != crc) { btrfs_err_rl(io_ctl->fs_info, "csum mismatch on free space cache"); @@ -3166,8 +3166,8 @@ static int do_trimming(struct btrfs_block_group_cache *block_group, space_info->bytes_readonly += reserved_bytes; block_group->reserved -= reserved_bytes; space_info->bytes_reserved -= reserved_bytes; - spin_unlock(&space_info->lock); spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); } return ret; @@ -3358,7 +3358,7 @@ void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group) if (cleanup) { mutex_lock(&fs_info->chunk_mutex); - em_tree = &fs_info->mapping_tree.map_tree; + em_tree = &fs_info->mapping_tree; write_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, block_group->key.objectid, 1); diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index ffca2abf13d0..2e8bb402050b 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -11,6 +11,7 @@ #include "free-space-cache.h" #include "inode-map.h" #include "transaction.h" +#include "delalloc-space.h" static int caching_kthread(void *data) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a2aabdb85226..1af069a9a0c7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -47,6 +47,7 @@ #include "props.h" #include "qgroup.h" #include "dedupe.h" +#include "delalloc-space.h" struct btrfs_iget_args { struct btrfs_key *location; @@ -1932,17 +1933,19 @@ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, u64 length = 0; u64 map_length; int ret; + struct btrfs_io_geometry geom; if (bio_flags & EXTENT_BIO_COMPRESSED) return 0; length = bio->bi_iter.bi_size; map_length = length; - ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, - NULL, 0); + ret = btrfs_get_io_geometry(fs_info, btrfs_op(bio), logical, map_length, + &geom); if (ret < 0) return ret; - if (map_length < length + size) + + if (geom.len < length + size) return 1; return 0; } @@ -3203,16 +3206,23 @@ static int __readpage_endio_check(struct inode *inode, int icsum, struct page *page, int pgoff, u64 start, size_t len) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); char *kaddr; - u32 csum_expected; - u32 csum = ~(u32)0; + u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + u8 *csum_expected; + u8 csum[BTRFS_CSUM_SIZE]; - csum_expected = *(((u32 *)io_bio->csum) + icsum); + csum_expected = ((u8 *)io_bio->csum) + icsum * csum_size; kaddr = kmap_atomic(page); - csum = btrfs_csum_data(kaddr + pgoff, csum, len); - btrfs_csum_final(csum, (u8 *)&csum); - if (csum != csum_expected) + shash->tfm = fs_info->csum_shash; + + crypto_shash_init(shash); + crypto_shash_update(shash, kaddr + pgoff, len); + crypto_shash_final(shash, csum); + + if (memcmp(csum, csum_expected, csum_size)) goto zeroit; kunmap_atomic(kaddr); @@ -3286,6 +3296,28 @@ void btrfs_add_delayed_iput(struct inode *inode) wake_up_process(fs_info->cleaner_kthread); } +static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info, + struct btrfs_inode *inode) +{ + list_del_init(&inode->delayed_iput); + spin_unlock(&fs_info->delayed_iput_lock); + iput(&inode->vfs_inode); + if (atomic_dec_and_test(&fs_info->nr_delayed_iputs)) + wake_up(&fs_info->delayed_iputs_wait); + spin_lock(&fs_info->delayed_iput_lock); +} + +static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info, + struct btrfs_inode *inode) +{ + if (!list_empty(&inode->delayed_iput)) { + spin_lock(&fs_info->delayed_iput_lock); + if (!list_empty(&inode->delayed_iput)) + run_delayed_iput_locked(fs_info, inode); + spin_unlock(&fs_info->delayed_iput_lock); + } +} + void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) { @@ -3295,12 +3327,7 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) inode = list_first_entry(&fs_info->delayed_iputs, struct btrfs_inode, delayed_iput); - list_del_init(&inode->delayed_iput); - spin_unlock(&fs_info->delayed_iput_lock); - iput(&inode->vfs_inode); - if (atomic_dec_and_test(&fs_info->nr_delayed_iputs)) - wake_up(&fs_info->delayed_iputs_wait); - spin_lock(&fs_info->delayed_iput_lock); + run_delayed_iput_locked(fs_info, inode); } spin_unlock(&fs_info->delayed_iput_lock); } @@ -3935,9 +3962,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; int ret = 0; - struct extent_buffer *leaf; struct btrfs_dir_item *di; - struct btrfs_key key; u64 index; u64 ino = btrfs_ino(inode); u64 dir_ino = btrfs_ino(dir); @@ -3955,8 +3980,6 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, ret = di ? PTR_ERR(di) : -ENOENT; goto err; } - leaf = path->nodes[0]; - btrfs_dir_item_key_to_cpu(leaf, di, &key); ret = btrfs_delete_one_dir_name(trans, root, path, di); if (ret) goto err; @@ -4009,6 +4032,17 @@ skip_backref: ret = 0; else if (ret) btrfs_abort_transaction(trans, ret); + + /* + * If we have a pending delayed iput we could end up with the final iput + * being run in btrfs-cleaner context. If we have enough of these built + * up we can end up burning a lot of time in btrfs-cleaner without any + * way to throttle the unlinks. Since we're currently holding a ref on + * the inode we can run the delayed iput here without any issues as the + * final iput won't be done until after we drop the ref we're currently + * holding. + */ + btrfs_run_delayed_iput(fs_info, inode); err: btrfs_free_path(path); if (ret) @@ -5008,21 +5042,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) if (size <= hole_start) return 0; - while (1) { - struct btrfs_ordered_extent *ordered; - - lock_extent_bits(io_tree, hole_start, block_end - 1, - &cached_state); - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), hole_start, - block_end - hole_start); - if (!ordered) - break; - unlock_extent_cached(io_tree, hole_start, block_end - 1, - &cached_state); - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - } - + btrfs_lock_and_flush_ordered_range(io_tree, BTRFS_I(inode), hole_start, + block_end - 1, &cached_state); cur_offset = hole_start; while (1) { em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, @@ -8318,22 +8339,21 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip) struct bio *orig_bio = dip->orig_bio; u64 start_sector = orig_bio->bi_iter.bi_sector; u64 file_offset = dip->logical_offset; - u64 map_length; int async_submit = 0; u64 submit_len; int clone_offset = 0; int clone_len; int ret; blk_status_t status; + struct btrfs_io_geometry geom; - map_length = orig_bio->bi_iter.bi_size; - submit_len = map_length; - ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), start_sector << 9, - &map_length, NULL, 0); + submit_len = orig_bio->bi_iter.bi_size; + ret = btrfs_get_io_geometry(fs_info, btrfs_op(orig_bio), + start_sector << 9, submit_len, &geom); if (ret) return -EIO; - if (map_length >= submit_len) { + if (geom.len >= submit_len) { bio = orig_bio; dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED; goto submit; @@ -8346,10 +8366,10 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip) async_submit = 1; /* bio split */ - ASSERT(map_length <= INT_MAX); + ASSERT(geom.len <= INT_MAX); atomic_inc(&dip->pending_bios); do { - clone_len = min_t(int, submit_len, map_length); + clone_len = min_t(int, submit_len, geom.len); /* * This will never fail as it's passing GPF_NOFS and @@ -8386,9 +8406,8 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip) start_sector += clone_len >> 9; file_offset += clone_len; - map_length = submit_len; - ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), - start_sector << 9, &map_length, NULL, 0); + ret = btrfs_get_io_geometry(fs_info, btrfs_op(orig_bio), + start_sector << 9, submit_len, &geom); if (ret) goto out_err; } while (submit_len > 0); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index cfeff1b8dce0..818f7ec8bb0e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -43,6 +43,8 @@ #include "qgroup.h" #include "tree-log.h" #include "compression.h" +#include "space-info.h" +#include "delalloc-space.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI @@ -3993,6 +3995,27 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, if (!same_inode) inode_dio_wait(inode_out); + /* + * Workaround to make sure NOCOW buffered write reach disk as NOCOW. + * + * Btrfs' back references do not have a block level granularity, they + * work at the whole extent level. + * NOCOW buffered write without data space reserved may not be able + * to fall back to CoW due to lack of data space, thus could cause + * data loss. + * + * Here we take a shortcut by flushing the whole inode, so that all + * nocow write should reach disk as nocow before we increase the + * reference of the extent. We could do better by only flushing NOCOW + * data, but that needs extra accounting. + * + * Also we don't need to check ASYNC_EXTENT, as async extent will be + * CoWed anyway, not affecting nocow part. + */ + ret = filemap_flush(inode_in->i_mapping); + if (ret < 0) + return ret; + ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), wb_len); if (ret < 0) diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 2f6c3c7851ed..98fccce4208c 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -15,19 +15,19 @@ #ifdef CONFIG_BTRFS_DEBUG static void btrfs_assert_spinning_writers_get(struct extent_buffer *eb) { - WARN_ON(atomic_read(&eb->spinning_writers)); - atomic_inc(&eb->spinning_writers); + WARN_ON(eb->spinning_writers); + eb->spinning_writers++; } static void btrfs_assert_spinning_writers_put(struct extent_buffer *eb) { - WARN_ON(atomic_read(&eb->spinning_writers) != 1); - atomic_dec(&eb->spinning_writers); + WARN_ON(eb->spinning_writers != 1); + eb->spinning_writers--; } static void btrfs_assert_no_spinning_writers(struct extent_buffer *eb) { - WARN_ON(atomic_read(&eb->spinning_writers)); + WARN_ON(eb->spinning_writers); } static void btrfs_assert_spinning_readers_get(struct extent_buffer *eb) @@ -58,17 +58,17 @@ static void btrfs_assert_tree_read_locked(struct extent_buffer *eb) static void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb) { - atomic_inc(&eb->write_locks); + eb->write_locks++; } static void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb) { - atomic_dec(&eb->write_locks); + eb->write_locks--; } void btrfs_assert_tree_locked(struct extent_buffer *eb) { - BUG_ON(!atomic_read(&eb->write_locks)); + BUG_ON(!eb->write_locks); } #else @@ -111,10 +111,10 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb) */ if (eb->lock_nested && current->pid == eb->lock_owner) return; - if (atomic_read(&eb->blocking_writers) == 0) { + if (eb->blocking_writers == 0) { btrfs_assert_spinning_writers_put(eb); btrfs_assert_tree_locked(eb); - atomic_inc(&eb->blocking_writers); + eb->blocking_writers++; write_unlock(&eb->lock); } } @@ -148,12 +148,11 @@ void btrfs_clear_lock_blocking_write(struct extent_buffer *eb) */ if (eb->lock_nested && current->pid == eb->lock_owner) return; - BUG_ON(atomic_read(&eb->blocking_writers) != 1); write_lock(&eb->lock); + BUG_ON(eb->blocking_writers != 1); btrfs_assert_spinning_writers_get(eb); - /* atomic_dec_and_test implies a barrier */ - if (atomic_dec_and_test(&eb->blocking_writers)) - cond_wake_up_nomb(&eb->write_lock_wq); + if (--eb->blocking_writers == 0) + cond_wake_up(&eb->write_lock_wq); } /* @@ -167,12 +166,10 @@ void btrfs_tree_read_lock(struct extent_buffer *eb) if (trace_btrfs_tree_read_lock_enabled()) start_ns = ktime_get_ns(); again: - BUG_ON(!atomic_read(&eb->blocking_writers) && - current->pid == eb->lock_owner); - read_lock(&eb->lock); - if (atomic_read(&eb->blocking_writers) && - current->pid == eb->lock_owner) { + BUG_ON(eb->blocking_writers == 0 && + current->pid == eb->lock_owner); + if (eb->blocking_writers && current->pid == eb->lock_owner) { /* * This extent is already write-locked by our thread. We allow * an additional read lock to be added because it's for the same @@ -185,10 +182,10 @@ again: trace_btrfs_tree_read_lock(eb, start_ns); return; } - if (atomic_read(&eb->blocking_writers)) { + if (eb->blocking_writers) { read_unlock(&eb->lock); wait_event(eb->write_lock_wq, - atomic_read(&eb->blocking_writers) == 0); + eb->blocking_writers == 0); goto again; } btrfs_assert_tree_read_locks_get(eb); @@ -203,11 +200,11 @@ again: */ int btrfs_tree_read_lock_atomic(struct extent_buffer *eb) { - if (atomic_read(&eb->blocking_writers)) + if (eb->blocking_writers) return 0; read_lock(&eb->lock); - if (atomic_read(&eb->blocking_writers)) { + if (eb->blocking_writers) { read_unlock(&eb->lock); return 0; } @@ -223,13 +220,13 @@ int btrfs_tree_read_lock_atomic(struct extent_buffer *eb) */ int btrfs_try_tree_read_lock(struct extent_buffer *eb) { - if (atomic_read(&eb->blocking_writers)) + if (eb->blocking_writers) return 0; if (!read_trylock(&eb->lock)) return 0; - if (atomic_read(&eb->blocking_writers)) { + if (eb->blocking_writers) { read_unlock(&eb->lock); return 0; } @@ -245,13 +242,11 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb) */ int btrfs_try_tree_write_lock(struct extent_buffer *eb) { - if (atomic_read(&eb->blocking_writers) || - atomic_read(&eb->blocking_readers)) + if (eb->blocking_writers || atomic_read(&eb->blocking_readers)) return 0; write_lock(&eb->lock); - if (atomic_read(&eb->blocking_writers) || - atomic_read(&eb->blocking_readers)) { + if (eb->blocking_writers || atomic_read(&eb->blocking_readers)) { write_unlock(&eb->lock); return 0; } @@ -322,10 +317,9 @@ void btrfs_tree_lock(struct extent_buffer *eb) WARN_ON(eb->lock_owner == current->pid); again: wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0); - wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); + wait_event(eb->write_lock_wq, eb->blocking_writers == 0); write_lock(&eb->lock); - if (atomic_read(&eb->blocking_readers) || - atomic_read(&eb->blocking_writers)) { + if (atomic_read(&eb->blocking_readers) || eb->blocking_writers) { write_unlock(&eb->lock); goto again; } @@ -340,7 +334,7 @@ again: */ void btrfs_tree_unlock(struct extent_buffer *eb) { - int blockers = atomic_read(&eb->blocking_writers); + int blockers = eb->blocking_writers; BUG_ON(blockers > 1); @@ -351,7 +345,7 @@ void btrfs_tree_unlock(struct extent_buffer *eb) if (blockers) { btrfs_assert_no_spinning_writers(eb); - atomic_dec(&eb->blocking_writers); + eb->blocking_writers--; /* Use the lighter barrier after atomic */ smp_mb__after_atomic(); cond_wake_up_nomb(&eb->write_lock_wq); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 52889da69113..1744ba8b2754 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -13,6 +13,7 @@ #include "extent_io.h" #include "disk-io.h" #include "compression.h" +#include "delalloc-space.h" static struct kmem_cache *btrfs_ordered_extent_cache; @@ -924,14 +925,16 @@ out: * be reclaimed before their checksum is actually put into the btree */ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, - u32 *sum, int len) + u8 *sum, int len) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ordered_sum *ordered_sum; struct btrfs_ordered_extent *ordered; struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; unsigned long num_sectors; unsigned long i; u32 sectorsize = btrfs_inode_sectorsize(inode); + const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); int index = 0; ordered = btrfs_lookup_ordered_extent(inode, offset); @@ -947,10 +950,10 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, num_sectors = ordered_sum->len >> inode->i_sb->s_blocksize_bits; num_sectors = min_t(int, len - index, num_sectors - i); - memcpy(sum + index, ordered_sum->sums + i, - num_sectors); + memcpy(sum + index, ordered_sum->sums + i * csum_size, + num_sectors * csum_size); - index += (int)num_sectors; + index += (int)num_sectors * csum_size; if (index == len) goto out; disk_bytenr += num_sectors * sectorsize; @@ -962,6 +965,51 @@ out: return index; } +/* + * btrfs_flush_ordered_range - Lock the passed range and ensures all pending + * ordered extents in it are run to completion. + * + * @tree: IO tree used for locking out other users of the range + * @inode: Inode whose ordered tree is to be searched + * @start: Beginning of range to flush + * @end: Last byte of range to lock + * @cached_state: If passed, will return the extent state responsible for the + * locked range. It's the caller's responsibility to free the cached state. + * + * This function always returns with the given range locked, ensuring after it's + * called no order extent can be pending. + */ +void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree, + struct btrfs_inode *inode, u64 start, + u64 end, + struct extent_state **cached_state) +{ + struct btrfs_ordered_extent *ordered; + struct extent_state *cachedp = NULL; + + if (cached_state) + cachedp = *cached_state; + + while (1) { + lock_extent_bits(tree, start, end, &cachedp); + ordered = btrfs_lookup_ordered_range(inode, start, + end - start + 1); + if (!ordered) { + /* + * If no external cached_state has been passed then + * decrement the extra ref taken for cachedp since we + * aren't exposing it outside of this function + */ + if (!cached_state) + refcount_dec(&cachedp->refs); + break; + } + unlock_extent_cached(tree, start, end, &cachedp); + btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + } +} + int __init ordered_data_init(void) { btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 4c5991c3de14..5204171ea962 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -23,7 +23,7 @@ struct btrfs_ordered_sum { int len; struct list_head list; /* last field is a variable length array of csums */ - u32 sums[]; + u8 sums[]; }; /* @@ -183,11 +183,15 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range( int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, - u32 *sum, int len); + u8 *sum, int len); u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, const u64 range_start, const u64 range_len); u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, const u64 range_start, const u64 range_len); +void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree, + struct btrfs_inode *inode, u64 start, + u64 end, + struct extent_state **cached_state); int __init ordered_data_init(void); void __cold ordered_data_exit(void); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 1141ca5fae6a..9cb50577d982 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -153,11 +153,11 @@ static void print_eb_refs_lock(struct extent_buffer *eb) #ifdef CONFIG_BTRFS_DEBUG btrfs_info(eb->fs_info, "refs %u lock (w:%d r:%d bw:%d br:%d sw:%d sr:%d) lock_owner %u current %u", - atomic_read(&eb->refs), atomic_read(&eb->write_locks), + atomic_read(&eb->refs), eb->write_locks, atomic_read(&eb->read_locks), - atomic_read(&eb->blocking_writers), + eb->blocking_writers, atomic_read(&eb->blocking_readers), - atomic_read(&eb->spinning_writers), + eb->spinning_writers, atomic_read(&eb->spinning_readers), eb->lock_owner, current->pid); #endif diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index a9e2e66152ee..e0469816c678 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -257,11 +257,7 @@ static int prop_compression_validate(const char *value, size_t len) if (!value) return 0; - if (!strncmp("lzo", value, 3)) - return 0; - else if (!strncmp("zlib", value, 4)) - return 0; - else if (!strncmp("zstd", value, 4)) + if (btrfs_compress_is_valid_type(value, len)) return 0; return -EINVAL; @@ -341,7 +337,7 @@ static int inherit_props(struct btrfs_trans_handle *trans, for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) { const struct prop_handler *h = &prop_handlers[i]; const char *value; - u64 num_bytes; + u64 num_bytes = 0; if (!h->inheritable) continue; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 3e6ffbbd8b0a..f8a3c1b0a15a 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2614,6 +2614,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, int ret = 0; int i; u64 *i_qgroups; + bool committing = false; struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *quota_root; struct btrfs_qgroup *srcgroup; @@ -2621,7 +2622,25 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, u32 level_size = 0; u64 nums; - mutex_lock(&fs_info->qgroup_ioctl_lock); + /* + * There are only two callers of this function. + * + * One in create_subvol() in the ioctl context, which needs to hold + * the qgroup_ioctl_lock. + * + * The other one in create_pending_snapshot() where no other qgroup + * code can modify the fs as they all need to either start a new trans + * or hold a trans handler, thus we don't need to hold + * qgroup_ioctl_lock. + * This would avoid long and complex lock chain and make lockdep happy. + */ + spin_lock(&fs_info->trans_lock); + if (trans->transaction->state == TRANS_STATE_COMMIT_DOING) + committing = true; + spin_unlock(&fs_info->trans_lock); + + if (!committing) + mutex_lock(&fs_info->qgroup_ioctl_lock); if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) goto out; @@ -2785,7 +2804,8 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, unlock: spin_unlock(&fs_info->qgroup_lock); out: - mutex_unlock(&fs_info->qgroup_ioctl_lock); + if (!committing) + mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index f5d4c13a8dbc..2503485db859 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -7,7 +7,7 @@ #ifndef BTRFS_RAID56_H #define BTRFS_RAID56_H -static inline int nr_parity_stripes(struct map_lookup *map) +static inline int nr_parity_stripes(const struct map_lookup *map) { if (map->type & BTRFS_BLOCK_GROUP_RAID5) return 1; @@ -17,7 +17,7 @@ static inline int nr_parity_stripes(struct map_lookup *map) return 0; } -static inline int nr_data_stripes(struct map_lookup *map) +static inline int nr_data_stripes(const struct map_lookup *map) { return map->num_stripes - nr_parity_stripes(map); } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 22a3c69864fa..7f219851fa23 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -20,6 +20,7 @@ #include "inode-map.h" #include "qgroup.h" #include "print-tree.h" +#include "delalloc-space.h" /* * backref_node, mapping_node and tree_block start with this diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 22124122728c..47733fb55df7 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -9,6 +9,8 @@ #include "transaction.h" #include "disk-io.h" #include "print-tree.h" +#include "qgroup.h" +#include "space-info.h" /* * Read a root item from the tree. In case we detect a root item smaller then @@ -497,3 +499,57 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans, btrfs_set_stack_timespec_nsec(&item->ctime, ct.tv_nsec); spin_unlock(&root->root_item_lock); } + +/* + * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation + * root: the root of the parent directory + * rsv: block reservation + * items: the number of items that we need do reservation + * use_global_rsv: allow fallback to the global block reservation + * + * This function is used to reserve the space for snapshot/subvolume + * creation and deletion. Those operations are different with the + * common file/directory operations, they change two fs/file trees + * and root tree, the number of items that the qgroup reserves is + * different with the free space reservation. So we can not use + * the space reservation mechanism in start_transaction(). + */ +int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, int items, + bool use_global_rsv) +{ + u64 qgroup_num_bytes = 0; + u64 num_bytes; + int ret; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + + if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { + /* One for parent inode, two for dir entries */ + qgroup_num_bytes = 3 * fs_info->nodesize; + ret = btrfs_qgroup_reserve_meta_prealloc(root, + qgroup_num_bytes, true); + if (ret) + return ret; + } + + num_bytes = btrfs_calc_trans_metadata_size(fs_info, items); + rsv->space_info = btrfs_find_space_info(fs_info, + BTRFS_BLOCK_GROUP_METADATA); + ret = btrfs_block_rsv_add(root, rsv, num_bytes, + BTRFS_RESERVE_FLUSH_ALL); + + if (ret == -ENOSPC && use_global_rsv) + ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true); + + if (ret && qgroup_num_bytes) + btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); + + return ret; +} + +void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv) +{ + btrfs_block_rsv_release(fs_info, rsv, (u64)-1); +} diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index f7b29f9db5e2..0c99cf9fb595 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -6,6 +6,7 @@ #include <linux/blkdev.h> #include <linux/ratelimit.h> #include <linux/sched/mm.h> +#include <crypto/hash.h> #include "ctree.h" #include "volumes.h" #include "disk-io.h" @@ -1787,11 +1788,12 @@ static int scrub_checksum(struct scrub_block *sblock) static int scrub_checksum_data(struct scrub_block *sblock) { struct scrub_ctx *sctx = sblock->sctx; + struct btrfs_fs_info *fs_info = sctx->fs_info; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); u8 csum[BTRFS_CSUM_SIZE]; u8 *on_disk_csum; struct page *page; void *buffer; - u32 crc = ~(u32)0; u64 len; int index; @@ -1799,6 +1801,9 @@ static int scrub_checksum_data(struct scrub_block *sblock) if (!sblock->pagev[0]->have_csum) return 0; + shash->tfm = fs_info->csum_shash; + crypto_shash_init(shash); + on_disk_csum = sblock->pagev[0]->csum; page = sblock->pagev[0]->page; buffer = kmap_atomic(page); @@ -1808,7 +1813,7 @@ static int scrub_checksum_data(struct scrub_block *sblock) for (;;) { u64 l = min_t(u64, len, PAGE_SIZE); - crc = btrfs_csum_data(buffer, crc, l); + crypto_shash_update(shash, buffer, l); kunmap_atomic(buffer); len -= l; if (len == 0) @@ -1820,7 +1825,7 @@ static int scrub_checksum_data(struct scrub_block *sblock) buffer = kmap_atomic(page); } - btrfs_csum_final(crc, csum); + crypto_shash_final(shash, csum); if (memcmp(csum, on_disk_csum, sctx->csum_size)) sblock->checksum_error = 1; @@ -1832,16 +1837,19 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) struct scrub_ctx *sctx = sblock->sctx; struct btrfs_header *h; struct btrfs_fs_info *fs_info = sctx->fs_info; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); u8 calculated_csum[BTRFS_CSUM_SIZE]; u8 on_disk_csum[BTRFS_CSUM_SIZE]; struct page *page; void *mapped_buffer; u64 mapped_size; void *p; - u32 crc = ~(u32)0; u64 len; int index; + shash->tfm = fs_info->csum_shash; + crypto_shash_init(shash); + BUG_ON(sblock->page_count < 1); page = sblock->pagev[0]->page; mapped_buffer = kmap_atomic(page); @@ -1875,7 +1883,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) for (;;) { u64 l = min_t(u64, len, mapped_size); - crc = btrfs_csum_data(p, crc, l); + crypto_shash_update(shash, p, l); kunmap_atomic(mapped_buffer); len -= l; if (len == 0) @@ -1889,7 +1897,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) p = mapped_buffer; } - btrfs_csum_final(crc, calculated_csum); + crypto_shash_final(shash, calculated_csum); if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) sblock->checksum_error = 1; @@ -1900,18 +1908,22 @@ static int scrub_checksum_super(struct scrub_block *sblock) { struct btrfs_super_block *s; struct scrub_ctx *sctx = sblock->sctx; + struct btrfs_fs_info *fs_info = sctx->fs_info; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); u8 calculated_csum[BTRFS_CSUM_SIZE]; u8 on_disk_csum[BTRFS_CSUM_SIZE]; struct page *page; void *mapped_buffer; u64 mapped_size; void *p; - u32 crc = ~(u32)0; int fail_gen = 0; int fail_cor = 0; u64 len; int index; + shash->tfm = fs_info->csum_shash; + crypto_shash_init(shash); + BUG_ON(sblock->page_count < 1); page = sblock->pagev[0]->page; mapped_buffer = kmap_atomic(page); @@ -1934,7 +1946,7 @@ static int scrub_checksum_super(struct scrub_block *sblock) for (;;) { u64 l = min_t(u64, len, mapped_size); - crc = btrfs_csum_data(p, crc, l); + crypto_shash_update(shash, p, l); kunmap_atomic(mapped_buffer); len -= l; if (len == 0) @@ -1948,7 +1960,7 @@ static int scrub_checksum_super(struct scrub_block *sblock) p = mapped_buffer; } - btrfs_csum_final(crc, calculated_csum); + crypto_shash_final(shash, calculated_csum); if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) ++fail_cor; @@ -2448,7 +2460,7 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum) ASSERT(index < UINT_MAX); num_sectors = sum->len / sctx->fs_info->sectorsize; - memcpy(csum, sum->sums + index, sctx->csum_size); + memcpy(csum, sum->sums + index * sctx->csum_size, sctx->csum_size); if (index == num_sectors - 1) { list_del(&sum->list); kfree(sum); @@ -2660,18 +2672,18 @@ static int get_raid56_logic_offset(u64 physical, int num, u64 last_offset; u32 stripe_index; u32 rot; + const int data_stripes = nr_data_stripes(map); - last_offset = (physical - map->stripes[num].physical) * - nr_data_stripes(map); + last_offset = (physical - map->stripes[num].physical) * data_stripes; if (stripe_start) *stripe_start = last_offset; *offset = last_offset; - for (i = 0; i < nr_data_stripes(map); i++) { + for (i = 0; i < data_stripes; i++) { *offset = last_offset + i * map->stripe_len; stripe_nr = div64_u64(*offset, map->stripe_len); - stripe_nr = div_u64(stripe_nr, nr_data_stripes(map)); + stripe_nr = div_u64(stripe_nr, data_stripes); /* Work out the disk rotation on this stripe-set */ stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot); @@ -3079,7 +3091,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, offset = map->stripe_len * (num / map->sub_stripes); increment = map->stripe_len * factor; mirror_num = num % map->sub_stripes + 1; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { + } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { increment = map->stripe_len; mirror_num = num % map->num_stripes + 1; } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { @@ -3410,15 +3422,15 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, struct btrfs_block_group_cache *cache) { struct btrfs_fs_info *fs_info = sctx->fs_info; - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; + struct extent_map_tree *map_tree = &fs_info->mapping_tree; struct map_lookup *map; struct extent_map *em; int i; int ret = 0; - read_lock(&map_tree->map_tree.lock); - em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); - read_unlock(&map_tree->map_tree.lock); + read_lock(&map_tree->lock); + em = lookup_extent_mapping(map_tree, chunk_offset, 1); + read_unlock(&map_tree->lock); if (!em) { /* diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index f7fe4770f0e5..69b59bf75882 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -686,7 +686,7 @@ static int send_cmd(struct send_ctx *sctx) hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr)); hdr->crc = 0; - crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); + crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); hdr->crc = cpu_to_le32(crc); ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, @@ -6929,9 +6929,23 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) if (ret) goto out; + mutex_lock(&fs_info->balance_mutex); + if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { + mutex_unlock(&fs_info->balance_mutex); + btrfs_warn_rl(fs_info, + "cannot run send because a balance operation is in progress"); + ret = -EAGAIN; + goto out; + } + fs_info->send_in_progress++; + mutex_unlock(&fs_info->balance_mutex); + current->journal_info = BTRFS_SEND_TRANS_STUB; ret = send_subvol(sctx); current->journal_info = NULL; + mutex_lock(&fs_info->balance_mutex); + fs_info->send_in_progress--; + mutex_unlock(&fs_info->balance_mutex); if (ret < 0) goto out; diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c new file mode 100644 index 000000000000..ab7b9ec4c240 --- /dev/null +++ b/fs/btrfs/space-info.c @@ -0,0 +1,1094 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "ctree.h" +#include "space-info.h" +#include "sysfs.h" +#include "volumes.h" +#include "free-space-cache.h" +#include "ordered-data.h" +#include "transaction.h" +#include "math.h" + +u64 btrfs_space_info_used(struct btrfs_space_info *s_info, + bool may_use_included) +{ + ASSERT(s_info); + return s_info->bytes_used + s_info->bytes_reserved + + s_info->bytes_pinned + s_info->bytes_readonly + + (may_use_included ? s_info->bytes_may_use : 0); +} + +/* + * after adding space to the filesystem, we need to clear the full flags + * on all the space infos. + */ +void btrfs_clear_space_info_full(struct btrfs_fs_info *info) +{ + struct list_head *head = &info->space_info; + struct btrfs_space_info *found; + + rcu_read_lock(); + list_for_each_entry_rcu(found, head, list) + found->full = 0; + rcu_read_unlock(); +} + +static const char *alloc_name(u64 flags) +{ + switch (flags) { + case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA: + return "mixed"; + case BTRFS_BLOCK_GROUP_METADATA: + return "metadata"; + case BTRFS_BLOCK_GROUP_DATA: + return "data"; + case BTRFS_BLOCK_GROUP_SYSTEM: + return "system"; + default: + WARN_ON(1); + return "invalid-combination"; + }; +} + +static int create_space_info(struct btrfs_fs_info *info, u64 flags) +{ + + struct btrfs_space_info *space_info; + int i; + int ret; + + space_info = kzalloc(sizeof(*space_info), GFP_NOFS); + if (!space_info) + return -ENOMEM; + + ret = percpu_counter_init(&space_info->total_bytes_pinned, 0, + GFP_KERNEL); + if (ret) { + kfree(space_info); + return ret; + } + + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) + INIT_LIST_HEAD(&space_info->block_groups[i]); + init_rwsem(&space_info->groups_sem); + spin_lock_init(&space_info->lock); + space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; + space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; + init_waitqueue_head(&space_info->wait); + INIT_LIST_HEAD(&space_info->ro_bgs); + INIT_LIST_HEAD(&space_info->tickets); + INIT_LIST_HEAD(&space_info->priority_tickets); + + ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype, + info->space_info_kobj, "%s", + alloc_name(space_info->flags)); + if (ret) { + kobject_put(&space_info->kobj); + return ret; + } + + list_add_rcu(&space_info->list, &info->space_info); + if (flags & BTRFS_BLOCK_GROUP_DATA) + info->data_sinfo = space_info; + + return ret; +} + +int btrfs_init_space_info(struct btrfs_fs_info *fs_info) +{ + struct btrfs_super_block *disk_super; + u64 features; + u64 flags; + int mixed = 0; + int ret; + + disk_super = fs_info->super_copy; + if (!btrfs_super_root(disk_super)) + return -EINVAL; + + features = btrfs_super_incompat_flags(disk_super); + if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) + mixed = 1; + + flags = BTRFS_BLOCK_GROUP_SYSTEM; + ret = create_space_info(fs_info, flags); + if (ret) + goto out; + + if (mixed) { + flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; + ret = create_space_info(fs_info, flags); + } else { + flags = BTRFS_BLOCK_GROUP_METADATA; + ret = create_space_info(fs_info, flags); + if (ret) + goto out; + + flags = BTRFS_BLOCK_GROUP_DATA; + ret = create_space_info(fs_info, flags); + } +out: + return ret; +} + +void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, + u64 total_bytes, u64 bytes_used, + u64 bytes_readonly, + struct btrfs_space_info **space_info) +{ + struct btrfs_space_info *found; + int factor; + + factor = btrfs_bg_type_to_factor(flags); + + found = btrfs_find_space_info(info, flags); + ASSERT(found); + spin_lock(&found->lock); + found->total_bytes += total_bytes; + found->disk_total += total_bytes * factor; + found->bytes_used += bytes_used; + found->disk_used += bytes_used * factor; + found->bytes_readonly += bytes_readonly; + if (total_bytes > 0) + found->full = 0; + btrfs_space_info_add_new_bytes(info, found, + total_bytes - bytes_used - + bytes_readonly); + spin_unlock(&found->lock); + *space_info = found; +} + +struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, + u64 flags) +{ + struct list_head *head = &info->space_info; + struct btrfs_space_info *found; + + flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; + + rcu_read_lock(); + list_for_each_entry_rcu(found, head, list) { + if (found->flags & flags) { + rcu_read_unlock(); + return found; + } + } + rcu_read_unlock(); + return NULL; +} + +static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) +{ + return (global->size << 1); +} + +static int can_overcommit(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, u64 bytes, + enum btrfs_reserve_flush_enum flush, + bool system_chunk) +{ + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + u64 profile; + u64 space_size; + u64 avail; + u64 used; + int factor; + + /* Don't overcommit when in mixed mode. */ + if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) + return 0; + + if (system_chunk) + profile = btrfs_system_alloc_profile(fs_info); + else + profile = btrfs_metadata_alloc_profile(fs_info); + + used = btrfs_space_info_used(space_info, false); + + /* + * We only want to allow over committing if we have lots of actual space + * free, but if we don't have enough space to handle the global reserve + * space then we could end up having a real enospc problem when trying + * to allocate a chunk or some other such important allocation. + */ + spin_lock(&global_rsv->lock); + space_size = calc_global_rsv_need_space(global_rsv); + spin_unlock(&global_rsv->lock); + if (used + space_size >= space_info->total_bytes) + return 0; + + used += space_info->bytes_may_use; + + avail = atomic64_read(&fs_info->free_chunk_space); + + /* + * If we have dup, raid1 or raid10 then only half of the free + * space is actually usable. For raid56, the space info used + * doesn't include the parity drive, so we don't have to + * change the math + */ + factor = btrfs_bg_type_to_factor(profile); + avail = div_u64(avail, factor); + + /* + * If we aren't flushing all things, let us overcommit up to + * 1/2th of the space. If we can flush, don't let us overcommit + * too much, let it overcommit up to 1/8 of the space. + */ + if (flush == BTRFS_RESERVE_FLUSH_ALL) + avail >>= 3; + else + avail >>= 1; + + if (used + bytes < space_info->total_bytes + avail) + return 1; + return 0; +} + +/* + * This is for space we already have accounted in space_info->bytes_may_use, so + * basically when we're returning space from block_rsv's. + */ +void btrfs_space_info_add_old_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 num_bytes) +{ + struct reserve_ticket *ticket; + struct list_head *head; + u64 used; + enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; + bool check_overcommit = false; + + spin_lock(&space_info->lock); + head = &space_info->priority_tickets; + + /* + * If we are over our limit then we need to check and see if we can + * overcommit, and if we can't then we just need to free up our space + * and not satisfy any requests. + */ + used = btrfs_space_info_used(space_info, true); + if (used - num_bytes >= space_info->total_bytes) + check_overcommit = true; +again: + while (!list_empty(head) && num_bytes) { + ticket = list_first_entry(head, struct reserve_ticket, + list); + /* + * We use 0 bytes because this space is already reserved, so + * adding the ticket space would be a double count. + */ + if (check_overcommit && + !can_overcommit(fs_info, space_info, 0, flush, false)) + break; + if (num_bytes >= ticket->bytes) { + list_del_init(&ticket->list); + num_bytes -= ticket->bytes; + ticket->bytes = 0; + space_info->tickets_id++; + wake_up(&ticket->wait); + } else { + ticket->bytes -= num_bytes; + num_bytes = 0; + } + } + + if (num_bytes && head == &space_info->priority_tickets) { + head = &space_info->tickets; + flush = BTRFS_RESERVE_FLUSH_ALL; + goto again; + } + btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes); + trace_btrfs_space_reservation(fs_info, "space_info", + space_info->flags, num_bytes, 0); + spin_unlock(&space_info->lock); +} + +/* + * This is for newly allocated space that isn't accounted in + * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent + * we use this helper. + */ +void btrfs_space_info_add_new_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 num_bytes) +{ + struct reserve_ticket *ticket; + struct list_head *head = &space_info->priority_tickets; + +again: + while (!list_empty(head) && num_bytes) { + ticket = list_first_entry(head, struct reserve_ticket, + list); + if (num_bytes >= ticket->bytes) { + trace_btrfs_space_reservation(fs_info, "space_info", + space_info->flags, + ticket->bytes, 1); + list_del_init(&ticket->list); + num_bytes -= ticket->bytes; + btrfs_space_info_update_bytes_may_use(fs_info, + space_info, + ticket->bytes); + ticket->bytes = 0; + space_info->tickets_id++; + wake_up(&ticket->wait); + } else { + trace_btrfs_space_reservation(fs_info, "space_info", + space_info->flags, + num_bytes, 1); + btrfs_space_info_update_bytes_may_use(fs_info, + space_info, + num_bytes); + ticket->bytes -= num_bytes; + num_bytes = 0; + } + } + + if (num_bytes && head == &space_info->priority_tickets) { + head = &space_info->tickets; + goto again; + } +} + +#define DUMP_BLOCK_RSV(fs_info, rsv_name) \ +do { \ + struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \ + spin_lock(&__rsv->lock); \ + btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \ + __rsv->size, __rsv->reserved); \ + spin_unlock(&__rsv->lock); \ +} while (0) + +void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *info, u64 bytes, + int dump_block_groups) +{ + struct btrfs_block_group_cache *cache; + int index = 0; + + spin_lock(&info->lock); + btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull", + info->flags, + info->total_bytes - btrfs_space_info_used(info, true), + info->full ? "" : "not "); + btrfs_info(fs_info, + "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu", + info->total_bytes, info->bytes_used, info->bytes_pinned, + info->bytes_reserved, info->bytes_may_use, + info->bytes_readonly); + spin_unlock(&info->lock); + + DUMP_BLOCK_RSV(fs_info, global_block_rsv); + DUMP_BLOCK_RSV(fs_info, trans_block_rsv); + DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); + DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); + DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); + + if (!dump_block_groups) + return; + + down_read(&info->groups_sem); +again: + list_for_each_entry(cache, &info->block_groups[index], list) { + spin_lock(&cache->lock); + btrfs_info(fs_info, + "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s", + cache->key.objectid, cache->key.offset, + btrfs_block_group_used(&cache->item), cache->pinned, + cache->reserved, cache->ro ? "[readonly]" : ""); + btrfs_dump_free_space(cache, bytes); + spin_unlock(&cache->lock); + } + if (++index < BTRFS_NR_RAID_TYPES) + goto again; + up_read(&info->groups_sem); +} + +static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, + unsigned long nr_pages, int nr_items) +{ + struct super_block *sb = fs_info->sb; + + if (down_read_trylock(&sb->s_umount)) { + writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); + up_read(&sb->s_umount); + } else { + /* + * We needn't worry the filesystem going from r/w to r/o though + * we don't acquire ->s_umount mutex, because the filesystem + * should guarantee the delalloc inodes list be empty after + * the filesystem is readonly(all dirty pages are written to + * the disk). + */ + btrfs_start_delalloc_roots(fs_info, nr_items); + if (!current->journal_info) + btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1); + } +} + +static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, + u64 to_reclaim) +{ + u64 bytes; + u64 nr; + + bytes = btrfs_calc_trans_metadata_size(fs_info, 1); + nr = div64_u64(to_reclaim, bytes); + if (!nr) + nr = 1; + return nr; +} + +#define EXTENT_SIZE_PER_ITEM SZ_256K + +/* + * shrink metadata reservation for delalloc + */ +static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, + u64 orig, bool wait_ordered) +{ + struct btrfs_space_info *space_info; + struct btrfs_trans_handle *trans; + u64 delalloc_bytes; + u64 dio_bytes; + u64 async_pages; + u64 items; + long time_left; + unsigned long nr_pages; + int loops; + + /* Calc the number of the pages we need flush for space reservation */ + items = calc_reclaim_items_nr(fs_info, to_reclaim); + to_reclaim = items * EXTENT_SIZE_PER_ITEM; + + trans = (struct btrfs_trans_handle *)current->journal_info; + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + + delalloc_bytes = percpu_counter_sum_positive( + &fs_info->delalloc_bytes); + dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); + if (delalloc_bytes == 0 && dio_bytes == 0) { + if (trans) + return; + if (wait_ordered) + btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); + return; + } + + /* + * If we are doing more ordered than delalloc we need to just wait on + * ordered extents, otherwise we'll waste time trying to flush delalloc + * that likely won't give us the space back we need. + */ + if (dio_bytes > delalloc_bytes) + wait_ordered = true; + + loops = 0; + while ((delalloc_bytes || dio_bytes) && loops < 3) { + nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; + + /* + * Triggers inode writeback for up to nr_pages. This will invoke + * ->writepages callback and trigger delalloc filling + * (btrfs_run_delalloc_range()). + */ + btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items); + + /* + * We need to wait for the compressed pages to start before + * we continue. + */ + async_pages = atomic_read(&fs_info->async_delalloc_pages); + if (!async_pages) + goto skip_async; + + /* + * Calculate how many compressed pages we want to be written + * before we continue. I.e if there are more async pages than we + * require wait_event will wait until nr_pages are written. + */ + if (async_pages <= nr_pages) + async_pages = 0; + else + async_pages -= nr_pages; + + wait_event(fs_info->async_submit_wait, + atomic_read(&fs_info->async_delalloc_pages) <= + (int)async_pages); +skip_async: + spin_lock(&space_info->lock); + if (list_empty(&space_info->tickets) && + list_empty(&space_info->priority_tickets)) { + spin_unlock(&space_info->lock); + break; + } + spin_unlock(&space_info->lock); + + loops++; + if (wait_ordered && !trans) { + btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); + } else { + time_left = schedule_timeout_killable(1); + if (time_left) + break; + } + delalloc_bytes = percpu_counter_sum_positive( + &fs_info->delalloc_bytes); + dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); + } +} + +/** + * maybe_commit_transaction - possibly commit the transaction if its ok to + * @root - the root we're allocating for + * @bytes - the number of bytes we want to reserve + * @force - force the commit + * + * This will check to make sure that committing the transaction will actually + * get us somewhere and then commit the transaction if it does. Otherwise it + * will return -ENOSPC. + */ +static int may_commit_transaction(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info) +{ + struct reserve_ticket *ticket = NULL; + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; + struct btrfs_trans_handle *trans; + u64 bytes_needed; + u64 reclaim_bytes = 0; + + trans = (struct btrfs_trans_handle *)current->journal_info; + if (trans) + return -EAGAIN; + + spin_lock(&space_info->lock); + if (!list_empty(&space_info->priority_tickets)) + ticket = list_first_entry(&space_info->priority_tickets, + struct reserve_ticket, list); + else if (!list_empty(&space_info->tickets)) + ticket = list_first_entry(&space_info->tickets, + struct reserve_ticket, list); + bytes_needed = (ticket) ? ticket->bytes : 0; + spin_unlock(&space_info->lock); + + if (!bytes_needed) + return 0; + + trans = btrfs_join_transaction(fs_info->extent_root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + /* + * See if there is enough pinned space to make this reservation, or if + * we have block groups that are going to be freed, allowing us to + * possibly do a chunk allocation the next loop through. + */ + if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || + __percpu_counter_compare(&space_info->total_bytes_pinned, + bytes_needed, + BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) + goto commit; + + /* + * See if there is some space in the delayed insertion reservation for + * this reservation. + */ + if (space_info != delayed_rsv->space_info) + goto enospc; + + spin_lock(&delayed_rsv->lock); + reclaim_bytes += delayed_rsv->reserved; + spin_unlock(&delayed_rsv->lock); + + spin_lock(&delayed_refs_rsv->lock); + reclaim_bytes += delayed_refs_rsv->reserved; + spin_unlock(&delayed_refs_rsv->lock); + if (reclaim_bytes >= bytes_needed) + goto commit; + bytes_needed -= reclaim_bytes; + + if (__percpu_counter_compare(&space_info->total_bytes_pinned, + bytes_needed, + BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) + goto enospc; + +commit: + return btrfs_commit_transaction(trans); +enospc: + btrfs_end_transaction(trans); + return -ENOSPC; +} + +/* + * Try to flush some data based on policy set by @state. This is only advisory + * and may fail for various reasons. The caller is supposed to examine the + * state of @space_info to detect the outcome. + */ +static void flush_space(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, u64 num_bytes, + int state) +{ + struct btrfs_root *root = fs_info->extent_root; + struct btrfs_trans_handle *trans; + int nr; + int ret = 0; + + switch (state) { + case FLUSH_DELAYED_ITEMS_NR: + case FLUSH_DELAYED_ITEMS: + if (state == FLUSH_DELAYED_ITEMS_NR) + nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2; + else + nr = -1; + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + ret = btrfs_run_delayed_items_nr(trans, nr); + btrfs_end_transaction(trans); + break; + case FLUSH_DELALLOC: + case FLUSH_DELALLOC_WAIT: + shrink_delalloc(fs_info, num_bytes * 2, num_bytes, + state == FLUSH_DELALLOC_WAIT); + break; + case FLUSH_DELAYED_REFS_NR: + case FLUSH_DELAYED_REFS: + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + if (state == FLUSH_DELAYED_REFS_NR) + nr = calc_reclaim_items_nr(fs_info, num_bytes); + else + nr = 0; + btrfs_run_delayed_refs(trans, nr); + btrfs_end_transaction(trans); + break; + case ALLOC_CHUNK: + case ALLOC_CHUNK_FORCE: + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + ret = btrfs_chunk_alloc(trans, + btrfs_metadata_alloc_profile(fs_info), + (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : + CHUNK_ALLOC_FORCE); + btrfs_end_transaction(trans); + if (ret > 0 || ret == -ENOSPC) + ret = 0; + break; + case COMMIT_TRANS: + /* + * If we have pending delayed iputs then we could free up a + * bunch of pinned space, so make sure we run the iputs before + * we do our pinned bytes check below. + */ + btrfs_run_delayed_iputs(fs_info); + btrfs_wait_on_delayed_iputs(fs_info); + + ret = may_commit_transaction(fs_info, space_info); + break; + default: + ret = -ENOSPC; + break; + } + + trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state, + ret); + return; +} + +static inline u64 +btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + bool system_chunk) +{ + struct reserve_ticket *ticket; + u64 used; + u64 expected; + u64 to_reclaim = 0; + + list_for_each_entry(ticket, &space_info->tickets, list) + to_reclaim += ticket->bytes; + list_for_each_entry(ticket, &space_info->priority_tickets, list) + to_reclaim += ticket->bytes; + if (to_reclaim) + return to_reclaim; + + to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); + if (can_overcommit(fs_info, space_info, to_reclaim, + BTRFS_RESERVE_FLUSH_ALL, system_chunk)) + return 0; + + used = btrfs_space_info_used(space_info, true); + + if (can_overcommit(fs_info, space_info, SZ_1M, + BTRFS_RESERVE_FLUSH_ALL, system_chunk)) + expected = div_factor_fine(space_info->total_bytes, 95); + else + expected = div_factor_fine(space_info->total_bytes, 90); + + if (used > expected) + to_reclaim = used - expected; + else + to_reclaim = 0; + to_reclaim = min(to_reclaim, space_info->bytes_may_use + + space_info->bytes_reserved); + return to_reclaim; +} + +static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 used, bool system_chunk) +{ + u64 thresh = div_factor_fine(space_info->total_bytes, 98); + + /* If we're just plain full then async reclaim just slows us down. */ + if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) + return 0; + + if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info, + system_chunk)) + return 0; + + return (used >= thresh && !btrfs_fs_closing(fs_info) && + !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); +} + +static bool wake_all_tickets(struct list_head *head) +{ + struct reserve_ticket *ticket; + + while (!list_empty(head)) { + ticket = list_first_entry(head, struct reserve_ticket, list); + list_del_init(&ticket->list); + ticket->error = -ENOSPC; + wake_up(&ticket->wait); + if (ticket->bytes != ticket->orig_bytes) + return true; + } + return false; +} + +/* + * This is for normal flushers, we can wait all goddamned day if we want to. We + * will loop and continuously try to flush as long as we are making progress. + * We count progress as clearing off tickets each time we have to loop. + */ +static void btrfs_async_reclaim_metadata_space(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_space_info *space_info; + u64 to_reclaim; + int flush_state; + int commit_cycles = 0; + u64 last_tickets_id; + + fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + + spin_lock(&space_info->lock); + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, + false); + if (!to_reclaim) { + space_info->flush = 0; + spin_unlock(&space_info->lock); + return; + } + last_tickets_id = space_info->tickets_id; + spin_unlock(&space_info->lock); + + flush_state = FLUSH_DELAYED_ITEMS_NR; + do { + flush_space(fs_info, space_info, to_reclaim, flush_state); + spin_lock(&space_info->lock); + if (list_empty(&space_info->tickets)) { + space_info->flush = 0; + spin_unlock(&space_info->lock); + return; + } + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, + space_info, + false); + if (last_tickets_id == space_info->tickets_id) { + flush_state++; + } else { + last_tickets_id = space_info->tickets_id; + flush_state = FLUSH_DELAYED_ITEMS_NR; + if (commit_cycles) + commit_cycles--; + } + + /* + * We don't want to force a chunk allocation until we've tried + * pretty hard to reclaim space. Think of the case where we + * freed up a bunch of space and so have a lot of pinned space + * to reclaim. We would rather use that than possibly create a + * underutilized metadata chunk. So if this is our first run + * through the flushing state machine skip ALLOC_CHUNK_FORCE and + * commit the transaction. If nothing has changed the next go + * around then we can force a chunk allocation. + */ + if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles) + flush_state++; + + if (flush_state > COMMIT_TRANS) { + commit_cycles++; + if (commit_cycles > 2) { + if (wake_all_tickets(&space_info->tickets)) { + flush_state = FLUSH_DELAYED_ITEMS_NR; + commit_cycles--; + } else { + space_info->flush = 0; + } + } else { + flush_state = FLUSH_DELAYED_ITEMS_NR; + } + } + spin_unlock(&space_info->lock); + } while (flush_state <= COMMIT_TRANS); +} + +void btrfs_init_async_reclaim_work(struct work_struct *work) +{ + INIT_WORK(work, btrfs_async_reclaim_metadata_space); +} + +static const enum btrfs_flush_state priority_flush_states[] = { + FLUSH_DELAYED_ITEMS_NR, + FLUSH_DELAYED_ITEMS, + ALLOC_CHUNK, +}; + +static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + struct reserve_ticket *ticket) +{ + u64 to_reclaim; + int flush_state; + + spin_lock(&space_info->lock); + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, + false); + if (!to_reclaim) { + spin_unlock(&space_info->lock); + return; + } + spin_unlock(&space_info->lock); + + flush_state = 0; + do { + flush_space(fs_info, space_info, to_reclaim, + priority_flush_states[flush_state]); + flush_state++; + spin_lock(&space_info->lock); + if (ticket->bytes == 0) { + spin_unlock(&space_info->lock); + return; + } + spin_unlock(&space_info->lock); + } while (flush_state < ARRAY_SIZE(priority_flush_states)); +} + +static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + struct reserve_ticket *ticket) + +{ + DEFINE_WAIT(wait); + u64 reclaim_bytes = 0; + int ret = 0; + + spin_lock(&space_info->lock); + while (ticket->bytes > 0 && ticket->error == 0) { + ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); + if (ret) { + ret = -EINTR; + break; + } + spin_unlock(&space_info->lock); + + schedule(); + + finish_wait(&ticket->wait, &wait); + spin_lock(&space_info->lock); + } + if (!ret) + ret = ticket->error; + if (!list_empty(&ticket->list)) + list_del_init(&ticket->list); + if (ticket->bytes && ticket->bytes < ticket->orig_bytes) + reclaim_bytes = ticket->orig_bytes - ticket->bytes; + spin_unlock(&space_info->lock); + + if (reclaim_bytes) + btrfs_space_info_add_old_bytes(fs_info, space_info, + reclaim_bytes); + return ret; +} + +/** + * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space + * @root - the root we're allocating for + * @space_info - the space info we want to allocate from + * @orig_bytes - the number of bytes we want + * @flush - whether or not we can flush to make our reservation + * + * This will reserve orig_bytes number of bytes from the space info associated + * with the block_rsv. If there is not enough space it will make an attempt to + * flush out space to make room. It will do this by flushing delalloc if + * possible or committing the transaction. If flush is 0 then no attempts to + * regain reservations will be made and this will fail if there is not enough + * space already. + */ +static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 orig_bytes, + enum btrfs_reserve_flush_enum flush, + bool system_chunk) +{ + struct reserve_ticket ticket; + u64 used; + u64 reclaim_bytes = 0; + int ret = 0; + + ASSERT(orig_bytes); + ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); + + spin_lock(&space_info->lock); + ret = -ENOSPC; + used = btrfs_space_info_used(space_info, true); + + /* + * Carry on if we have enough space (short-circuit) OR call + * can_overcommit() to ensure we can overcommit to continue. + */ + if ((used + orig_bytes <= space_info->total_bytes) || + can_overcommit(fs_info, space_info, orig_bytes, flush, + system_chunk)) { + btrfs_space_info_update_bytes_may_use(fs_info, space_info, + orig_bytes); + trace_btrfs_space_reservation(fs_info, "space_info", + space_info->flags, orig_bytes, 1); + ret = 0; + } + + /* + * If we couldn't make a reservation then setup our reservation ticket + * and kick the async worker if it's not already running. + * + * If we are a priority flusher then we just need to add our ticket to + * the list and we will do our own flushing further down. + */ + if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { + ticket.orig_bytes = orig_bytes; + ticket.bytes = orig_bytes; + ticket.error = 0; + init_waitqueue_head(&ticket.wait); + if (flush == BTRFS_RESERVE_FLUSH_ALL) { + list_add_tail(&ticket.list, &space_info->tickets); + if (!space_info->flush) { + space_info->flush = 1; + trace_btrfs_trigger_flush(fs_info, + space_info->flags, + orig_bytes, flush, + "enospc"); + queue_work(system_unbound_wq, + &fs_info->async_reclaim_work); + } + } else { + list_add_tail(&ticket.list, + &space_info->priority_tickets); + } + } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { + used += orig_bytes; + /* + * We will do the space reservation dance during log replay, + * which means we won't have fs_info->fs_root set, so don't do + * the async reclaim as we will panic. + */ + if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && + need_do_async_reclaim(fs_info, space_info, + used, system_chunk) && + !work_busy(&fs_info->async_reclaim_work)) { + trace_btrfs_trigger_flush(fs_info, space_info->flags, + orig_bytes, flush, "preempt"); + queue_work(system_unbound_wq, + &fs_info->async_reclaim_work); + } + } + spin_unlock(&space_info->lock); + if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) + return ret; + + if (flush == BTRFS_RESERVE_FLUSH_ALL) + return wait_reserve_ticket(fs_info, space_info, &ticket); + + ret = 0; + priority_reclaim_metadata_space(fs_info, space_info, &ticket); + spin_lock(&space_info->lock); + if (ticket.bytes) { + if (ticket.bytes < orig_bytes) + reclaim_bytes = orig_bytes - ticket.bytes; + list_del_init(&ticket.list); + ret = -ENOSPC; + } + spin_unlock(&space_info->lock); + + if (reclaim_bytes) + btrfs_space_info_add_old_bytes(fs_info, space_info, + reclaim_bytes); + ASSERT(list_empty(&ticket.list)); + return ret; +} + +/** + * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space + * @root - the root we're allocating for + * @block_rsv - the block_rsv we're allocating for + * @orig_bytes - the number of bytes we want + * @flush - whether or not we can flush to make our reservation + * + * This will reserve orig_bytes number of bytes from the space info associated + * with the block_rsv. If there is not enough space it will make an attempt to + * flush out space to make room. It will do this by flushing delalloc if + * possible or committing the transaction. If flush is 0 then no attempts to + * regain reservations will be made and this will fail if there is not enough + * space already. + */ +int btrfs_reserve_metadata_bytes(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 orig_bytes, + enum btrfs_reserve_flush_enum flush) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + int ret; + bool system_chunk = (root == fs_info->chunk_root); + + ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info, + orig_bytes, flush, system_chunk); + if (ret == -ENOSPC && + unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { + if (block_rsv != global_rsv && + !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes)) + ret = 0; + } + if (ret == -ENOSPC) { + trace_btrfs_space_reservation(fs_info, "space_info:enospc", + block_rsv->space_info->flags, + orig_bytes, 1); + + if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) + btrfs_dump_space_info(fs_info, block_rsv->space_info, + orig_bytes, 0); + } + return ret; +} diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h new file mode 100644 index 000000000000..c2b54b8e1a14 --- /dev/null +++ b/fs/btrfs/space-info.h @@ -0,0 +1,133 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_SPACE_INFO_H +#define BTRFS_SPACE_INFO_H + +struct btrfs_space_info { + spinlock_t lock; + + u64 total_bytes; /* total bytes in the space, + this doesn't take mirrors into account */ + u64 bytes_used; /* total bytes used, + this doesn't take mirrors into account */ + u64 bytes_pinned; /* total bytes pinned, will be freed when the + transaction finishes */ + u64 bytes_reserved; /* total bytes the allocator has reserved for + current allocations */ + u64 bytes_may_use; /* number of bytes that may be used for + delalloc/allocations */ + u64 bytes_readonly; /* total bytes that are read only */ + + u64 max_extent_size; /* This will hold the maximum extent size of + the space info if we had an ENOSPC in the + allocator. */ + + unsigned int full:1; /* indicates that we cannot allocate any more + chunks for this space */ + unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ + + unsigned int flush:1; /* set if we are trying to make space */ + + unsigned int force_alloc; /* set if we need to force a chunk + alloc for this space */ + + u64 disk_used; /* total bytes used on disk */ + u64 disk_total; /* total bytes on disk, takes mirrors into + account */ + + u64 flags; + + /* + * bytes_pinned is kept in line with what is actually pinned, as in + * we've called update_block_group and dropped the bytes_used counter + * and increased the bytes_pinned counter. However this means that + * bytes_pinned does not reflect the bytes that will be pinned once the + * delayed refs are flushed, so this counter is inc'ed every time we + * call btrfs_free_extent so it is a realtime count of what will be + * freed once the transaction is committed. It will be zeroed every + * time the transaction commits. + */ + struct percpu_counter total_bytes_pinned; + + struct list_head list; + /* Protected by the spinlock 'lock'. */ + struct list_head ro_bgs; + struct list_head priority_tickets; + struct list_head tickets; + /* + * tickets_id just indicates the next ticket will be handled, so note + * it's not stored per ticket. + */ + u64 tickets_id; + + struct rw_semaphore groups_sem; + /* for block groups in our same type */ + struct list_head block_groups[BTRFS_NR_RAID_TYPES]; + wait_queue_head_t wait; + + struct kobject kobj; + struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES]; +}; + +struct reserve_ticket { + u64 orig_bytes; + u64 bytes; + int error; + struct list_head list; + wait_queue_head_t wait; +}; + +static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) +{ + return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) && + (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); +} + +/* + * + * Declare a helper function to detect underflow of various space info members + */ +#define DECLARE_SPACE_INFO_UPDATE(name) \ +static inline void \ +btrfs_space_info_update_##name(struct btrfs_fs_info *fs_info, \ + struct btrfs_space_info *sinfo, \ + s64 bytes) \ +{ \ + lockdep_assert_held(&sinfo->lock); \ + trace_update_##name(fs_info, sinfo, sinfo->name, bytes); \ + if (bytes < 0 && sinfo->name < -bytes) { \ + WARN_ON(1); \ + sinfo->name = 0; \ + return; \ + } \ + sinfo->name += bytes; \ +} + +DECLARE_SPACE_INFO_UPDATE(bytes_may_use); +DECLARE_SPACE_INFO_UPDATE(bytes_pinned); + +void btrfs_space_info_add_new_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 num_bytes); +void btrfs_space_info_add_old_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 num_bytes); +int btrfs_init_space_info(struct btrfs_fs_info *fs_info); +void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, + u64 total_bytes, u64 bytes_used, + u64 bytes_readonly, + struct btrfs_space_info **space_info); +struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, + u64 flags); +u64 btrfs_space_info_used(struct btrfs_space_info *s_info, + bool may_use_included); +void btrfs_clear_space_info_full(struct btrfs_fs_info *info); +void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *info, u64 bytes, + int dump_block_groups); +int btrfs_reserve_metadata_bytes(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 orig_bytes, + enum btrfs_reserve_flush_enum flush); + +#endif /* BTRFS_SPACE_INFO_H */ diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 0645ec428b4f..78de9d5d80c6 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -42,6 +42,7 @@ #include "dev-replace.h" #include "free-space-cache.h" #include "backref.h" +#include "space-info.h" #include "tests/btrfs-tests.h" #include "qgroup.h" @@ -1553,6 +1554,8 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, } else { snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); btrfs_sb(s)->bdev_holder = fs_type; + if (!strstr(crc32c_impl(), "generic")) + set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags); error = btrfs_fill_super(s, fs_devices, data); } if (!error) @@ -1601,14 +1604,10 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, { struct vfsmount *mnt_root; struct dentry *root; - fmode_t mode = FMODE_READ; char *subvol_name = NULL; u64 subvol_objectid = 0; int error = 0; - if (!(flags & SB_RDONLY)) - mode |= FMODE_WRITE; - error = btrfs_parse_subvol_options(data, &subvol_name, &subvol_objectid); if (error) { @@ -1904,8 +1903,9 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, u64 type; u64 avail_space; u64 min_stripe_size; - int min_stripes = 1, num_stripes = 1; + int min_stripes, num_stripes = 1; int i = 0, nr_devices; + const struct btrfs_raid_attr *rattr; /* * We aren't under the device list lock, so this is racy-ish, but good @@ -1929,21 +1929,18 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, /* calc min stripe number for data space allocation */ type = btrfs_data_alloc_profile(fs_info); - if (type & BTRFS_BLOCK_GROUP_RAID0) { - min_stripes = 2; + rattr = &btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)]; + min_stripes = rattr->devs_min; + + if (type & BTRFS_BLOCK_GROUP_RAID0) num_stripes = nr_devices; - } else if (type & BTRFS_BLOCK_GROUP_RAID1) { - min_stripes = 2; + else if (type & BTRFS_BLOCK_GROUP_RAID1) num_stripes = 2; - } else if (type & BTRFS_BLOCK_GROUP_RAID10) { - min_stripes = 4; + else if (type & BTRFS_BLOCK_GROUP_RAID10) num_stripes = 4; - } - if (type & BTRFS_BLOCK_GROUP_DUP) - min_stripe_size = 2 * BTRFS_STRIPE_LEN; - else - min_stripe_size = BTRFS_STRIPE_LEN; + /* Adjust for more than 1 stripe per device */ + min_stripe_size = rattr->dev_stripes * BTRFS_STRIPE_LEN; rcu_read_lock(); list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { @@ -2466,3 +2463,4 @@ late_initcall(init_btrfs_fs); module_exit(exit_btrfs_fs) MODULE_LICENSE("GPL"); +MODULE_SOFTDEP("pre: crc32c"); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index c1dfc97893ba..9539f8143b7a 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -16,6 +16,7 @@ #include "transaction.h" #include "sysfs.h" #include "volumes.h" +#include "space-info.h" static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj); static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj); diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 9238fd4f1734..1e3ba4949399 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -5,6 +5,7 @@ #include <linux/fs.h> #include <linux/mount.h> +#include <linux/pseudo_fs.h> #include <linux/magic.h> #include "btrfs-tests.h" #include "../ctree.h" @@ -32,17 +33,19 @@ static const struct super_operations btrfs_test_super_ops = { .destroy_inode = btrfs_test_destroy_inode, }; -static struct dentry *btrfs_test_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *data) + +static int btrfs_test_init_fs_context(struct fs_context *fc) { - return mount_pseudo(fs_type, "btrfs_test:", &btrfs_test_super_ops, - NULL, BTRFS_TEST_MAGIC); + struct pseudo_fs_context *ctx = init_pseudo(fc, BTRFS_TEST_MAGIC); + if (!ctx) + return -ENOMEM; + ctx->ops = &btrfs_test_super_ops; + return 0; } static struct file_system_type test_type = { .name = "btrfs_test_fs", - .mount = btrfs_test_mount, + .init_fs_context = btrfs_test_init_fs_context, .kill_sb = kill_anon_super, }; diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 7bf4d5734dbe..1bf6b5a79191 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -10,6 +10,7 @@ #include "btrfs-tests.h" #include "../ctree.h" #include "../extent_io.h" +#include "../btrfs_inode.h" #define PROCESS_UNLOCK (1 << 0) #define PROCESS_RELEASE (1 << 1) @@ -58,7 +59,7 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end, static int test_find_delalloc(u32 sectorsize) { struct inode *inode; - struct extent_io_tree tmp; + struct extent_io_tree *tmp; struct page *page; struct page *locked_page = NULL; unsigned long index = 0; @@ -76,12 +77,13 @@ static int test_find_delalloc(u32 sectorsize) test_std_err(TEST_ALLOC_INODE); return -ENOMEM; } + tmp = &BTRFS_I(inode)->io_tree; /* * Passing NULL as we don't have fs_info but tracepoints are not used * at this point */ - extent_io_tree_init(NULL, &tmp, IO_TREE_SELFTEST, NULL); + extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST, NULL); /* * First go through and create and mark all of our pages dirty, we pin @@ -108,10 +110,10 @@ static int test_find_delalloc(u32 sectorsize) * |--- delalloc ---| * |--- search ---| */ - set_extent_delalloc(&tmp, 0, sectorsize - 1, 0, NULL); + set_extent_delalloc(tmp, 0, sectorsize - 1, 0, NULL); start = 0; end = 0; - found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + found = find_lock_delalloc_range(inode, locked_page, &start, &end); if (!found) { test_err("should have found at least one delalloc"); @@ -122,7 +124,7 @@ static int test_find_delalloc(u32 sectorsize) sectorsize - 1, start, end); goto out_bits; } - unlock_extent(&tmp, start, end); + unlock_extent(tmp, start, end); unlock_page(locked_page); put_page(locked_page); @@ -139,10 +141,10 @@ static int test_find_delalloc(u32 sectorsize) test_err("couldn't find the locked page"); goto out_bits; } - set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, 0, NULL); + set_extent_delalloc(tmp, sectorsize, max_bytes - 1, 0, NULL); start = test_start; end = 0; - found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + found = find_lock_delalloc_range(inode, locked_page, &start, &end); if (!found) { test_err("couldn't find delalloc in our range"); @@ -158,7 +160,7 @@ static int test_find_delalloc(u32 sectorsize) test_err("there were unlocked pages in the range"); goto out_bits; } - unlock_extent(&tmp, start, end); + unlock_extent(tmp, start, end); /* locked_page was unlocked above */ put_page(locked_page); @@ -176,7 +178,7 @@ static int test_find_delalloc(u32 sectorsize) } start = test_start; end = 0; - found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + found = find_lock_delalloc_range(inode, locked_page, &start, &end); if (found) { test_err("found range when we shouldn't have"); @@ -194,10 +196,10 @@ static int test_find_delalloc(u32 sectorsize) * * We are re-using our test_start from above since it works out well. */ - set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, 0, NULL); + set_extent_delalloc(tmp, max_bytes, total_dirty - 1, 0, NULL); start = test_start; end = 0; - found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + found = find_lock_delalloc_range(inode, locked_page, &start, &end); if (!found) { test_err("didn't find our range"); @@ -213,7 +215,7 @@ static int test_find_delalloc(u32 sectorsize) test_err("pages in range were not all locked"); goto out_bits; } - unlock_extent(&tmp, start, end); + unlock_extent(tmp, start, end); /* * Now to test where we run into a page that is no longer dirty in the @@ -238,7 +240,7 @@ static int test_find_delalloc(u32 sectorsize) * this changes at any point in the future we will need to fix this * tests expected behavior. */ - found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + found = find_lock_delalloc_range(inode, locked_page, &start, &end); if (!found) { test_err("didn't find our range"); @@ -256,7 +258,7 @@ static int test_find_delalloc(u32 sectorsize) } ret = 0; out_bits: - clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1); + clear_extent_bits(tmp, 0, total_dirty - 1, (unsigned)-1); out: if (locked_page) put_page(locked_page); @@ -432,6 +434,89 @@ out: return ret; } +static int test_find_first_clear_extent_bit(void) +{ + struct extent_io_tree tree; + u64 start, end; + + test_msg("running find_first_clear_extent_bit test"); + extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST, NULL); + + /* + * Set 1M-4M alloc/discard and 32M-64M thus leaving a hole between + * 4M-32M + */ + set_extent_bits(&tree, SZ_1M, SZ_4M - 1, + CHUNK_TRIMMED | CHUNK_ALLOCATED); + + find_first_clear_extent_bit(&tree, SZ_512K, &start, &end, + CHUNK_TRIMMED | CHUNK_ALLOCATED); + + if (start != 0 || end != SZ_1M -1) + test_err("error finding beginning range: start %llu end %llu", + start, end); + + /* Now add 32M-64M so that we have a hole between 4M-32M */ + set_extent_bits(&tree, SZ_32M, SZ_64M - 1, + CHUNK_TRIMMED | CHUNK_ALLOCATED); + + /* + * Request first hole starting at 12M, we should get 4M-32M + */ + find_first_clear_extent_bit(&tree, 12 * SZ_1M, &start, &end, + CHUNK_TRIMMED | CHUNK_ALLOCATED); + + if (start != SZ_4M || end != SZ_32M - 1) + test_err("error finding trimmed range: start %llu end %llu", + start, end); + + /* + * Search in the middle of allocated range, should get the next one + * available, which happens to be unallocated -> 4M-32M + */ + find_first_clear_extent_bit(&tree, SZ_2M, &start, &end, + CHUNK_TRIMMED | CHUNK_ALLOCATED); + + if (start != SZ_4M || end != SZ_32M -1) + test_err("error finding next unalloc range: start %llu end %llu", + start, end); + + /* + * Set 64M-72M with CHUNK_ALLOC flag, then search for CHUNK_TRIMMED flag + * being unset in this range, we should get the entry in range 64M-72M + */ + set_extent_bits(&tree, SZ_64M, SZ_64M + SZ_8M - 1, CHUNK_ALLOCATED); + find_first_clear_extent_bit(&tree, SZ_64M + SZ_1M, &start, &end, + CHUNK_TRIMMED); + + if (start != SZ_64M || end != SZ_64M + SZ_8M - 1) + test_err("error finding exact range: start %llu end %llu", + start, end); + + find_first_clear_extent_bit(&tree, SZ_64M - SZ_8M, &start, &end, + CHUNK_TRIMMED); + + /* + * Search in the middle of set range whose immediate neighbour doesn't + * have the bits set so it must be returned + */ + if (start != SZ_64M || end != SZ_64M + SZ_8M - 1) + test_err("error finding next alloc range: start %llu end %llu", + start, end); + + /* + * Search beyond any known range, shall return after last known range + * and end should be -1 + */ + find_first_clear_extent_bit(&tree, -1, &start, &end, CHUNK_TRIMMED); + if (start != SZ_64M + SZ_8M || end != -1) + test_err( + "error handling beyond end of range search: start %llu end %llu", + start, end); + + return 0; +} + int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) { int ret; @@ -442,6 +527,10 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) if (ret) goto out; + ret = test_find_first_clear_extent_bit(); + if (ret) + goto out; + ret = test_eb_bitmaps(sectorsize, nodesize); out: return ret; diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 87aeabe9d610..4a7f796c9900 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -66,7 +66,9 @@ static int test_case_1(struct btrfs_fs_info *fs_info, em->len = SZ_16K; em->block_start = 0; em->block_len = SZ_16K; + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [0, 16K)"); goto out; @@ -85,7 +87,9 @@ static int test_case_1(struct btrfs_fs_info *fs_info, em->len = SZ_4K; em->block_start = SZ_32K; /* avoid merging */ em->block_len = SZ_4K; + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [16K, 20K)"); goto out; @@ -104,7 +108,9 @@ static int test_case_1(struct btrfs_fs_info *fs_info, em->len = len; em->block_start = start; em->block_len = len; + write_lock(&em_tree->lock); ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); + write_unlock(&em_tree->lock); if (ret) { test_err("case1 [%llu %llu]: ret %d", start, start + len, ret); goto out; @@ -148,7 +154,9 @@ static int test_case_2(struct btrfs_fs_info *fs_info, em->len = SZ_1K; em->block_start = EXTENT_MAP_INLINE; em->block_len = (u64)-1; + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [0, 1K)"); goto out; @@ -167,7 +175,9 @@ static int test_case_2(struct btrfs_fs_info *fs_info, em->len = SZ_4K; em->block_start = SZ_4K; em->block_len = SZ_4K; + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [4K, 8K)"); goto out; @@ -186,7 +196,9 @@ static int test_case_2(struct btrfs_fs_info *fs_info, em->len = SZ_1K; em->block_start = EXTENT_MAP_INLINE; em->block_len = (u64)-1; + write_lock(&em_tree->lock); ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); + write_unlock(&em_tree->lock); if (ret) { test_err("case2 [0 1K]: ret %d", ret); goto out; @@ -225,7 +237,9 @@ static int __test_case_3(struct btrfs_fs_info *fs_info, em->len = SZ_4K; em->block_start = SZ_4K; em->block_len = SZ_4K; + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [4K, 8K)"); goto out; @@ -244,7 +258,9 @@ static int __test_case_3(struct btrfs_fs_info *fs_info, em->len = SZ_16K; em->block_start = 0; em->block_len = SZ_16K; + write_lock(&em_tree->lock); ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); + write_unlock(&em_tree->lock); if (ret) { test_err("case3 [0x%llx 0x%llx): ret %d", start, start + len, ret); @@ -320,7 +336,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, em->len = SZ_8K; em->block_start = 0; em->block_len = SZ_8K; + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [0, 8K)"); goto out; @@ -339,7 +357,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, em->len = 24 * SZ_1K; em->block_start = SZ_16K; /* avoid merging */ em->block_len = 24 * SZ_1K; + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [8K, 32K)"); goto out; @@ -357,7 +377,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, em->len = SZ_32K; em->block_start = 0; em->block_len = SZ_32K; + write_lock(&em_tree->lock); ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); + write_unlock(&em_tree->lock); if (ret) { test_err("case4 [0x%llx 0x%llx): ret %d", start, len, ret); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 3f6811cdf803..3b8ae1a8f02d 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -129,6 +129,24 @@ static inline int extwriter_counter_read(struct btrfs_transaction *trans) } /* + * To be called after all the new block groups attached to the transaction + * handle have been created (btrfs_create_pending_block_groups()). + */ +void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + + if (!trans->chunk_bytes_reserved) + return; + + WARN_ON_ONCE(!list_empty(&trans->new_bgs)); + + btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv, + trans->chunk_bytes_reserved); + trans->chunk_bytes_reserved = 0; +} + +/* * either allocate a new transaction or hop into the existing one */ static noinline int join_transaction(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 78c446c222b7..527ea94b57d9 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -224,5 +224,6 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction); void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info); void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); +void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); #endif diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 96fce4bef4e7..ccd5706199d7 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -132,6 +132,7 @@ static int check_extent_data_item(struct extent_buffer *leaf, struct btrfs_file_extent_item *fi; u32 sectorsize = fs_info->sectorsize; u32 item_size = btrfs_item_size_nr(leaf, slot); + u64 extent_end; if (!IS_ALIGNED(key->offset, sectorsize)) { file_extent_err(leaf, slot, @@ -207,6 +208,16 @@ static int check_extent_data_item(struct extent_buffer *leaf, CHECK_FE_ALIGNED(leaf, slot, fi, num_bytes, sectorsize)) return -EUCLEAN; + /* Catch extent end overflow */ + if (check_add_overflow(btrfs_file_extent_num_bytes(leaf, fi), + key->offset, &extent_end)) { + file_extent_err(leaf, slot, + "extent end overflow, have file offset %llu extent num bytes %llu", + key->offset, + btrfs_file_extent_num_bytes(leaf, fi)); + return -EUCLEAN; + } + /* * Check that no two consecutive file extent items, in the same leaf, * present ranges that overlap each other. diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 3fc8d854d7fb..6c8297bcfeb7 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3323,6 +3323,30 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, } /* + * Check if an inode was logged in the current transaction. We can't always rely + * on an inode's logged_trans value, because it's an in-memory only field and + * therefore not persisted. This means that its value is lost if the inode gets + * evicted and loaded again from disk (in which case it has a value of 0, and + * certainly it is smaller then any possible transaction ID), when that happens + * the full_sync flag is set in the inode's runtime flags, so on that case we + * assume eviction happened and ignore the logged_trans value, assuming the + * worst case, that the inode was logged before in the current transaction. + */ +static bool inode_logged(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode) +{ + if (inode->logged_trans == trans->transid) + return true; + + if (inode->last_trans == trans->transid && + test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) && + !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags)) + return true; + + return false; +} + +/* * If both a file and directory are logged, and unlinks or renames are * mixed in, we have a few interesting corners: * @@ -3356,7 +3380,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, int bytes_del = 0; u64 dir_ino = btrfs_ino(dir); - if (dir->logged_trans < trans->transid) + if (!inode_logged(trans, dir)) return 0; ret = join_running_log_trans(root); @@ -3460,7 +3484,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, u64 index; int ret; - if (inode->logged_trans < trans->transid) + if (!inode_logged(trans, inode)) return 0; ret = join_running_log_trans(root); @@ -5420,9 +5444,19 @@ log_extents: } } + /* + * Don't update last_log_commit if we logged that an inode exists after + * it was loaded to memory (full_sync bit set). + * This is to prevent data loss when we do a write to the inode, then + * the inode gets evicted after all delalloc was flushed, then we log + * it exists (due to a rename for example) and then fsync it. This last + * fsync would do nothing (not logging the extents previously written). + */ spin_lock(&inode->lock); inode->logged_trans = trans->transid; - inode->last_log_commit = inode->last_sub_trans; + if (inode_only != LOG_INODE_EXISTS || + !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) + inode->last_log_commit = inode->last_sub_trans; spin_unlock(&inode->lock); out_unlock: mutex_unlock(&inode->log_mutex); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1c2a6e4b39da..a13ddba1ebc3 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -28,6 +28,7 @@ #include "dev-replace.h" #include "sysfs.h" #include "tree-checker.h" +#include "space-info.h" const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = { @@ -123,12 +124,14 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { }, }; -const char *get_raid_name(enum btrfs_raid_types type) +const char *btrfs_bg_type_to_raid_name(u64 flags) { - if (type >= BTRFS_NR_RAID_TYPES) + const int index = btrfs_bg_flags_to_raid_index(flags); + + if (index >= BTRFS_NR_RAID_TYPES) return NULL; - return btrfs_raid_array[type].raid_name; + return btrfs_raid_array[index].raid_name; } /* @@ -237,7 +240,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * chunk_mutex * ----------- * protects chunks, adding or removing during allocation, trim or when a new - * device is added/removed + * device is added/removed. Additionally it also protects post_commit_list of + * individual devices, since they can be added to the transaction's + * post_commit_list only with chunk_mutex held. * * cleaner_mutex * ------------- @@ -1818,7 +1823,7 @@ static u64 find_next_chunk(struct btrfs_fs_info *fs_info) struct rb_node *n; u64 ret = 0; - em_tree = &fs_info->mapping_tree.map_tree; + em_tree = &fs_info->mapping_tree; read_lock(&em_tree->lock); n = rb_last(&em_tree->map.rb_root); if (n) { @@ -2941,7 +2946,7 @@ struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, struct extent_map_tree *em_tree; struct extent_map *em; - em_tree = &fs_info->mapping_tree.map_tree; + em_tree = &fs_info->mapping_tree; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, logical, length); read_unlock(&em_tree->lock); @@ -3474,6 +3479,18 @@ static int chunk_devid_filter(struct extent_buffer *leaf, return 1; } +static u64 calc_data_stripes(u64 type, int num_stripes) +{ + const int index = btrfs_bg_flags_to_raid_index(type); + const int ncopies = btrfs_raid_array[index].ncopies; + const int nparity = btrfs_raid_array[index].nparity; + + if (nparity) + return num_stripes - nparity; + else + return num_stripes / ncopies; +} + /* [pstart, pend) */ static int chunk_drange_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk, @@ -3483,22 +3500,15 @@ static int chunk_drange_filter(struct extent_buffer *leaf, int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); u64 stripe_offset; u64 stripe_length; + u64 type; int factor; int i; if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) return 0; - if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | - BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) { - factor = num_stripes / 2; - } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) { - factor = num_stripes - 1; - } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) { - factor = num_stripes - 2; - } else { - factor = num_stripes; - } + type = btrfs_chunk_type(leaf, chunk); + factor = calc_data_stripes(type, num_stripes); for (i = 0; i < num_stripes; i++) { stripe = btrfs_stripe_nr(chunk, i); @@ -3921,11 +3931,9 @@ static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf, bp += ret; \ } while (0) - if (flags & BTRFS_BALANCE_ARGS_CONVERT) { - int index = btrfs_bg_flags_to_raid_index(bargs->target); - - CHECK_APPEND_1ARG("convert=%s,", get_raid_name(index)); - } + if (flags & BTRFS_BALANCE_ARGS_CONVERT) + CHECK_APPEND_1ARG("convert=%s,", + btrfs_bg_type_to_raid_name(bargs->target)); if (flags & BTRFS_BALANCE_ARGS_SOFT) CHECK_APPEND_NOARG("soft,"); @@ -4047,6 +4055,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, u64 num_devices; unsigned seq; bool reducing_integrity; + int i; if (btrfs_fs_closing(fs_info) || atomic_read(&fs_info->balance_pause_req) || @@ -4076,48 +4085,43 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, } num_devices = btrfs_num_devices(fs_info); + allowed = 0; + for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) + if (num_devices >= btrfs_raid_array[i].devs_min) + allowed |= btrfs_raid_array[i].bg_flag; - allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP; - if (num_devices > 1) - allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); - if (num_devices > 2) - allowed |= BTRFS_BLOCK_GROUP_RAID5; - if (num_devices > 3) - allowed |= (BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_RAID6); if (validate_convert_profile(&bctl->data, allowed)) { - int index = btrfs_bg_flags_to_raid_index(bctl->data.target); - btrfs_err(fs_info, "balance: invalid convert data profile %s", - get_raid_name(index)); + btrfs_bg_type_to_raid_name(bctl->data.target)); ret = -EINVAL; goto out; } if (validate_convert_profile(&bctl->meta, allowed)) { - int index = btrfs_bg_flags_to_raid_index(bctl->meta.target); - btrfs_err(fs_info, "balance: invalid convert metadata profile %s", - get_raid_name(index)); + btrfs_bg_type_to_raid_name(bctl->meta.target)); ret = -EINVAL; goto out; } if (validate_convert_profile(&bctl->sys, allowed)) { - int index = btrfs_bg_flags_to_raid_index(bctl->sys.target); - btrfs_err(fs_info, "balance: invalid convert system profile %s", - get_raid_name(index)); + btrfs_bg_type_to_raid_name(bctl->sys.target)); ret = -EINVAL; goto out; } - /* allow to reduce meta or sys integrity only if force set */ - allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6; + /* + * Allow to reduce metadata or system integrity only if force set for + * profiles with redundancy (copies, parity) + */ + allowed = 0; + for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) { + if (btrfs_raid_array[i].ncopies >= 2 || + btrfs_raid_array[i].tolerated_failures >= 1) + allowed |= btrfs_raid_array[i].bg_flag; + } do { seq = read_seqbegin(&fs_info->profiles_lock); @@ -4152,12 +4156,18 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { - int meta_index = btrfs_bg_flags_to_raid_index(meta_target); - int data_index = btrfs_bg_flags_to_raid_index(data_target); - btrfs_warn(fs_info, "balance: metadata profile %s has lower redundancy than data profile %s", - get_raid_name(meta_index), get_raid_name(data_index)); + btrfs_bg_type_to_raid_name(meta_target), + btrfs_bg_type_to_raid_name(data_target)); + } + + if (fs_info->send_in_progress) { + btrfs_warn_rl(fs_info, +"cannot run balance while send operations are in progress (%d in progress)", + fs_info->send_in_progress); + ret = -EAGAIN; + goto out; } ret = insert_balance_item(fs_info, bctl); @@ -4949,6 +4959,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, sub_stripes = btrfs_raid_array[index].sub_stripes; dev_stripes = btrfs_raid_array[index].dev_stripes; devs_max = btrfs_raid_array[index].devs_max; + if (!devs_max) + devs_max = BTRFS_MAX_DEVS(info); devs_min = btrfs_raid_array[index].devs_min; devs_increment = btrfs_raid_array[index].devs_increment; ncopies = btrfs_raid_array[index].ncopies; @@ -4957,8 +4969,6 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (type & BTRFS_BLOCK_GROUP_DATA) { max_stripe_size = SZ_1G; max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; - if (!devs_max) - devs_max = BTRFS_MAX_DEVS(info); } else if (type & BTRFS_BLOCK_GROUP_METADATA) { /* for larger filesystems, use larger metadata chunks */ if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) @@ -4966,13 +4976,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, else max_stripe_size = SZ_256M; max_chunk_size = max_stripe_size; - if (!devs_max) - devs_max = BTRFS_MAX_DEVS(info); } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { max_stripe_size = SZ_32M; max_chunk_size = 2 * max_stripe_size; - if (!devs_max) - devs_max = BTRFS_MAX_DEVS_SYS_CHUNK; } else { btrfs_err(info, "invalid chunk type 0x%llx requested", type); @@ -5143,7 +5149,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, em->block_len = em->len; em->orig_block_len = stripe_size; - em_tree = &info->mapping_tree.map_tree; + em_tree = &info->mapping_tree; write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); if (ret) { @@ -5324,20 +5330,9 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) static inline int btrfs_chunk_max_errors(struct map_lookup *map) { - int max_errors; + const int index = btrfs_bg_flags_to_raid_index(map->type); - if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_DUP)) { - max_errors = 1; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { - max_errors = 2; - } else { - max_errors = 0; - } - - return max_errors; + return btrfs_raid_array[index].tolerated_failures; } int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) @@ -5378,21 +5373,16 @@ end: return readonly; } -void btrfs_mapping_init(struct btrfs_mapping_tree *tree) -{ - extent_map_tree_init(&tree->map_tree); -} - -void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) +void btrfs_mapping_tree_free(struct extent_map_tree *tree) { struct extent_map *em; while (1) { - write_lock(&tree->map_tree.lock); - em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); + write_lock(&tree->lock); + em = lookup_extent_mapping(tree, 0, (u64)-1); if (em) - remove_extent_mapping(&tree->map_tree, em); - write_unlock(&tree->map_tree.lock); + remove_extent_mapping(tree, em); + write_unlock(&tree->lock); if (!em) break; /* once for us */ @@ -5419,7 +5409,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) return 1; map = em->map_lookup; - if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) + if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK)) ret = map->num_stripes; else if (map->type & BTRFS_BLOCK_GROUP_RAID10) ret = map->sub_stripes; @@ -5493,7 +5483,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, struct btrfs_device *srcdev; ASSERT((map->type & - (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))); + (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10))); if (map->type & BTRFS_BLOCK_GROUP_RAID10) num_stripes = map->sub_stripes; @@ -5682,7 +5672,7 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, &remaining_stripes); div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); last_stripe *= sub_stripes; - } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | + } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_DUP)) { num_stripes = map->num_stripes; } else { @@ -5926,6 +5916,102 @@ static bool need_full_stripe(enum btrfs_map_op op) return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); } +/* + * btrfs_get_io_geometry - calculates the geomery of a particular (address, len) + * tuple. This information is used to calculate how big a + * particular bio can get before it straddles a stripe. + * + * @fs_info - the filesystem + * @logical - address that we want to figure out the geometry of + * @len - the length of IO we are going to perform, starting at @logical + * @op - type of operation - write or read + * @io_geom - pointer used to return values + * + * Returns < 0 in case a chunk for the given logical address cannot be found, + * usually shouldn't happen unless @logical is corrupted, 0 otherwise. + */ +int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, + u64 logical, u64 len, struct btrfs_io_geometry *io_geom) +{ + struct extent_map *em; + struct map_lookup *map; + u64 offset; + u64 stripe_offset; + u64 stripe_nr; + u64 stripe_len; + u64 raid56_full_stripe_start = (u64)-1; + int data_stripes; + + ASSERT(op != BTRFS_MAP_DISCARD); + + em = btrfs_get_chunk_map(fs_info, logical, len); + if (IS_ERR(em)) + return PTR_ERR(em); + + map = em->map_lookup; + /* Offset of this logical address in the chunk */ + offset = logical - em->start; + /* Len of a stripe in a chunk */ + stripe_len = map->stripe_len; + /* Stripe wher this block falls in */ + stripe_nr = div64_u64(offset, stripe_len); + /* Offset of stripe in the chunk */ + stripe_offset = stripe_nr * stripe_len; + if (offset < stripe_offset) { + btrfs_crit(fs_info, +"stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu", + stripe_offset, offset, em->start, logical, stripe_len); + free_extent_map(em); + return -EINVAL; + } + + /* stripe_offset is the offset of this block in its stripe */ + stripe_offset = offset - stripe_offset; + data_stripes = nr_data_stripes(map); + + if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + u64 max_len = stripe_len - stripe_offset; + + /* + * In case of raid56, we need to know the stripe aligned start + */ + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + unsigned long full_stripe_len = stripe_len * data_stripes; + raid56_full_stripe_start = offset; + + /* + * Allow a write of a full stripe, but make sure we + * don't allow straddling of stripes + */ + raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, + full_stripe_len); + raid56_full_stripe_start *= full_stripe_len; + + /* + * For writes to RAID[56], allow a full stripeset across + * all disks. For other RAID types and for RAID[56] + * reads, just allow a single stripe (on a single disk). + */ + if (op == BTRFS_MAP_WRITE) { + max_len = stripe_len * data_stripes - + (offset - raid56_full_stripe_start); + } + } + len = min_t(u64, em->len - offset, max_len); + } else { + len = em->len - offset; + } + + io_geom->len = len; + io_geom->offset = offset; + io_geom->stripe_len = stripe_len; + io_geom->stripe_nr = stripe_nr; + io_geom->stripe_offset = stripe_offset; + io_geom->raid56_stripe_offset = raid56_full_stripe_start; + + return 0; +} + static int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, @@ -5939,6 +6025,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, u64 stripe_nr; u64 stripe_len; u32 stripe_index; + int data_stripes; int i; int ret = 0; int num_stripes; @@ -5951,76 +6038,29 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int patch_the_first_stripe_for_dev_replace = 0; u64 physical_to_patch_in_first_stripe = 0; u64 raid56_full_stripe_start = (u64)-1; + struct btrfs_io_geometry geom; + + ASSERT(bbio_ret); if (op == BTRFS_MAP_DISCARD) return __btrfs_map_block_for_discard(fs_info, logical, *length, bbio_ret); - em = btrfs_get_chunk_map(fs_info, logical, *length); - if (IS_ERR(em)) - return PTR_ERR(em); + ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom); + if (ret < 0) + return ret; + em = btrfs_get_chunk_map(fs_info, logical, *length); + ASSERT(em); map = em->map_lookup; - offset = logical - em->start; - - stripe_len = map->stripe_len; - stripe_nr = offset; - /* - * stripe_nr counts the total number of stripes we have to stride - * to get to this block - */ - stripe_nr = div64_u64(stripe_nr, stripe_len); - - stripe_offset = stripe_nr * stripe_len; - if (offset < stripe_offset) { - btrfs_crit(fs_info, - "stripe math has gone wrong, stripe_offset=%llu, offset=%llu, start=%llu, logical=%llu, stripe_len=%llu", - stripe_offset, offset, em->start, logical, - stripe_len); - free_extent_map(em); - return -EINVAL; - } - - /* stripe_offset is the offset of this block in its stripe*/ - stripe_offset = offset - stripe_offset; - - /* if we're here for raid56, we need to know the stripe aligned start */ - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); - raid56_full_stripe_start = offset; - /* allow a write of a full stripe, but make sure we don't - * allow straddling of stripes - */ - raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, - full_stripe_len); - raid56_full_stripe_start *= full_stripe_len; - } - - if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { - u64 max_len; - /* For writes to RAID[56], allow a full stripeset across all disks. - For other RAID types and for RAID[56] reads, just allow a single - stripe (on a single disk). */ - if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && - (op == BTRFS_MAP_WRITE)) { - max_len = stripe_len * nr_data_stripes(map) - - (offset - raid56_full_stripe_start); - } else { - /* we limit the length of each bio to what fits in a stripe */ - max_len = stripe_len - stripe_offset; - } - *length = min_t(u64, em->len - offset, max_len); - } else { - *length = em->len - offset; - } - - /* - * This is for when we're called from btrfs_bio_fits_in_stripe and all - * it cares about is the length - */ - if (!bbio_ret) - goto out; + *length = geom.len; + offset = geom.offset; + stripe_len = geom.stripe_len; + stripe_nr = geom.stripe_nr; + stripe_offset = geom.stripe_offset; + raid56_full_stripe_start = geom.raid56_stripe_offset; + data_stripes = nr_data_stripes(map); down_read(&dev_replace->rwsem); dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); @@ -6052,7 +6092,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, &stripe_index); if (!need_full_stripe(op)) mirror_num = 1; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { + } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { if (need_full_stripe(op)) num_stripes = map->num_stripes; else if (mirror_num) @@ -6094,7 +6134,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { /* push stripe_nr back to the start of the full stripe */ stripe_nr = div64_u64(raid56_full_stripe_start, - stripe_len * nr_data_stripes(map)); + stripe_len * data_stripes); /* RAID[56] write or recovery. Return all stripes */ num_stripes = map->num_stripes; @@ -6110,10 +6150,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * Mirror #3 is RAID6 Q block. */ stripe_nr = div_u64_rem(stripe_nr, - nr_data_stripes(map), &stripe_index); + data_stripes, &stripe_index); if (mirror_num > 1) - stripe_index = nr_data_stripes(map) + - mirror_num - 2; + stripe_index = data_stripes + mirror_num - 2; /* We distribute the parity blocks across stripes */ div_u64_rem(stripe_nr + stripe_index, map->num_stripes, @@ -6171,8 +6210,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, div_u64_rem(stripe_nr, num_stripes, &rot); /* Fill in the logical address of each stripe */ - tmp = stripe_nr * nr_data_stripes(map); - for (i = 0; i < nr_data_stripes(map); i++) + tmp = stripe_nr * data_stripes; + for (i = 0; i < data_stripes; i++) bbio->raid_map[(i+rot) % num_stripes] = em->start + (tmp + i) * map->stripe_len; @@ -6687,7 +6726,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, struct btrfs_chunk *chunk) { struct btrfs_fs_info *fs_info = leaf->fs_info; - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; + struct extent_map_tree *map_tree = &fs_info->mapping_tree; struct map_lookup *map; struct extent_map *em; u64 logical; @@ -6712,9 +6751,9 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, return ret; } - read_lock(&map_tree->map_tree.lock); - em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); - read_unlock(&map_tree->map_tree.lock); + read_lock(&map_tree->lock); + em = lookup_extent_mapping(map_tree, logical, 1); + read_unlock(&map_tree->lock); /* already mapped? */ if (em && em->start <= logical && em->start + em->len > logical) { @@ -6783,9 +6822,9 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, } - write_lock(&map_tree->map_tree.lock); - ret = add_extent_mapping(&map_tree->map_tree, em, 0); - write_unlock(&map_tree->map_tree.lock); + write_lock(&map_tree->lock); + ret = add_extent_mapping(map_tree, em, 0); + write_unlock(&map_tree->lock); if (ret < 0) { btrfs_err(fs_info, "failed to add chunk map, start=%llu len=%llu: %d", @@ -7103,14 +7142,14 @@ out_short_read: bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, struct btrfs_device *failing_dev) { - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; + struct extent_map_tree *map_tree = &fs_info->mapping_tree; struct extent_map *em; u64 next_start = 0; bool ret = true; - read_lock(&map_tree->map_tree.lock); - em = lookup_extent_mapping(&map_tree->map_tree, 0, (u64)-1); - read_unlock(&map_tree->map_tree.lock); + read_lock(&map_tree->lock); + em = lookup_extent_mapping(map_tree, 0, (u64)-1); + read_unlock(&map_tree->lock); /* No chunk at all? Return false anyway */ if (!em) { ret = false; @@ -7148,10 +7187,10 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, next_start = extent_map_end(em); free_extent_map(em); - read_lock(&map_tree->map_tree.lock); - em = lookup_extent_mapping(&map_tree->map_tree, next_start, + read_lock(&map_tree->lock); + em = lookup_extent_mapping(map_tree, next_start, (u64)(-1) - next_start); - read_unlock(&map_tree->map_tree.lock); + read_unlock(&map_tree->lock); } out: return ret; @@ -7600,10 +7639,9 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) */ int btrfs_bg_type_to_factor(u64 flags) { - if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10)) - return 2; - return 1; + const int index = btrfs_bg_flags_to_raid_index(flags); + + return btrfs_raid_array[index].ncopies; } @@ -7612,7 +7650,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, u64 chunk_offset, u64 devid, u64 physical_offset, u64 physical_len) { - struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; + struct extent_map_tree *em_tree = &fs_info->mapping_tree; struct extent_map *em; struct map_lookup *map; struct btrfs_device *dev; @@ -7701,7 +7739,7 @@ out: static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) { - struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; + struct extent_map_tree *em_tree = &fs_info->mapping_tree; struct extent_map *em; struct rb_node *node; int ret = 0; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 136a3eb64604..7f6aa1816409 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -23,6 +23,21 @@ struct btrfs_pending_bios { struct bio *tail; }; +struct btrfs_io_geometry { + /* remaining bytes before crossing a stripe */ + u64 len; + /* offset of logical address in chunk */ + u64 offset; + /* length of single IO stripe */ + u64 stripe_len; + /* number of stripe where address falls */ + u64 stripe_nr; + /* offset of address in stripe */ + u64 stripe_offset; + /* offset of raid56 stripe into the chunk */ + u64 raid56_stripe_offset; +}; + /* * Use sequence counter to get consistent device stat data on * 32-bit processors. @@ -43,8 +58,8 @@ struct btrfs_pending_bios { #define BTRFS_DEV_STATE_FLUSH_SENT (4) struct btrfs_device { - struct list_head dev_list; - struct list_head dev_alloc_list; + struct list_head dev_list; /* device_list_mutex */ + struct list_head dev_alloc_list; /* chunk mutex */ struct list_head post_commit_list; /* chunk mutex */ struct btrfs_fs_devices *fs_devices; struct btrfs_fs_info *fs_info; @@ -229,9 +244,14 @@ struct btrfs_fs_devices { * this mutex lock. */ struct mutex device_list_mutex; + + /* List of all devices, protected by device_list_mutex */ struct list_head devices; - /* devices not currently being allocated */ + /* + * Devices which can satisfy space allocation. Protected by + * chunk_mutex + */ struct list_head alloc_list; struct btrfs_fs_devices *seed; @@ -336,16 +356,16 @@ struct btrfs_device_info { }; struct btrfs_raid_attr { - int sub_stripes; /* sub_stripes info for map */ - int dev_stripes; /* stripes per dev */ - int devs_max; /* max devs to use */ - int devs_min; /* min devs needed */ - int tolerated_failures; /* max tolerated fail devs */ - int devs_increment; /* ndevs has to be a multiple of this */ - int ncopies; /* how many copies to data has */ - int nparity; /* number of stripes worth of bytes to store + u8 sub_stripes; /* sub_stripes info for map */ + u8 dev_stripes; /* stripes per dev */ + u8 devs_max; /* max devs to use */ + u8 devs_min; /* min devs needed */ + u8 tolerated_failures; /* max tolerated fail devs */ + u8 devs_increment; /* ndevs has to be a multiple of this */ + u8 ncopies; /* how many copies to data has */ + u8 nparity; /* number of stripes worth of bytes to store * parity information */ - int mindev_error; /* error code if min devs requisite is unmet */ + u8 mindev_error; /* error code if min devs requisite is unmet */ const char raid_name[8]; /* name of the raid */ u64 bg_flag; /* block group flag of the raid */ }; @@ -408,13 +428,14 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret); +int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, + u64 logical, u64 len, struct btrfs_io_geometry *io_geom); int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, u64 physical, u64 **logical, int *naddrs, int *stripe_len); int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type); -void btrfs_mapping_init(struct btrfs_mapping_tree *tree); -void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree); +void btrfs_mapping_tree_free(struct extent_map_tree *tree); blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num, int async_submit); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, @@ -557,8 +578,6 @@ static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags) return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ } -const char *get_raid_name(enum btrfs_raid_types type); - void btrfs_commit_device_sizes(struct btrfs_transaction *trans); struct list_head *btrfs_get_fs_uuids(void); @@ -568,6 +587,7 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, struct btrfs_device *failing_dev); int btrfs_bg_type_to_factor(u64 flags); +const char *btrfs_bg_type_to_raid_name(u64 flags); int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info); #endif diff --git a/fs/buffer.c b/fs/buffer.c index 49a871570092..86a38b979323 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3089,7 +3089,7 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, if (wbc) { wbc_init_bio(wbc, bio); - wbc_account_io(wbc, bh->b_page, bh->b_size); + wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size); } submit_bio(bio); diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig index 7f7d92d6b024..cf235f6eacf9 100644 --- a/fs/ceph/Kconfig +++ b/fs/ceph/Kconfig @@ -36,3 +36,15 @@ config CEPH_FS_POSIX_ACL groups beyond the owner/group/world scheme. If you don't know what Access Control Lists are, say N + +config CEPH_FS_SECURITY_LABEL + bool "CephFS Security Labels" + depends on CEPH_FS && SECURITY + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the Ceph filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index 8a19c249036c..aa55f412a6e3 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -159,7 +159,7 @@ out: } int ceph_pre_init_acls(struct inode *dir, umode_t *mode, - struct ceph_acls_info *info) + struct ceph_acl_sec_ctx *as_ctx) { struct posix_acl *acl, *default_acl; size_t val_size1 = 0, val_size2 = 0; @@ -234,9 +234,9 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode, kfree(tmp_buf); - info->acl = acl; - info->default_acl = default_acl; - info->pagelist = pagelist; + as_ctx->acl = acl; + as_ctx->default_acl = default_acl; + as_ctx->pagelist = pagelist; return 0; out_err: @@ -248,18 +248,10 @@ out_err: return err; } -void ceph_init_inode_acls(struct inode* inode, struct ceph_acls_info *info) +void ceph_init_inode_acls(struct inode *inode, struct ceph_acl_sec_ctx *as_ctx) { if (!inode) return; - ceph_set_cached_acl(inode, ACL_TYPE_ACCESS, info->acl); - ceph_set_cached_acl(inode, ACL_TYPE_DEFAULT, info->default_acl); -} - -void ceph_release_acls_info(struct ceph_acls_info *info) -{ - posix_acl_release(info->acl); - posix_acl_release(info->default_acl); - if (info->pagelist) - ceph_pagelist_release(info->pagelist); + ceph_set_cached_acl(inode, ACL_TYPE_ACCESS, as_ctx->acl); + ceph_set_cached_acl(inode, ACL_TYPE_DEFAULT, as_ctx->default_acl); } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index a47c541f8006..e078cc55b989 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -10,6 +10,7 @@ #include <linux/pagevec.h> #include <linux/task_io_accounting_ops.h> #include <linux/signal.h> +#include <linux/iversion.h> #include "super.h" #include "mds_client.h" @@ -1576,6 +1577,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) /* Update time before taking page lock */ file_update_time(vma->vm_file); + inode_inc_iversion_raw(inode); do { lock_page(page); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 0176241eaea7..d98dcd976c80 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -8,6 +8,7 @@ #include <linux/vmalloc.h> #include <linux/wait.h> #include <linux/writeback.h> +#include <linux/iversion.h> #include "super.h" #include "mds_client.h" @@ -1138,8 +1139,9 @@ struct cap_msg_args { u64 ino, cid, follows; u64 flush_tid, oldest_flush_tid, size, max_size; u64 xattr_version; + u64 change_attr; struct ceph_buffer *xattr_buf; - struct timespec64 atime, mtime, ctime; + struct timespec64 atime, mtime, ctime, btime; int op, caps, wanted, dirty; u32 seq, issue_seq, mseq, time_warp_seq; u32 flags; @@ -1160,7 +1162,6 @@ static int send_cap_msg(struct cap_msg_args *arg) struct ceph_msg *msg; void *p; size_t extra_len; - struct timespec64 zerotime = {0}; struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc; dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s" @@ -1245,15 +1246,10 @@ static int send_cap_msg(struct cap_msg_args *arg) /* pool namespace (version 8) (mds always ignores this) */ ceph_encode_32(&p, 0); - /* - * btime and change_attr (version 9) - * - * We just zero these out for now, as the MDS ignores them unless - * the requisite feature flags are set (which we don't do yet). - */ - ceph_encode_timespec64(p, &zerotime); + /* btime and change_attr (version 9) */ + ceph_encode_timespec64(p, &arg->btime); p += sizeof(struct ceph_timespec); - ceph_encode_64(&p, 0); + ceph_encode_64(&p, arg->change_attr); /* Advisory flags (version 10) */ ceph_encode_32(&p, arg->flags); @@ -1263,20 +1259,22 @@ static int send_cap_msg(struct cap_msg_args *arg) } /* - * Queue cap releases when an inode is dropped from our cache. Since - * inode is about to be destroyed, there is no need for i_ceph_lock. + * Queue cap releases when an inode is dropped from our cache. */ -void __ceph_remove_caps(struct inode *inode) +void __ceph_remove_caps(struct ceph_inode_info *ci) { - struct ceph_inode_info *ci = ceph_inode(inode); struct rb_node *p; + /* lock i_ceph_lock, because ceph_d_revalidate(..., LOOKUP_RCU) + * may call __ceph_caps_issued_mask() on a freeing inode. */ + spin_lock(&ci->i_ceph_lock); p = rb_first(&ci->i_caps); while (p) { struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node); p = rb_next(p); __ceph_remove_cap(cap, true); } + spin_unlock(&ci->i_ceph_lock); } /* @@ -1297,7 +1295,7 @@ void __ceph_remove_caps(struct inode *inode) * caller should hold snap_rwsem (read), s_mutex. */ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, - int op, bool sync, int used, int want, int retain, + int op, int flags, int used, int want, int retain, int flushing, u64 flush_tid, u64 oldest_flush_tid) __releases(cap->ci->i_ceph_lock) { @@ -1377,6 +1375,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, arg.mtime = inode->i_mtime; arg.atime = inode->i_atime; arg.ctime = inode->i_ctime; + arg.btime = ci->i_btime; + arg.change_attr = inode_peek_iversion_raw(inode); arg.op = op; arg.caps = cap->implemented; @@ -1393,12 +1393,19 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, arg.mode = inode->i_mode; arg.inline_data = ci->i_inline_version != CEPH_INLINE_NONE; - if (list_empty(&ci->i_cap_snaps)) - arg.flags = CEPH_CLIENT_CAPS_NO_CAPSNAP; - else - arg.flags = CEPH_CLIENT_CAPS_PENDING_CAPSNAP; - if (sync) - arg.flags |= CEPH_CLIENT_CAPS_SYNC; + if (!(flags & CEPH_CLIENT_CAPS_PENDING_CAPSNAP) && + !list_empty(&ci->i_cap_snaps)) { + struct ceph_cap_snap *capsnap; + list_for_each_entry_reverse(capsnap, &ci->i_cap_snaps, ci_item) { + if (capsnap->cap_flush.tid) + break; + if (capsnap->need_flush) { + flags |= CEPH_CLIENT_CAPS_PENDING_CAPSNAP; + break; + } + } + } + arg.flags = flags; spin_unlock(&ci->i_ceph_lock); @@ -1436,6 +1443,8 @@ static inline int __send_flush_snap(struct inode *inode, arg.atime = capsnap->atime; arg.mtime = capsnap->mtime; arg.ctime = capsnap->ctime; + arg.btime = capsnap->btime; + arg.change_attr = capsnap->change_attr; arg.op = CEPH_CAP_OP_FLUSHSNAP; arg.caps = capsnap->issued; @@ -1603,10 +1612,8 @@ retry: } // make sure flushsnap messages are sent in proper order. - if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) { + if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) __kick_flushing_caps(mdsc, session, ci, 0); - ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; - } __ceph_flush_snaps(ci, session); out: @@ -2048,10 +2055,8 @@ ack: if (cap == ci->i_auth_cap && (ci->i_ceph_flags & (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS))) { - if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) { + if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) __kick_flushing_caps(mdsc, session, ci, 0); - ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; - } if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) __ceph_flush_snaps(ci, session); @@ -2087,7 +2092,7 @@ ack: sent++; /* __send_cap drops i_ceph_lock */ - delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, false, + delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, 0, cap_used, want, retain, flushing, flush_tid, oldest_flush_tid); goto retry; /* retake i_ceph_lock and restart our cap scan. */ @@ -2121,6 +2126,7 @@ static int try_flush_caps(struct inode *inode, u64 *ptid) retry: spin_lock(&ci->i_ceph_lock); +retry_locked: if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { spin_unlock(&ci->i_ceph_lock); dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode); @@ -2128,8 +2134,6 @@ retry: } if (ci->i_dirty_caps && ci->i_auth_cap) { struct ceph_cap *cap = ci->i_auth_cap; - int used = __ceph_caps_used(ci); - int want = __ceph_caps_wanted(ci); int delayed; if (!session || session != cap->session) { @@ -2145,13 +2149,25 @@ retry: goto out; } + if (ci->i_ceph_flags & + (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS)) { + if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) + __kick_flushing_caps(mdsc, session, ci, 0); + if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) + __ceph_flush_snaps(ci, session); + goto retry_locked; + } + flushing = __mark_caps_flushing(inode, session, true, &flush_tid, &oldest_flush_tid); /* __send_cap drops i_ceph_lock */ - delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, true, - used, want, (cap->issued | cap->implemented), - flushing, flush_tid, oldest_flush_tid); + delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, + CEPH_CLIENT_CAPS_SYNC, + __ceph_caps_used(ci), + __ceph_caps_wanted(ci), + (cap->issued | cap->implemented), + flushing, flush_tid, oldest_flush_tid); if (delayed) { spin_lock(&ci->i_ceph_lock); @@ -2320,6 +2336,16 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_cap_flush *cf; int ret; u64 first_tid = 0; + u64 last_snap_flush = 0; + + ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; + + list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) { + if (!cf->caps) { + last_snap_flush = cf->tid; + break; + } + } list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) { if (cf->tid < first_tid) @@ -2338,10 +2364,13 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc, dout("kick_flushing_caps %p cap %p tid %llu %s\n", inode, cap, cf->tid, ceph_cap_string(cf->caps)); ci->i_ceph_flags |= CEPH_I_NODELAY; + ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, - false, __ceph_caps_used(ci), + (cf->tid < last_snap_flush ? + CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0), + __ceph_caps_used(ci), __ceph_caps_wanted(ci), - cap->issued | cap->implemented, + (cap->issued | cap->implemented), cf->caps, cf->tid, oldest_flush_tid); if (ret) { pr_err("kick_flushing_caps: error sending " @@ -2410,7 +2439,6 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, */ if ((cap->issued & ci->i_flushing_caps) != ci->i_flushing_caps) { - ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; /* encode_caps_cb() also will reset these sequence * numbers. make sure sequence numbers in cap flush * message match later reconnect message */ @@ -2450,7 +2478,6 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, continue; } if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) { - ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid); } @@ -2478,7 +2505,6 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, oldest_flush_tid = __get_oldest_flush_tid(mdsc); spin_unlock(&mdsc->cap_dirty_lock); - ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid); spin_unlock(&ci->i_ceph_lock); } else { @@ -3040,8 +3066,10 @@ struct cap_extra_info { bool dirstat_valid; u64 nfiles; u64 nsubdirs; + u64 change_attr; /* currently issued */ int issued; + struct timespec64 btime; }; /* @@ -3123,11 +3151,14 @@ static void handle_cap_grant(struct inode *inode, __check_cap_issue(ci, cap, newcaps); + inode_set_max_iversion_raw(inode, extra_info->change_attr); + if ((newcaps & CEPH_CAP_AUTH_SHARED) && (extra_info->issued & CEPH_CAP_AUTH_EXCL) == 0) { inode->i_mode = le32_to_cpu(grant->mode); inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid)); inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid)); + ci->i_btime = extra_info->btime; dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, from_kuid(&init_user_ns, inode->i_uid), from_kgid(&init_user_ns, inode->i_gid)); @@ -3154,6 +3185,7 @@ static void handle_cap_grant(struct inode *inode, ci->i_xattrs.blob = ceph_buffer_get(xattr_buf); ci->i_xattrs.version = version; ceph_forget_all_cached_acls(inode); + ceph_security_invalidate_secctx(inode); } } @@ -3848,17 +3880,19 @@ void ceph_handle_caps(struct ceph_mds_session *session, } } - if (msg_version >= 11) { + if (msg_version >= 9) { struct ceph_timespec *btime; - u64 change_attr; - u32 flags; - /* version >= 9 */ if (p + sizeof(*btime) > end) goto bad; btime = p; + ceph_decode_timespec64(&extra_info.btime, btime); p += sizeof(*btime); - ceph_decode_64_safe(&p, end, change_attr, bad); + ceph_decode_64_safe(&p, end, extra_info.change_attr, bad); + } + + if (msg_version >= 11) { + u32 flags; /* version >= 10 */ ceph_decode_32_safe(&p, end, flags, bad); /* version >= 11 */ diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 83cd41fa2b01..2eb88ed22993 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -52,7 +52,7 @@ static int mdsc_show(struct seq_file *s, void *p) struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; struct rb_node *rp; - int pathlen; + int pathlen = 0; u64 pathbase; char *path; diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 0637149fb9f9..4ca0b8ff9a72 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -825,7 +825,7 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry, struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; - struct ceph_acls_info acls = {}; + struct ceph_acl_sec_ctx as_ctx = {}; int err; if (ceph_snap(dir) != CEPH_NOSNAP) @@ -836,7 +836,10 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry, goto out; } - err = ceph_pre_init_acls(dir, &mode, &acls); + err = ceph_pre_init_acls(dir, &mode, &as_ctx); + if (err < 0) + goto out; + err = ceph_security_init_secctx(dentry, mode, &as_ctx); if (err < 0) goto out; @@ -855,9 +858,9 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry, req->r_args.mknod.rdev = cpu_to_le32(rdev); req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; - if (acls.pagelist) { - req->r_pagelist = acls.pagelist; - acls.pagelist = NULL; + if (as_ctx.pagelist) { + req->r_pagelist = as_ctx.pagelist; + as_ctx.pagelist = NULL; } err = ceph_mdsc_do_request(mdsc, dir, req); if (!err && !req->r_reply_info.head->is_dentry) @@ -865,10 +868,10 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry, ceph_mdsc_put_request(req); out: if (!err) - ceph_init_inode_acls(d_inode(dentry), &acls); + ceph_init_inode_acls(d_inode(dentry), &as_ctx); else d_drop(dentry); - ceph_release_acls_info(&acls); + ceph_release_acl_sec_ctx(&as_ctx); return err; } @@ -884,6 +887,7 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry, struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; + struct ceph_acl_sec_ctx as_ctx = {}; int err; if (ceph_snap(dir) != CEPH_NOSNAP) @@ -894,6 +898,10 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry, goto out; } + err = ceph_security_init_secctx(dentry, S_IFLNK | 0777, &as_ctx); + if (err < 0) + goto out; + dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest); req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS); if (IS_ERR(req)) { @@ -919,6 +927,7 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry, out: if (err) d_drop(dentry); + ceph_release_acl_sec_ctx(&as_ctx); return err; } @@ -927,7 +936,7 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; - struct ceph_acls_info acls = {}; + struct ceph_acl_sec_ctx as_ctx = {}; int err = -EROFS; int op; @@ -950,7 +959,10 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) } mode |= S_IFDIR; - err = ceph_pre_init_acls(dir, &mode, &acls); + err = ceph_pre_init_acls(dir, &mode, &as_ctx); + if (err < 0) + goto out; + err = ceph_security_init_secctx(dentry, mode, &as_ctx); if (err < 0) goto out; @@ -967,9 +979,9 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) req->r_args.mkdir.mode = cpu_to_le32(mode); req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; - if (acls.pagelist) { - req->r_pagelist = acls.pagelist; - acls.pagelist = NULL; + if (as_ctx.pagelist) { + req->r_pagelist = as_ctx.pagelist; + as_ctx.pagelist = NULL; } err = ceph_mdsc_do_request(mdsc, dir, req); if (!err && @@ -979,10 +991,10 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) ceph_mdsc_put_request(req); out: if (!err) - ceph_init_inode_acls(d_inode(dentry), &acls); + ceph_init_inode_acls(d_inode(dentry), &as_ctx); else d_drop(dentry); - ceph_release_acls_info(&acls); + ceph_release_acl_sec_ctx(&as_ctx); return err; } @@ -1255,7 +1267,7 @@ __dentry_leases_walk(struct ceph_mds_client *mdsc, if (!spin_trylock(&dentry->d_lock)) continue; - if (dentry->d_lockref.count < 0) { + if (__lockref_is_dead(&dentry->d_lockref)) { list_del_init(&di->lease_list); goto next; } @@ -1433,8 +1445,7 @@ static bool __dentry_lease_is_valid(struct ceph_dentry_info *di) return false; } -static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags, - struct inode *dir) +static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags) { struct ceph_dentry_info *di; struct ceph_mds_session *session = NULL; @@ -1466,7 +1477,7 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags, spin_unlock(&dentry->d_lock); if (session) { - ceph_mdsc_lease_send_msg(session, dir, dentry, + ceph_mdsc_lease_send_msg(session, dentry, CEPH_MDS_LEASE_RENEW, seq); ceph_put_mds_session(session); } @@ -1512,18 +1523,26 @@ static int __dir_lease_try_check(const struct dentry *dentry) static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry) { struct ceph_inode_info *ci = ceph_inode(dir); - struct ceph_dentry_info *di = ceph_dentry(dentry); - int valid = 0; + int valid; + int shared_gen; spin_lock(&ci->i_ceph_lock); - if (atomic_read(&ci->i_shared_gen) == di->lease_shared_gen) - valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1); + valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1); + shared_gen = atomic_read(&ci->i_shared_gen); spin_unlock(&ci->i_ceph_lock); - if (valid) - __ceph_dentry_dir_lease_touch(di); - dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n", - dir, (unsigned)atomic_read(&ci->i_shared_gen), - dentry, (unsigned)di->lease_shared_gen, valid); + if (valid) { + struct ceph_dentry_info *di; + spin_lock(&dentry->d_lock); + di = ceph_dentry(dentry); + if (dir == d_inode(dentry->d_parent) && + di && di->lease_shared_gen == shared_gen) + __ceph_dentry_dir_lease_touch(di); + else + valid = 0; + spin_unlock(&dentry->d_lock); + } + dout("dir_lease_is_valid dir %p v%u dentry %p = %d\n", + dir, (unsigned)atomic_read(&ci->i_shared_gen), dentry, valid); return valid; } @@ -1558,7 +1577,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) ceph_snap(d_inode(dentry)) == CEPH_SNAPDIR) { valid = 1; } else { - valid = dentry_lease_is_valid(dentry, flags, dir); + valid = dentry_lease_is_valid(dentry, flags); if (valid == -ECHILD) return valid; if (valid || dir_lease_is_valid(dir, dentry)) { diff --git a/fs/ceph/export.c b/fs/ceph/export.c index d3ef7ee429ec..15ff1b09cfa2 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -368,7 +368,7 @@ static struct dentry *ceph_get_parent(struct dentry *child) } out: dout("get_parent %p ino %llx.%llx err=%ld\n", - child, ceph_vinop(inode), (IS_ERR(dn) ? PTR_ERR(dn) : 0)); + child, ceph_vinop(inode), (long)PTR_ERR_OR_ZERO(dn)); return dn; } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index c5517ffeb11c..685a03cc4b77 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -10,6 +10,7 @@ #include <linux/namei.h> #include <linux/writeback.h> #include <linux/falloc.h> +#include <linux/iversion.h> #include "super.h" #include "mds_client.h" @@ -437,7 +438,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; struct dentry *dn; - struct ceph_acls_info acls = {}; + struct ceph_acl_sec_ctx as_ctx = {}; int mask; int err; @@ -451,25 +452,28 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, if (flags & O_CREAT) { if (ceph_quota_is_max_files_exceeded(dir)) return -EDQUOT; - err = ceph_pre_init_acls(dir, &mode, &acls); + err = ceph_pre_init_acls(dir, &mode, &as_ctx); if (err < 0) return err; + err = ceph_security_init_secctx(dentry, mode, &as_ctx); + if (err < 0) + goto out_ctx; } /* do the open */ req = prepare_open_request(dir->i_sb, flags, mode); if (IS_ERR(req)) { err = PTR_ERR(req); - goto out_acl; + goto out_ctx; } req->r_dentry = dget(dentry); req->r_num_caps = 2; if (flags & O_CREAT) { req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; - if (acls.pagelist) { - req->r_pagelist = acls.pagelist; - acls.pagelist = NULL; + if (as_ctx.pagelist) { + req->r_pagelist = as_ctx.pagelist; + as_ctx.pagelist = NULL; } } @@ -507,7 +511,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, } else { dout("atomic_open finish_open on dn %p\n", dn); if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) { - ceph_init_inode_acls(d_inode(dentry), &acls); + ceph_init_inode_acls(d_inode(dentry), &as_ctx); file->f_mode |= FMODE_CREATED; } err = finish_open(file, dentry, ceph_open); @@ -516,8 +520,8 @@ out_req: if (!req->r_err && req->r_target_inode) ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode); ceph_mdsc_put_request(req); -out_acl: - ceph_release_acls_info(&acls); +out_ctx: + ceph_release_acl_sec_ctx(&as_ctx); dout("atomic_open result=%d\n", err); return err; } @@ -1007,7 +1011,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, * may block. */ truncate_inode_pages_range(inode->i_mapping, pos, - (pos+len) | (PAGE_SIZE - 1)); + PAGE_ALIGN(pos + len) - 1); req->r_mtime = mtime; } @@ -1022,7 +1026,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, req->r_callback = ceph_aio_complete_req; req->r_inode = inode; req->r_priv = aio_req; - list_add_tail(&req->r_unsafe_item, &aio_req->osd_reqs); + list_add_tail(&req->r_private_item, &aio_req->osd_reqs); pos += len; continue; @@ -1082,8 +1086,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, while (!list_empty(&osd_reqs)) { req = list_first_entry(&osd_reqs, struct ceph_osd_request, - r_unsafe_item); - list_del_init(&req->r_unsafe_item); + r_private_item); + list_del_init(&req->r_private_item); if (ret >= 0) ret = ceph_osdc_start_request(req->r_osdc, req, false); @@ -1432,6 +1436,8 @@ retry_snap: if (err) goto out; + inode_inc_iversion_raw(inode); + if (ci->i_inline_version != CEPH_INLINE_NONE) { err = ceph_uninline_data(file, NULL); if (err < 0) @@ -2063,6 +2069,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, do_final_copy = true; file_update_time(dst_file); + inode_inc_iversion_raw(dst_inode); + if (endoff > size) { int caps_flags = 0; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 761451f36e2d..791f84a13bb8 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -13,6 +13,7 @@ #include <linux/posix_acl.h> #include <linux/random.h> #include <linux/sort.h> +#include <linux/iversion.h> #include "super.h" #include "mds_client.h" @@ -42,6 +43,7 @@ static int ceph_set_ino_cb(struct inode *inode, void *data) { ceph_inode(inode)->i_vino = *(struct ceph_vino *)data; inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data); + inode_set_iversion_raw(inode, 0); return 0; } @@ -509,6 +511,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) INIT_WORK(&ci->i_work, ceph_inode_work); ci->i_work_mask = 0; + memset(&ci->i_btime, '\0', sizeof(ci->i_btime)); ceph_fscache_inode_init(ci); @@ -523,17 +526,20 @@ void ceph_free_inode(struct inode *inode) kmem_cache_free(ceph_inode_cachep, ci); } -void ceph_destroy_inode(struct inode *inode) +void ceph_evict_inode(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_frag *frag; struct rb_node *n; - dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode)); + dout("evict_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode)); + + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); ceph_fscache_unregister_inode_cookie(ci); - __ceph_remove_caps(inode); + __ceph_remove_caps(ci); if (__ceph_has_any_quota(ci)) ceph_adjust_quota_realms_count(inode, false); @@ -578,16 +584,6 @@ void ceph_destroy_inode(struct inode *inode) ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns)); } -int ceph_drop_inode(struct inode *inode) -{ - /* - * Positve dentry and corresponding inode are always accompanied - * in MDS reply. So no need to keep inode in the cache after - * dropping all its aliases. - */ - return 1; -} - static inline blkcnt_t calc_inode_blocks(u64 size) { return (size + (1<<9) - 1) >> 9; @@ -795,6 +791,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page, le64_to_cpu(info->version) > (ci->i_version & ~1))) new_version = true; + /* Update change_attribute */ + inode_set_max_iversion_raw(inode, iinfo->change_attr); + __ceph_caps_issued(ci, &issued); issued |= __ceph_caps_dirty(ci); new_issued = ~issued & info_caps; @@ -813,6 +812,8 @@ static int fill_inode(struct inode *inode, struct page *locked_page, dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, from_kuid(&init_user_ns, inode->i_uid), from_kgid(&init_user_ns, inode->i_gid)); + ceph_decode_timespec64(&ci->i_btime, &iinfo->btime); + ceph_decode_timespec64(&ci->i_snap_btime, &iinfo->snap_btime); } if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) && @@ -887,6 +888,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page, iinfo->xattr_data, iinfo->xattr_len); ci->i_xattrs.version = le64_to_cpu(info->xattr_version); ceph_forget_all_cached_acls(inode); + ceph_security_invalidate_secctx(inode); xattr_blob = NULL; } @@ -1027,59 +1029,38 @@ out: } /* - * caller should hold session s_mutex. + * caller should hold session s_mutex and dentry->d_lock. */ -static void update_dentry_lease(struct dentry *dentry, - struct ceph_mds_reply_lease *lease, - struct ceph_mds_session *session, - unsigned long from_time, - struct ceph_vino *tgt_vino, - struct ceph_vino *dir_vino) +static void __update_dentry_lease(struct inode *dir, struct dentry *dentry, + struct ceph_mds_reply_lease *lease, + struct ceph_mds_session *session, + unsigned long from_time, + struct ceph_mds_session **old_lease_session) { struct ceph_dentry_info *di = ceph_dentry(dentry); long unsigned duration = le32_to_cpu(lease->duration_ms); long unsigned ttl = from_time + (duration * HZ) / 1000; long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000; - struct inode *dir; - struct ceph_mds_session *old_lease_session = NULL; - /* - * Make sure dentry's inode matches tgt_vino. NULL tgt_vino means that - * we expect a negative dentry. - */ - if (!tgt_vino && d_really_is_positive(dentry)) - return; - - if (tgt_vino && (d_really_is_negative(dentry) || - !ceph_ino_compare(d_inode(dentry), tgt_vino))) - return; - - spin_lock(&dentry->d_lock); dout("update_dentry_lease %p duration %lu ms ttl %lu\n", dentry, duration, ttl); - dir = d_inode(dentry->d_parent); - - /* make sure parent matches dir_vino */ - if (!ceph_ino_compare(dir, dir_vino)) - goto out_unlock; - /* only track leases on regular dentries */ if (ceph_snap(dir) != CEPH_NOSNAP) - goto out_unlock; + return; di->lease_shared_gen = atomic_read(&ceph_inode(dir)->i_shared_gen); if (duration == 0) { __ceph_dentry_dir_lease_touch(di); - goto out_unlock; + return; } if (di->lease_gen == session->s_cap_gen && time_before(ttl, di->time)) - goto out_unlock; /* we already have a newer lease. */ + return; /* we already have a newer lease. */ if (di->lease_session && di->lease_session != session) { - old_lease_session = di->lease_session; + *old_lease_session = di->lease_session; di->lease_session = NULL; } @@ -1092,6 +1073,62 @@ static void update_dentry_lease(struct dentry *dentry, di->time = ttl; __ceph_dentry_lease_touch(di); +} + +static inline void update_dentry_lease(struct inode *dir, struct dentry *dentry, + struct ceph_mds_reply_lease *lease, + struct ceph_mds_session *session, + unsigned long from_time) +{ + struct ceph_mds_session *old_lease_session = NULL; + spin_lock(&dentry->d_lock); + __update_dentry_lease(dir, dentry, lease, session, from_time, + &old_lease_session); + spin_unlock(&dentry->d_lock); + if (old_lease_session) + ceph_put_mds_session(old_lease_session); +} + +/* + * update dentry lease without having parent inode locked + */ +static void update_dentry_lease_careful(struct dentry *dentry, + struct ceph_mds_reply_lease *lease, + struct ceph_mds_session *session, + unsigned long from_time, + char *dname, u32 dname_len, + struct ceph_vino *pdvino, + struct ceph_vino *ptvino) + +{ + struct inode *dir; + struct ceph_mds_session *old_lease_session = NULL; + + spin_lock(&dentry->d_lock); + /* make sure dentry's name matches target */ + if (dentry->d_name.len != dname_len || + memcmp(dentry->d_name.name, dname, dname_len)) + goto out_unlock; + + dir = d_inode(dentry->d_parent); + /* make sure parent matches dvino */ + if (!ceph_ino_compare(dir, pdvino)) + goto out_unlock; + + /* make sure dentry's inode matches target. NULL ptvino means that + * we expect a negative dentry */ + if (ptvino) { + if (d_really_is_negative(dentry)) + goto out_unlock; + if (!ceph_ino_compare(d_inode(dentry), ptvino)) + goto out_unlock; + } else { + if (d_really_is_positive(dentry)) + goto out_unlock; + } + + __update_dentry_lease(dir, dentry, lease, session, + from_time, &old_lease_session); out_unlock: spin_unlock(&dentry->d_lock); if (old_lease_session) @@ -1156,19 +1193,6 @@ static int splice_dentry(struct dentry **pdn, struct inode *in) return 0; } -static int d_name_cmp(struct dentry *dentry, const char *name, size_t len) -{ - int ret; - - /* take d_lock to ensure dentry->d_name stability */ - spin_lock(&dentry->d_lock); - ret = dentry->d_name.len - len; - if (!ret) - ret = memcmp(dentry->d_name.name, name, len); - spin_unlock(&dentry->d_lock); - return ret; -} - /* * Incorporate results into the local cache. This is either just * one inode, or a directory, dentry, and possibly linked-to inode (e.g., @@ -1371,10 +1395,9 @@ retry_lookup: } else if (have_lease) { if (d_unhashed(dn)) d_add(dn, NULL); - update_dentry_lease(dn, rinfo->dlease, - session, - req->r_request_started, - NULL, &dvino); + update_dentry_lease(dir, dn, + rinfo->dlease, session, + req->r_request_started); } goto done; } @@ -1396,11 +1419,9 @@ retry_lookup: } if (have_lease) { - tvino.ino = le64_to_cpu(rinfo->targeti.in->ino); - tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid); - update_dentry_lease(dn, rinfo->dlease, session, - req->r_request_started, - &tvino, &dvino); + update_dentry_lease(dir, dn, + rinfo->dlease, session, + req->r_request_started); } dout(" final dn %p\n", dn); } else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP || @@ -1418,27 +1439,20 @@ retry_lookup: err = splice_dentry(&req->r_dentry, in); if (err < 0) goto done; - } else if (rinfo->head->is_dentry && - !d_name_cmp(req->r_dentry, rinfo->dname, rinfo->dname_len)) { + } else if (rinfo->head->is_dentry && req->r_dentry) { + /* parent inode is not locked, be carefull */ struct ceph_vino *ptvino = NULL; - - if ((le32_to_cpu(rinfo->diri.in->cap.caps) & CEPH_CAP_FILE_SHARED) || - le32_to_cpu(rinfo->dlease->duration_ms)) { - dvino.ino = le64_to_cpu(rinfo->diri.in->ino); - dvino.snap = le64_to_cpu(rinfo->diri.in->snapid); - - if (rinfo->head->is_target) { - tvino.ino = le64_to_cpu(rinfo->targeti.in->ino); - tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid); - ptvino = &tvino; - } - - update_dentry_lease(req->r_dentry, rinfo->dlease, - session, req->r_request_started, ptvino, - &dvino); - } else { - dout("%s: no dentry lease or dir cap\n", __func__); + dvino.ino = le64_to_cpu(rinfo->diri.in->ino); + dvino.snap = le64_to_cpu(rinfo->diri.in->snapid); + if (rinfo->head->is_target) { + tvino.ino = le64_to_cpu(rinfo->targeti.in->ino); + tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid); + ptvino = &tvino; } + update_dentry_lease_careful(req->r_dentry, rinfo->dlease, + session, req->r_request_started, + rinfo->dname, rinfo->dname_len, + &dvino, ptvino); } done: dout("fill_trace done err=%d\n", err); @@ -1600,7 +1614,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, /* FIXME: release caps/leases if error occurs */ for (i = 0; i < rinfo->dir_nr; i++) { struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; - struct ceph_vino tvino, dvino; + struct ceph_vino tvino; dname.name = rde->name; dname.len = rde->name_len; @@ -1701,9 +1715,9 @@ retry_lookup: ceph_dentry(dn)->offset = rde->offset; - dvino = ceph_vino(d_inode(parent)); - update_dentry_lease(dn, rde->lease, req->r_session, - req->r_request_started, &tvino, &dvino); + update_dentry_lease(d_inode(parent), dn, + rde->lease, req->r_session, + req->r_request_started); if (err == 0 && skipped == 0 && cache_ctl.index >= 0) { ret = fill_readdir_cache(d_inode(parent), dn, @@ -2282,7 +2296,7 @@ static int statx_to_caps(u32 want) { int mask = 0; - if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME)) + if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME|STATX_BTIME)) mask |= CEPH_CAP_AUTH_SHARED; if (want & (STATX_NLINK|STATX_CTIME)) @@ -2307,6 +2321,7 @@ int ceph_getattr(const struct path *path, struct kstat *stat, { struct inode *inode = d_inode(path->dentry); struct ceph_inode_info *ci = ceph_inode(inode); + u32 valid_mask = STATX_BASIC_STATS; int err = 0; /* Skip the getattr altogether if we're asked not to sync */ @@ -2319,6 +2334,16 @@ int ceph_getattr(const struct path *path, struct kstat *stat, generic_fillattr(inode, stat); stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino); + + /* + * btime on newly-allocated inodes is 0, so if this is still set to + * that, then assume that it's not valid. + */ + if (ci->i_btime.tv_sec || ci->i_btime.tv_nsec) { + stat->btime = ci->i_btime; + valid_mask |= STATX_BTIME; + } + if (ceph_snap(inode) == CEPH_NOSNAP) stat->dev = inode->i_sb->s_dev; else @@ -2342,7 +2367,6 @@ int ceph_getattr(const struct path *path, struct kstat *stat, stat->nlink = 1 + 1 + ci->i_subdirs; } - /* Mask off any higher bits (e.g. btime) until we have support */ - stat->result_mask = request_mask & STATX_BASIC_STATS; + stat->result_mask = request_mask & valid_mask; return err; } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index c8a9b89b922d..920e9f048bd8 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -150,14 +150,13 @@ static int parse_reply_info_in(void **p, void *end, info->pool_ns_data = *p; *p += info->pool_ns_len; } - /* btime, change_attr */ - { - struct ceph_timespec btime; - u64 change_attr; - ceph_decode_need(p, end, sizeof(btime), bad); - ceph_decode_copy(p, &btime, sizeof(btime)); - ceph_decode_64_safe(p, end, change_attr, bad); - } + + /* btime */ + ceph_decode_need(p, end, sizeof(info->btime), bad); + ceph_decode_copy(p, &info->btime, sizeof(info->btime)); + + /* change attribute */ + ceph_decode_64_safe(p, end, info->change_attr, bad); /* dir pin */ if (struct_v >= 2) { @@ -166,6 +165,15 @@ static int parse_reply_info_in(void **p, void *end, info->dir_pin = -ENODATA; } + /* snapshot birth time, remains zero for v<=2 */ + if (struct_v >= 3) { + ceph_decode_need(p, end, sizeof(info->snap_btime), bad); + ceph_decode_copy(p, &info->snap_btime, + sizeof(info->snap_btime)); + } else { + memset(&info->snap_btime, 0, sizeof(info->snap_btime)); + } + *p = end; } else { if (features & CEPH_FEATURE_MDS_INLINE_DATA) { @@ -197,7 +205,14 @@ static int parse_reply_info_in(void **p, void *end, } } + if (features & CEPH_FEATURE_FS_BTIME) { + ceph_decode_need(p, end, sizeof(info->btime), bad); + ceph_decode_copy(p, &info->btime, sizeof(info->btime)); + ceph_decode_64_safe(p, end, info->change_attr, bad); + } + info->dir_pin = -ENODATA; + /* info->snap_btime remains zero */ } return 0; bad: @@ -717,6 +732,7 @@ void ceph_mdsc_release_request(struct kref *kref) ceph_pagelist_release(req->r_pagelist); put_request_session(req); ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation); + WARN_ON_ONCE(!list_empty(&req->r_wait)); kfree(req); } @@ -903,7 +919,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, struct inode *dir; rcu_read_lock(); - parent = req->r_dentry->d_parent; + parent = READ_ONCE(req->r_dentry->d_parent); dir = req->r_parent ? : d_inode_rcu(parent); if (!dir || dir->i_sb != mdsc->fsc->sb) { @@ -2135,7 +2151,7 @@ retry: memcpy(path + pos, temp->d_name.name, temp->d_name.len); } spin_unlock(&temp->d_lock); - temp = temp->d_parent; + temp = READ_ONCE(temp->d_parent); /* Are we at the root? */ if (IS_ROOT(temp)) @@ -3727,42 +3743,35 @@ static void check_new_map(struct ceph_mds_client *mdsc, ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "", ceph_session_state_name(s->s_state)); - if (i >= newmap->m_num_mds || - memcmp(ceph_mdsmap_get_addr(oldmap, i), - ceph_mdsmap_get_addr(newmap, i), - sizeof(struct ceph_entity_addr))) { - if (s->s_state == CEPH_MDS_SESSION_OPENING) { - /* the session never opened, just close it - * out now */ - get_session(s); - __unregister_session(mdsc, s); - __wake_requests(mdsc, &s->s_waiting); - ceph_put_mds_session(s); - } else if (i >= newmap->m_num_mds) { - /* force close session for stopped mds */ - get_session(s); - __unregister_session(mdsc, s); - __wake_requests(mdsc, &s->s_waiting); - kick_requests(mdsc, i); - mutex_unlock(&mdsc->mutex); + if (i >= newmap->m_num_mds) { + /* force close session for stopped mds */ + get_session(s); + __unregister_session(mdsc, s); + __wake_requests(mdsc, &s->s_waiting); + mutex_unlock(&mdsc->mutex); - mutex_lock(&s->s_mutex); - cleanup_session_requests(mdsc, s); - remove_session_caps(s); - mutex_unlock(&s->s_mutex); + mutex_lock(&s->s_mutex); + cleanup_session_requests(mdsc, s); + remove_session_caps(s); + mutex_unlock(&s->s_mutex); - ceph_put_mds_session(s); + ceph_put_mds_session(s); - mutex_lock(&mdsc->mutex); - } else { - /* just close it */ - mutex_unlock(&mdsc->mutex); - mutex_lock(&s->s_mutex); - mutex_lock(&mdsc->mutex); - ceph_con_close(&s->s_con); - mutex_unlock(&s->s_mutex); - s->s_state = CEPH_MDS_SESSION_RESTARTING; - } + mutex_lock(&mdsc->mutex); + kick_requests(mdsc, i); + continue; + } + + if (memcmp(ceph_mdsmap_get_addr(oldmap, i), + ceph_mdsmap_get_addr(newmap, i), + sizeof(struct ceph_entity_addr))) { + /* just close it */ + mutex_unlock(&mdsc->mutex); + mutex_lock(&s->s_mutex); + mutex_lock(&mdsc->mutex); + ceph_con_close(&s->s_con); + mutex_unlock(&s->s_mutex); + s->s_state = CEPH_MDS_SESSION_RESTARTING; } else if (oldstate == newstate) { continue; /* nothing new with this mds */ } @@ -3931,31 +3940,33 @@ bad: } void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, - struct inode *inode, struct dentry *dentry, char action, u32 seq) { struct ceph_msg *msg; struct ceph_mds_lease *lease; - int len = sizeof(*lease) + sizeof(u32); - int dnamelen = 0; + struct inode *dir; + int len = sizeof(*lease) + sizeof(u32) + NAME_MAX; - dout("lease_send_msg inode %p dentry %p %s to mds%d\n", - inode, dentry, ceph_lease_op_name(action), session->s_mds); - dnamelen = dentry->d_name.len; - len += dnamelen; + dout("lease_send_msg identry %p %s to mds%d\n", + dentry, ceph_lease_op_name(action), session->s_mds); msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false); if (!msg) return; lease = msg->front.iov_base; lease->action = action; - lease->ino = cpu_to_le64(ceph_vino(inode).ino); - lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap); lease->seq = cpu_to_le32(seq); - put_unaligned_le32(dnamelen, lease + 1); - memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dnamelen); + spin_lock(&dentry->d_lock); + dir = d_inode(dentry->d_parent); + lease->ino = cpu_to_le64(ceph_ino(dir)); + lease->first = lease->last = cpu_to_le64(ceph_snap(dir)); + + put_unaligned_le32(dentry->d_name.len, lease + 1); + memcpy((void *)(lease + 1) + 4, + dentry->d_name.name, dentry->d_name.len); + spin_unlock(&dentry->d_lock); /* * if this is a preemptive lease RELEASE, no need to * flush request stream, since the actual request will @@ -4157,6 +4168,7 @@ static void wait_requests(struct ceph_mds_client *mdsc) while ((req = __get_oldest_req(mdsc))) { dout("wait_requests timed out on tid %llu\n", req->r_tid); + list_del_init(&req->r_wait); __unregister_request(mdsc, req); } } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index a83f28bc2387..f7c8603484fe 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -69,6 +69,9 @@ struct ceph_mds_reply_info_in { u64 max_bytes; u64 max_files; s32 dir_pin; + struct ceph_timespec btime; + struct ceph_timespec snap_btime; + u64 change_attr; }; struct ceph_mds_reply_dir_entry { @@ -504,7 +507,6 @@ extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry); extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, - struct inode *inode, struct dentry *dentry, char action, u32 seq); diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 701b4fb0fb5a..ce2d00da5096 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -107,7 +107,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) struct ceph_mdsmap *m; const void *start = *p; int i, j, n; - int err = -EINVAL; + int err; u8 mdsmap_v, mdsmap_cv; u16 mdsmap_ev; @@ -183,8 +183,9 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) inc = ceph_decode_32(p); state = ceph_decode_32(p); state_seq = ceph_decode_64(p); - ceph_decode_copy(p, &addr, sizeof(addr)); - ceph_decode_addr(&addr); + err = ceph_decode_entity_addr(p, end, &addr); + if (err) + goto corrupt; ceph_decode_copy(p, &laggy_since, sizeof(laggy_since)); *p += sizeof(u32); ceph_decode_32_safe(p, end, namelen, bad); @@ -357,7 +358,7 @@ bad_ext: nomem: err = -ENOMEM; goto out_err; -bad: +corrupt: pr_err("corrupt mdsmap\n"); print_hex_dump(KERN_DEBUG, "mdsmap: ", DUMP_PREFIX_OFFSET, 16, 1, @@ -365,6 +366,9 @@ bad: out_err: ceph_mdsmap_destroy(m); return ERR_PTR(err); +bad: + err = -EINVAL; + goto corrupt; } void ceph_mdsmap_destroy(struct ceph_mdsmap *m) diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index d629fc857450..de56dee60540 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -135,7 +135,7 @@ static struct inode *lookup_quotarealm_inode(struct ceph_mds_client *mdsc, return NULL; mutex_lock(&qri->mutex); - if (qri->inode) { + if (qri->inode && ceph_is_any_caps(qri->inode)) { /* A request has already returned the inode */ mutex_unlock(&qri->mutex); return qri->inode; @@ -146,7 +146,18 @@ static struct inode *lookup_quotarealm_inode(struct ceph_mds_client *mdsc, mutex_unlock(&qri->mutex); return NULL; } - in = ceph_lookup_inode(sb, realm->ino); + if (qri->inode) { + /* get caps */ + int ret = __ceph_do_getattr(qri->inode, NULL, + CEPH_STAT_CAP_INODE, true); + if (ret >= 0) + in = qri->inode; + else + in = ERR_PTR(ret); + } else { + in = ceph_lookup_inode(sb, realm->ino); + } + if (IS_ERR(in)) { pr_warn("Can't lookup inode %llx (err: %ld)\n", realm->ino, PTR_ERR(in)); diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 72c6c022f02b..4c6494eb02b5 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -3,6 +3,7 @@ #include <linux/sort.h> #include <linux/slab.h> +#include <linux/iversion.h> #include "super.h" #include "mds_client.h" #include <linux/ceph/decode.h> @@ -606,6 +607,8 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, capsnap->mtime = inode->i_mtime; capsnap->atime = inode->i_atime; capsnap->ctime = inode->i_ctime; + capsnap->btime = ci->i_btime; + capsnap->change_attr = inode_peek_iversion_raw(inode); capsnap->time_warp_seq = ci->i_time_warp_seq; capsnap->truncate_size = ci->i_truncate_size; capsnap->truncate_seq = ci->i_truncate_seq; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index ed1b65a6c2c3..ab4868c7308e 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -840,10 +840,10 @@ static int ceph_remount(struct super_block *sb, int *flags, char *data) static const struct super_operations ceph_super_ops = { .alloc_inode = ceph_alloc_inode, - .destroy_inode = ceph_destroy_inode, .free_inode = ceph_free_inode, .write_inode = ceph_write_inode, - .drop_inode = ceph_drop_inode, + .drop_inode = generic_delete_inode, + .evict_inode = ceph_evict_inode, .sync_fs = ceph_sync_fs, .put_super = ceph_put_super, .remount_fs = ceph_remount, @@ -978,7 +978,7 @@ static int ceph_set_super(struct super_block *s, void *data) s->s_d_op = &ceph_dentry_ops; s->s_export_op = &ceph_export_ops; - s->s_time_gran = 1000; /* 1000 ns == 1 us */ + s->s_time_gran = 1; ret = set_anon_super(s, NULL); /* what is that second arg for? */ if (ret != 0) @@ -1159,17 +1159,15 @@ static int __init init_ceph(void) goto out; ceph_flock_init(); - ceph_xattr_init(); ret = register_filesystem(&ceph_fs_type); if (ret) - goto out_xattr; + goto out_caches; pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL); return 0; -out_xattr: - ceph_xattr_exit(); +out_caches: destroy_caches(); out: return ret; @@ -1179,7 +1177,6 @@ static void __exit exit_ceph(void) { dout("exit_ceph\n"); unregister_filesystem(&ceph_fs_type); - ceph_xattr_exit(); destroy_caches(); } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index fbe6869a3f95..d2352fd95dbc 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -197,7 +197,8 @@ struct ceph_cap_snap { u64 xattr_version; u64 size; - struct timespec64 mtime, atime, ctime; + u64 change_attr; + struct timespec64 mtime, atime, ctime, btime; u64 time_warp_seq; u64 truncate_size; u32 truncate_seq; @@ -384,6 +385,8 @@ struct ceph_inode_info { int i_snap_realm_counter; /* snap realm (if caps) */ struct list_head i_snap_realm_item; struct list_head i_snap_flush_item; + struct timespec64 i_btime; + struct timespec64 i_snap_btime; struct work_struct i_work; unsigned long i_work_mask; @@ -544,7 +547,12 @@ static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, long long release_count, long long ordered_count) { - smp_mb__before_atomic(); + /* + * Makes sure operations that setup readdir cache (update page + * cache and i_size) are strongly ordered w.r.t. the following + * atomic64_set() operations. + */ + smp_mb(); atomic64_set(&ci->i_complete_seq[0], release_count); atomic64_set(&ci->i_complete_seq[1], ordered_count); } @@ -876,9 +884,8 @@ static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci) extern const struct inode_operations ceph_file_iops; extern struct inode *ceph_alloc_inode(struct super_block *sb); -extern void ceph_destroy_inode(struct inode *inode); +extern void ceph_evict_inode(struct inode *inode); extern void ceph_free_inode(struct inode *inode); -extern int ceph_drop_inode(struct inode *inode); extern struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino); @@ -921,10 +928,20 @@ ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t); extern ssize_t ceph_listxattr(struct dentry *, char *, size_t); extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci); extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci); -extern void __init ceph_xattr_init(void); -extern void ceph_xattr_exit(void); extern const struct xattr_handler *ceph_xattr_handlers[]; +struct ceph_acl_sec_ctx { +#ifdef CONFIG_CEPH_FS_POSIX_ACL + void *default_acl; + void *acl; +#endif +#ifdef CONFIG_CEPH_FS_SECURITY_LABEL + void *sec_ctx; + u32 sec_ctxlen; +#endif + struct ceph_pagelist *pagelist; +}; + #ifdef CONFIG_SECURITY extern bool ceph_security_xattr_deadlock(struct inode *in); extern bool ceph_security_xattr_wanted(struct inode *in); @@ -939,21 +956,32 @@ static inline bool ceph_security_xattr_wanted(struct inode *in) } #endif -/* acl.c */ -struct ceph_acls_info { - void *default_acl; - void *acl; - struct ceph_pagelist *pagelist; -}; +#ifdef CONFIG_CEPH_FS_SECURITY_LABEL +extern int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, + struct ceph_acl_sec_ctx *ctx); +extern void ceph_security_invalidate_secctx(struct inode *inode); +#else +static inline int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, + struct ceph_acl_sec_ctx *ctx) +{ + return 0; +} +static inline void ceph_security_invalidate_secctx(struct inode *inode) +{ +} +#endif + +void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx); +/* acl.c */ #ifdef CONFIG_CEPH_FS_POSIX_ACL struct posix_acl *ceph_get_acl(struct inode *, int); int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type); int ceph_pre_init_acls(struct inode *dir, umode_t *mode, - struct ceph_acls_info *info); -void ceph_init_inode_acls(struct inode *inode, struct ceph_acls_info *info); -void ceph_release_acls_info(struct ceph_acls_info *info); + struct ceph_acl_sec_ctx *as_ctx); +void ceph_init_inode_acls(struct inode *inode, + struct ceph_acl_sec_ctx *as_ctx); static inline void ceph_forget_all_cached_acls(struct inode *inode) { @@ -966,15 +994,12 @@ static inline void ceph_forget_all_cached_acls(struct inode *inode) #define ceph_set_acl NULL static inline int ceph_pre_init_acls(struct inode *dir, umode_t *mode, - struct ceph_acls_info *info) + struct ceph_acl_sec_ctx *as_ctx) { return 0; } static inline void ceph_init_inode_acls(struct inode *inode, - struct ceph_acls_info *info) -{ -} -static inline void ceph_release_acls_info(struct ceph_acls_info *info) + struct ceph_acl_sec_ctx *as_ctx) { } static inline int ceph_acl_chmod(struct dentry *dentry, struct inode *inode) @@ -1000,7 +1025,7 @@ extern void ceph_add_cap(struct inode *inode, unsigned cap, unsigned seq, u64 realmino, int flags, struct ceph_cap **new_cap); extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release); -extern void __ceph_remove_caps(struct inode* inode); +extern void __ceph_remove_caps(struct ceph_inode_info *ci); extern void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap); extern int ceph_is_any_caps(struct inode *inode); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 0cc42c8879e9..37b458a9af3a 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -8,6 +8,7 @@ #include <linux/ceph/decode.h> #include <linux/xattr.h> +#include <linux/security.h> #include <linux/posix_acl_xattr.h> #include <linux/slab.h> @@ -17,26 +18,9 @@ static int __remove_xattr(struct ceph_inode_info *ci, struct ceph_inode_xattr *xattr); -static const struct xattr_handler ceph_other_xattr_handler; - -/* - * List of handlers for synthetic system.* attributes. Other - * attributes are handled directly. - */ -const struct xattr_handler *ceph_xattr_handlers[] = { -#ifdef CONFIG_CEPH_FS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif - &ceph_other_xattr_handler, - NULL, -}; - static bool ceph_is_valid_xattr(const char *name) { return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) || - !strncmp(name, XATTR_SECURITY_PREFIX, - XATTR_SECURITY_PREFIX_LEN) || !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); } @@ -48,8 +32,8 @@ static bool ceph_is_valid_xattr(const char *name) struct ceph_vxattr { char *name; size_t name_size; /* strlen(name) + 1 (for '\0') */ - size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val, - size_t size); + ssize_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val, + size_t size); bool (*exists_cb)(struct ceph_inode_info *ci); unsigned int flags; }; @@ -68,8 +52,8 @@ static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci) rcu_dereference_raw(fl->pool_ns) != NULL); } -static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, - size_t size) +static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, + size_t size) { struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); struct ceph_osd_client *osdc = &fsc->client->osdc; @@ -79,7 +63,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, const char *ns_field = " pool_namespace="; char buf[128]; size_t len, total_len = 0; - int ret; + ssize_t ret; pool_ns = ceph_try_get_string(ci->i_layout.pool_ns); @@ -96,18 +80,15 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, len = snprintf(buf, sizeof(buf), "stripe_unit=%u stripe_count=%u object_size=%u pool=%lld", ci->i_layout.stripe_unit, ci->i_layout.stripe_count, - ci->i_layout.object_size, (unsigned long long)pool); + ci->i_layout.object_size, pool); total_len = len; } if (pool_ns) total_len += strlen(ns_field) + pool_ns->len; - if (!size) { - ret = total_len; - } else if (total_len > size) { - ret = -ERANGE; - } else { + ret = total_len; + if (size >= total_len) { memcpy(val, buf, len); ret = len; if (pool_name) { @@ -128,28 +109,55 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, return ret; } -static size_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci, - char *val, size_t size) +/* + * The convention with strings in xattrs is that they should not be NULL + * terminated, since we're returning the length with them. snprintf always + * NULL terminates however, so call it on a temporary buffer and then memcpy + * the result into place. + */ +static int ceph_fmt_xattr(char *val, size_t size, const char *fmt, ...) { - return snprintf(val, size, "%u", ci->i_layout.stripe_unit); + int ret; + va_list args; + char buf[96]; /* NB: reevaluate size if new vxattrs are added */ + + va_start(args, fmt); + ret = vsnprintf(buf, size ? sizeof(buf) : 0, fmt, args); + va_end(args); + + /* Sanity check */ + if (size && ret + 1 > sizeof(buf)) { + WARN_ONCE(true, "Returned length too big (%d)", ret); + return -E2BIG; + } + + if (ret <= size) + memcpy(val, buf, ret); + return ret; } -static size_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci, +static ssize_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci, char *val, size_t size) { - return snprintf(val, size, "%u", ci->i_layout.stripe_count); + return ceph_fmt_xattr(val, size, "%u", ci->i_layout.stripe_unit); +} + +static ssize_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci, + char *val, size_t size) +{ + return ceph_fmt_xattr(val, size, "%u", ci->i_layout.stripe_count); } -static size_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci, - char *val, size_t size) +static ssize_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci, + char *val, size_t size) { - return snprintf(val, size, "%u", ci->i_layout.object_size); + return ceph_fmt_xattr(val, size, "%u", ci->i_layout.object_size); } -static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci, - char *val, size_t size) +static ssize_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci, + char *val, size_t size) { - int ret; + ssize_t ret; struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); struct ceph_osd_client *osdc = &fsc->client->osdc; s64 pool = ci->i_layout.pool_id; @@ -157,21 +165,27 @@ static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci, down_read(&osdc->lock); pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); - if (pool_name) - ret = snprintf(val, size, "%s", pool_name); - else - ret = snprintf(val, size, "%lld", (unsigned long long)pool); + if (pool_name) { + ret = strlen(pool_name); + if (ret <= size) + memcpy(val, pool_name, ret); + } else { + ret = ceph_fmt_xattr(val, size, "%lld", pool); + } up_read(&osdc->lock); return ret; } -static size_t ceph_vxattrcb_layout_pool_namespace(struct ceph_inode_info *ci, - char *val, size_t size) +static ssize_t ceph_vxattrcb_layout_pool_namespace(struct ceph_inode_info *ci, + char *val, size_t size) { - int ret = 0; + ssize_t ret = 0; struct ceph_string *ns = ceph_try_get_string(ci->i_layout.pool_ns); + if (ns) { - ret = snprintf(val, size, "%.*s", (int)ns->len, ns->str); + ret = ns->len; + if (ret <= size) + memcpy(val, ns->str, ret); ceph_put_string(ns); } return ret; @@ -179,53 +193,54 @@ static size_t ceph_vxattrcb_layout_pool_namespace(struct ceph_inode_info *ci, /* directories */ -static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val, - size_t size) +static ssize_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val, + size_t size) { - return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs); + return ceph_fmt_xattr(val, size, "%lld", ci->i_files + ci->i_subdirs); } -static size_t ceph_vxattrcb_dir_files(struct ceph_inode_info *ci, char *val, - size_t size) +static ssize_t ceph_vxattrcb_dir_files(struct ceph_inode_info *ci, char *val, + size_t size) { - return snprintf(val, size, "%lld", ci->i_files); + return ceph_fmt_xattr(val, size, "%lld", ci->i_files); } -static size_t ceph_vxattrcb_dir_subdirs(struct ceph_inode_info *ci, char *val, - size_t size) +static ssize_t ceph_vxattrcb_dir_subdirs(struct ceph_inode_info *ci, char *val, + size_t size) { - return snprintf(val, size, "%lld", ci->i_subdirs); + return ceph_fmt_xattr(val, size, "%lld", ci->i_subdirs); } -static size_t ceph_vxattrcb_dir_rentries(struct ceph_inode_info *ci, char *val, - size_t size) +static ssize_t ceph_vxattrcb_dir_rentries(struct ceph_inode_info *ci, char *val, + size_t size) { - return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs); + return ceph_fmt_xattr(val, size, "%lld", + ci->i_rfiles + ci->i_rsubdirs); } -static size_t ceph_vxattrcb_dir_rfiles(struct ceph_inode_info *ci, char *val, - size_t size) +static ssize_t ceph_vxattrcb_dir_rfiles(struct ceph_inode_info *ci, char *val, + size_t size) { - return snprintf(val, size, "%lld", ci->i_rfiles); + return ceph_fmt_xattr(val, size, "%lld", ci->i_rfiles); } -static size_t ceph_vxattrcb_dir_rsubdirs(struct ceph_inode_info *ci, char *val, - size_t size) +static ssize_t ceph_vxattrcb_dir_rsubdirs(struct ceph_inode_info *ci, char *val, + size_t size) { - return snprintf(val, size, "%lld", ci->i_rsubdirs); + return ceph_fmt_xattr(val, size, "%lld", ci->i_rsubdirs); } -static size_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val, - size_t size) +static ssize_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val, + size_t size) { - return snprintf(val, size, "%lld", ci->i_rbytes); + return ceph_fmt_xattr(val, size, "%lld", ci->i_rbytes); } -static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val, - size_t size) +static ssize_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val, + size_t size) { - return snprintf(val, size, "%lld.09%ld", ci->i_rctime.tv_sec, - ci->i_rctime.tv_nsec); + return ceph_fmt_xattr(val, size, "%lld.%09ld", ci->i_rctime.tv_sec, + ci->i_rctime.tv_nsec); } /* dir pin */ @@ -234,10 +249,10 @@ static bool ceph_vxattrcb_dir_pin_exists(struct ceph_inode_info *ci) return ci->i_dir_pin != -ENODATA; } -static size_t ceph_vxattrcb_dir_pin(struct ceph_inode_info *ci, char *val, - size_t size) +static ssize_t ceph_vxattrcb_dir_pin(struct ceph_inode_info *ci, char *val, + size_t size) { - return snprintf(val, size, "%d", (int)ci->i_dir_pin); + return ceph_fmt_xattr(val, size, "%d", (int)ci->i_dir_pin); } /* quotas */ @@ -254,23 +269,36 @@ static bool ceph_vxattrcb_quota_exists(struct ceph_inode_info *ci) return ret; } -static size_t ceph_vxattrcb_quota(struct ceph_inode_info *ci, char *val, - size_t size) +static ssize_t ceph_vxattrcb_quota(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return ceph_fmt_xattr(val, size, "max_bytes=%llu max_files=%llu", + ci->i_max_bytes, ci->i_max_files); +} + +static ssize_t ceph_vxattrcb_quota_max_bytes(struct ceph_inode_info *ci, + char *val, size_t size) { - return snprintf(val, size, "max_bytes=%llu max_files=%llu", - ci->i_max_bytes, ci->i_max_files); + return ceph_fmt_xattr(val, size, "%llu", ci->i_max_bytes); } -static size_t ceph_vxattrcb_quota_max_bytes(struct ceph_inode_info *ci, - char *val, size_t size) +static ssize_t ceph_vxattrcb_quota_max_files(struct ceph_inode_info *ci, + char *val, size_t size) { - return snprintf(val, size, "%llu", ci->i_max_bytes); + return ceph_fmt_xattr(val, size, "%llu", ci->i_max_files); } -static size_t ceph_vxattrcb_quota_max_files(struct ceph_inode_info *ci, - char *val, size_t size) +/* snapshots */ +static bool ceph_vxattrcb_snap_btime_exists(struct ceph_inode_info *ci) { - return snprintf(val, size, "%llu", ci->i_max_files); + return (ci->i_snap_btime.tv_sec != 0 || ci->i_snap_btime.tv_nsec != 0); +} + +static ssize_t ceph_vxattrcb_snap_btime(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return ceph_fmt_xattr(val, size, "%lld.%09ld", ci->i_snap_btime.tv_sec, + ci->i_snap_btime.tv_nsec); } #define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name @@ -327,7 +355,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = { XATTR_RSTAT_FIELD(dir, rctime), { .name = "ceph.dir.pin", - .name_size = sizeof("ceph.dir_pin"), + .name_size = sizeof("ceph.dir.pin"), .getxattr_cb = ceph_vxattrcb_dir_pin, .exists_cb = ceph_vxattrcb_dir_pin_exists, .flags = VXATTR_FLAG_HIDDEN, @@ -341,9 +369,15 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = { }, XATTR_QUOTA_FIELD(quota, max_bytes), XATTR_QUOTA_FIELD(quota, max_files), + { + .name = "ceph.snap.btime", + .name_size = sizeof("ceph.snap.btime"), + .getxattr_cb = ceph_vxattrcb_snap_btime, + .exists_cb = ceph_vxattrcb_snap_btime_exists, + .flags = VXATTR_FLAG_READONLY, + }, { .name = NULL, 0 } /* Required table terminator */ }; -static size_t ceph_dir_vxattrs_name_size; /* total size of all names */ /* files */ @@ -360,9 +394,15 @@ static struct ceph_vxattr ceph_file_vxattrs[] = { XATTR_LAYOUT_FIELD(file, layout, object_size), XATTR_LAYOUT_FIELD(file, layout, pool), XATTR_LAYOUT_FIELD(file, layout, pool_namespace), + { + .name = "ceph.snap.btime", + .name_size = sizeof("ceph.snap.btime"), + .getxattr_cb = ceph_vxattrcb_snap_btime, + .exists_cb = ceph_vxattrcb_snap_btime_exists, + .flags = VXATTR_FLAG_READONLY, + }, { .name = NULL, 0 } /* Required table terminator */ }; -static size_t ceph_file_vxattrs_name_size; /* total size of all names */ static struct ceph_vxattr *ceph_inode_vxattrs(struct inode *inode) { @@ -373,47 +413,6 @@ static struct ceph_vxattr *ceph_inode_vxattrs(struct inode *inode) return NULL; } -static size_t ceph_vxattrs_name_size(struct ceph_vxattr *vxattrs) -{ - if (vxattrs == ceph_dir_vxattrs) - return ceph_dir_vxattrs_name_size; - if (vxattrs == ceph_file_vxattrs) - return ceph_file_vxattrs_name_size; - BUG_ON(vxattrs); - return 0; -} - -/* - * Compute the aggregate size (including terminating '\0') of all - * virtual extended attribute names in the given vxattr table. - */ -static size_t __init vxattrs_name_size(struct ceph_vxattr *vxattrs) -{ - struct ceph_vxattr *vxattr; - size_t size = 0; - - for (vxattr = vxattrs; vxattr->name; vxattr++) { - if (!(vxattr->flags & VXATTR_FLAG_HIDDEN)) - size += vxattr->name_size; - } - - return size; -} - -/* Routines called at initialization and exit time */ - -void __init ceph_xattr_init(void) -{ - ceph_dir_vxattrs_name_size = vxattrs_name_size(ceph_dir_vxattrs); - ceph_file_vxattrs_name_size = vxattrs_name_size(ceph_file_vxattrs); -} - -void ceph_xattr_exit(void) -{ - ceph_dir_vxattrs_name_size = 0; - ceph_file_vxattrs_name_size = 0; -} - static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode, const char *name) { @@ -523,8 +522,8 @@ static int __set_xattr(struct ceph_inode_info *ci, dout("__set_xattr_val p=%p\n", p); } - dout("__set_xattr_val added %llx.%llx xattr %p %s=%.*s\n", - ceph_vinop(&ci->vfs_inode), xattr, name, val_len, val); + dout("__set_xattr_val added %llx.%llx xattr %p %.*s=%.*s\n", + ceph_vinop(&ci->vfs_inode), xattr, name_len, name, val_len, val); return 0; } @@ -823,7 +822,7 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, struct ceph_inode_xattr *xattr; struct ceph_vxattr *vxattr = NULL; int req_mask; - int err; + ssize_t err; /* let's see if a virtual xattr was requested */ vxattr = ceph_match_vxattr(inode, name); @@ -835,8 +834,11 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, if (err) return err; err = -ENODATA; - if (!(vxattr->exists_cb && !vxattr->exists_cb(ci))) + if (!(vxattr->exists_cb && !vxattr->exists_cb(ci))) { err = vxattr->getxattr_cb(ci, value, size); + if (size && size < err) + err = -ERANGE; + } return err; } @@ -897,10 +899,9 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) struct inode *inode = d_inode(dentry); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_vxattr *vxattrs = ceph_inode_vxattrs(inode); - u32 vir_namelen = 0; + bool len_only = (size == 0); u32 namelen; int err; - u32 len; int i; spin_lock(&ci->i_ceph_lock); @@ -919,38 +920,45 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) err = __build_xattrs(inode); if (err < 0) goto out; - /* - * Start with virtual dir xattr names (if any) (including - * terminating '\0' characters for each). - */ - vir_namelen = ceph_vxattrs_name_size(vxattrs); - /* adding 1 byte per each variable due to the null termination */ + /* add 1 byte for each xattr due to the null termination */ namelen = ci->i_xattrs.names_size + ci->i_xattrs.count; - err = -ERANGE; - if (size && vir_namelen + namelen > size) - goto out; - - err = namelen + vir_namelen; - if (size == 0) - goto out; + if (!len_only) { + if (namelen > size) { + err = -ERANGE; + goto out; + } + names = __copy_xattr_names(ci, names); + size -= namelen; + } - names = __copy_xattr_names(ci, names); /* virtual xattr names, too */ - err = namelen; if (vxattrs) { for (i = 0; vxattrs[i].name; i++) { - if (!(vxattrs[i].flags & VXATTR_FLAG_HIDDEN) && - !(vxattrs[i].exists_cb && - !vxattrs[i].exists_cb(ci))) { - len = sprintf(names, "%s", vxattrs[i].name); - names += len + 1; - err += len + 1; + size_t this_len; + + if (vxattrs[i].flags & VXATTR_FLAG_HIDDEN) + continue; + if (vxattrs[i].exists_cb && !vxattrs[i].exists_cb(ci)) + continue; + + this_len = strlen(vxattrs[i].name) + 1; + namelen += this_len; + if (len_only) + continue; + + if (this_len > size) { + err = -ERANGE; + goto out; } + + memcpy(names, vxattrs[i].name, this_len); + names += this_len; + size -= this_len; } } - + err = namelen; out: spin_unlock(&ci->i_ceph_lock); return err; @@ -1206,4 +1214,138 @@ bool ceph_security_xattr_deadlock(struct inode *in) spin_unlock(&ci->i_ceph_lock); return ret; } + +#ifdef CONFIG_CEPH_FS_SECURITY_LABEL +int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, + struct ceph_acl_sec_ctx *as_ctx) +{ + struct ceph_pagelist *pagelist = as_ctx->pagelist; + const char *name; + size_t name_len; + int err; + + err = security_dentry_init_security(dentry, mode, &dentry->d_name, + &as_ctx->sec_ctx, + &as_ctx->sec_ctxlen); + if (err < 0) { + WARN_ON_ONCE(err != -EOPNOTSUPP); + err = 0; /* do nothing */ + goto out; + } + + err = -ENOMEM; + if (!pagelist) { + pagelist = ceph_pagelist_alloc(GFP_KERNEL); + if (!pagelist) + goto out; + err = ceph_pagelist_reserve(pagelist, PAGE_SIZE); + if (err) + goto out; + ceph_pagelist_encode_32(pagelist, 1); + } + + /* + * FIXME: Make security_dentry_init_security() generic. Currently + * It only supports single security module and only selinux has + * dentry_init_security hook. + */ + name = XATTR_NAME_SELINUX; + name_len = strlen(name); + err = ceph_pagelist_reserve(pagelist, + 4 * 2 + name_len + as_ctx->sec_ctxlen); + if (err) + goto out; + + if (as_ctx->pagelist) { + /* update count of KV pairs */ + BUG_ON(pagelist->length <= sizeof(__le32)); + if (list_is_singular(&pagelist->head)) { + le32_add_cpu((__le32*)pagelist->mapped_tail, 1); + } else { + struct page *page = list_first_entry(&pagelist->head, + struct page, lru); + void *addr = kmap_atomic(page); + le32_add_cpu((__le32*)addr, 1); + kunmap_atomic(addr); + } + } else { + as_ctx->pagelist = pagelist; + } + + ceph_pagelist_encode_32(pagelist, name_len); + ceph_pagelist_append(pagelist, name, name_len); + + ceph_pagelist_encode_32(pagelist, as_ctx->sec_ctxlen); + ceph_pagelist_append(pagelist, as_ctx->sec_ctx, as_ctx->sec_ctxlen); + + err = 0; +out: + if (pagelist && !as_ctx->pagelist) + ceph_pagelist_release(pagelist); + return err; +} + +void ceph_security_invalidate_secctx(struct inode *inode) +{ + security_inode_invalidate_secctx(inode); +} + +static int ceph_xattr_set_security_label(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *key, const void *buf, + size_t buflen, int flags) +{ + if (security_ismaclabel(key)) { + const char *name = xattr_full_name(handler, key); + return __ceph_setxattr(inode, name, buf, buflen, flags); + } + return -EOPNOTSUPP; +} + +static int ceph_xattr_get_security_label(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *key, void *buf, size_t buflen) +{ + if (security_ismaclabel(key)) { + const char *name = xattr_full_name(handler, key); + return __ceph_getxattr(inode, name, buf, buflen); + } + return -EOPNOTSUPP; +} + +static const struct xattr_handler ceph_security_label_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .get = ceph_xattr_get_security_label, + .set = ceph_xattr_set_security_label, +}; +#endif #endif + +void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx) +{ +#ifdef CONFIG_CEPH_FS_POSIX_ACL + posix_acl_release(as_ctx->acl); + posix_acl_release(as_ctx->default_acl); +#endif +#ifdef CONFIG_CEPH_FS_SECURITY_LABEL + security_release_secctx(as_ctx->sec_ctx, as_ctx->sec_ctxlen); +#endif + if (as_ctx->pagelist) + ceph_pagelist_release(as_ctx->pagelist); +} + +/* + * List of handlers for synthetic system.* attributes. Other + * attributes are handled directly. + */ +const struct xattr_handler *ceph_xattr_handlers[] = { +#ifdef CONFIG_CEPH_FS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif +#ifdef CONFIG_CEPH_FS_SECURITY_LABEL + &ceph_security_label_handler, +#endif + &ceph_other_xattr_handler, + NULL, +}; diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 523e9ea78a28..b16219e5dac9 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -13,9 +13,11 @@ config CIFS select CRYPTO_LIB_ARC4 select CRYPTO_AEAD2 select CRYPTO_CCM + select CRYPTO_GCM select CRYPTO_ECB select CRYPTO_AES select CRYPTO_DES + select KEYS help This is the client VFS module for the SMB3 family of NAS protocols, (including support for the most recent, most secure dialect SMB3.1.1) @@ -109,7 +111,7 @@ config CIFS_WEAK_PW_HASH config CIFS_UPCALL bool "Kerberos/SPNEGO advanced session setup" - depends on CIFS && KEYS + depends on CIFS select DNS_RESOLVER help Enables an upcall mechanism for CIFS which accesses userspace helper @@ -144,14 +146,6 @@ config CIFS_POSIX (such as Samba 3.10 and later) which can negotiate CIFS POSIX ACL support. If unsure, say N. -config CIFS_ACL - bool "Provide CIFS ACL support" - depends on CIFS_XATTR && KEYS - help - Allows fetching CIFS/NTFS ACL from the server. The DACL blob - is handed over to the application/caller. See the man - page for getcifsacl for more information. If unsure, say Y. - config CIFS_DEBUG bool "Enable CIFS debugging routines" default y @@ -184,7 +178,7 @@ config CIFS_DEBUG_DUMP_KEYS config CIFS_DFS_UPCALL bool "DFS feature support" - depends on CIFS && KEYS + depends on CIFS select DNS_RESOLVER help Distributed File System (DFS) support is used to access shares @@ -203,10 +197,10 @@ config CIFS_NFSD_EXPORT Allows NFS server to export a CIFS mounted share (nfsd over cifs) config CIFS_SMB_DIRECT - bool "SMB Direct support (Experimental)" + bool "SMB Direct support" depends on CIFS=m && INFINIBAND && INFINIBAND_ADDR_TRANS || CIFS=y && INFINIBAND=y && INFINIBAND_ADDR_TRANS=y help - Enables SMB Direct experimental support for SMB 3.0, 3.02 and 3.1.1. + Enables SMB Direct support for SMB 3.0, 3.02 and 3.1.1. SMB Direct allows transferring SMB packets over RDMA. If unsure, say N. diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile index 51af69a1a328..41332f20055b 100644 --- a/fs/cifs/Makefile +++ b/fs/cifs/Makefile @@ -10,10 +10,9 @@ cifs-y := trace.o cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o \ cifs_unicode.o nterr.o cifsencrypt.o \ readdir.o ioctl.o sess.o export.o smb1ops.o winucase.o \ smb2ops.o smb2maperror.o smb2transport.o \ - smb2misc.o smb2pdu.o smb2inode.o smb2file.o + smb2misc.o smb2pdu.o smb2inode.o smb2file.o cifsacl.o cifs-$(CONFIG_CIFS_XATTR) += xattr.o -cifs-$(CONFIG_CIFS_ACL) += cifsacl.o cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index ec933fb0b36e..a38d796f5ffe 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -240,9 +240,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) #ifdef CONFIG_CIFS_XATTR seq_printf(m, ",XATTR"); #endif -#ifdef CONFIG_CIFS_ACL seq_printf(m, ",ACL"); -#endif seq_putc(m, '\n'); seq_printf(m, "CIFSMaxBufSize: %d\n", CIFSMaxBufSize); seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid); diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index ed49222abecb..b326d2ca3765 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -52,6 +52,7 @@ #define CIFS_MOUNT_UID_FROM_ACL 0x2000000 /* try to get UID via special SID */ #define CIFS_MOUNT_NO_HANDLE_CACHE 0x4000000 /* disable caching dir handles */ #define CIFS_MOUNT_NO_DFS 0x8000000 /* disable DFS resolving */ +#define CIFS_MOUNT_MODE_FROM_SID 0x10000000 /* retrieve mode from special ACE */ struct cifs_sb_info { struct rb_root tlink_tree; @@ -83,5 +84,10 @@ struct cifs_sb_info { * failover properly. */ char *origin_fullpath; /* \\HOST\SHARE\[OPTIONAL PATH] */ + /* + * Indicate whether serverino option was turned off later + * (cifs_autodisable_serverino) in order to match new mounts. + */ + bool mnt_cifs_serverino_autodisabled; }; #endif /* _CIFS_FS_SB_H */ diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 24635b65effa..3289b566463f 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -526,6 +526,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_puts(s, ",nobrl"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_HANDLE_CACHE) seq_puts(s, ",nohandlecache"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID) + seq_puts(s, ",modefromsid"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) seq_puts(s, ",cifsacl"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) @@ -554,6 +556,11 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",bsize=%u", cifs_sb->bsize); seq_printf(s, ",echo_interval=%lu", tcon->ses->server->echo_interval / HZ); + + /* Only display max_credits if it was overridden on mount */ + if (tcon->ses->server->max_credits != SMB2_MAX_CREDITS_AVAILABLE) + seq_printf(s, ",max_credits=%u", tcon->ses->server->max_credits); + if (tcon->snapshot_time) seq_printf(s, ",snapshot=%llu", tcon->snapshot_time); if (tcon->handle_timeout) @@ -1097,6 +1104,10 @@ ssize_t cifs_file_copychunk_range(unsigned int xid, goto out; } + rc = -EOPNOTSUPP; + if (!target_tcon->ses->server->ops->copychunk_range) + goto out; + /* * Note: cifs case is easier than btrfs since server responsible for * checks for proper open modes and file type and if it wants @@ -1108,11 +1119,12 @@ ssize_t cifs_file_copychunk_range(unsigned int xid, /* should we flush first and last page first */ truncate_inode_pages(&target_inode->i_data, 0); - if (target_tcon->ses->server->ops->copychunk_range) + rc = file_modified(dst_file); + if (!rc) rc = target_tcon->ses->server->ops->copychunk_range(xid, smb_file_src, smb_file_target, off, len, destoff); - else - rc = -EOPNOTSUPP; + + file_accessed(src_file); /* force revalidate of size and timestamps of target file now * that target is updated on the server @@ -1517,11 +1529,9 @@ init_cifs(void) goto out_destroy_dfs_cache; #endif /* CONFIG_CIFS_UPCALL */ -#ifdef CONFIG_CIFS_ACL rc = init_cifs_idmap(); if (rc) goto out_register_key_type; -#endif /* CONFIG_CIFS_ACL */ rc = register_filesystem(&cifs_fs_type); if (rc) @@ -1536,10 +1546,8 @@ init_cifs(void) return 0; out_init_cifs_idmap: -#ifdef CONFIG_CIFS_ACL exit_cifs_idmap(); out_register_key_type: -#endif #ifdef CONFIG_CIFS_UPCALL exit_cifs_spnego(); out_destroy_dfs_cache: @@ -1571,9 +1579,7 @@ exit_cifs(void) unregister_filesystem(&cifs_fs_type); unregister_filesystem(&smb3_fs_type); cifs_dfs_release_automount_timer(); -#ifdef CONFIG_CIFS_ACL exit_cifs_idmap(); -#endif #ifdef CONFIG_CIFS_UPCALL exit_cifs_spnego(); #endif @@ -1607,5 +1613,6 @@ MODULE_SOFTDEP("pre: sha256"); MODULE_SOFTDEP("pre: sha512"); MODULE_SOFTDEP("pre: aead2"); MODULE_SOFTDEP("pre: ccm"); +MODULE_SOFTDEP("pre: gcm"); module_init(init_cifs) module_exit(exit_cifs) diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index aea005703785..4b21a90015a9 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -152,5 +152,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.20" +#define CIFS_VERSION "2.21" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 4777b3c4a92c..fe610e7e3670 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -550,6 +550,7 @@ struct smb_vol { bool override_gid:1; bool dynperm:1; bool noperm:1; + bool mode_ace:1; bool no_psx_acl:1; /* set if posix acl support should be disabled */ bool cifs_acl:1; bool backupuid_specified; /* mount option backupuid is specified */ @@ -600,6 +601,7 @@ struct smb_vol { __u64 snapshot_time; /* needed for timewarp tokens */ __u32 handle_timeout; /* persistent and durable handle timeout in ms */ unsigned int max_credits; /* smb3 max_credits 10 < credits < 60000 */ + __u16 compression; /* compression algorithm 0xFFFF default 0=disabled */ }; /** @@ -617,7 +619,8 @@ struct smb_vol { CIFS_MOUNT_FSCACHE | CIFS_MOUNT_MF_SYMLINKS | \ CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_STRICT_IO | \ CIFS_MOUNT_CIFS_BACKUPUID | CIFS_MOUNT_CIFS_BACKUPGID | \ - CIFS_MOUNT_NO_DFS) + CIFS_MOUNT_UID_FROM_ACL | CIFS_MOUNT_NO_HANDLE_CACHE | \ + CIFS_MOUNT_NO_DFS | CIFS_MOUNT_MODE_FROM_SID) /** * Generic VFS superblock mount flags (s_flags) to consider when @@ -1870,7 +1873,6 @@ extern unsigned int cifs_min_small; /* min size of small buf pool */ extern unsigned int cifs_max_pending; /* MAX requests at once to server*/ extern bool disable_legacy_dialects; /* forbid vers=1.0 and vers=2.0 mounts */ -#ifdef CONFIG_CIFS_ACL GLOBAL_EXTERN struct rb_root uidtree; GLOBAL_EXTERN struct rb_root gidtree; GLOBAL_EXTERN spinlock_t siduidlock; @@ -1879,7 +1881,6 @@ GLOBAL_EXTERN struct rb_root siduidtree; GLOBAL_EXTERN struct rb_root sidgidtree; GLOBAL_EXTERN spinlock_t uidsidlock; GLOBAL_EXTERN spinlock_t gidsidlock; -#endif /* CONFIG_CIFS_ACL */ void cifs_oplock_break(struct work_struct *work); void cifs_queue_oplock_break(struct cifsFileInfo *cfile); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 1fbd92843a73..e2f95965065d 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -3600,11 +3600,9 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen, return size; } -static __u16 convert_ace_to_cifs_ace(struct cifs_posix_ace *cifs_ace, +static void convert_ace_to_cifs_ace(struct cifs_posix_ace *cifs_ace, const struct posix_acl_xattr_entry *local_ace) { - __u16 rc = 0; /* 0 = ACL converted ok */ - cifs_ace->cifs_e_perm = le16_to_cpu(local_ace->e_perm); cifs_ace->cifs_e_tag = le16_to_cpu(local_ace->e_tag); /* BB is there a better way to handle the large uid? */ @@ -3617,7 +3615,6 @@ static __u16 convert_ace_to_cifs_ace(struct cifs_posix_ace *cifs_ace, cifs_dbg(FYI, "perm %d tag %d id %d\n", ace->e_perm, ace->e_tag, ace->e_id); */ - return rc; } /* Convert ACL from local Linux POSIX xattr to CIFS POSIX ACL wire format */ @@ -3653,13 +3650,8 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL, cifs_dbg(FYI, "unknown ACL type %d\n", acl_type); return 0; } - for (i = 0; i < count; i++) { - rc = convert_ace_to_cifs_ace(&cifs_acl->ace_array[i], &ace[i]); - if (rc != 0) { - /* ACE not converted */ - break; - } - } + for (i = 0; i < count; i++) + convert_ace_to_cifs_ace(&cifs_acl->ace_array[i], &ace[i]); if (rc == 0) { rc = (__u16)(count * sizeof(struct cifs_posix_ace)); rc += sizeof(struct cifs_posix_acl); @@ -3920,7 +3912,6 @@ GetExtAttrOut: #endif /* CONFIG_POSIX */ -#ifdef CONFIG_CIFS_ACL /* * Initialize NT TRANSACT SMB into small smb request buffer. This assumes that * all NT TRANSACTS that we init here have total parm and data under about 400 @@ -4164,7 +4155,6 @@ setCifsAclRetry: return (rc); } -#endif /* CONFIG_CIFS_ACL */ /* Legacy Query Path Information call for lookup to old servers such as Win9x/WinME */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 714a359c7c8d..a4830ced0f98 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -96,7 +96,8 @@ enum { Opt_multiuser, Opt_sloppy, Opt_nosharesock, Opt_persistent, Opt_nopersistent, Opt_resilient, Opt_noresilient, - Opt_domainauto, Opt_rdma, + Opt_domainauto, Opt_rdma, Opt_modesid, + Opt_compress, /* Mount options which take numeric value */ Opt_backupuid, Opt_backupgid, Opt_uid, @@ -175,6 +176,7 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_serverino, "serverino" }, { Opt_noserverino, "noserverino" }, { Opt_rwpidforward, "rwpidforward" }, + { Opt_modesid, "modefromsid" }, { Opt_cifsacl, "cifsacl" }, { Opt_nocifsacl, "nocifsacl" }, { Opt_acl, "acl" }, @@ -212,6 +214,7 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_echo_interval, "echo_interval=%s" }, { Opt_max_credits, "max_credits=%s" }, { Opt_snapshot, "snapshot=%s" }, + { Opt_compress, "compress=%s" }, { Opt_blank_user, "user=" }, { Opt_blank_user, "username=" }, @@ -706,10 +709,10 @@ static bool server_unresponsive(struct TCP_Server_Info *server) { /* - * We need to wait 2 echo intervals to make sure we handle such + * We need to wait 3 echo intervals to make sure we handle such * situations right: * 1s client sends a normal SMB request - * 2s client gets a response + * 3s client gets a response * 30s echo workqueue job pops, and decides we got a response recently * and don't need to send another * ... @@ -718,9 +721,9 @@ server_unresponsive(struct TCP_Server_Info *server) */ if ((server->tcpStatus == CifsGood || server->tcpStatus == CifsNeedNegotiate) && - time_after(jiffies, server->lstrp + 2 * server->echo_interval)) { + time_after(jiffies, server->lstrp + 3 * server->echo_interval)) { cifs_dbg(VFS, "Server %s has not responded in %lu seconds. Reconnecting...\n", - server->hostname, (2 * server->echo_interval) / HZ); + server->hostname, (3 * server->echo_interval) / HZ); cifs_reconnect(server); wake_up(&server->response_q); return true; @@ -1223,11 +1226,11 @@ next_pdu: atomic_read(&midCount)); cifs_dump_mem("Received Data is: ", bufs[i], HEADER_SIZE(server)); + smb2_add_credits_from_hdr(bufs[i], server); #ifdef CONFIG_CIFS_DEBUG2 if (server->ops->dump_detail) server->ops->dump_detail(bufs[i], server); - smb2_add_credits_from_hdr(bufs[i], server); cifs_dump_mids(server); #endif /* CIFS_DEBUG2 */ } @@ -1830,6 +1833,9 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, case Opt_rwpidforward: vol->rwpidforward = 1; break; + case Opt_modesid: + vol->mode_ace = 1; + break; case Opt_cifsacl: vol->cifs_acl = 1; break; @@ -1911,6 +1917,11 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, case Opt_rdma: vol->rdma = true; break; + case Opt_compress: + vol->compression = UNKNOWN_TYPE; + cifs_dbg(VFS, + "SMB3 compression support is experimental\n"); + break; /* Numeric Values */ case Opt_backupuid: @@ -2544,8 +2555,15 @@ static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol) if (vol->nosharesock) return 0; - /* BB update this for smb3any and default case */ - if ((server->vals != vol->vals) || (server->ops != vol->ops)) + /* If multidialect negotiation see if existing sessions match one */ + if (strcmp(vol->vals->version_string, SMB3ANY_VERSION_STRING) == 0) { + if (server->vals->protocol_id < SMB30_PROT_ID) + return 0; + } else if (strcmp(vol->vals->version_string, + SMBDEFAULT_VERSION_STRING) == 0) { + if (server->vals->protocol_id < SMB21_PROT_ID) + return 0; + } else if ((server->vals != vol->vals) || (server->ops != vol->ops)) return 0; if (!net_eq(cifs_net_ns(server), current->nsproxy->net_ns)) @@ -2680,6 +2698,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) tcp_ses->sequence_number = 0; tcp_ses->reconnect_instance = 1; tcp_ses->lstrp = jiffies; + tcp_ses->compress_algorithm = cpu_to_le16(volume_info->compression); spin_lock_init(&tcp_ses->req_lock); INIT_LIST_HEAD(&tcp_ses->tcp_ses_list); INIT_LIST_HEAD(&tcp_ses->smb_ses_list); @@ -3460,12 +3479,16 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data) { struct cifs_sb_info *old = CIFS_SB(sb); struct cifs_sb_info *new = mnt_data->cifs_sb; + unsigned int oldflags = old->mnt_cifs_flags & CIFS_MOUNT_MASK; + unsigned int newflags = new->mnt_cifs_flags & CIFS_MOUNT_MASK; if ((sb->s_flags & CIFS_MS_MASK) != (mnt_data->flags & CIFS_MS_MASK)) return 0; - if ((old->mnt_cifs_flags & CIFS_MOUNT_MASK) != - (new->mnt_cifs_flags & CIFS_MOUNT_MASK)) + if (old->mnt_cifs_serverino_autodisabled) + newflags &= ~CIFS_MOUNT_SERVER_INUM; + + if (oldflags != newflags) return 0; /* @@ -3965,6 +3988,8 @@ int cifs_setup_cifs_sb(struct smb_vol *pvolume_info, cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOPOSIXBRL; if (pvolume_info->rwpidforward) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_RWPIDFORWARD; + if (pvolume_info->mode_ace) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MODE_FROM_SID; if (pvolume_info->cifs_acl) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL; if (pvolume_info->backupuid_specified) { @@ -4459,11 +4484,13 @@ cifs_are_all_path_components_accessible(struct TCP_Server_Info *server, unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, - char *full_path) + char *full_path, + int added_treename) { int rc; char *s; char sep, tmp; + int skip = added_treename ? 1 : 0; sep = CIFS_DIR_SEP(cifs_sb); s = full_path; @@ -4478,7 +4505,14 @@ cifs_are_all_path_components_accessible(struct TCP_Server_Info *server, /* next separator */ while (*s && *s != sep) s++; - + /* + * if the treename is added, we then have to skip the first + * part within the separators + */ + if (skip) { + skip = 0; + continue; + } /* * temporarily null-terminate the path at the end of * the current component @@ -4526,8 +4560,7 @@ static int is_path_remote(struct cifs_sb_info *cifs_sb, struct smb_vol *vol, if (rc != -EREMOTE) { rc = cifs_are_all_path_components_accessible(server, xid, tcon, - cifs_sb, - full_path); + cifs_sb, full_path, tcon->Flags & SMB_SHARE_IS_IN_DFS); if (rc != 0) { cifs_dbg(VFS, "cannot query dirs between root and final path, " "enabling CIFS_MOUNT_USE_PREFIX_PATH\n"); diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c index e3e1c13df439..1692c0c6c23a 100644 --- a/fs/cifs/dfs_cache.c +++ b/fs/cifs/dfs_cache.c @@ -492,7 +492,7 @@ static struct dfs_cache_entry *__find_cache_entry(unsigned int hash, #ifdef CONFIG_CIFS_DEBUG2 char *name = get_tgt_name(ce); - if (unlikely(IS_ERR(name))) { + if (IS_ERR(name)) { rcu_read_unlock(); return ERR_CAST(name); } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index d7cc62252634..56ca4b8ccaba 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -892,7 +892,6 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, cifs_dbg(FYI, "cifs_sfu_type failed: %d\n", tmprc); } -#ifdef CONFIG_CIFS_ACL /* fill in 0777 bits from ACL */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { rc = cifs_acl_to_fattr(cifs_sb, &fattr, *inode, full_path, fid); @@ -902,7 +901,6 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, goto cgii_exit; } } -#endif /* CONFIG_CIFS_ACL */ /* fill in remaining high mode bits e.g. SUID, VTX */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) @@ -2408,6 +2406,8 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) struct inode *inode = d_inode(direntry); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifsInodeInfo *cifsInode = CIFS_I(inode); + struct cifsFileInfo *wfile; + struct cifs_tcon *tcon; char *full_path = NULL; int rc = -EACCES; __u32 dosattr = 0; @@ -2415,7 +2415,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) xid = get_xid(); - cifs_dbg(FYI, "setattr on file %pd attrs->iavalid 0x%x\n", + cifs_dbg(FYI, "setattr on file %pd attrs->ia_valid 0x%x\n", direntry, attrs->ia_valid); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) @@ -2454,6 +2454,20 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) mapping_set_error(inode->i_mapping, rc); rc = 0; + if (attrs->ia_valid & ATTR_MTIME) { + rc = cifs_get_writable_file(cifsInode, false, &wfile); + if (!rc) { + tcon = tlink_tcon(wfile->tlink); + rc = tcon->ses->server->ops->flush(xid, tcon, &wfile->fid); + cifsFileInfo_put(wfile); + if (rc) + return rc; + } else if (rc != -EBADF) + return rc; + else + rc = 0; + } + if (attrs->ia_valid & ATTR_SIZE) { rc = cifs_set_file_size(inode, attrs, xid, full_path); if (rc != 0) @@ -2466,7 +2480,6 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) if (attrs->ia_valid & ATTR_GID) gid = attrs->ia_gid; -#ifdef CONFIG_CIFS_ACL if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { if (uid_valid(uid) || gid_valid(gid)) { rc = id_mode_to_cifs_acl(inode, full_path, NO_CHANGE_64, @@ -2478,7 +2491,6 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) } } } else -#endif /* CONFIG_CIFS_ACL */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) attrs->ia_valid &= ~(ATTR_UID | ATTR_GID); @@ -2489,7 +2501,6 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) if (attrs->ia_valid & ATTR_MODE) { mode = attrs->ia_mode; rc = 0; -#ifdef CONFIG_CIFS_ACL if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { rc = id_mode_to_cifs_acl(inode, full_path, mode, INVALID_UID, INVALID_GID); @@ -2499,7 +2510,6 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) goto cifs_setattr_exit; } } else -#endif /* CONFIG_CIFS_ACL */ if (((mode & S_IWUGO) == 0) && (cifsInode->cifsAttrs & ATTR_READONLY) == 0) { diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index b1a696a73f7c..f383877a6511 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -539,6 +539,7 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb) tcon = cifs_sb_master_tcon(cifs_sb); cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM; + cifs_sb->mnt_cifs_serverino_autodisabled = true; cifs_dbg(VFS, "Autodisabling the use of server inode numbers on %s.\n", tcon ? tcon->treeName : "new server"); cifs_dbg(VFS, "The server doesn't seem to support them properly or the files might be on different servers (DFS).\n"); diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 9e430ae9314f..b7421a096319 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -1223,16 +1223,15 @@ struct smb_version_operations smb1_operations = { .query_all_EAs = CIFSSMBQAllEAs, .set_EA = CIFSSMBSetEA, #endif /* CIFS_XATTR */ -#ifdef CONFIG_CIFS_ACL .get_acl = get_cifs_acl, .get_acl_by_fid = get_cifs_acl_by_fid, .set_acl = set_cifs_acl, -#endif /* CIFS_ACL */ .make_node = cifs_make_node, }; struct smb_version_values smb1_values = { .version_string = SMB1_VERSION_STRING, + .protocol_id = SMB10_PROT_ID, .large_lock_type = LOCKING_ANDX_LARGE_FILES, .exclusive_lock_type = 0, .shared_lock_type = LOCKING_ANDX_SHARED_LOCK, diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index 54bffb2a1786..e6a1fc72018f 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -88,14 +88,20 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, } if (buf) { - /* open response does not have IndexNumber field - get it */ - rc = SMB2_get_srv_num(xid, oparms->tcon, fid->persistent_fid, + /* if open response does not have IndexNumber field - get it */ + if (smb2_data->IndexNumber == 0) { + rc = SMB2_get_srv_num(xid, oparms->tcon, + fid->persistent_fid, fid->volatile_fid, &smb2_data->IndexNumber); - if (rc) { - /* let get_inode_info disable server inode numbers */ - smb2_data->IndexNumber = 0; - rc = 0; + if (rc) { + /* + * let get_inode_info disable server inode + * numbers + */ + smb2_data->IndexNumber = 0; + rc = 0; + } } move_smb2_info_to_cifs(buf, smb2_data); } diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 278405d26c47..d8d9cdfa30b6 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -120,6 +120,8 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, SMB2_O_INFO_FILE, 0, sizeof(struct smb2_file_all_info) + PATH_MAX * 2, 0, NULL); + if (rc) + goto finished; smb2_set_next_command(tcon, &rqst[num_rqst]); smb2_set_related(&rqst[num_rqst++]); trace_smb3_query_info_compound_enter(xid, ses->Suid, tcon->tid, @@ -147,6 +149,8 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, COMPOUND_FID, current->tgid, FILE_DISPOSITION_INFORMATION, SMB2_O_INFO_FILE, 0, data, size); + if (rc) + goto finished; smb2_set_next_command(tcon, &rqst[num_rqst]); smb2_set_related(&rqst[num_rqst++]); trace_smb3_rmdir_enter(xid, ses->Suid, tcon->tid, full_path); @@ -163,6 +167,8 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, COMPOUND_FID, current->tgid, FILE_END_OF_FILE_INFORMATION, SMB2_O_INFO_FILE, 0, data, size); + if (rc) + goto finished; smb2_set_next_command(tcon, &rqst[num_rqst]); smb2_set_related(&rqst[num_rqst++]); trace_smb3_set_eof_enter(xid, ses->Suid, tcon->tid, full_path); @@ -180,6 +186,8 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, COMPOUND_FID, current->tgid, FILE_BASIC_INFORMATION, SMB2_O_INFO_FILE, 0, data, size); + if (rc) + goto finished; smb2_set_next_command(tcon, &rqst[num_rqst]); smb2_set_related(&rqst[num_rqst++]); trace_smb3_set_info_compound_enter(xid, ses->Suid, tcon->tid, @@ -206,6 +214,8 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, COMPOUND_FID, current->tgid, FILE_RENAME_INFORMATION, SMB2_O_INFO_FILE, 0, data, size); + if (rc) + goto finished; smb2_set_next_command(tcon, &rqst[num_rqst]); smb2_set_related(&rqst[num_rqst++]); trace_smb3_rename_enter(xid, ses->Suid, tcon->tid, full_path); @@ -231,6 +241,8 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, COMPOUND_FID, current->tgid, FILE_LINK_INFORMATION, SMB2_O_INFO_FILE, 0, data, size); + if (rc) + goto finished; smb2_set_next_command(tcon, &rqst[num_rqst]); smb2_set_related(&rqst[num_rqst++]); trace_smb3_hardlink_enter(xid, ses->Suid, tcon->tid, full_path); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 9fd56b0acd7e..a5bc1b671c12 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -694,8 +694,51 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid) smb2_set_related(&rqst[1]); + /* + * We do not hold the lock for the open because in case + * SMB2_open needs to reconnect, it will end up calling + * cifs_mark_open_files_invalid() which takes the lock again + * thus causing a deadlock + */ + + mutex_unlock(&tcon->crfid.fid_mutex); rc = compound_send_recv(xid, ses, flags, 2, rqst, resp_buftype, rsp_iov); + mutex_lock(&tcon->crfid.fid_mutex); + + /* + * Now we need to check again as the cached root might have + * been successfully re-opened from a concurrent process + */ + + if (tcon->crfid.is_valid) { + /* work was already done */ + + /* stash fids for close() later */ + struct cifs_fid fid = { + .persistent_fid = pfid->persistent_fid, + .volatile_fid = pfid->volatile_fid, + }; + + /* + * caller expects this func to set pfid to a valid + * cached root, so we copy the existing one and get a + * reference. + */ + memcpy(pfid, tcon->crfid.fid, sizeof(*pfid)); + kref_get(&tcon->crfid.refcount); + + mutex_unlock(&tcon->crfid.fid_mutex); + + if (rc == 0) { + /* close extra handle outside of crit sec */ + SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); + } + goto oshr_free; + } + + /* Cached root is still invalid, continue normaly */ + if (rc) goto oshr_exit; @@ -711,11 +754,12 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid) tcon->crfid.is_valid = true; kref_init(&tcon->crfid.refcount); + /* BB TBD check to see if oplock level check can be removed below */ if (o_rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) { kref_get(&tcon->crfid.refcount); - oplock = smb2_parse_lease_state(server, o_rsp, - &oparms.fid->epoch, - oparms.fid->lease_key); + smb2_parse_contexts(server, o_rsp, + &oparms.fid->epoch, + oparms.fid->lease_key, &oplock, NULL); } else goto oshr_exit; @@ -729,8 +773,9 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid) (char *)&tcon->crfid.file_all_info)) tcon->crfid.file_all_info_is_valid = 1; - oshr_exit: +oshr_exit: mutex_unlock(&tcon->crfid.fid_mutex); +oshr_free: SMB2_open_free(&rqst[0]); SMB2_query_info_free(&rqst[1]); free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); @@ -2027,6 +2072,10 @@ smb2_set_related(struct smb_rqst *rqst) struct smb2_sync_hdr *shdr; shdr = (struct smb2_sync_hdr *)(rqst->rq_iov[0].iov_base); + if (shdr == NULL) { + cifs_dbg(FYI, "shdr NULL in smb2_set_related\n"); + return; + } shdr->Flags |= SMB2_FLAGS_RELATED_OPERATIONS; } @@ -2041,6 +2090,12 @@ smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst) unsigned long len = smb_rqst_len(server, rqst); int i, num_padding; + shdr = (struct smb2_sync_hdr *)(rqst->rq_iov[0].iov_base); + if (shdr == NULL) { + cifs_dbg(FYI, "shdr NULL in smb2_set_next_command\n"); + return; + } + /* SMB headers in a compound are 8 byte aligned. */ /* No padding needed */ @@ -2080,7 +2135,6 @@ smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst) } finished: - shdr = (struct smb2_sync_hdr *)(rqst->rq_iov[0].iov_base); shdr->NextCommand = cpu_to_le32(len); } @@ -2374,6 +2428,34 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses, } static int +parse_reparse_posix(struct reparse_posix_data *symlink_buf, + u32 plen, char **target_path, + struct cifs_sb_info *cifs_sb) +{ + unsigned int len; + + /* See MS-FSCC 2.1.2.6 for the 'NFS' style reparse tags */ + len = le16_to_cpu(symlink_buf->ReparseDataLength); + + if (le64_to_cpu(symlink_buf->InodeType) != NFS_SPECFILE_LNK) { + cifs_dbg(VFS, "%lld not a supported symlink type\n", + le64_to_cpu(symlink_buf->InodeType)); + return -EOPNOTSUPP; + } + + *target_path = cifs_strndup_from_utf16( + symlink_buf->PathBuffer, + len, true, cifs_sb->local_nls); + if (!(*target_path)) + return -ENOMEM; + + convert_delimiter(*target_path, '/'); + cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path); + + return 0; +} + +static int parse_reparse_symlink(struct reparse_symlink_data_buffer *symlink_buf, u32 plen, char **target_path, struct cifs_sb_info *cifs_sb) @@ -2381,11 +2463,7 @@ parse_reparse_symlink(struct reparse_symlink_data_buffer *symlink_buf, unsigned int sub_len; unsigned int sub_offset; - /* We only handle Symbolic Link : MS-FSCC 2.1.2.4 */ - if (le32_to_cpu(symlink_buf->ReparseTag) != IO_REPARSE_TAG_SYMLINK) { - cifs_dbg(VFS, "srv returned invalid symlink buffer\n"); - return -EIO; - } + /* We handle Symbolic Link reparse tag here. See: MS-FSCC 2.1.2.4 */ sub_offset = le16_to_cpu(symlink_buf->SubstituteNameOffset); sub_len = le16_to_cpu(symlink_buf->SubstituteNameLength); @@ -2407,6 +2485,41 @@ parse_reparse_symlink(struct reparse_symlink_data_buffer *symlink_buf, return 0; } +static int +parse_reparse_point(struct reparse_data_buffer *buf, + u32 plen, char **target_path, + struct cifs_sb_info *cifs_sb) +{ + if (plen < sizeof(struct reparse_data_buffer)) { + cifs_dbg(VFS, "reparse buffer is too small. Must be " + "at least 8 bytes but was %d\n", plen); + return -EIO; + } + + if (plen < le16_to_cpu(buf->ReparseDataLength) + + sizeof(struct reparse_data_buffer)) { + cifs_dbg(VFS, "srv returned invalid reparse buf " + "length: %d\n", plen); + return -EIO; + } + + /* See MS-FSCC 2.1.2 */ + switch (le32_to_cpu(buf->ReparseTag)) { + case IO_REPARSE_TAG_NFS: + return parse_reparse_posix( + (struct reparse_posix_data *)buf, + plen, target_path, cifs_sb); + case IO_REPARSE_TAG_SYMLINK: + return parse_reparse_symlink( + (struct reparse_symlink_data_buffer *)buf, + plen, target_path, cifs_sb); + default: + cifs_dbg(VFS, "srv returned unknown symlink buffer " + "tag:0x%08x\n", le32_to_cpu(buf->ReparseTag)); + return -EOPNOTSUPP; + } +} + #define SMB2_SYMLINK_STRUCT_SIZE \ (sizeof(struct smb2_err_rsp) - 1 + sizeof(struct smb2_symlink_err_rsp)) @@ -2533,23 +2646,8 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, goto querty_exit; } - if (plen < 8) { - cifs_dbg(VFS, "reparse buffer is too small. Must be " - "at least 8 bytes but was %d\n", plen); - rc = -EIO; - goto querty_exit; - } - - if (plen < le16_to_cpu(reparse_buf->ReparseDataLength) + 8) { - cifs_dbg(VFS, "srv returned invalid reparse buf " - "length: %d\n", plen); - rc = -EIO; - goto querty_exit; - } - - rc = parse_reparse_symlink( - (struct reparse_symlink_data_buffer *)reparse_buf, - plen, target_path, cifs_sb); + rc = parse_reparse_point(reparse_buf, plen, target_path, + cifs_sb); goto querty_exit; } @@ -2561,26 +2659,32 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, err_buf = err_iov.iov_base; if (le32_to_cpu(err_buf->ByteCount) < sizeof(struct smb2_symlink_err_rsp) || err_iov.iov_len < SMB2_SYMLINK_STRUCT_SIZE) { - rc = -ENOENT; + rc = -EINVAL; + goto querty_exit; + } + + symlink = (struct smb2_symlink_err_rsp *)err_buf->ErrorData; + if (le32_to_cpu(symlink->SymLinkErrorTag) != SYMLINK_ERROR_TAG || + le32_to_cpu(symlink->ReparseTag) != IO_REPARSE_TAG_SYMLINK) { + rc = -EINVAL; goto querty_exit; } /* open must fail on symlink - reset rc */ rc = 0; - symlink = (struct smb2_symlink_err_rsp *)err_buf->ErrorData; sub_len = le16_to_cpu(symlink->SubstituteNameLength); sub_offset = le16_to_cpu(symlink->SubstituteNameOffset); print_len = le16_to_cpu(symlink->PrintNameLength); print_offset = le16_to_cpu(symlink->PrintNameOffset); if (err_iov.iov_len < SMB2_SYMLINK_STRUCT_SIZE + sub_offset + sub_len) { - rc = -ENOENT; + rc = -EINVAL; goto querty_exit; } if (err_iov.iov_len < SMB2_SYMLINK_STRUCT_SIZE + print_offset + print_len) { - rc = -ENOENT; + rc = -EINVAL; goto querty_exit; } @@ -2606,7 +2710,6 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, return rc; } -#ifdef CONFIG_CIFS_ACL static struct cifs_ntsd * get_smb2_acl_by_fid(struct cifs_sb_info *cifs_sb, const struct cifs_fid *cifsfid, u32 *pacllen) @@ -2691,7 +2794,6 @@ get_smb2_acl_by_path(struct cifs_sb_info *cifs_sb, return pntsd; } -#ifdef CONFIG_CIFS_ACL static int set_smb2_acl(struct cifs_ntsd *pnntsd, __u32 acllen, struct inode *inode, const char *path, int aclflag) @@ -2749,7 +2851,6 @@ set_smb2_acl(struct cifs_ntsd *pnntsd, __u32 acllen, free_xid(xid); return rc; } -#endif /* CIFS_ACL */ /* Retrieve an ACL from the server */ static struct cifs_ntsd * @@ -2769,7 +2870,6 @@ get_smb2_acl(struct cifs_sb_info *cifs_sb, cifsFileInfo_put(open_file); return pntsd; } -#endif static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, loff_t offset, loff_t len, bool keep_size) @@ -3367,7 +3467,7 @@ smb2_dir_needs_close(struct cifsFileInfo *cfile) static void fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len, - struct smb_rqst *old_rq) + struct smb_rqst *old_rq, __le16 cipher_type) { struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)old_rq->rq_iov[0].iov_base; @@ -3376,7 +3476,10 @@ fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len, tr_hdr->ProtocolId = SMB2_TRANSFORM_PROTO_NUM; tr_hdr->OriginalMessageSize = cpu_to_le32(orig_len); tr_hdr->Flags = cpu_to_le16(0x01); - get_random_bytes(&tr_hdr->Nonce, SMB3_AES128CMM_NONCE); + if (cipher_type == SMB2_ENCRYPTION_AES128_GCM) + get_random_bytes(&tr_hdr->Nonce, SMB3_AES128GCM_NONCE); + else + get_random_bytes(&tr_hdr->Nonce, SMB3_AES128CCM_NONCE); memcpy(&tr_hdr->SessionId, &shdr->SessionId, 8); } @@ -3534,8 +3637,13 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, rc = -ENOMEM; goto free_sg; } - iv[0] = 3; - memcpy(iv + 1, (char *)tr_hdr->Nonce, SMB3_AES128CMM_NONCE); + + if (server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) + memcpy(iv, (char *)tr_hdr->Nonce, SMB3_AES128GCM_NONCE); + else { + iv[0] = 3; + memcpy(iv + 1, (char *)tr_hdr->Nonce, SMB3_AES128CCM_NONCE); + } aead_request_set_crypt(req, sg, sg, crypt_len, iv); aead_request_set_ad(req, assoc_data_len); @@ -3635,7 +3743,7 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst, } /* fill the 1st iov with a transform header */ - fill_transform_hdr(tr_hdr, orig_len, old_rq); + fill_transform_hdr(tr_hdr, orig_len, old_rq, server->cipher_type); rc = crypt_message(server, num_rqst, new_rq, 1); cifs_dbg(FYI, "Encrypt message returned %d\n", rc); @@ -4284,11 +4392,9 @@ struct smb_version_operations smb20_operations = { .query_all_EAs = smb2_query_eas, .set_EA = smb2_set_ea, #endif /* CIFS_XATTR */ -#ifdef CONFIG_CIFS_ACL .get_acl = get_smb2_acl, .get_acl_by_fid = get_smb2_acl_by_fid, .set_acl = set_smb2_acl, -#endif /* CIFS_ACL */ .next_header = smb2_next_header, .ioctl_query_info = smb2_ioctl_query_info, .make_node = smb2_make_node, @@ -4385,11 +4491,9 @@ struct smb_version_operations smb21_operations = { .query_all_EAs = smb2_query_eas, .set_EA = smb2_set_ea, #endif /* CIFS_XATTR */ -#ifdef CONFIG_CIFS_ACL .get_acl = get_smb2_acl, .get_acl_by_fid = get_smb2_acl_by_fid, .set_acl = set_smb2_acl, -#endif /* CIFS_ACL */ .next_header = smb2_next_header, .ioctl_query_info = smb2_ioctl_query_info, .make_node = smb2_make_node, @@ -4495,11 +4599,9 @@ struct smb_version_operations smb30_operations = { .query_all_EAs = smb2_query_eas, .set_EA = smb2_set_ea, #endif /* CIFS_XATTR */ -#ifdef CONFIG_CIFS_ACL .get_acl = get_smb2_acl, .get_acl_by_fid = get_smb2_acl_by_fid, .set_acl = set_smb2_acl, -#endif /* CIFS_ACL */ .next_header = smb2_next_header, .ioctl_query_info = smb2_ioctl_query_info, .make_node = smb2_make_node, @@ -4606,11 +4708,9 @@ struct smb_version_operations smb311_operations = { .query_all_EAs = smb2_query_eas, .set_EA = smb2_set_ea, #endif /* CIFS_XATTR */ -#ifdef CONFIG_CIFS_ACL .get_acl = get_smb2_acl, .get_acl_by_fid = get_smb2_acl_by_fid, .set_acl = set_smb2_acl, -#endif /* CIFS_ACL */ .next_header = smb2_next_header, .ioctl_query_info = smb2_ioctl_query_info, .make_node = smb2_make_node, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 75311a8a68bf..c8cd7b6cdda2 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -489,10 +489,25 @@ static void build_encrypt_ctxt(struct smb2_encryption_neg_context *pneg_ctxt) { pneg_ctxt->ContextType = SMB2_ENCRYPTION_CAPABILITIES; - pneg_ctxt->DataLength = cpu_to_le16(4); /* Cipher Count + le16 cipher */ - pneg_ctxt->CipherCount = cpu_to_le16(1); -/* pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES128_GCM;*/ /* not supported yet */ - pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES128_CCM; + pneg_ctxt->DataLength = cpu_to_le16(6); /* Cipher Count + two ciphers */ + pneg_ctxt->CipherCount = cpu_to_le16(2); + pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES128_GCM; + pneg_ctxt->Ciphers[1] = SMB2_ENCRYPTION_AES128_CCM; +} + +static unsigned int +build_netname_ctxt(struct smb2_netname_neg_context *pneg_ctxt, char *hostname) +{ + struct nls_table *cp = load_nls_default(); + + pneg_ctxt->ContextType = SMB2_NETNAME_NEGOTIATE_CONTEXT_ID; + + /* copy up to max of first 100 bytes of server name to NetName field */ + pneg_ctxt->DataLength = cpu_to_le16(2 + + (2 * cifs_strtoUTF16(pneg_ctxt->NetName, hostname, 100, cp))); + /* context size is DataLength + minimal smb2_neg_context */ + return DIV_ROUND_UP(le16_to_cpu(pneg_ctxt->DataLength) + + sizeof(struct smb2_neg_context), 8) * 8; } static void @@ -521,7 +536,7 @@ build_posix_ctxt(struct smb2_posix_neg_context *pneg_ctxt) static void assemble_neg_contexts(struct smb2_negotiate_req *req, - unsigned int *total_len) + struct TCP_Server_Info *server, unsigned int *total_len) { char *pneg_ctxt = (char *)req; unsigned int ctxt_len; @@ -551,17 +566,25 @@ assemble_neg_contexts(struct smb2_negotiate_req *req, *total_len += ctxt_len; pneg_ctxt += ctxt_len; - build_compression_ctxt((struct smb2_compression_capabilities_context *) + if (server->compress_algorithm) { + build_compression_ctxt((struct smb2_compression_capabilities_context *) pneg_ctxt); - ctxt_len = DIV_ROUND_UP( - sizeof(struct smb2_compression_capabilities_context), 8) * 8; + ctxt_len = DIV_ROUND_UP( + sizeof(struct smb2_compression_capabilities_context), + 8) * 8; + *total_len += ctxt_len; + pneg_ctxt += ctxt_len; + req->NegotiateContextCount = cpu_to_le16(5); + } else + req->NegotiateContextCount = cpu_to_le16(4); + + ctxt_len = build_netname_ctxt((struct smb2_netname_neg_context *)pneg_ctxt, + server->hostname); *total_len += ctxt_len; pneg_ctxt += ctxt_len; build_posix_ctxt((struct smb2_posix_neg_context *)pneg_ctxt); *total_len += sizeof(struct smb2_posix_neg_context); - - req->NegotiateContextCount = cpu_to_le16(4); } static void decode_preauth_context(struct smb2_preauth_neg_context *ctxt) @@ -829,7 +852,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) if ((ses->server->vals->protocol_id == SMB311_PROT_ID) || (strcmp(ses->server->vals->version_string, SMBDEFAULT_VERSION_STRING) == 0)) - assemble_neg_contexts(req, &total_len); + assemble_neg_contexts(req, server, &total_len); } iov[0].iov_base = (char *)req; iov[0].iov_len = total_len; @@ -1850,10 +1873,21 @@ create_reconnect_durable_buf(struct cifs_fid *fid) return buf; } -__u8 -smb2_parse_lease_state(struct TCP_Server_Info *server, +static void +parse_query_id_ctxt(struct create_context *cc, struct smb2_file_all_info *buf) +{ + struct create_on_disk_id *pdisk_id = (struct create_on_disk_id *)cc; + + cifs_dbg(FYI, "parse query id context 0x%llx 0x%llx\n", + pdisk_id->DiskFileId, pdisk_id->VolumeId); + buf->IndexNumber = pdisk_id->DiskFileId; +} + +void +smb2_parse_contexts(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp, - unsigned int *epoch, char *lease_key) + unsigned int *epoch, char *lease_key, __u8 *oplock, + struct smb2_file_all_info *buf) { char *data_offset; struct create_context *cc; @@ -1861,15 +1895,24 @@ smb2_parse_lease_state(struct TCP_Server_Info *server, unsigned int remaining; char *name; + *oplock = 0; data_offset = (char *)rsp + le32_to_cpu(rsp->CreateContextsOffset); remaining = le32_to_cpu(rsp->CreateContextsLength); cc = (struct create_context *)data_offset; + + /* Initialize inode number to 0 in case no valid data in qfid context */ + if (buf) + buf->IndexNumber = 0; + while (remaining >= sizeof(struct create_context)) { name = le16_to_cpu(cc->NameOffset) + (char *)cc; if (le16_to_cpu(cc->NameLength) == 4 && - strncmp(name, "RqLs", 4) == 0) - return server->ops->parse_lease_buf(cc, epoch, - lease_key); + strncmp(name, SMB2_CREATE_REQUEST_LEASE, 4) == 0) + *oplock = server->ops->parse_lease_buf(cc, epoch, + lease_key); + else if (buf && (le16_to_cpu(cc->NameLength) == 4) && + strncmp(name, SMB2_CREATE_QUERY_ON_DISK_ID, 4) == 0) + parse_query_id_ctxt(cc, buf); next = le32_to_cpu(cc->Next); if (!next) @@ -1878,7 +1921,10 @@ smb2_parse_lease_state(struct TCP_Server_Info *server, cc = (struct create_context *)((char *)cc + next); } - return 0; + if (rsp->OplockLevel != SMB2_OPLOCK_LEVEL_LEASE) + *oplock = rsp->OplockLevel; + + return; } static int @@ -2095,6 +2141,48 @@ add_twarp_context(struct kvec *iov, unsigned int *num_iovec, __u64 timewarp) return 0; } +static struct crt_query_id_ctxt * +create_query_id_buf(void) +{ + struct crt_query_id_ctxt *buf; + + buf = kzalloc(sizeof(struct crt_query_id_ctxt), GFP_KERNEL); + if (!buf) + return NULL; + + buf->ccontext.DataOffset = cpu_to_le16(0); + buf->ccontext.DataLength = cpu_to_le32(0); + buf->ccontext.NameOffset = cpu_to_le16(offsetof + (struct crt_query_id_ctxt, Name)); + buf->ccontext.NameLength = cpu_to_le16(4); + /* SMB2_CREATE_QUERY_ON_DISK_ID is "QFid" */ + buf->Name[0] = 'Q'; + buf->Name[1] = 'F'; + buf->Name[2] = 'i'; + buf->Name[3] = 'd'; + return buf; +} + +/* See MS-SMB2 2.2.13.2.9 */ +static int +add_query_id_context(struct kvec *iov, unsigned int *num_iovec) +{ + struct smb2_create_req *req = iov[0].iov_base; + unsigned int num = *num_iovec; + + iov[num].iov_base = create_query_id_buf(); + if (iov[num].iov_base == NULL) + return -ENOMEM; + iov[num].iov_len = sizeof(struct crt_query_id_ctxt); + if (!req->CreateContextsOffset) + req->CreateContextsOffset = cpu_to_le32( + sizeof(struct smb2_create_req) + + iov[num - 1].iov_len); + le32_add_cpu(&req->CreateContextsLength, sizeof(struct crt_query_id_ctxt)); + *num_iovec = num + 1; + return 0; +} + static int alloc_path_with_tree_prefix(__le16 **out_path, int *out_size, int *out_len, const char *treename, const __le16 *path) @@ -2423,6 +2511,12 @@ SMB2_open_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, __u8 *oplock, return rc; } + if (n_iov > 2) { + struct create_context *ccontext = + (struct create_context *)iov[n_iov-1].iov_base; + ccontext->Next = cpu_to_le32(iov[n_iov-1].iov_len); + } + add_query_id_context(iov, &n_iov); rqst->rq_nvec = n_iov; return 0; @@ -2517,12 +2611,9 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, buf->DeletePending = 0; } - if (rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) - *oplock = smb2_parse_lease_state(server, rsp, - &oparms->fid->epoch, - oparms->fid->lease_key); - else - *oplock = rsp->OplockLevel; + + smb2_parse_contexts(server, rsp, &oparms->fid->epoch, + oparms->fid->lease_key, oplock, buf); creat_exit: SMB2_open_free(&rqst); free_rsp_buf(resp_buftype, rsp); @@ -2550,12 +2641,11 @@ SMB2_ioctl_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, * indatalen is usually small at a couple of bytes max, so * just allocate through generic pool */ - in_data_buf = kmalloc(indatalen, GFP_NOFS); + in_data_buf = kmemdup(in_data, indatalen, GFP_NOFS); if (!in_data_buf) { cifs_small_buf_release(req); return -ENOMEM; } - memcpy(in_data_buf, in_data, indatalen); } req->CtlCode = cpu_to_le32(opcode); diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 858353d20c39..747de9317659 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -123,7 +123,7 @@ struct smb2_sync_pdu { __le16 StructureSize2; /* size of wct area (varies, request specific) */ } __packed; -#define SMB3_AES128CMM_NONCE 11 +#define SMB3_AES128CCM_NONCE 11 #define SMB3_AES128GCM_NONCE 12 struct smb2_transform_hdr { @@ -166,6 +166,8 @@ struct smb2_err_rsp { __u8 ErrorData[1]; /* variable length */ } __packed; +#define SYMLINK_ERROR_TAG 0x4c4d5953 + struct smb2_symlink_err_rsp { __le32 SymLinkLength; __le32 SymLinkErrorTag; @@ -227,6 +229,7 @@ struct smb2_negotiate_req { } __packed; /* Dialects */ +#define SMB10_PROT_ID 0x0000 /* local only, not sent on wire w/CIFS negprot */ #define SMB20_PROT_ID 0x0202 #define SMB21_PROT_ID 0x0210 #define SMB30_PROT_ID 0x0300 @@ -293,7 +296,7 @@ struct smb2_encryption_neg_context { __le16 DataLength; __le32 Reserved; __le16 CipherCount; /* AES-128-GCM and AES-128-CCM */ - __le16 Ciphers[1]; /* Ciphers[0] since only one used now */ + __le16 Ciphers[2]; } __packed; /* See MS-SMB2 2.2.3.1.3 */ @@ -316,6 +319,12 @@ struct smb2_compression_capabilities_context { * For smb2_netname_negotiate_context_id See MS-SMB2 2.2.3.1.4. * Its struct simply contains NetName, an array of Unicode characters */ +struct smb2_netname_neg_context { + __le16 ContextType; /* 0x100 */ + __le16 DataLength; + __le32 Reserved; + __le16 NetName[0]; /* hostname of target converted to UCS-2 */ +} __packed; #define POSIX_CTXT_DATA_LEN 16 struct smb2_posix_neg_context { @@ -640,6 +649,7 @@ struct smb2_tree_disconnect_rsp { #define SMB2_CREATE_DURABLE_HANDLE_REQUEST_V2 "DH2Q" #define SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2 "DH2C" #define SMB2_CREATE_APP_INSTANCE_ID 0x45BCA66AEFA7F74A9008FA462E144D74 +#define SMB2_CREATE_APP_INSTANCE_VERSION 0xB982D0B73B56074FA07B524A8116A010 #define SVHDX_OPEN_DEVICE_CONTEX 0x9CCBCF9E04C1E643980E158DA1F6EC83 #define SMB2_CREATE_TAG_POSIX 0x93AD25509CB411E7B42383DE968BCD7C @@ -654,9 +664,10 @@ struct smb2_tree_disconnect_rsp { * [3] : durable context * [4] : posix context * [5] : time warp context - * [6] : compound padding + * [6] : query id context + * [7] : compound padding */ -#define SMB2_CREATE_IOV_SIZE 7 +#define SMB2_CREATE_IOV_SIZE 8 struct smb2_create_req { struct smb2_sync_hdr sync_hdr; @@ -680,10 +691,10 @@ struct smb2_create_req { /* * Maximum size of a SMB2_CREATE response is 64 (smb2 header) + - * 88 (fixed part of create response) + 520 (path) + 150 (contexts) + + * 88 (fixed part of create response) + 520 (path) + 208 (contexts) + * 2 bytes of padding. */ -#define MAX_SMB2_CREATE_RESPONSE_SIZE 824 +#define MAX_SMB2_CREATE_RESPONSE_SIZE 880 struct smb2_create_rsp { struct smb2_sync_hdr sync_hdr; @@ -806,6 +817,15 @@ struct durable_reconnect_context_v2 { __le32 Flags; /* see above DHANDLE_FLAG_PERSISTENT */ } __packed; +/* See MS-SMB2 2.2.14.2.9 */ +struct create_on_disk_id { + struct create_context ccontext; + __u8 Name[8]; + __le64 DiskFileId; + __le64 VolumeId; + __u32 Reserved[4]; +} __packed; + /* See MS-SMB2 2.2.14.2.12 */ struct durable_reconnect_context_v2_rsp { __le32 Timeout; @@ -826,6 +846,12 @@ struct crt_twarp_ctxt { } __packed; +/* See MS-SMB2 2.2.13.2.9 */ +struct crt_query_id_ctxt { + struct create_context ccontext; + __u8 Name[8]; +} __packed; + #define COPY_CHUNK_RES_KEY_SIZE 24 struct resume_key_req { char ResumeKey[COPY_CHUNK_RES_KEY_SIZE]; diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 52df125e9189..07ca72486cfa 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -228,9 +228,10 @@ extern int smb3_validate_negotiate(const unsigned int, struct cifs_tcon *); extern enum securityEnum smb2_select_sectype(struct TCP_Server_Info *, enum securityEnum); -extern __u8 smb2_parse_lease_state(struct TCP_Server_Info *server, - struct smb2_create_rsp *rsp, - unsigned int *epoch, char *lease_key); +extern void smb2_parse_contexts(struct TCP_Server_Info *server, + struct smb2_create_rsp *rsp, + unsigned int *epoch, char *lease_key, + __u8 *oplock, struct smb2_file_all_info *buf); extern int smb3_encryption_required(const struct cifs_tcon *tcon); extern int smb2_validate_iov(unsigned int offset, unsigned int buffer_length, struct kvec *iov, unsigned int min_buf_size); diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index d1181572758b..1ccbcf9c2c3b 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -734,7 +734,10 @@ smb3_crypto_aead_allocate(struct TCP_Server_Info *server) struct crypto_aead *tfm; if (!server->secmech.ccmaesencrypt) { - tfm = crypto_alloc_aead("ccm(aes)", 0, 0); + if (server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) + tfm = crypto_alloc_aead("gcm(aes)", 0, 0); + else + tfm = crypto_alloc_aead("ccm(aes)", 0, 0); if (IS_ERR(tfm)) { cifs_dbg(VFS, "%s: Failed to alloc encrypt aead\n", __func__); @@ -744,7 +747,10 @@ smb3_crypto_aead_allocate(struct TCP_Server_Info *server) } if (!server->secmech.ccmaesdecrypt) { - tfm = crypto_alloc_aead("ccm(aes)", 0, 0); + if (server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) + tfm = crypto_alloc_aead("gcm(aes)", 0, 0); + else + tfm = crypto_alloc_aead("ccm(aes)", 0, 0); if (IS_ERR(tfm)) { crypto_free_aead(server->secmech.ccmaesencrypt); server->secmech.ccmaesencrypt = NULL; diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 60661b3f983a..5d6d44bfe10a 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -979,6 +979,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, }; unsigned int instance; char *buf; + struct TCP_Server_Info *server; optype = flags & CIFS_OP_MASK; @@ -990,7 +991,8 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, return -EIO; } - if (ses->server->tcpStatus == CifsExiting) + server = ses->server; + if (server->tcpStatus == CifsExiting) return -ENOENT; /* @@ -1001,7 +1003,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, * other requests. * This can be handled by the eventual session reconnect. */ - rc = wait_for_compound_request(ses->server, num_rqst, flags, + rc = wait_for_compound_request(server, num_rqst, flags, &instance); if (rc) return rc; @@ -1017,7 +1019,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, * of smb data. */ - mutex_lock(&ses->server->srv_mutex); + mutex_lock(&server->srv_mutex); /* * All the parts of the compound chain belong obtained credits from the @@ -1026,24 +1028,24 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, * we obtained credits and return -EAGAIN in such cases to let callers * handle it. */ - if (instance != ses->server->reconnect_instance) { - mutex_unlock(&ses->server->srv_mutex); + if (instance != server->reconnect_instance) { + mutex_unlock(&server->srv_mutex); for (j = 0; j < num_rqst; j++) - add_credits(ses->server, &credits[j], optype); + add_credits(server, &credits[j], optype); return -EAGAIN; } for (i = 0; i < num_rqst; i++) { - midQ[i] = ses->server->ops->setup_request(ses, &rqst[i]); + midQ[i] = server->ops->setup_request(ses, &rqst[i]); if (IS_ERR(midQ[i])) { - revert_current_mid(ses->server, i); + revert_current_mid(server, i); for (j = 0; j < i; j++) cifs_delete_mid(midQ[j]); - mutex_unlock(&ses->server->srv_mutex); + mutex_unlock(&server->srv_mutex); /* Update # of requests on wire to server */ for (j = 0; j < num_rqst; j++) - add_credits(ses->server, &credits[j], optype); + add_credits(server, &credits[j], optype); return PTR_ERR(midQ[i]); } @@ -1059,19 +1061,19 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, else midQ[i]->callback = cifs_compound_last_callback; } - cifs_in_send_inc(ses->server); - rc = smb_send_rqst(ses->server, num_rqst, rqst, flags); - cifs_in_send_dec(ses->server); + cifs_in_send_inc(server); + rc = smb_send_rqst(server, num_rqst, rqst, flags); + cifs_in_send_dec(server); for (i = 0; i < num_rqst; i++) cifs_save_when_sent(midQ[i]); if (rc < 0) { - revert_current_mid(ses->server, num_rqst); - ses->server->sequence_number -= 2; + revert_current_mid(server, num_rqst); + server->sequence_number -= 2; } - mutex_unlock(&ses->server->srv_mutex); + mutex_unlock(&server->srv_mutex); /* * If sending failed for some reason or it is an oplock break that we @@ -1079,7 +1081,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, */ if (rc < 0 || (flags & CIFS_NO_SRV_RSP)) { for (i = 0; i < num_rqst; i++) - add_credits(ses->server, &credits[i], optype); + add_credits(server, &credits[i], optype); goto out; } @@ -1099,7 +1101,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, rqst[0].rq_nvec); for (i = 0; i < num_rqst; i++) { - rc = wait_for_response(ses->server, midQ[i]); + rc = wait_for_response(server, midQ[i]); if (rc != 0) break; } @@ -1107,7 +1109,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, for (; i < num_rqst; i++) { cifs_dbg(VFS, "Cancelling wait for mid %llu cmd: %d\n", midQ[i]->mid, le16_to_cpu(midQ[i]->command)); - send_cancel(ses->server, &rqst[i], midQ[i]); + send_cancel(server, &rqst[i], midQ[i]); spin_lock(&GlobalMid_Lock); if (midQ[i]->mid_state == MID_REQUEST_SUBMITTED) { midQ[i]->mid_flags |= MID_WAIT_CANCELLED; @@ -1123,7 +1125,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, if (rc < 0) goto out; - rc = cifs_sync_mid_result(midQ[i], ses->server); + rc = cifs_sync_mid_result(midQ[i], server); if (rc != 0) { /* mark this mid as cancelled to not free it below */ cancelled_mid[i] = true; @@ -1140,14 +1142,14 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, buf = (char *)midQ[i]->resp_buf; resp_iov[i].iov_base = buf; resp_iov[i].iov_len = midQ[i]->resp_buf_size + - ses->server->vals->header_preamble_size; + server->vals->header_preamble_size; if (midQ[i]->large_buf) resp_buf_type[i] = CIFS_LARGE_BUFFER; else resp_buf_type[i] = CIFS_SMALL_BUFFER; - rc = ses->server->ops->check_receive(midQ[i], ses->server, + rc = server->ops->check_receive(midQ[i], server, flags & CIFS_LOG_ERROR); /* mark it so buf will not be freed by cifs_delete_mid */ diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 50ddb795aaeb..9076150758d8 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -96,7 +96,6 @@ static int cifs_xattr_set(const struct xattr_handler *handler, break; case XATTR_CIFS_ACL: { -#ifdef CONFIG_CIFS_ACL struct cifs_ntsd *pacl; if (!value) @@ -117,7 +116,6 @@ static int cifs_xattr_set(const struct xattr_handler *handler, CIFS_I(inode)->time = 0; kfree(pacl); } -#endif /* CONFIG_CIFS_ACL */ break; } @@ -247,7 +245,6 @@ static int cifs_xattr_get(const struct xattr_handler *handler, break; case XATTR_CIFS_ACL: { -#ifdef CONFIG_CIFS_ACL u32 acllen; struct cifs_ntsd *pacl; @@ -270,7 +267,6 @@ static int cifs_xattr_get(const struct xattr_handler *handler, rc = acllen; kfree(pacl); } -#endif /* CONFIG_CIFS_ACL */ break; } diff --git a/fs/coda/Makefile b/fs/coda/Makefile index 1ce66819da2a..78befb8369c9 100644 --- a/fs/coda/Makefile +++ b/fs/coda/Makefile @@ -6,7 +6,8 @@ obj-$(CONFIG_CODA_FS) += coda.o coda-objs := psdev.o cache.o cnode.o inode.o dir.o file.o upcall.o \ - coda_linux.o symlink.o pioctl.o sysctl.o + coda_linux.o symlink.o pioctl.o +coda-$(CONFIG_SYSCTL) += sysctl.o # If you want debugging output, please uncomment the following line. diff --git a/fs/coda/cache.c b/fs/coda/cache.c index 201fc08a8b4f..3b8c4513118f 100644 --- a/fs/coda/cache.c +++ b/fs/coda/cache.c @@ -21,7 +21,7 @@ #include <linux/spinlock.h> #include <linux/coda.h> -#include <linux/coda_psdev.h> +#include "coda_psdev.h" #include "coda_linux.h" #include "coda_cache.h" diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c index 845b5a66952a..06855f6c7902 100644 --- a/fs/coda/cnode.c +++ b/fs/coda/cnode.c @@ -8,8 +8,8 @@ #include <linux/time.h> #include <linux/coda.h> -#include <linux/coda_psdev.h> #include <linux/pagemap.h> +#include "coda_psdev.h" #include "coda_linux.h" static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2) @@ -137,11 +137,6 @@ struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb) struct inode *inode; unsigned long hash = coda_f2i(fid); - if ( !sb ) { - pr_warn("%s: no sb!\n", __func__); - return NULL; - } - inode = ilookup5(sb, hash, coda_test_inode, fid); if ( !inode ) return NULL; @@ -153,6 +148,16 @@ struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb) return inode; } +struct coda_file_info *coda_ftoc(struct file *file) +{ + struct coda_file_info *cfi = file->private_data; + + BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); + + return cfi; + +} + /* the CONTROL inode is made without asking attributes from Venus */ struct inode *coda_cnode_makectl(struct super_block *sb) { diff --git a/fs/coda/coda_fs_i.h b/fs/coda/coda_fs_i.h index d702ba1a2bf9..1763ff95d865 100644 --- a/fs/coda/coda_fs_i.h +++ b/fs/coda/coda_fs_i.h @@ -40,10 +40,9 @@ struct coda_file_info { int cfi_magic; /* magic number */ struct file *cfi_container; /* container file for this cnode */ unsigned int cfi_mapcount; /* nr of times this file is mapped */ + bool cfi_access_intent; /* is access intent supported */ }; -#define CODA_FTOC(file) ((struct coda_file_info *)((file)->private_data)) - /* flags */ #define C_VATTR 0x1 /* Validity of vattr in inode */ #define C_FLUSH 0x2 /* used after a flush */ @@ -54,6 +53,7 @@ struct inode *coda_cnode_make(struct CodaFid *, struct super_block *); struct inode *coda_iget(struct super_block *sb, struct CodaFid *fid, struct coda_vattr *attr); struct inode *coda_cnode_makectl(struct super_block *sb); struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb); +struct coda_file_info *coda_ftoc(struct file *file); void coda_replace_fid(struct inode *, struct CodaFid *, struct CodaFid *); #endif diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h index bb0b3e0ed6c2..f82b59c9dd28 100644 --- a/fs/coda/coda_int.h +++ b/fs/coda/coda_int.h @@ -13,9 +13,19 @@ extern int coda_fake_statfs; void coda_destroy_inodecache(void); int __init coda_init_inodecache(void); int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync); + +#ifdef CONFIG_SYSCTL void coda_sysctl_init(void); void coda_sysctl_clean(void); +#else +static inline void coda_sysctl_init(void) +{ +} +static inline void coda_sysctl_clean(void) +{ +} +#endif #endif /* _CODA_INT_ */ diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c index f3d543dd9a98..2e1a5a192074 100644 --- a/fs/coda/coda_linux.c +++ b/fs/coda/coda_linux.c @@ -18,7 +18,7 @@ #include <linux/string.h> #include <linux/coda.h> -#include <linux/coda_psdev.h> +#include "coda_psdev.h" #include "coda_linux.h" /* initialize the debugging variables */ @@ -66,6 +66,25 @@ unsigned short coda_flags_to_cflags(unsigned short flags) return coda_flags; } +static struct timespec64 coda_to_timespec64(struct coda_timespec ts) +{ + struct timespec64 ts64 = { + .tv_sec = ts.tv_sec, + .tv_nsec = ts.tv_nsec, + }; + + return ts64; +} + +static struct coda_timespec timespec64_to_coda(struct timespec64 ts64) +{ + struct coda_timespec ts = { + .tv_sec = ts64.tv_sec, + .tv_nsec = ts64.tv_nsec, + }; + + return ts; +} /* utility functions below */ void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr) @@ -105,11 +124,11 @@ void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr) if (attr->va_size != -1) inode->i_blocks = (attr->va_size + 511) >> 9; if (attr->va_atime.tv_sec != -1) - inode->i_atime = timespec_to_timespec64(attr->va_atime); + inode->i_atime = coda_to_timespec64(attr->va_atime); if (attr->va_mtime.tv_sec != -1) - inode->i_mtime = timespec_to_timespec64(attr->va_mtime); + inode->i_mtime = coda_to_timespec64(attr->va_mtime); if (attr->va_ctime.tv_sec != -1) - inode->i_ctime = timespec_to_timespec64(attr->va_ctime); + inode->i_ctime = coda_to_timespec64(attr->va_ctime); } @@ -130,12 +149,12 @@ void coda_iattr_to_vattr(struct iattr *iattr, struct coda_vattr *vattr) vattr->va_uid = (vuid_t) -1; vattr->va_gid = (vgid_t) -1; vattr->va_size = (off_t) -1; - vattr->va_atime.tv_sec = (time_t) -1; - vattr->va_atime.tv_nsec = (time_t) -1; - vattr->va_mtime.tv_sec = (time_t) -1; - vattr->va_mtime.tv_nsec = (time_t) -1; - vattr->va_ctime.tv_sec = (time_t) -1; - vattr->va_ctime.tv_nsec = (time_t) -1; + vattr->va_atime.tv_sec = (int64_t) -1; + vattr->va_atime.tv_nsec = (long) -1; + vattr->va_mtime.tv_sec = (int64_t) -1; + vattr->va_mtime.tv_nsec = (long) -1; + vattr->va_ctime.tv_sec = (int64_t) -1; + vattr->va_ctime.tv_nsec = (long) -1; vattr->va_type = C_VNON; vattr->va_fileid = -1; vattr->va_gen = -1; @@ -175,13 +194,13 @@ void coda_iattr_to_vattr(struct iattr *iattr, struct coda_vattr *vattr) vattr->va_size = iattr->ia_size; } if ( valid & ATTR_ATIME ) { - vattr->va_atime = timespec64_to_timespec(iattr->ia_atime); + vattr->va_atime = timespec64_to_coda(iattr->ia_atime); } if ( valid & ATTR_MTIME ) { - vattr->va_mtime = timespec64_to_timespec(iattr->ia_mtime); + vattr->va_mtime = timespec64_to_coda(iattr->ia_mtime); } if ( valid & ATTR_CTIME ) { - vattr->va_ctime = timespec64_to_timespec(iattr->ia_ctime); + vattr->va_ctime = timespec64_to_coda(iattr->ia_ctime); } } diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h index 126155cadfa9..d5ebd36fb2cc 100644 --- a/fs/coda/coda_linux.h +++ b/fs/coda/coda_linux.h @@ -59,22 +59,6 @@ void coda_vattr_to_iattr(struct inode *, struct coda_vattr *); void coda_iattr_to_vattr(struct iattr *, struct coda_vattr *); unsigned short coda_flags_to_cflags(unsigned short); -/* sysctl.h */ -void coda_sysctl_init(void); -void coda_sysctl_clean(void); - -#define CODA_ALLOC(ptr, cast, size) do { \ - if (size < PAGE_SIZE) \ - ptr = kzalloc((unsigned long) size, GFP_KERNEL); \ - else \ - ptr = (cast)vzalloc((unsigned long) size); \ - if (!ptr) \ - pr_warn("kernel malloc returns 0 at %s:%d\n", __FILE__, __LINE__); \ -} while (0) - - -#define CODA_FREE(ptr, size) kvfree((ptr)) - /* inode to cnode access functions */ static inline struct coda_inode_info *ITOC(struct inode *inode) diff --git a/fs/coda/coda_psdev.h b/fs/coda/coda_psdev.h new file mode 100644 index 000000000000..52da08c770b0 --- /dev/null +++ b/fs/coda/coda_psdev.h @@ -0,0 +1,95 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __CODA_PSDEV_H +#define __CODA_PSDEV_H + +#include <linux/backing-dev.h> +#include <linux/magic.h> +#include <linux/mutex.h> + +#define CODA_PSDEV_MAJOR 67 +#define MAX_CODADEVS 5 /* how many do we allow */ + +struct kstatfs; + +/* messages between coda filesystem in kernel and Venus */ +struct upc_req { + struct list_head uc_chain; + caddr_t uc_data; + u_short uc_flags; + u_short uc_inSize; /* Size is at most 5000 bytes */ + u_short uc_outSize; + u_short uc_opcode; /* copied from data to save lookup */ + int uc_unique; + wait_queue_head_t uc_sleep; /* process' wait queue */ +}; + +#define CODA_REQ_ASYNC 0x1 +#define CODA_REQ_READ 0x2 +#define CODA_REQ_WRITE 0x4 +#define CODA_REQ_ABORT 0x8 + +/* communication pending/processing queues */ +struct venus_comm { + u_long vc_seq; + wait_queue_head_t vc_waitq; /* Venus wait queue */ + struct list_head vc_pending; + struct list_head vc_processing; + int vc_inuse; + struct super_block *vc_sb; + struct mutex vc_mutex; +}; + +static inline struct venus_comm *coda_vcp(struct super_block *sb) +{ + return (struct venus_comm *)((sb)->s_fs_info); +} + +/* upcalls */ +int venus_rootfid(struct super_block *sb, struct CodaFid *fidp); +int venus_getattr(struct super_block *sb, struct CodaFid *fid, + struct coda_vattr *attr); +int venus_setattr(struct super_block *, struct CodaFid *, struct coda_vattr *); +int venus_lookup(struct super_block *sb, struct CodaFid *fid, + const char *name, int length, int *type, + struct CodaFid *resfid); +int venus_close(struct super_block *sb, struct CodaFid *fid, int flags, + kuid_t uid); +int venus_open(struct super_block *sb, struct CodaFid *fid, int flags, + struct file **f); +int venus_mkdir(struct super_block *sb, struct CodaFid *dirfid, + const char *name, int length, + struct CodaFid *newfid, struct coda_vattr *attrs); +int venus_create(struct super_block *sb, struct CodaFid *dirfid, + const char *name, int length, int excl, int mode, + struct CodaFid *newfid, struct coda_vattr *attrs); +int venus_rmdir(struct super_block *sb, struct CodaFid *dirfid, + const char *name, int length); +int venus_remove(struct super_block *sb, struct CodaFid *dirfid, + const char *name, int length); +int venus_readlink(struct super_block *sb, struct CodaFid *fid, + char *buffer, int *length); +int venus_rename(struct super_block *sb, struct CodaFid *new_fid, + struct CodaFid *old_fid, size_t old_length, + size_t new_length, const char *old_name, + const char *new_name); +int venus_link(struct super_block *sb, struct CodaFid *fid, + struct CodaFid *dirfid, const char *name, int len ); +int venus_symlink(struct super_block *sb, struct CodaFid *fid, + const char *name, int len, const char *symname, int symlen); +int venus_access(struct super_block *sb, struct CodaFid *fid, int mask); +int venus_pioctl(struct super_block *sb, struct CodaFid *fid, + unsigned int cmd, struct PioctlData *data); +int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out, + size_t nbytes); +int venus_fsync(struct super_block *sb, struct CodaFid *fid); +int venus_statfs(struct dentry *dentry, struct kstatfs *sfs); +int venus_access_intent(struct super_block *sb, struct CodaFid *fid, + bool *access_intent_supported, + size_t count, loff_t ppos, int type); + +/* + * Statistics + */ + +extern struct venus_comm coda_comms[]; +#endif diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 00876ddadb43..ca40c2556ba6 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -23,7 +23,7 @@ #include <linux/uaccess.h> #include <linux/coda.h> -#include <linux/coda_psdev.h> +#include "coda_psdev.h" #include "coda_linux.h" #include "coda_cache.h" @@ -47,8 +47,8 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, unsig int type = 0; if (length > CODA_MAXNAMLEN) { - pr_err("name too long: lookup, %s (%*s)\n", - coda_i2s(dir), (int)length, name); + pr_err("name too long: lookup, %s %zu\n", + coda_i2s(dir), length); return ERR_PTR(-ENAMETOOLONG); } @@ -356,8 +356,7 @@ static int coda_venus_readdir(struct file *coda_file, struct dir_context *ctx) ino_t ino; int ret; - cfi = CODA_FTOC(coda_file); - BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); + cfi = coda_ftoc(coda_file); host_file = cfi->cfi_container; cii = ITOC(file_inode(coda_file)); @@ -426,8 +425,7 @@ static int coda_readdir(struct file *coda_file, struct dir_context *ctx) struct file *host_file; int ret; - cfi = CODA_FTOC(coda_file); - BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); + cfi = coda_ftoc(coda_file); host_file = cfi->cfi_container; if (host_file->f_op->iterate || host_file->f_op->iterate_shared) { diff --git a/fs/coda/file.c b/fs/coda/file.c index 1cbc1f2298ee..128d63df5bfb 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -20,22 +20,43 @@ #include <linux/string.h> #include <linux/slab.h> #include <linux/uaccess.h> +#include <linux/uio.h> #include <linux/coda.h> -#include <linux/coda_psdev.h> - +#include "coda_psdev.h" #include "coda_linux.h" #include "coda_int.h" +struct coda_vm_ops { + atomic_t refcnt; + struct file *coda_file; + const struct vm_operations_struct *host_vm_ops; + struct vm_operations_struct vm_ops; +}; + static ssize_t coda_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *coda_file = iocb->ki_filp; - struct coda_file_info *cfi = CODA_FTOC(coda_file); + struct inode *coda_inode = file_inode(coda_file); + struct coda_file_info *cfi = coda_ftoc(coda_file); + loff_t ki_pos = iocb->ki_pos; + size_t count = iov_iter_count(to); + ssize_t ret; - BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); + ret = venus_access_intent(coda_inode->i_sb, coda_i2f(coda_inode), + &cfi->cfi_access_intent, + count, ki_pos, CODA_ACCESS_TYPE_READ); + if (ret) + goto finish_read; - return vfs_iter_read(cfi->cfi_container, to, &iocb->ki_pos, 0); + ret = vfs_iter_read(cfi->cfi_container, to, &iocb->ki_pos, 0); + +finish_read: + venus_access_intent(coda_inode->i_sb, coda_i2f(coda_inode), + &cfi->cfi_access_intent, + count, ki_pos, CODA_ACCESS_TYPE_READ_FINISH); + return ret; } static ssize_t @@ -43,13 +64,18 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *coda_file = iocb->ki_filp; struct inode *coda_inode = file_inode(coda_file); - struct coda_file_info *cfi = CODA_FTOC(coda_file); - struct file *host_file; + struct coda_file_info *cfi = coda_ftoc(coda_file); + struct file *host_file = cfi->cfi_container; + loff_t ki_pos = iocb->ki_pos; + size_t count = iov_iter_count(to); ssize_t ret; - BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); + ret = venus_access_intent(coda_inode->i_sb, coda_i2f(coda_inode), + &cfi->cfi_access_intent, + count, ki_pos, CODA_ACCESS_TYPE_WRITE); + if (ret) + goto finish_write; - host_file = cfi->cfi_container; file_start_write(host_file); inode_lock(coda_inode); ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos, 0); @@ -58,26 +84,73 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to) coda_inode->i_mtime = coda_inode->i_ctime = current_time(coda_inode); inode_unlock(coda_inode); file_end_write(host_file); + +finish_write: + venus_access_intent(coda_inode->i_sb, coda_i2f(coda_inode), + &cfi->cfi_access_intent, + count, ki_pos, CODA_ACCESS_TYPE_WRITE_FINISH); return ret; } +static void +coda_vm_open(struct vm_area_struct *vma) +{ + struct coda_vm_ops *cvm_ops = + container_of(vma->vm_ops, struct coda_vm_ops, vm_ops); + + atomic_inc(&cvm_ops->refcnt); + + if (cvm_ops->host_vm_ops && cvm_ops->host_vm_ops->open) + cvm_ops->host_vm_ops->open(vma); +} + +static void +coda_vm_close(struct vm_area_struct *vma) +{ + struct coda_vm_ops *cvm_ops = + container_of(vma->vm_ops, struct coda_vm_ops, vm_ops); + + if (cvm_ops->host_vm_ops && cvm_ops->host_vm_ops->close) + cvm_ops->host_vm_ops->close(vma); + + if (atomic_dec_and_test(&cvm_ops->refcnt)) { + vma->vm_ops = cvm_ops->host_vm_ops; + fput(cvm_ops->coda_file); + kfree(cvm_ops); + } +} + static int coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma) { - struct coda_file_info *cfi; + struct inode *coda_inode = file_inode(coda_file); + struct coda_file_info *cfi = coda_ftoc(coda_file); + struct file *host_file = cfi->cfi_container; + struct inode *host_inode = file_inode(host_file); struct coda_inode_info *cii; - struct file *host_file; - struct inode *coda_inode, *host_inode; - - cfi = CODA_FTOC(coda_file); - BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); - host_file = cfi->cfi_container; + struct coda_vm_ops *cvm_ops; + loff_t ppos; + size_t count; + int ret; if (!host_file->f_op->mmap) return -ENODEV; - coda_inode = file_inode(coda_file); - host_inode = file_inode(host_file); + if (WARN_ON(coda_file != vma->vm_file)) + return -EIO; + + count = vma->vm_end - vma->vm_start; + ppos = vma->vm_pgoff * PAGE_SIZE; + + ret = venus_access_intent(coda_inode->i_sb, coda_i2f(coda_inode), + &cfi->cfi_access_intent, + count, ppos, CODA_ACCESS_TYPE_MMAP); + if (ret) + return ret; + + cvm_ops = kmalloc(sizeof(struct coda_vm_ops), GFP_KERNEL); + if (!cvm_ops) + return -ENOMEM; cii = ITOC(coda_inode); spin_lock(&cii->c_lock); @@ -89,6 +162,7 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma) * the container file on us! */ else if (coda_inode->i_mapping != host_inode->i_mapping) { spin_unlock(&cii->c_lock); + kfree(cvm_ops); return -EBUSY; } @@ -97,7 +171,29 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma) cfi->cfi_mapcount++; spin_unlock(&cii->c_lock); - return call_mmap(host_file, vma); + vma->vm_file = get_file(host_file); + ret = call_mmap(vma->vm_file, vma); + + if (ret) { + /* if call_mmap fails, our caller will put coda_file so we + * should drop the reference to the host_file that we got. + */ + fput(host_file); + kfree(cvm_ops); + } else { + /* here we add redirects for the open/close vm_operations */ + cvm_ops->host_vm_ops = vma->vm_ops; + if (vma->vm_ops) + cvm_ops->vm_ops = *vma->vm_ops; + + cvm_ops->vm_ops.open = coda_vm_open; + cvm_ops->vm_ops.close = coda_vm_close; + cvm_ops->coda_file = coda_file; + atomic_set(&cvm_ops->refcnt, 1); + + vma->vm_ops = &cvm_ops->vm_ops; + } + return ret; } int coda_open(struct inode *coda_inode, struct file *coda_file) @@ -127,6 +223,8 @@ int coda_open(struct inode *coda_inode, struct file *coda_file) cfi->cfi_magic = CODA_MAGIC; cfi->cfi_mapcount = 0; cfi->cfi_container = host_file; + /* assume access intents are supported unless we hear otherwise */ + cfi->cfi_access_intent = true; BUG_ON(coda_file->private_data != NULL); coda_file->private_data = cfi; @@ -142,8 +240,7 @@ int coda_release(struct inode *coda_inode, struct file *coda_file) struct inode *host_inode; int err; - cfi = CODA_FTOC(coda_file); - BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); + cfi = coda_ftoc(coda_file); err = venus_close(coda_inode->i_sb, coda_i2f(coda_inode), coda_flags, coda_file->f_cred->fsuid); @@ -185,8 +282,7 @@ int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync) return err; inode_lock(coda_inode); - cfi = CODA_FTOC(coda_file); - BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); + cfi = coda_ftoc(coda_file); host_file = cfi->cfi_container; err = vfs_fsync(host_file, datasync); @@ -207,4 +303,3 @@ const struct file_operations coda_file_operations = { .fsync = coda_fsync, .splice_read = generic_file_splice_read, }; - diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 23f6ebd08e80..321f56e487cb 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -27,7 +27,7 @@ #include <linux/vmalloc.h> #include <linux/coda.h> -#include <linux/coda_psdev.h> +#include "coda_psdev.h" #include "coda_linux.h" #include "coda_cache.h" @@ -236,6 +236,7 @@ static void coda_put_super(struct super_block *sb) vcp->vc_sb = NULL; sb->s_fs_info = NULL; mutex_unlock(&vcp->vc_mutex); + mutex_destroy(&vcp->vc_mutex); pr_info("Bye bye.\n"); } diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c index e0c17b7dccce..644d48c12ce8 100644 --- a/fs/coda/pioctl.c +++ b/fs/coda/pioctl.c @@ -20,8 +20,7 @@ #include <linux/uaccess.h> #include <linux/coda.h> -#include <linux/coda_psdev.h> - +#include "coda_psdev.h" #include "coda_linux.h" /* pioctl ops */ diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index 0ceef32e6fae..240669f51eac 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -38,8 +38,7 @@ #include <linux/uaccess.h> #include <linux/coda.h> -#include <linux/coda_psdev.h> - +#include "coda_psdev.h" #include "coda_linux.h" #include "coda_int.h" @@ -100,8 +99,12 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf, ssize_t retval = 0, count = 0; int error; + /* make sure there is enough to copy out the (opcode, unique) values */ + if (nbytes < (2 * sizeof(u_int32_t))) + return -EINVAL; + /* Peek at the opcode, uniquefier */ - if (copy_from_user(&hdr, buf, 2 * sizeof(u_long))) + if (copy_from_user(&hdr, buf, 2 * sizeof(u_int32_t))) return -EFAULT; if (DOWNCALL(hdr.opcode)) { @@ -119,17 +122,21 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf, hdr.opcode, hdr.unique); nbytes = size; } - CODA_ALLOC(dcbuf, union outputArgs *, nbytes); + dcbuf = kvmalloc(nbytes, GFP_KERNEL); + if (!dcbuf) { + retval = -ENOMEM; + goto out; + } if (copy_from_user(dcbuf, buf, nbytes)) { - CODA_FREE(dcbuf, nbytes); + kvfree(dcbuf); retval = -EFAULT; goto out; } /* what downcall errors does Venus handle ? */ - error = coda_downcall(vcp, hdr.opcode, dcbuf); + error = coda_downcall(vcp, hdr.opcode, dcbuf, nbytes); - CODA_FREE(dcbuf, nbytes); + kvfree(dcbuf); if (error) { pr_warn("%s: coda_downcall error: %d\n", __func__, error); @@ -182,8 +189,11 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf, if (req->uc_opcode == CODA_OPEN_BY_FD) { struct coda_open_by_fd_out *outp = (struct coda_open_by_fd_out *)req->uc_data; - if (!outp->oh.result) + if (!outp->oh.result) { outp->fh = fget(outp->fd); + if (!outp->fh) + return -EBADF; + } } wake_up(&req->uc_sleep); @@ -252,7 +262,7 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf, goto out; } - CODA_FREE(req->uc_data, sizeof(struct coda_in_hdr)); + kvfree(req->uc_data); kfree(req); out: mutex_unlock(&vcp->vc_mutex); @@ -314,7 +324,7 @@ static int coda_psdev_release(struct inode * inode, struct file * file) /* Async requests need to be freed here */ if (req->uc_flags & CODA_REQ_ASYNC) { - CODA_FREE(req->uc_data, sizeof(struct coda_in_hdr)); + kvfree(req->uc_data); kfree(req); continue; } @@ -347,13 +357,13 @@ static const struct file_operations coda_psdev_fops = { .llseek = noop_llseek, }; -static int init_coda_psdev(void) +static int __init init_coda_psdev(void) { int i, err = 0; if (register_chrdev(CODA_PSDEV_MAJOR, "coda", &coda_psdev_fops)) { pr_err("%s: unable to get major %d\n", __func__, CODA_PSDEV_MAJOR); - return -EIO; + return -EIO; } coda_psdev_class = class_create(THIS_MODULE, "coda"); if (IS_ERR(coda_psdev_class)) { @@ -378,7 +388,7 @@ MODULE_AUTHOR("Jan Harkes, Peter J. Braam"); MODULE_DESCRIPTION("Coda Distributed File System VFS interface"); MODULE_ALIAS_CHARDEV_MAJOR(CODA_PSDEV_MAJOR); MODULE_LICENSE("GPL"); -MODULE_VERSION("6.6"); +MODULE_VERSION("7.0"); static int __init init_coda(void) { diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c index 202297d156df..8907d0508198 100644 --- a/fs/coda/symlink.c +++ b/fs/coda/symlink.c @@ -17,8 +17,7 @@ #include <linux/pagemap.h> #include <linux/coda.h> -#include <linux/coda_psdev.h> - +#include "coda_psdev.h" #include "coda_linux.h" static int coda_symlink_filler(struct file *file, struct page *page) diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c index 0301d45000a8..fda3b702b1c5 100644 --- a/fs/coda/sysctl.c +++ b/fs/coda/sysctl.c @@ -12,7 +12,6 @@ #include "coda_int.h" -#ifdef CONFIG_SYSCTL static struct ctl_table_header *fs_table_header; static struct ctl_table coda_table[] = { @@ -62,13 +61,3 @@ void coda_sysctl_clean(void) fs_table_header = NULL; } } - -#else -void coda_sysctl_init(void) -{ -} - -void coda_sysctl_clean(void) -{ -} -#endif diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index 1175a1722411..eb3b1898da46 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -33,7 +33,7 @@ #include <linux/vfs.h> #include <linux/coda.h> -#include <linux/coda_psdev.h> +#include "coda_psdev.h" #include "coda_linux.h" #include "coda_cache.h" @@ -46,7 +46,7 @@ static void *alloc_upcall(int opcode, int size) { union inputArgs *inp; - CODA_ALLOC(inp, union inputArgs *, size); + inp = kvzalloc(size, GFP_KERNEL); if (!inp) return ERR_PTR(-ENOMEM); @@ -85,7 +85,7 @@ int venus_rootfid(struct super_block *sb, struct CodaFid *fidp) if (!error) *fidp = outp->coda_root.VFid; - CODA_FREE(inp, insize); + kvfree(inp); return error; } @@ -104,7 +104,7 @@ int venus_getattr(struct super_block *sb, struct CodaFid *fid, if (!error) *attr = outp->coda_getattr.attr; - CODA_FREE(inp, insize); + kvfree(inp); return error; } @@ -123,7 +123,7 @@ int venus_setattr(struct super_block *sb, struct CodaFid *fid, error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); - CODA_FREE(inp, insize); + kvfree(inp); return error; } @@ -153,7 +153,7 @@ int venus_lookup(struct super_block *sb, struct CodaFid *fid, *type = outp->coda_lookup.vtype; } - CODA_FREE(inp, insize); + kvfree(inp); return error; } @@ -173,7 +173,7 @@ int venus_close(struct super_block *sb, struct CodaFid *fid, int flags, error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); - CODA_FREE(inp, insize); + kvfree(inp); return error; } @@ -194,7 +194,7 @@ int venus_open(struct super_block *sb, struct CodaFid *fid, if (!error) *fh = outp->coda_open_by_fd.fh; - CODA_FREE(inp, insize); + kvfree(inp); return error; } @@ -224,7 +224,7 @@ int venus_mkdir(struct super_block *sb, struct CodaFid *dirfid, *newfid = outp->coda_mkdir.VFid; } - CODA_FREE(inp, insize); + kvfree(inp); return error; } @@ -262,7 +262,7 @@ int venus_rename(struct super_block *sb, struct CodaFid *old_fid, error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); - CODA_FREE(inp, insize); + kvfree(inp); return error; } @@ -295,7 +295,7 @@ int venus_create(struct super_block *sb, struct CodaFid *dirfid, *newfid = outp->coda_create.VFid; } - CODA_FREE(inp, insize); + kvfree(inp); return error; } @@ -318,7 +318,7 @@ int venus_rmdir(struct super_block *sb, struct CodaFid *dirfid, error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); - CODA_FREE(inp, insize); + kvfree(inp); return error; } @@ -340,7 +340,7 @@ int venus_remove(struct super_block *sb, struct CodaFid *dirfid, error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); - CODA_FREE(inp, insize); + kvfree(inp); return error; } @@ -370,7 +370,7 @@ int venus_readlink(struct super_block *sb, struct CodaFid *fid, *(buffer + retlen) = '\0'; } - CODA_FREE(inp, insize); + kvfree(inp); return error; } @@ -398,7 +398,7 @@ int venus_link(struct super_block *sb, struct CodaFid *fid, error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); - CODA_FREE(inp, insize); + kvfree(inp); return error; } @@ -433,7 +433,7 @@ int venus_symlink(struct super_block *sb, struct CodaFid *fid, error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); - CODA_FREE(inp, insize); + kvfree(inp); return error; } @@ -449,7 +449,7 @@ int venus_fsync(struct super_block *sb, struct CodaFid *fid) inp->coda_fsync.VFid = *fid; error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); - CODA_FREE(inp, insize); + kvfree(inp); return error; } @@ -467,7 +467,7 @@ int venus_access(struct super_block *sb, struct CodaFid *fid, int mask) error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); - CODA_FREE(inp, insize); + kvfree(inp); return error; } @@ -543,7 +543,7 @@ int venus_pioctl(struct super_block *sb, struct CodaFid *fid, } exit: - CODA_FREE(inp, insize); + kvfree(inp); return error; } @@ -553,7 +553,7 @@ int venus_statfs(struct dentry *dentry, struct kstatfs *sfs) union outputArgs *outp; int insize, outsize, error; - insize = max_t(unsigned int, INSIZE(statfs), OUTSIZE(statfs)); + insize = SIZE(statfs); UPARG(CODA_STATFS); error = coda_upcall(coda_vcp(dentry->d_sb), insize, &outsize, inp); @@ -565,10 +565,51 @@ int venus_statfs(struct dentry *dentry, struct kstatfs *sfs) sfs->f_ffree = outp->coda_statfs.stat.f_ffree; } - CODA_FREE(inp, insize); + kvfree(inp); return error; } +int venus_access_intent(struct super_block *sb, struct CodaFid *fid, + bool *access_intent_supported, + size_t count, loff_t ppos, int type) +{ + union inputArgs *inp; + union outputArgs *outp; + int insize, outsize, error; + bool finalizer = + type == CODA_ACCESS_TYPE_READ_FINISH || + type == CODA_ACCESS_TYPE_WRITE_FINISH; + + if (!*access_intent_supported && !finalizer) + return 0; + + insize = SIZE(access_intent); + UPARG(CODA_ACCESS_INTENT); + + inp->coda_access_intent.VFid = *fid; + inp->coda_access_intent.count = count; + inp->coda_access_intent.pos = ppos; + inp->coda_access_intent.type = type; + + error = coda_upcall(coda_vcp(sb), insize, + finalizer ? NULL : &outsize, inp); + + /* + * we have to free the request buffer for synchronous upcalls + * or when asynchronous upcalls fail, but not when asynchronous + * upcalls succeed + */ + if (!finalizer || error) + kvfree(inp); + + /* Chunked access is not supported or an old Coda client */ + if (error == -EOPNOTSUPP) { + *access_intent_supported = false; + error = 0; + } + return error; +} + /* * coda_upcall and coda_downcall routines. */ @@ -598,10 +639,12 @@ static void coda_unblock_signals(sigset_t *old) * has seen them, * - CODA_CLOSE or CODA_RELEASE upcall (to avoid reference count problems) * - CODA_STORE (to avoid data loss) + * - CODA_ACCESS_INTENT (to avoid reference count problems) */ #define CODA_INTERRUPTIBLE(r) (!coda_hard && \ (((r)->uc_opcode != CODA_CLOSE && \ (r)->uc_opcode != CODA_STORE && \ + (r)->uc_opcode != CODA_ACCESS_INTENT && \ (r)->uc_opcode != CODA_RELEASE) || \ (r)->uc_flags & CODA_REQ_READ)) @@ -687,21 +730,25 @@ static int coda_upcall(struct venus_comm *vcp, goto exit; } + buffer->ih.unique = ++vcp->vc_seq; + req->uc_data = (void *)buffer; - req->uc_flags = 0; + req->uc_flags = outSize ? 0 : CODA_REQ_ASYNC; req->uc_inSize = inSize; - req->uc_outSize = *outSize ? *outSize : inSize; - req->uc_opcode = ((union inputArgs *)buffer)->ih.opcode; - req->uc_unique = ++vcp->vc_seq; + req->uc_outSize = (outSize && *outSize) ? *outSize : inSize; + req->uc_opcode = buffer->ih.opcode; + req->uc_unique = buffer->ih.unique; init_waitqueue_head(&req->uc_sleep); - /* Fill in the common input args. */ - ((union inputArgs *)buffer)->ih.unique = req->uc_unique; - /* Append msg to pending queue and poke Venus. */ list_add_tail(&req->uc_chain, &vcp->vc_pending); - wake_up_interruptible(&vcp->vc_waitq); + + if (req->uc_flags & CODA_REQ_ASYNC) { + mutex_unlock(&vcp->vc_mutex); + return 0; + } + /* We can be interrupted while we wait for Venus to process * our request. If the interrupt occurs before Venus has read * the request, we dequeue and return. If it occurs after the @@ -743,20 +790,20 @@ static int coda_upcall(struct venus_comm *vcp, sig_req = kmalloc(sizeof(struct upc_req), GFP_KERNEL); if (!sig_req) goto exit; - CODA_ALLOC((sig_req->uc_data), char *, sizeof(struct coda_in_hdr)); - if (!sig_req->uc_data) { + sig_inputArgs = kvzalloc(sizeof(struct coda_in_hdr), GFP_KERNEL); + if (!sig_inputArgs) { kfree(sig_req); goto exit; } error = -EINTR; - sig_inputArgs = (union inputArgs *)sig_req->uc_data; sig_inputArgs->ih.opcode = CODA_SIGNAL; sig_inputArgs->ih.unique = req->uc_unique; sig_req->uc_flags = CODA_REQ_ASYNC; sig_req->uc_opcode = sig_inputArgs->ih.opcode; sig_req->uc_unique = sig_inputArgs->ih.unique; + sig_req->uc_data = (void *)sig_inputArgs; sig_req->uc_inSize = sizeof(struct coda_in_hdr); sig_req->uc_outSize = sizeof(struct coda_in_hdr); @@ -804,12 +851,44 @@ exit: * * CODA_REPLACE -- replace one CodaFid with another throughout the name cache */ -int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out) +int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out, + size_t nbytes) { struct inode *inode = NULL; struct CodaFid *fid = NULL, *newfid; struct super_block *sb; + /* + * Make sure we have received enough data from the cache + * manager to populate the necessary fields in the buffer + */ + switch (opcode) { + case CODA_PURGEUSER: + if (nbytes < sizeof(struct coda_purgeuser_out)) + return -EINVAL; + break; + + case CODA_ZAPDIR: + if (nbytes < sizeof(struct coda_zapdir_out)) + return -EINVAL; + break; + + case CODA_ZAPFILE: + if (nbytes < sizeof(struct coda_zapfile_out)) + return -EINVAL; + break; + + case CODA_PURGEFID: + if (nbytes < sizeof(struct coda_purgefid_out)) + return -EINVAL; + break; + + case CODA_REPLACE: + if (nbytes < sizeof(struct coda_replace_out)) + return -EINVAL; + break; + } + /* Handle invalidation requests. */ mutex_lock(&vcp->vc_mutex); sb = vcp->vc_sb; @@ -879,4 +958,3 @@ unlock_out: iput(inode); return 0; } - diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index 791304fdde9d..55438dd58189 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -13,6 +13,7 @@ #include <linux/fs.h> #include <linux/module.h> #include <linux/mount.h> +#include <linux/fs_context.h> #include <linux/pagemap.h> #include <linux/init.h> #include <linux/slab.h> @@ -52,7 +53,7 @@ static struct configfs_dirent configfs_root = { .s_iattr = NULL, }; -static int configfs_fill_super(struct super_block *sb, void *data, int silent) +static int configfs_fill_super(struct super_block *sb, struct fs_context *fc) { struct inode *inode; struct dentry *root; @@ -88,16 +89,25 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent) return 0; } -static struct dentry *configfs_do_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int configfs_get_tree(struct fs_context *fc) { - return mount_single(fs_type, flags, data, configfs_fill_super); + return get_tree_single(fc, configfs_fill_super); +} + +static const struct fs_context_operations configfs_context_ops = { + .get_tree = configfs_get_tree, +}; + +static int configfs_init_fs_context(struct fs_context *fc) +{ + fc->ops = &configfs_context_ops; + return 0; } static struct file_system_type configfs_fs_type = { .owner = THIS_MODULE, .name = "configfs", - .mount = configfs_do_mount, + .init_fs_context = configfs_init_fs_context, .kill_sb = kill_litter_super, }; MODULE_ALIAS_FS("configfs"); diff --git a/fs/d_path.c b/fs/d_path.c index e8fce6b1174f..a7d0a96b35ce 100644 --- a/fs/d_path.c +++ b/fs/d_path.c @@ -316,7 +316,6 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen) end = ERR_PTR(-ENAMETOOLONG); return end; } -EXPORT_SYMBOL(simple_dname); /* * Write full pathname from the root of the filesystem into the buffer. @@ -26,7 +26,6 @@ #include <linux/mmu_notifier.h> #include <linux/iomap.h> #include <asm/pgalloc.h> -#include "internal.h" #define CREATE_TRACE_POINTS #include <trace/events/fs_dax.h> @@ -124,6 +123,15 @@ static int dax_is_empty_entry(void *entry) } /* + * true if the entry that was found is of a smaller order than the entry + * we were looking for + */ +static bool dax_is_conflict(void *entry) +{ + return entry == XA_RETRY_ENTRY; +} + +/* * DAX page cache entry locking */ struct exceptional_entry_key { @@ -195,11 +203,13 @@ static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all) * Look up entry in page cache, wait for it to become unlocked if it * is a DAX entry and return it. The caller must subsequently call * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry() - * if it did. + * if it did. The entry returned may have a larger order than @order. + * If @order is larger than the order of the entry found in i_pages, this + * function returns a dax_is_conflict entry. * * Must be called with the i_pages lock held. */ -static void *get_unlocked_entry(struct xa_state *xas) +static void *get_unlocked_entry(struct xa_state *xas, unsigned int order) { void *entry; struct wait_exceptional_entry_queue ewait; @@ -210,6 +220,8 @@ static void *get_unlocked_entry(struct xa_state *xas) for (;;) { entry = xas_find_conflict(xas); + if (dax_entry_order(entry) < order) + return XA_RETRY_ENTRY; if (!entry || WARN_ON_ONCE(!xa_is_value(entry)) || !dax_is_locked(entry)) return entry; @@ -254,7 +266,7 @@ static void wait_entry_unlocked(struct xa_state *xas, void *entry) static void put_unlocked_entry(struct xa_state *xas, void *entry) { /* If we were the only waiter woken, wake the next one */ - if (entry) + if (entry && dax_is_conflict(entry)) dax_wake_entry(xas, entry, false); } @@ -461,7 +473,7 @@ void dax_unlock_page(struct page *page, dax_entry_t cookie) * overlap with xarray value entries. */ static void *grab_mapping_entry(struct xa_state *xas, - struct address_space *mapping, unsigned long size_flag) + struct address_space *mapping, unsigned int order) { unsigned long index = xas->xa_index; bool pmd_downgrade = false; /* splitting PMD entry into PTE entries? */ @@ -469,20 +481,17 @@ static void *grab_mapping_entry(struct xa_state *xas, retry: xas_lock_irq(xas); - entry = get_unlocked_entry(xas); + entry = get_unlocked_entry(xas, order); if (entry) { + if (dax_is_conflict(entry)) + goto fallback; if (!xa_is_value(entry)) { xas_set_err(xas, EIO); goto out_unlock; } - if (size_flag & DAX_PMD) { - if (dax_is_pte_entry(entry)) { - put_unlocked_entry(xas, entry); - goto fallback; - } - } else { /* trying to grab a PTE entry */ + if (order == 0) { if (dax_is_pmd_entry(entry) && (dax_is_zero_entry(entry) || dax_is_empty_entry(entry))) { @@ -523,7 +532,11 @@ retry: if (entry) { dax_lock_entry(xas, entry); } else { - entry = dax_make_entry(pfn_to_pfn_t(0), size_flag | DAX_EMPTY); + unsigned long flags = DAX_EMPTY; + + if (order > 0) + flags |= DAX_PMD; + entry = dax_make_entry(pfn_to_pfn_t(0), flags); dax_lock_entry(xas, entry); if (xas_error(xas)) goto out_unlock; @@ -594,7 +607,7 @@ struct page *dax_layout_busy_page(struct address_space *mapping) if (WARN_ON_ONCE(!xa_is_value(entry))) continue; if (unlikely(dax_is_locked(entry))) - entry = get_unlocked_entry(&xas); + entry = get_unlocked_entry(&xas, 0); if (entry) page = dax_busy_page(entry); put_unlocked_entry(&xas, entry); @@ -621,7 +634,7 @@ static int __dax_invalidate_entry(struct address_space *mapping, void *entry; xas_lock_irq(&xas); - entry = get_unlocked_entry(&xas); + entry = get_unlocked_entry(&xas, 0); if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) goto out; if (!trunc && @@ -848,7 +861,7 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev, if (unlikely(dax_is_locked(entry))) { void *old_entry = entry; - entry = get_unlocked_entry(xas); + entry = get_unlocked_entry(xas, 0); /* Entry got punched out / reallocated? */ if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) @@ -1509,7 +1522,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * entry is already in the array, for instance), it will return * VM_FAULT_FALLBACK. */ - entry = grab_mapping_entry(&xas, mapping, DAX_PMD); + entry = grab_mapping_entry(&xas, mapping, PMD_ORDER); if (xa_is_internal(entry)) { result = xa_to_internal(entry); goto fallback; @@ -1658,11 +1671,10 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order) vm_fault_t ret; xas_lock_irq(&xas); - entry = get_unlocked_entry(&xas); + entry = get_unlocked_entry(&xas, order); /* Did we race with someone splitting entry or so? */ - if (!entry || - (order == 0 && !dax_is_pte_entry(entry)) || - (order == PMD_ORDER && !dax_is_pmd_entry(entry))) { + if (!entry || dax_is_conflict(entry) || + (order == 0 && !dax_is_pte_entry(entry))) { put_unlocked_entry(&xas, entry); xas_unlock_irq(&xas); trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf, diff --git a/fs/dcache.c b/fs/dcache.c index f41121e5d1ec..e88cf0554e65 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -861,6 +861,32 @@ void dput(struct dentry *dentry) } EXPORT_SYMBOL(dput); +static void __dput_to_list(struct dentry *dentry, struct list_head *list) +__must_hold(&dentry->d_lock) +{ + if (dentry->d_flags & DCACHE_SHRINK_LIST) { + /* let the owner of the list it's on deal with it */ + --dentry->d_lockref.count; + } else { + if (dentry->d_flags & DCACHE_LRU_LIST) + d_lru_del(dentry); + if (!--dentry->d_lockref.count) + d_shrink_add(dentry, list); + } +} + +void dput_to_list(struct dentry *dentry, struct list_head *list) +{ + rcu_read_lock(); + if (likely(fast_dput(dentry))) { + rcu_read_unlock(); + return; + } + rcu_read_unlock(); + if (!retain_dentry(dentry)) + __dput_to_list(dentry, list); + spin_unlock(&dentry->d_lock); +} /* This must be called with d_lock held */ static inline void __dget_dlock(struct dentry *dentry) @@ -1067,7 +1093,7 @@ out: return false; } -static void shrink_dentry_list(struct list_head *list) +void shrink_dentry_list(struct list_head *list) { while (!list_empty(list)) { struct dentry *dentry, *parent; @@ -1089,18 +1115,9 @@ static void shrink_dentry_list(struct list_head *list) rcu_read_unlock(); d_shrink_del(dentry); parent = dentry->d_parent; + if (parent != dentry) + __dput_to_list(parent, list); __dentry_kill(dentry); - if (parent == dentry) - continue; - /* - * We need to prune ancestors too. This is necessary to prevent - * quadratic behavior of shrink_dcache_parent(), but is also - * expected to be beneficial in reducing dentry cache - * fragmentation. - */ - dentry = parent; - while (dentry && !lockref_put_or_lock(&dentry->d_lockref)) - dentry = dentry_kill(dentry); } } @@ -1445,8 +1462,11 @@ out: struct select_data { struct dentry *start; + union { + long found; + struct dentry *victim; + }; struct list_head dispose; - int found; }; static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) @@ -1478,6 +1498,37 @@ out: return ret; } +static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry) +{ + struct select_data *data = _data; + enum d_walk_ret ret = D_WALK_CONTINUE; + + if (data->start == dentry) + goto out; + + if (dentry->d_flags & DCACHE_SHRINK_LIST) { + if (!dentry->d_lockref.count) { + rcu_read_lock(); + data->victim = dentry; + return D_WALK_QUIT; + } + } else { + if (dentry->d_flags & DCACHE_LRU_LIST) + d_lru_del(dentry); + if (!dentry->d_lockref.count) + d_shrink_add(dentry, &data->dispose); + } + /* + * We can return to the caller if we have found some (this + * ensures forward progress). We'll be coming back to find + * the rest. + */ + if (!list_empty(&data->dispose)) + ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY; +out: + return ret; +} + /** * shrink_dcache_parent - prune dcache * @parent: parent of entries to prune @@ -1487,12 +1538,9 @@ out: void shrink_dcache_parent(struct dentry *parent) { for (;;) { - struct select_data data; + struct select_data data = {.start = parent}; INIT_LIST_HEAD(&data.dispose); - data.start = parent; - data.found = 0; - d_walk(parent, &data, select_collect); if (!list_empty(&data.dispose)) { @@ -1503,6 +1551,24 @@ void shrink_dcache_parent(struct dentry *parent) cond_resched(); if (!data.found) break; + data.victim = NULL; + d_walk(parent, &data, select_collect2); + if (data.victim) { + struct dentry *parent; + spin_lock(&data.victim->d_lock); + if (!shrink_lock_dentry(data.victim)) { + spin_unlock(&data.victim->d_lock); + rcu_read_unlock(); + } else { + rcu_read_unlock(); + parent = data.victim->d_parent; + if (parent != data.victim) + __dput_to_list(parent, &data.dispose); + __dentry_kill(data.victim); + } + } + if (!list_empty(&data.dispose)) + shrink_dentry_list(&data.dispose); } } EXPORT_SYMBOL(shrink_dcache_parent); diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 91d65f337d87..f91db24bbf3b 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -23,6 +23,7 @@ #include <linux/slab.h> #include <asm/unaligned.h> #include <linux/kernel.h> +#include <linux/xattr.h> #include "ecryptfs_kernel.h" #define DECRYPT 0 @@ -860,13 +861,10 @@ static struct ecryptfs_flag_map_elem ecryptfs_flag_map[] = { * @crypt_stat: The cryptographic context * @page_virt: Source data to be parsed * @bytes_read: Updated with the number of bytes read - * - * Returns zero on success; non-zero if the flag set is invalid */ -static int ecryptfs_process_flags(struct ecryptfs_crypt_stat *crypt_stat, +static void ecryptfs_process_flags(struct ecryptfs_crypt_stat *crypt_stat, char *page_virt, int *bytes_read) { - int rc = 0; int i; u32 flags; @@ -879,7 +877,6 @@ static int ecryptfs_process_flags(struct ecryptfs_crypt_stat *crypt_stat, /* Version is in top 8 bits of the 32-bit flag vector */ crypt_stat->file_version = ((flags >> 24) & 0xFF); (*bytes_read) = 4; - return rc; } /** @@ -1004,8 +1001,10 @@ int ecryptfs_read_and_validate_header_region(struct inode *inode) rc = ecryptfs_read_lower(file_size, 0, ECRYPTFS_SIZE_AND_MARKER_BYTES, inode); - if (rc < ECRYPTFS_SIZE_AND_MARKER_BYTES) - return rc >= 0 ? -EINVAL : rc; + if (rc < 0) + return rc; + else if (rc < ECRYPTFS_SIZE_AND_MARKER_BYTES) + return -EINVAL; rc = ecryptfs_validate_marker(marker); if (!rc) ecryptfs_i_size_init(file_size, inode); @@ -1115,9 +1114,21 @@ ecryptfs_write_metadata_to_xattr(struct dentry *ecryptfs_dentry, char *page_virt, size_t size) { int rc; + struct dentry *lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); + struct inode *lower_inode = d_inode(lower_dentry); - rc = ecryptfs_setxattr(ecryptfs_dentry, ecryptfs_inode, - ECRYPTFS_XATTR_NAME, page_virt, size, 0); + if (!(lower_inode->i_opflags & IOP_XATTR)) { + rc = -EOPNOTSUPP; + goto out; + } + + inode_lock(lower_inode); + rc = __vfs_setxattr(lower_dentry, lower_inode, ECRYPTFS_XATTR_NAME, + page_virt, size, 0); + if (!rc && ecryptfs_inode) + fsstack_copy_attr_all(ecryptfs_inode, lower_inode); + inode_unlock(lower_inode); +out: return rc; } @@ -1291,12 +1302,7 @@ static int ecryptfs_read_headers_virt(char *page_virt, if (!(crypt_stat->flags & ECRYPTFS_I_SIZE_INITIALIZED)) ecryptfs_i_size_init(page_virt, d_inode(ecryptfs_dentry)); offset += MAGIC_ECRYPTFS_MARKER_SIZE_BYTES; - rc = ecryptfs_process_flags(crypt_stat, (page_virt + offset), - &bytes_read); - if (rc) { - ecryptfs_printk(KERN_WARNING, "Error processing flags\n"); - goto out; - } + ecryptfs_process_flags(crypt_stat, (page_virt + offset), &bytes_read); if (crypt_stat->file_version > ECRYPTFS_SUPPORTED_FILE_VERSION) { ecryptfs_printk(KERN_WARNING, "File version is [%d]; only " "file version [%d] is supported by this " @@ -1367,8 +1373,10 @@ int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry, ecryptfs_inode_to_lower(inode), ECRYPTFS_XATTR_NAME, file_size, ECRYPTFS_SIZE_AND_MARKER_BYTES); - if (rc < ECRYPTFS_SIZE_AND_MARKER_BYTES) - return rc >= 0 ? -EINVAL : rc; + if (rc < 0) + return rc; + else if (rc < ECRYPTFS_SIZE_AND_MARKER_BYTES) + return -EINVAL; rc = ecryptfs_validate_marker(marker); if (!rc) ecryptfs_i_size_init(file_size, inode); diff --git a/fs/ecryptfs/debug.c b/fs/ecryptfs/debug.c index d131d070826f..1f65e99f9a41 100644 --- a/fs/ecryptfs/debug.c +++ b/fs/ecryptfs/debug.c @@ -83,25 +83,9 @@ void ecryptfs_dump_auth_tok(struct ecryptfs_auth_tok *auth_tok) */ void ecryptfs_dump_hex(char *data, int bytes) { - int i = 0; - int add_newline = 1; - if (ecryptfs_verbosity < 1) return; - if (bytes != 0) { - printk(KERN_DEBUG "0x%.2x.", (unsigned char)data[i]); - i++; - } - while (i < bytes) { - printk("0x%.2x.", (unsigned char)data[i]); - i++; - if (i % 16 == 0) { - printk("\n"); - add_newline = 0; - } else - add_newline = 1; - } - if (add_newline) - printk("\n"); -} + print_hex_dump(KERN_DEBUG, "ecryptfs: ", DUMP_PREFIX_OFFSET, 16, 1, + data, bytes, false); +} diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 1e994d780f37..18426f4855f1 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -1121,7 +1121,7 @@ static int ecryptfs_xattr_set(const struct xattr_handler *handler, } } -const struct xattr_handler ecryptfs_xattr_handler = { +static const struct xattr_handler ecryptfs_xattr_handler = { .prefix = "", /* match anything */ .get = ecryptfs_xattr_get, .set = ecryptfs_xattr_set, diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 9536e592e25a..216fbe6a4837 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -1048,8 +1048,9 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, "rc = [%d]\n", __func__, rc); goto out_free_unlock; } - while (s->decrypted_filename[s->i] != '\0' - && s->i < s->block_aligned_filename_size) + + while (s->i < s->block_aligned_filename_size && + s->decrypted_filename[s->i] != '\0') s->i++; if (s->i == s->block_aligned_filename_size) { printk(KERN_WARNING "%s: Invalid tag 70 packet; could not " @@ -1611,9 +1612,9 @@ int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key, int rc = 0; (*auth_tok_key) = request_key(&key_type_user, sig, NULL); - if (!(*auth_tok_key) || IS_ERR(*auth_tok_key)) { + if (IS_ERR(*auth_tok_key)) { (*auth_tok_key) = ecryptfs_get_encrypted_key(sig); - if (!(*auth_tok_key) || IS_ERR(*auth_tok_key)) { + if (IS_ERR(*auth_tok_key)) { printk(KERN_ERR "Could not find key with description: [%s]\n", sig); rc = process_request_key_err(PTR_ERR(*auth_tok_key)); diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 5bc3c4a4c563..fa4f6447ddad 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -7,6 +7,7 @@ #include <linux/ctype.h> #include <linux/efi.h> #include <linux/fs.h> +#include <linux/fs_context.h> #include <linux/module.h> #include <linux/pagemap.h> #include <linux/ucs2_string.h> @@ -28,8 +29,6 @@ static const struct super_operations efivarfs_ops = { .evict_inode = efivarfs_evict_inode, }; -static struct super_block *efivarfs_sb; - /* * Compare two efivarfs file names. * @@ -188,14 +187,12 @@ static int efivarfs_destroy(struct efivar_entry *entry, void *data) return 0; } -static int efivarfs_fill_super(struct super_block *sb, void *data, int silent) +static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc) { struct inode *inode = NULL; struct dentry *root; int err; - efivarfs_sb = sb; - sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; @@ -223,16 +220,24 @@ static int efivarfs_fill_super(struct super_block *sb, void *data, int silent) return err; } -static struct dentry *efivarfs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int efivarfs_get_tree(struct fs_context *fc) +{ + return get_tree_single(fc, efivarfs_fill_super); +} + +static const struct fs_context_operations efivarfs_context_ops = { + .get_tree = efivarfs_get_tree, +}; + +static int efivarfs_init_fs_context(struct fs_context *fc) { - return mount_single(fs_type, flags, data, efivarfs_fill_super); + fc->ops = &efivarfs_context_ops; + return 0; } static void efivarfs_kill_sb(struct super_block *sb) { kill_litter_super(sb); - efivarfs_sb = NULL; /* Remove all entries and destroy */ __efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL, NULL); @@ -241,7 +246,7 @@ static void efivarfs_kill_sb(struct super_block *sb) static struct file_system_type efivarfs_type = { .owner = THIS_MODULE, .name = "efivarfs", - .mount = efivarfs_mount, + .init_fs_context = efivarfs_init_fs_context, .kill_sb = efivarfs_kill_sb, }; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 4c74c768ae43..d7f1f5011fac 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -291,7 +291,7 @@ static LIST_HEAD(tfile_check_list); #include <linux/sysctl.h> -static long zero; +static long long_zero; static long long_max = LONG_MAX; struct ctl_table epoll_table[] = { @@ -301,7 +301,7 @@ struct ctl_table epoll_table[] = { .maxlen = sizeof(max_user_watches), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra1 = &zero, + .extra1 = &long_zero, .extra2 = &long_max, }, { } @@ -2313,19 +2313,17 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, size_t, sigsetsize) { int error; - sigset_t ksigmask, sigsaved; /* * If the caller wants a certain signal mask to be set during the wait, * we apply it here. */ - error = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize); + error = set_user_sigmask(sigmask, sigsetsize); if (error) return error; error = do_epoll_wait(epfd, events, maxevents, timeout); - - restore_user_sigmask(sigmask, &sigsaved, error == -EINTR); + restore_saved_sigmask_unless(error == -EINTR); return error; } @@ -2338,19 +2336,17 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, compat_size_t, sigsetsize) { long err; - sigset_t ksigmask, sigsaved; /* * If the caller wants a certain signal mask to be set during the wait, * we apply it here. */ - err = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize); + err = set_compat_user_sigmask(sigmask, sigsetsize); if (err) return err; err = do_epoll_wait(epfd, events, maxevents, timeout); - - restore_user_sigmask(sigmask, &sigsaved, err == -EINTR); + restore_saved_sigmask_unless(err == -EINTR); return err; } diff --git a/fs/ext4/file.c b/fs/ext4/file.c index f4a24a46245e..70b0438dbc94 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -371,15 +371,17 @@ static const struct vm_operations_struct ext4_file_vm_ops = { static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file->f_mapping->host; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct dax_device *dax_dev = sbi->s_daxdev; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(sbi))) return -EIO; /* - * We don't support synchronous mappings for non-DAX files. At least - * until someone comes with a sensible use case. + * We don't support synchronous mappings for non-DAX files and + * for DAX files if underneath dax_device is not synchronous. */ - if (!IS_DAX(file_inode(file)) && (vma->vm_flags & VM_SYNC)) + if (!daxdev_mapping_supported(vma, dax_dev)) return -EOPNOTSUPP; file_accessed(file); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index a18a47a2a1d1..12ceadef32c5 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -396,7 +396,7 @@ submit_and_retry: ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh)); if (ret != bh->b_size) goto submit_and_retry; - wbc_account_io(io->io_wbc, page, bh->b_size); + wbc_account_cgroup_owner(io->io_wbc, page, bh->b_size); io->io_next_block++; return 0; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 0ca530afc684..abbf14e9bd72 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -470,7 +470,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) } if (fio->io_wbc && !is_read_io(fio->op)) - wbc_account_io(fio->io_wbc, page, PAGE_SIZE); + wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE); bio_set_op_attrs(bio, fio->op, fio->op_flags); @@ -513,7 +513,7 @@ alloc_new: } if (fio->io_wbc) - wbc_account_io(fio->io_wbc, page, PAGE_SIZE); + wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE); inc_page_count(fio->sbi, WB_DATA_TYPE(page)); @@ -592,7 +592,7 @@ alloc_new: } if (fio->io_wbc) - wbc_account_io(fio->io_wbc, bio_page, PAGE_SIZE); + wbc_account_cgroup_owner(fio->io_wbc, bio_page, PAGE_SIZE); io->last_block_in_bio = fio->new_blkaddr; f2fs_trace_ios(fio, 0); @@ -2919,7 +2919,7 @@ int f2fs_migrate_page(struct address_space *mapping, /* one extra reference was held for atomic_write page */ extra_count = atomic_written ? 1 : 0; rc = migrate_page_move_mapping(mapping, newpage, - page, mode, extra_count); + page, extra_count); if (rc != MIGRATEPAGE_SUCCESS) { if (atomic_written) mutex_unlock(&fi->inmem_lock); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index d95a681ef7c9..6de6cda44031 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2818,9 +2818,7 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) while (zones && sector < nr_sectors) { nr_zones = F2FS_REPORT_NR_ZONES; - err = blkdev_report_zones(bdev, sector, - zones, &nr_zones, - GFP_KERNEL); + err = blkdev_report_zones(bdev, sector, zones, &nr_zones); if (err) break; if (!nr_zones) { diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 9ebfb1b28430..542b02d170f8 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -270,6 +270,7 @@ void __inode_attach_wb(struct inode *inode, struct page *page) if (unlikely(cmpxchg(&inode->i_wb, NULL, wb))) wb_put(wb); } +EXPORT_SYMBOL_GPL(__inode_attach_wb); /** * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it @@ -582,6 +583,7 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, if (unlikely(wb_dying(wbc->wb))) inode_switch_wbs(inode, wbc->wb_id); } +EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode); /** * wbc_detach_inode - disassociate wbc from inode and perform foreign detection @@ -701,9 +703,10 @@ void wbc_detach_inode(struct writeback_control *wbc) wb_put(wbc->wb); wbc->wb = NULL; } +EXPORT_SYMBOL_GPL(wbc_detach_inode); /** - * wbc_account_io - account IO issued during writeback + * wbc_account_cgroup_owner - account writeback to update inode cgroup ownership * @wbc: writeback_control of the writeback in progress * @page: page being written out * @bytes: number of bytes being written out @@ -712,8 +715,8 @@ void wbc_detach_inode(struct writeback_control *wbc) * controlled by @wbc. Keep the book for foreign inode detection. See * wbc_detach_inode(). */ -void wbc_account_io(struct writeback_control *wbc, struct page *page, - size_t bytes) +void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, + size_t bytes) { struct cgroup_subsys_state *css; int id; @@ -724,7 +727,7 @@ void wbc_account_io(struct writeback_control *wbc, struct page *page, * behind a slow cgroup. Ultimately, we want pageout() to kick off * regular writeback instead of writing things out itself. */ - if (!wbc->wb) + if (!wbc->wb || wbc->no_cgroup_owner) return; css = mem_cgroup_css_from_page(page); @@ -750,7 +753,7 @@ void wbc_account_io(struct writeback_control *wbc, struct page *page, else wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes); } -EXPORT_SYMBOL_GPL(wbc_account_io); +EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner); /** * inode_congested - test whether an inode is congested diff --git a/fs/fs_parser.c b/fs/fs_parser.c index 0d388faa25d1..460ea4206fa2 100644 --- a/fs/fs_parser.c +++ b/fs/fs_parser.c @@ -264,6 +264,7 @@ int fs_lookup_param(struct fs_context *fc, return invalf(fc, "%s: not usable as path", param->key); } + f->refcnt++; /* filename_lookup() drops our ref. */ ret = filename_lookup(param->dirfd, f, flags, _path, NULL); if (ret < 0) { errorf(fc, "%s: Lookup failure for '%s'", param->key, f->name); diff --git a/fs/fs_pin.c b/fs/fs_pin.c index a6497cf8ae53..47ef3c71ce90 100644 --- a/fs/fs_pin.c +++ b/fs/fs_pin.c @@ -19,20 +19,14 @@ void pin_remove(struct fs_pin *pin) spin_unlock_irq(&pin->wait.lock); } -void pin_insert_group(struct fs_pin *pin, struct vfsmount *m, struct hlist_head *p) +void pin_insert(struct fs_pin *pin, struct vfsmount *m) { spin_lock(&pin_lock); - if (p) - hlist_add_head(&pin->s_list, p); + hlist_add_head(&pin->s_list, &m->mnt_sb->s_pins); hlist_add_head(&pin->m_list, &real_mount(m)->mnt_pins); spin_unlock(&pin_lock); } -void pin_insert(struct fs_pin *pin, struct vfsmount *m) -{ - pin_insert_group(pin, m, &m->mnt_sb->s_pins); -} - void pin_kill(struct fs_pin *p) { wait_queue_entry_t wait; diff --git a/fs/fsopen.c b/fs/fsopen.c index a8bf83ce8d4e..043ffa8dc263 100644 --- a/fs/fsopen.c +++ b/fs/fsopen.c @@ -226,6 +226,8 @@ static int vfs_fsconfig_locked(struct fs_context *fc, int cmd, case FSCONFIG_CMD_CREATE: if (fc->phase != FS_CONTEXT_CREATE_PARAMS) return -EBUSY; + if (!mount_capable(fc)) + return -EPERM; fc->phase = FS_CONTEXT_CREATING; ret = vfs_get_tree(fc); if (ret) diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 14ce1e47f980..c23f6f243ad4 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -346,7 +346,7 @@ static int fuse_ctl_fill_super(struct super_block *sb, struct fs_context *fctx) static int fuse_ctl_get_tree(struct fs_context *fc) { - return vfs_get_super(fc, vfs_get_single_super, fuse_ctl_fill_super); + return get_tree_single(fc, fuse_ctl_fill_super); } static const struct fs_context_operations fuse_ctl_context_ops = { diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index d5403b4004c9..bb0b27d88e50 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -407,7 +407,7 @@ static int copy_name(char *buffer, const char *xattr_name, int name_len) int offset = 0; if (!is_known_namespace(xattr_name)) { - strncpy(buffer, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN); + memcpy(buffer, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN); offset += XATTR_MAC_OSX_PREFIX_LEN; len += XATTR_MAC_OSX_PREFIX_LEN; } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 1dcc57189382..a478df035651 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1299,7 +1299,7 @@ static int hugetlbfs_get_tree(struct fs_context *fc) int err = hugetlbfs_validate(fc); if (err) return err; - return vfs_get_super(fc, vfs_get_independent_super, hugetlbfs_fill_super); + return get_tree_nodev(fc, hugetlbfs_fill_super); } static void hugetlbfs_fs_context_free(struct fs_context *fc) diff --git a/fs/internal.h b/fs/internal.h index 2f3c3de51fad..315fcd8d237c 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -14,6 +14,7 @@ struct path; struct mount; struct shrink_control; struct fs_context; +struct user_namespace; /* * block_dev.c @@ -107,6 +108,7 @@ extern struct file *alloc_empty_file_noaccount(int, const struct cred *); extern int reconfigure_super(struct fs_context *); extern bool trylock_super(struct super_block *sb); extern struct super_block *user_get_super(dev_t); +extern bool mount_capable(struct fs_context *); /* * open.c @@ -154,6 +156,9 @@ extern int d_set_mounted(struct dentry *dentry); extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc); extern struct dentry *d_alloc_cursor(struct dentry *); extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *); +extern char *simple_dname(struct dentry *, char *, int); +extern void dput_to_list(struct dentry *, struct list_head *); +extern void shrink_dentry_list(struct list_head *); /* * read_write.c @@ -182,15 +187,5 @@ extern const struct dentry_operations ns_dentry_operations; extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd, unsigned long arg); -/* - * iomap support: - */ -typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len, - void *data, struct iomap *iomap); - -loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length, - unsigned flags, const struct iomap_ops *ops, void *data, - iomap_actor_t actor); - /* direct-io.c: */ int sb_init_dio_done_wq(struct super_block *sb); diff --git a/fs/io_uring.c b/fs/io_uring.c index 4ed4b110a154..e2a66e12fbc6 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -231,6 +231,7 @@ struct io_ring_ctx { struct task_struct *sqo_thread; /* if using sq thread polling */ struct mm_struct *sqo_mm; wait_queue_head_t sqo_wait; + struct completion sqo_thread_started; struct { /* CQ ring */ @@ -322,6 +323,7 @@ struct io_kiocb { struct io_ring_ctx *ctx; struct list_head list; + struct list_head link_list; unsigned int flags; refcount_t refs; #define REQ_F_NOWAIT 1 /* must not punt to workers */ @@ -330,8 +332,10 @@ struct io_kiocb { #define REQ_F_SEQ_PREV 8 /* sequential with previous */ #define REQ_F_IO_DRAIN 16 /* drain existing IO first */ #define REQ_F_IO_DRAINED 32 /* drain done */ +#define REQ_F_LINK 64 /* linked sqes */ +#define REQ_F_FAIL_LINK 128 /* fail rest of links */ u64 user_data; - u32 error; /* iopoll result from callback */ + u32 result; u32 sequence; struct work_struct work; @@ -395,7 +399,8 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) if (!ctx) return NULL; - if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 0, GFP_KERNEL)) { + if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { kfree(ctx); return NULL; } @@ -403,6 +408,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) ctx->flags = p->flags; init_waitqueue_head(&ctx->cq_wait); init_completion(&ctx->ctx_done); + init_completion(&ctx->sqo_thread_started); mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->wait); for (i = 0; i < ARRAY_SIZE(ctx->pending_async); i++) { @@ -584,6 +590,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, req->flags = 0; /* one is dropped after submission, the other at completion */ refcount_set(&req->refs, 2); + req->result = 0; return req; out: io_ring_drop_ctx_refs(ctx, 1); @@ -599,7 +606,7 @@ static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr) } } -static void io_free_req(struct io_kiocb *req) +static void __io_free_req(struct io_kiocb *req) { if (req->file && !(req->flags & REQ_F_FIXED_FILE)) fput(req->file); @@ -607,6 +614,63 @@ static void io_free_req(struct io_kiocb *req) kmem_cache_free(req_cachep, req); } +static void io_req_link_next(struct io_kiocb *req) +{ + struct io_kiocb *nxt; + + /* + * The list should never be empty when we are called here. But could + * potentially happen if the chain is messed up, check to be on the + * safe side. + */ + nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list); + if (nxt) { + list_del(&nxt->list); + if (!list_empty(&req->link_list)) { + INIT_LIST_HEAD(&nxt->link_list); + list_splice(&req->link_list, &nxt->link_list); + nxt->flags |= REQ_F_LINK; + } + + INIT_WORK(&nxt->work, io_sq_wq_submit_work); + queue_work(req->ctx->sqo_wq, &nxt->work); + } +} + +/* + * Called if REQ_F_LINK is set, and we fail the head request + */ +static void io_fail_links(struct io_kiocb *req) +{ + struct io_kiocb *link; + + while (!list_empty(&req->link_list)) { + link = list_first_entry(&req->link_list, struct io_kiocb, list); + list_del(&link->list); + + io_cqring_add_event(req->ctx, link->user_data, -ECANCELED); + __io_free_req(link); + } +} + +static void io_free_req(struct io_kiocb *req) +{ + /* + * If LINK is set, we have dependent requests in this chain. If we + * didn't fail this request, queue the first one up, moving any other + * dependencies to the next request. In case of failure, fail the rest + * of the chain. + */ + if (req->flags & REQ_F_LINK) { + if (req->flags & REQ_F_FAIL_LINK) + io_fail_links(req); + else + io_req_link_next(req); + } + + __io_free_req(req); +} + static void io_put_req(struct io_kiocb *req) { if (refcount_dec_and_test(&req->refs)) @@ -628,16 +692,17 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, req = list_first_entry(done, struct io_kiocb, list); list_del(&req->list); - io_cqring_fill_event(ctx, req->user_data, req->error); + io_cqring_fill_event(ctx, req->user_data, req->result); (*nr_events)++; if (refcount_dec_and_test(&req->refs)) { /* If we're not using fixed files, we have to pair the * completion part with the file put. Use regular * completions for those, only batch free for fixed - * file. + * file and non-linked commands. */ - if (req->flags & REQ_F_FIXED_FILE) { + if ((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) == + REQ_F_FIXED_FILE) { reqs[to_free++] = req; if (to_free == ARRAY_SIZE(reqs)) io_free_req_many(ctx, reqs, &to_free); @@ -776,6 +841,8 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2) kiocb_end_write(kiocb); + if ((req->flags & REQ_F_LINK) && res != req->result) + req->flags |= REQ_F_FAIL_LINK; io_cqring_add_event(req->ctx, req->user_data, res); io_put_req(req); } @@ -786,7 +853,9 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) kiocb_end_write(kiocb); - req->error = res; + if ((req->flags & REQ_F_LINK) && res != req->result) + req->flags |= REQ_F_FAIL_LINK; + req->result = res; if (res != -EAGAIN) req->flags |= REQ_F_IOPOLL_COMPLETED; } @@ -929,7 +998,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, !kiocb->ki_filp->f_op->iopoll) return -EOPNOTSUPP; - req->error = 0; kiocb->ki_flags |= IOCB_HIPRI; kiocb->ki_complete = io_complete_rw_iopoll; } else { @@ -1001,9 +1069,9 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw, return 0; } -static int io_import_iovec(struct io_ring_ctx *ctx, int rw, - const struct sqe_submit *s, struct iovec **iovec, - struct iov_iter *iter) +static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw, + const struct sqe_submit *s, struct iovec **iovec, + struct iov_iter *iter) { const struct io_uring_sqe *sqe = s->sqe; void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); @@ -1021,7 +1089,7 @@ static int io_import_iovec(struct io_ring_ctx *ctx, int rw, opcode = READ_ONCE(sqe->opcode); if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { - int ret = io_import_fixed(ctx, rw, sqe, iter); + ssize_t ret = io_import_fixed(ctx, rw, sqe, iter); *iovec = NULL; return ret; } @@ -1087,7 +1155,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, struct iov_iter iter; struct file *file; size_t iov_count; - int ret; + ssize_t read_size, ret; ret = io_prep_rw(req, s, force_nonblock); if (ret) @@ -1100,16 +1168,30 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, return -EINVAL; ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter); - if (ret) + if (ret < 0) return ret; + read_size = ret; + if (req->flags & REQ_F_LINK) + req->result = read_size; + iov_count = iov_iter_count(&iter); ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count); if (!ret) { ssize_t ret2; - /* Catch -EAGAIN return for forced non-blocking submission */ ret2 = call_read_iter(file, kiocb, &iter); + /* + * In case of a short read, punt to async. This can happen + * if we have data partially cached. Alternatively we can + * return the short read, in which case the application will + * need to issue another SQE and wait for it. That SQE will + * need async punt anyway, so it's more efficient to do it + * here. + */ + if (force_nonblock && ret2 > 0 && ret2 < read_size) + ret2 = -EAGAIN; + /* Catch -EAGAIN return for forced non-blocking submission */ if (!force_nonblock || ret2 != -EAGAIN) { io_rw_done(kiocb, ret2); } else { @@ -1134,7 +1216,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, struct iov_iter iter; struct file *file; size_t iov_count; - int ret; + ssize_t ret; ret = io_prep_rw(req, s, force_nonblock); if (ret) @@ -1147,9 +1229,12 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, return -EINVAL; ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter); - if (ret) + if (ret < 0) return ret; + if (req->flags & REQ_F_LINK) + req->result = ret; + iov_count = iov_iter_count(&iter); ret = -EAGAIN; @@ -1253,6 +1338,8 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe, end > 0 ? end : LLONG_MAX, fsync_flags & IORING_FSYNC_DATASYNC); + if (ret < 0 && (req->flags & REQ_F_LINK)) + req->flags |= REQ_F_FAIL_LINK; io_cqring_add_event(req->ctx, sqe->user_data, ret); io_put_req(req); return 0; @@ -1297,11 +1384,70 @@ static int io_sync_file_range(struct io_kiocb *req, ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags); + if (ret < 0 && (req->flags & REQ_F_LINK)) + req->flags |= REQ_F_FAIL_LINK; io_cqring_add_event(req->ctx, sqe->user_data, ret); io_put_req(req); return 0; } +#if defined(CONFIG_NET) +static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, + bool force_nonblock, + long (*fn)(struct socket *, struct user_msghdr __user *, + unsigned int)) +{ + struct socket *sock; + int ret; + + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + + sock = sock_from_file(req->file, &ret); + if (sock) { + struct user_msghdr __user *msg; + unsigned flags; + + flags = READ_ONCE(sqe->msg_flags); + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + + msg = (struct user_msghdr __user *) (unsigned long) + READ_ONCE(sqe->addr); + + ret = fn(sock, msg, flags); + if (force_nonblock && ret == -EAGAIN) + return ret; + } + + io_cqring_add_event(req->ctx, sqe->user_data, ret); + io_put_req(req); + return 0; +} +#endif + +static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, + bool force_nonblock) +{ +#if defined(CONFIG_NET) + return io_send_recvmsg(req, sqe, force_nonblock, __sys_sendmsg_sock); +#else + return -EOPNOTSUPP; +#endif +} + +static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, + bool force_nonblock) +{ +#if defined(CONFIG_NET) + return io_send_recvmsg(req, sqe, force_nonblock, __sys_recvmsg_sock); +#else + return -EOPNOTSUPP; +#endif +} + static void io_poll_remove_one(struct io_kiocb *req) { struct io_poll_iocb *poll = &req->poll; @@ -1549,9 +1695,10 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, { int ret, opcode; + req->user_data = READ_ONCE(s->sqe->user_data); + if (unlikely(s->index >= ctx->sq_entries)) return -EINVAL; - req->user_data = READ_ONCE(s->sqe->user_data); opcode = READ_ONCE(s->sqe->opcode); switch (opcode) { @@ -1586,6 +1733,12 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, case IORING_OP_SYNC_FILE_RANGE: ret = io_sync_file_range(req, s->sqe, force_nonblock); break; + case IORING_OP_SENDMSG: + ret = io_sendmsg(req, s->sqe, force_nonblock); + break; + case IORING_OP_RECVMSG: + ret = io_recvmsg(req, s->sqe, force_nonblock); + break; default: ret = -EINVAL; break; @@ -1595,7 +1748,7 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, return ret; if (ctx->flags & IORING_SETUP_IOPOLL) { - if (req->error == -EAGAIN) + if (req->result == -EAGAIN) return -EAGAIN; /* workqueue context doesn't hold uring_lock, grab it now */ @@ -1819,31 +1972,11 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s, return 0; } -static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, - struct io_submit_state *state) +static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, + struct sqe_submit *s) { - struct io_kiocb *req; int ret; - /* enforce forwards compatibility on users */ - if (unlikely(s->sqe->flags & ~(IOSQE_FIXED_FILE | IOSQE_IO_DRAIN))) - return -EINVAL; - - req = io_get_req(ctx, state); - if (unlikely(!req)) - return -EAGAIN; - - ret = io_req_set_file(ctx, s, state, req); - if (unlikely(ret)) - goto out; - - ret = io_req_defer(ctx, req, s->sqe); - if (ret) { - if (ret == -EIOCBQUEUED) - ret = 0; - return ret; - } - ret = __io_submit_sqe(ctx, req, s, true); if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { struct io_uring_sqe *sqe_copy; @@ -1866,24 +1999,93 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, /* * Queued up for async execution, worker will release - * submit reference when the iocb is actually - * submitted. + * submit reference when the iocb is actually submitted. */ return 0; } } -out: /* drop submission reference */ io_put_req(req); /* and drop final reference, if we failed */ - if (ret) + if (ret) { + io_cqring_add_event(ctx, req->user_data, ret); + if (req->flags & REQ_F_LINK) + req->flags |= REQ_F_FAIL_LINK; io_put_req(req); + } return ret; } +#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK) + +static void io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, + struct io_submit_state *state, struct io_kiocb **link) +{ + struct io_uring_sqe *sqe_copy; + struct io_kiocb *req; + int ret; + + /* enforce forwards compatibility on users */ + if (unlikely(s->sqe->flags & ~SQE_VALID_FLAGS)) { + ret = -EINVAL; + goto err; + } + + req = io_get_req(ctx, state); + if (unlikely(!req)) { + ret = -EAGAIN; + goto err; + } + + ret = io_req_set_file(ctx, s, state, req); + if (unlikely(ret)) { +err_req: + io_free_req(req); +err: + io_cqring_add_event(ctx, s->sqe->user_data, ret); + return; + } + + ret = io_req_defer(ctx, req, s->sqe); + if (ret) { + if (ret != -EIOCBQUEUED) + goto err_req; + return; + } + + /* + * If we already have a head request, queue this one for async + * submittal once the head completes. If we don't have a head but + * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be + * submitted sync once the chain is complete. If none of those + * conditions are true (normal request), then just queue it. + */ + if (*link) { + struct io_kiocb *prev = *link; + + sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL); + if (!sqe_copy) { + ret = -EAGAIN; + goto err_req; + } + + s->sqe = sqe_copy; + memcpy(&req->submit, s, sizeof(*s)); + list_add_tail(&req->list, &prev->link_list); + } else if (s->sqe->flags & IOSQE_IO_LINK) { + req->flags |= REQ_F_LINK; + + memcpy(&req->submit, s, sizeof(*s)); + INIT_LIST_HEAD(&req->link_list); + *link = req; + } else { + io_queue_sqe(ctx, req, s); + } +} + /* * Batched submission is done, ensure local IO is flushed out. */ @@ -1966,7 +2168,9 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, unsigned int nr, bool has_user, bool mm_fault) { struct io_submit_state state, *statep = NULL; - int ret, i, submitted = 0; + struct io_kiocb *link = NULL; + bool prev_was_link = false; + int i, submitted = 0; if (nr > IO_PLUG_THRESHOLD) { io_submit_state_start(&state, ctx, nr); @@ -1974,22 +2178,30 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, } for (i = 0; i < nr; i++) { + /* + * If previous wasn't linked and we have a linked command, + * that's the end of the chain. Submit the previous link. + */ + if (!prev_was_link && link) { + io_queue_sqe(ctx, link, &link->submit); + link = NULL; + } + prev_was_link = (sqes[i].sqe->flags & IOSQE_IO_LINK) != 0; + if (unlikely(mm_fault)) { - ret = -EFAULT; + io_cqring_add_event(ctx, sqes[i].sqe->user_data, + -EFAULT); } else { sqes[i].has_user = has_user; sqes[i].needs_lock = true; sqes[i].needs_fixed_file = true; - ret = io_submit_sqe(ctx, &sqes[i], statep); - } - if (!ret) { + io_submit_sqe(ctx, &sqes[i], statep, &link); submitted++; - continue; } - - io_cqring_add_event(ctx, sqes[i].sqe->user_data, ret); } + if (link) + io_queue_sqe(ctx, link, &link->submit); if (statep) io_submit_state_end(&state); @@ -2006,6 +2218,8 @@ static int io_sq_thread(void *data) unsigned inflight; unsigned long timeout; + complete(&ctx->sqo_thread_started); + old_fs = get_fs(); set_fs(USER_DS); @@ -2130,6 +2344,8 @@ static int io_sq_thread(void *data) static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) { struct io_submit_state state, *statep = NULL; + struct io_kiocb *link = NULL; + bool prev_was_link = false; int i, submit = 0; if (to_submit > IO_PLUG_THRESHOLD) { @@ -2139,22 +2355,30 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) for (i = 0; i < to_submit; i++) { struct sqe_submit s; - int ret; if (!io_get_sqring(ctx, &s)) break; + /* + * If previous wasn't linked and we have a linked command, + * that's the end of the chain. Submit the previous link. + */ + if (!prev_was_link && link) { + io_queue_sqe(ctx, link, &link->submit); + link = NULL; + } + prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0; + s.has_user = true; s.needs_lock = false; s.needs_fixed_file = false; submit++; - - ret = io_submit_sqe(ctx, &s, statep); - if (ret) - io_cqring_add_event(ctx, s.sqe->user_data, ret); + io_submit_sqe(ctx, &s, statep, &link); } io_commit_sqring(ctx); + if (link) + io_queue_sqe(ctx, link, &link->submit); if (statep) io_submit_state_end(statep); @@ -2176,7 +2400,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, const sigset_t __user *sig, size_t sigsz) { struct io_cq_ring *ring = ctx->cq_ring; - sigset_t ksigmask, sigsaved; int ret; if (io_cqring_events(ring) >= min_events) @@ -2186,21 +2409,17 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, #ifdef CONFIG_COMPAT if (in_compat_syscall()) ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig, - &ksigmask, &sigsaved, sigsz); + sigsz); else #endif - ret = set_user_sigmask(sig, &ksigmask, - &sigsaved, sigsz); + ret = set_user_sigmask(sig, sigsz); if (ret) return ret; } ret = wait_event_interruptible(ctx->wait, io_cqring_events(ring) >= min_events); - - if (sig) - restore_user_sigmask(sig, &sigsaved, ret == -ERESTARTSYS); - + restore_saved_sigmask_unless(ret == -ERESTARTSYS); if (ret == -ERESTARTSYS) ret = -EINTR; @@ -2240,6 +2459,7 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) static void io_sq_thread_stop(struct io_ring_ctx *ctx) { if (ctx->sqo_thread) { + wait_for_completion(&ctx->sqo_thread_started); /* * The park is a bit of a work-around, without it we get * warning spews on shutdown with SQPOLL set and affinity diff --git a/fs/iomap.c b/fs/iomap.c deleted file mode 100644 index 217c3e5a13d6..000000000000 --- a/fs/iomap.c +++ /dev/null @@ -1,2205 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2010 Red Hat, Inc. - * Copyright (c) 2016-2018 Christoph Hellwig. - */ -#include <linux/module.h> -#include <linux/compiler.h> -#include <linux/fs.h> -#include <linux/iomap.h> -#include <linux/uaccess.h> -#include <linux/gfp.h> -#include <linux/migrate.h> -#include <linux/mm.h> -#include <linux/mm_inline.h> -#include <linux/swap.h> -#include <linux/pagemap.h> -#include <linux/pagevec.h> -#include <linux/file.h> -#include <linux/uio.h> -#include <linux/backing-dev.h> -#include <linux/buffer_head.h> -#include <linux/task_io_accounting_ops.h> -#include <linux/dax.h> -#include <linux/sched/signal.h> - -#include "internal.h" - -/* - * Execute a iomap write on a segment of the mapping that spans a - * contiguous range of pages that have identical block mapping state. - * - * This avoids the need to map pages individually, do individual allocations - * for each page and most importantly avoid the need for filesystem specific - * locking per page. Instead, all the operations are amortised over the entire - * range of pages. It is assumed that the filesystems will lock whatever - * resources they require in the iomap_begin call, and release them in the - * iomap_end call. - */ -loff_t -iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, - const struct iomap_ops *ops, void *data, iomap_actor_t actor) -{ - struct iomap iomap = { 0 }; - loff_t written = 0, ret; - - /* - * Need to map a range from start position for length bytes. This can - * span multiple pages - it is only guaranteed to return a range of a - * single type of pages (e.g. all into a hole, all mapped or all - * unwritten). Failure at this point has nothing to undo. - * - * If allocation is required for this range, reserve the space now so - * that the allocation is guaranteed to succeed later on. Once we copy - * the data into the page cache pages, then we cannot fail otherwise we - * expose transient stale data. If the reserve fails, we can safely - * back out at this point as there is nothing to undo. - */ - ret = ops->iomap_begin(inode, pos, length, flags, &iomap); - if (ret) - return ret; - if (WARN_ON(iomap.offset > pos)) - return -EIO; - if (WARN_ON(iomap.length == 0)) - return -EIO; - - /* - * Cut down the length to the one actually provided by the filesystem, - * as it might not be able to give us the whole size that we requested. - */ - if (iomap.offset + iomap.length < pos + length) - length = iomap.offset + iomap.length - pos; - - /* - * Now that we have guaranteed that the space allocation will succeed. - * we can do the copy-in page by page without having to worry about - * failures exposing transient data. - */ - written = actor(inode, pos, length, data, &iomap); - - /* - * Now the data has been copied, commit the range we've copied. This - * should not fail unless the filesystem has had a fatal error. - */ - if (ops->iomap_end) { - ret = ops->iomap_end(inode, pos, length, - written > 0 ? written : 0, - flags, &iomap); - } - - return written ? written : ret; -} - -static sector_t -iomap_sector(struct iomap *iomap, loff_t pos) -{ - return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT; -} - -static struct iomap_page * -iomap_page_create(struct inode *inode, struct page *page) -{ - struct iomap_page *iop = to_iomap_page(page); - - if (iop || i_blocksize(inode) == PAGE_SIZE) - return iop; - - iop = kmalloc(sizeof(*iop), GFP_NOFS | __GFP_NOFAIL); - atomic_set(&iop->read_count, 0); - atomic_set(&iop->write_count, 0); - bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE); - - /* - * migrate_page_move_mapping() assumes that pages with private data have - * their count elevated by 1. - */ - get_page(page); - set_page_private(page, (unsigned long)iop); - SetPagePrivate(page); - return iop; -} - -static void -iomap_page_release(struct page *page) -{ - struct iomap_page *iop = to_iomap_page(page); - - if (!iop) - return; - WARN_ON_ONCE(atomic_read(&iop->read_count)); - WARN_ON_ONCE(atomic_read(&iop->write_count)); - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); - kfree(iop); -} - -/* - * Calculate the range inside the page that we actually need to read. - */ -static void -iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop, - loff_t *pos, loff_t length, unsigned *offp, unsigned *lenp) -{ - loff_t orig_pos = *pos; - loff_t isize = i_size_read(inode); - unsigned block_bits = inode->i_blkbits; - unsigned block_size = (1 << block_bits); - unsigned poff = offset_in_page(*pos); - unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length); - unsigned first = poff >> block_bits; - unsigned last = (poff + plen - 1) >> block_bits; - - /* - * If the block size is smaller than the page size we need to check the - * per-block uptodate status and adjust the offset and length if needed - * to avoid reading in already uptodate ranges. - */ - if (iop) { - unsigned int i; - - /* move forward for each leading block marked uptodate */ - for (i = first; i <= last; i++) { - if (!test_bit(i, iop->uptodate)) - break; - *pos += block_size; - poff += block_size; - plen -= block_size; - first++; - } - - /* truncate len if we find any trailing uptodate block(s) */ - for ( ; i <= last; i++) { - if (test_bit(i, iop->uptodate)) { - plen -= (last - i + 1) * block_size; - last = i - 1; - break; - } - } - } - - /* - * If the extent spans the block that contains the i_size we need to - * handle both halves separately so that we properly zero data in the - * page cache for blocks that are entirely outside of i_size. - */ - if (orig_pos <= isize && orig_pos + length > isize) { - unsigned end = offset_in_page(isize - 1) >> block_bits; - - if (first <= end && last > end) - plen -= (last - end) * block_size; - } - - *offp = poff; - *lenp = plen; -} - -static void -iomap_set_range_uptodate(struct page *page, unsigned off, unsigned len) -{ - struct iomap_page *iop = to_iomap_page(page); - struct inode *inode = page->mapping->host; - unsigned first = off >> inode->i_blkbits; - unsigned last = (off + len - 1) >> inode->i_blkbits; - unsigned int i; - bool uptodate = true; - - if (iop) { - for (i = 0; i < PAGE_SIZE / i_blocksize(inode); i++) { - if (i >= first && i <= last) - set_bit(i, iop->uptodate); - else if (!test_bit(i, iop->uptodate)) - uptodate = false; - } - } - - if (uptodate && !PageError(page)) - SetPageUptodate(page); -} - -static void -iomap_read_finish(struct iomap_page *iop, struct page *page) -{ - if (!iop || atomic_dec_and_test(&iop->read_count)) - unlock_page(page); -} - -static void -iomap_read_page_end_io(struct bio_vec *bvec, int error) -{ - struct page *page = bvec->bv_page; - struct iomap_page *iop = to_iomap_page(page); - - if (unlikely(error)) { - ClearPageUptodate(page); - SetPageError(page); - } else { - iomap_set_range_uptodate(page, bvec->bv_offset, bvec->bv_len); - } - - iomap_read_finish(iop, page); -} - -static void -iomap_read_end_io(struct bio *bio) -{ - int error = blk_status_to_errno(bio->bi_status); - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - - bio_for_each_segment_all(bvec, bio, iter_all) - iomap_read_page_end_io(bvec, error); - bio_put(bio); -} - -struct iomap_readpage_ctx { - struct page *cur_page; - bool cur_page_in_bio; - bool is_readahead; - struct bio *bio; - struct list_head *pages; -}; - -static void -iomap_read_inline_data(struct inode *inode, struct page *page, - struct iomap *iomap) -{ - size_t size = i_size_read(inode); - void *addr; - - if (PageUptodate(page)) - return; - - BUG_ON(page->index); - BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data)); - - addr = kmap_atomic(page); - memcpy(addr, iomap->inline_data, size); - memset(addr + size, 0, PAGE_SIZE - size); - kunmap_atomic(addr); - SetPageUptodate(page); -} - -static loff_t -iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap) -{ - struct iomap_readpage_ctx *ctx = data; - struct page *page = ctx->cur_page; - struct iomap_page *iop = iomap_page_create(inode, page); - bool same_page = false, is_contig = false; - loff_t orig_pos = pos; - unsigned poff, plen; - sector_t sector; - - if (iomap->type == IOMAP_INLINE) { - WARN_ON_ONCE(pos); - iomap_read_inline_data(inode, page, iomap); - return PAGE_SIZE; - } - - /* zero post-eof blocks as the page may be mapped */ - iomap_adjust_read_range(inode, iop, &pos, length, &poff, &plen); - if (plen == 0) - goto done; - - if (iomap->type != IOMAP_MAPPED || pos >= i_size_read(inode)) { - zero_user(page, poff, plen); - iomap_set_range_uptodate(page, poff, plen); - goto done; - } - - ctx->cur_page_in_bio = true; - - /* - * Try to merge into a previous segment if we can. - */ - sector = iomap_sector(iomap, pos); - if (ctx->bio && bio_end_sector(ctx->bio) == sector) - is_contig = true; - - if (is_contig && - __bio_try_merge_page(ctx->bio, page, plen, poff, &same_page)) { - if (!same_page && iop) - atomic_inc(&iop->read_count); - goto done; - } - - /* - * If we start a new segment we need to increase the read count, and we - * need to do so before submitting any previous full bio to make sure - * that we don't prematurely unlock the page. - */ - if (iop) - atomic_inc(&iop->read_count); - - if (!ctx->bio || !is_contig || bio_full(ctx->bio, plen)) { - gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); - int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT; - - if (ctx->bio) - submit_bio(ctx->bio); - - if (ctx->is_readahead) /* same as readahead_gfp_mask */ - gfp |= __GFP_NORETRY | __GFP_NOWARN; - ctx->bio = bio_alloc(gfp, min(BIO_MAX_PAGES, nr_vecs)); - ctx->bio->bi_opf = REQ_OP_READ; - if (ctx->is_readahead) - ctx->bio->bi_opf |= REQ_RAHEAD; - ctx->bio->bi_iter.bi_sector = sector; - bio_set_dev(ctx->bio, iomap->bdev); - ctx->bio->bi_end_io = iomap_read_end_io; - } - - bio_add_page(ctx->bio, page, plen, poff); -done: - /* - * Move the caller beyond our range so that it keeps making progress. - * For that we have to include any leading non-uptodate ranges, but - * we can skip trailing ones as they will be handled in the next - * iteration. - */ - return pos - orig_pos + plen; -} - -int -iomap_readpage(struct page *page, const struct iomap_ops *ops) -{ - struct iomap_readpage_ctx ctx = { .cur_page = page }; - struct inode *inode = page->mapping->host; - unsigned poff; - loff_t ret; - - for (poff = 0; poff < PAGE_SIZE; poff += ret) { - ret = iomap_apply(inode, page_offset(page) + poff, - PAGE_SIZE - poff, 0, ops, &ctx, - iomap_readpage_actor); - if (ret <= 0) { - WARN_ON_ONCE(ret == 0); - SetPageError(page); - break; - } - } - - if (ctx.bio) { - submit_bio(ctx.bio); - WARN_ON_ONCE(!ctx.cur_page_in_bio); - } else { - WARN_ON_ONCE(ctx.cur_page_in_bio); - unlock_page(page); - } - - /* - * Just like mpage_readpages and block_read_full_page we always - * return 0 and just mark the page as PageError on errors. This - * should be cleaned up all through the stack eventually. - */ - return 0; -} -EXPORT_SYMBOL_GPL(iomap_readpage); - -static struct page * -iomap_next_page(struct inode *inode, struct list_head *pages, loff_t pos, - loff_t length, loff_t *done) -{ - while (!list_empty(pages)) { - struct page *page = lru_to_page(pages); - - if (page_offset(page) >= (u64)pos + length) - break; - - list_del(&page->lru); - if (!add_to_page_cache_lru(page, inode->i_mapping, page->index, - GFP_NOFS)) - return page; - - /* - * If we already have a page in the page cache at index we are - * done. Upper layers don't care if it is uptodate after the - * readpages call itself as every page gets checked again once - * actually needed. - */ - *done += PAGE_SIZE; - put_page(page); - } - - return NULL; -} - -static loff_t -iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length, - void *data, struct iomap *iomap) -{ - struct iomap_readpage_ctx *ctx = data; - loff_t done, ret; - - for (done = 0; done < length; done += ret) { - if (ctx->cur_page && offset_in_page(pos + done) == 0) { - if (!ctx->cur_page_in_bio) - unlock_page(ctx->cur_page); - put_page(ctx->cur_page); - ctx->cur_page = NULL; - } - if (!ctx->cur_page) { - ctx->cur_page = iomap_next_page(inode, ctx->pages, - pos, length, &done); - if (!ctx->cur_page) - break; - ctx->cur_page_in_bio = false; - } - ret = iomap_readpage_actor(inode, pos + done, length - done, - ctx, iomap); - } - - return done; -} - -int -iomap_readpages(struct address_space *mapping, struct list_head *pages, - unsigned nr_pages, const struct iomap_ops *ops) -{ - struct iomap_readpage_ctx ctx = { - .pages = pages, - .is_readahead = true, - }; - loff_t pos = page_offset(list_entry(pages->prev, struct page, lru)); - loff_t last = page_offset(list_entry(pages->next, struct page, lru)); - loff_t length = last - pos + PAGE_SIZE, ret = 0; - - while (length > 0) { - ret = iomap_apply(mapping->host, pos, length, 0, ops, - &ctx, iomap_readpages_actor); - if (ret <= 0) { - WARN_ON_ONCE(ret == 0); - goto done; - } - pos += ret; - length -= ret; - } - ret = 0; -done: - if (ctx.bio) - submit_bio(ctx.bio); - if (ctx.cur_page) { - if (!ctx.cur_page_in_bio) - unlock_page(ctx.cur_page); - put_page(ctx.cur_page); - } - - /* - * Check that we didn't lose a page due to the arcance calling - * conventions.. - */ - WARN_ON_ONCE(!ret && !list_empty(ctx.pages)); - return ret; -} -EXPORT_SYMBOL_GPL(iomap_readpages); - -/* - * iomap_is_partially_uptodate checks whether blocks within a page are - * uptodate or not. - * - * Returns true if all blocks which correspond to a file portion - * we want to read within the page are uptodate. - */ -int -iomap_is_partially_uptodate(struct page *page, unsigned long from, - unsigned long count) -{ - struct iomap_page *iop = to_iomap_page(page); - struct inode *inode = page->mapping->host; - unsigned len, first, last; - unsigned i; - - /* Limit range to one page */ - len = min_t(unsigned, PAGE_SIZE - from, count); - - /* First and last blocks in range within page */ - first = from >> inode->i_blkbits; - last = (from + len - 1) >> inode->i_blkbits; - - if (iop) { - for (i = first; i <= last; i++) - if (!test_bit(i, iop->uptodate)) - return 0; - return 1; - } - - return 0; -} -EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); - -int -iomap_releasepage(struct page *page, gfp_t gfp_mask) -{ - /* - * mm accommodates an old ext3 case where clean pages might not have had - * the dirty bit cleared. Thus, it can send actual dirty pages to - * ->releasepage() via shrink_active_list(), skip those here. - */ - if (PageDirty(page) || PageWriteback(page)) - return 0; - iomap_page_release(page); - return 1; -} -EXPORT_SYMBOL_GPL(iomap_releasepage); - -void -iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len) -{ - /* - * If we are invalidating the entire page, clear the dirty state from it - * and release it to avoid unnecessary buildup of the LRU. - */ - if (offset == 0 && len == PAGE_SIZE) { - WARN_ON_ONCE(PageWriteback(page)); - cancel_dirty_page(page); - iomap_page_release(page); - } -} -EXPORT_SYMBOL_GPL(iomap_invalidatepage); - -#ifdef CONFIG_MIGRATION -int -iomap_migrate_page(struct address_space *mapping, struct page *newpage, - struct page *page, enum migrate_mode mode) -{ - int ret; - - ret = migrate_page_move_mapping(mapping, newpage, page, mode, 0); - if (ret != MIGRATEPAGE_SUCCESS) - return ret; - - if (page_has_private(page)) { - ClearPagePrivate(page); - get_page(newpage); - set_page_private(newpage, page_private(page)); - set_page_private(page, 0); - put_page(page); - SetPagePrivate(newpage); - } - - if (mode != MIGRATE_SYNC_NO_COPY) - migrate_page_copy(newpage, page); - else - migrate_page_states(newpage, page); - return MIGRATEPAGE_SUCCESS; -} -EXPORT_SYMBOL_GPL(iomap_migrate_page); -#endif /* CONFIG_MIGRATION */ - -static void -iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) -{ - loff_t i_size = i_size_read(inode); - - /* - * Only truncate newly allocated pages beyoned EOF, even if the - * write started inside the existing inode size. - */ - if (pos + len > i_size) - truncate_pagecache_range(inode, max(pos, i_size), pos + len); -} - -static int -iomap_read_page_sync(struct inode *inode, loff_t block_start, struct page *page, - unsigned poff, unsigned plen, unsigned from, unsigned to, - struct iomap *iomap) -{ - struct bio_vec bvec; - struct bio bio; - - if (iomap->type != IOMAP_MAPPED || block_start >= i_size_read(inode)) { - zero_user_segments(page, poff, from, to, poff + plen); - iomap_set_range_uptodate(page, poff, plen); - return 0; - } - - bio_init(&bio, &bvec, 1); - bio.bi_opf = REQ_OP_READ; - bio.bi_iter.bi_sector = iomap_sector(iomap, block_start); - bio_set_dev(&bio, iomap->bdev); - __bio_add_page(&bio, page, plen, poff); - return submit_bio_wait(&bio); -} - -static int -__iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, - struct page *page, struct iomap *iomap) -{ - struct iomap_page *iop = iomap_page_create(inode, page); - loff_t block_size = i_blocksize(inode); - loff_t block_start = pos & ~(block_size - 1); - loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1); - unsigned from = offset_in_page(pos), to = from + len, poff, plen; - int status = 0; - - if (PageUptodate(page)) - return 0; - - do { - iomap_adjust_read_range(inode, iop, &block_start, - block_end - block_start, &poff, &plen); - if (plen == 0) - break; - - if ((from > poff && from < poff + plen) || - (to > poff && to < poff + plen)) { - status = iomap_read_page_sync(inode, block_start, page, - poff, plen, from, to, iomap); - if (status) - break; - } - - } while ((block_start += plen) < block_end); - - return status; -} - -static int -iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, - struct page **pagep, struct iomap *iomap) -{ - const struct iomap_page_ops *page_ops = iomap->page_ops; - pgoff_t index = pos >> PAGE_SHIFT; - struct page *page; - int status = 0; - - BUG_ON(pos + len > iomap->offset + iomap->length); - - if (fatal_signal_pending(current)) - return -EINTR; - - if (page_ops && page_ops->page_prepare) { - status = page_ops->page_prepare(inode, pos, len, iomap); - if (status) - return status; - } - - page = grab_cache_page_write_begin(inode->i_mapping, index, flags); - if (!page) { - status = -ENOMEM; - goto out_no_page; - } - - if (iomap->type == IOMAP_INLINE) - iomap_read_inline_data(inode, page, iomap); - else if (iomap->flags & IOMAP_F_BUFFER_HEAD) - status = __block_write_begin_int(page, pos, len, NULL, iomap); - else - status = __iomap_write_begin(inode, pos, len, page, iomap); - - if (unlikely(status)) - goto out_unlock; - - *pagep = page; - return 0; - -out_unlock: - unlock_page(page); - put_page(page); - iomap_write_failed(inode, pos, len); - -out_no_page: - if (page_ops && page_ops->page_done) - page_ops->page_done(inode, pos, 0, NULL, iomap); - return status; -} - -int -iomap_set_page_dirty(struct page *page) -{ - struct address_space *mapping = page_mapping(page); - int newly_dirty; - - if (unlikely(!mapping)) - return !TestSetPageDirty(page); - - /* - * Lock out page->mem_cgroup migration to keep PageDirty - * synchronized with per-memcg dirty page counters. - */ - lock_page_memcg(page); - newly_dirty = !TestSetPageDirty(page); - if (newly_dirty) - __set_page_dirty(page, mapping, 0); - unlock_page_memcg(page); - - if (newly_dirty) - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - return newly_dirty; -} -EXPORT_SYMBOL_GPL(iomap_set_page_dirty); - -static int -__iomap_write_end(struct inode *inode, loff_t pos, unsigned len, - unsigned copied, struct page *page, struct iomap *iomap) -{ - flush_dcache_page(page); - - /* - * The blocks that were entirely written will now be uptodate, so we - * don't have to worry about a readpage reading them and overwriting a - * partial write. However if we have encountered a short write and only - * partially written into a block, it will not be marked uptodate, so a - * readpage might come in and destroy our partial write. - * - * Do the simplest thing, and just treat any short write to a non - * uptodate page as a zero-length write, and force the caller to redo - * the whole thing. - */ - if (unlikely(copied < len && !PageUptodate(page))) - return 0; - iomap_set_range_uptodate(page, offset_in_page(pos), len); - iomap_set_page_dirty(page); - return copied; -} - -static int -iomap_write_end_inline(struct inode *inode, struct page *page, - struct iomap *iomap, loff_t pos, unsigned copied) -{ - void *addr; - - WARN_ON_ONCE(!PageUptodate(page)); - BUG_ON(pos + copied > PAGE_SIZE - offset_in_page(iomap->inline_data)); - - addr = kmap_atomic(page); - memcpy(iomap->inline_data + pos, addr + pos, copied); - kunmap_atomic(addr); - - mark_inode_dirty(inode); - return copied; -} - -static int -iomap_write_end(struct inode *inode, loff_t pos, unsigned len, - unsigned copied, struct page *page, struct iomap *iomap) -{ - const struct iomap_page_ops *page_ops = iomap->page_ops; - loff_t old_size = inode->i_size; - int ret; - - if (iomap->type == IOMAP_INLINE) { - ret = iomap_write_end_inline(inode, page, iomap, pos, copied); - } else if (iomap->flags & IOMAP_F_BUFFER_HEAD) { - ret = block_write_end(NULL, inode->i_mapping, pos, len, copied, - page, NULL); - } else { - ret = __iomap_write_end(inode, pos, len, copied, page, iomap); - } - - /* - * Update the in-memory inode size after copying the data into the page - * cache. It's up to the file system to write the updated size to disk, - * preferably after I/O completion so that no stale data is exposed. - */ - if (pos + ret > old_size) { - i_size_write(inode, pos + ret); - iomap->flags |= IOMAP_F_SIZE_CHANGED; - } - unlock_page(page); - - if (old_size < pos) - pagecache_isize_extended(inode, old_size, pos); - if (page_ops && page_ops->page_done) - page_ops->page_done(inode, pos, ret, page, iomap); - put_page(page); - - if (ret < len) - iomap_write_failed(inode, pos, len); - return ret; -} - -static loff_t -iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap) -{ - struct iov_iter *i = data; - long status = 0; - ssize_t written = 0; - unsigned int flags = AOP_FLAG_NOFS; - - do { - struct page *page; - unsigned long offset; /* Offset into pagecache page */ - unsigned long bytes; /* Bytes to write to page */ - size_t copied; /* Bytes copied from user */ - - offset = offset_in_page(pos); - bytes = min_t(unsigned long, PAGE_SIZE - offset, - iov_iter_count(i)); -again: - if (bytes > length) - bytes = length; - - /* - * Bring in the user page that we will copy from _first_. - * Otherwise there's a nasty deadlock on copying from the - * same page as we're writing to, without it being marked - * up-to-date. - * - * Not only is this an optimisation, but it is also required - * to check that the address is actually valid, when atomic - * usercopies are used, below. - */ - if (unlikely(iov_iter_fault_in_readable(i, bytes))) { - status = -EFAULT; - break; - } - - status = iomap_write_begin(inode, pos, bytes, flags, &page, - iomap); - if (unlikely(status)) - break; - - if (mapping_writably_mapped(inode->i_mapping)) - flush_dcache_page(page); - - copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); - - flush_dcache_page(page); - - status = iomap_write_end(inode, pos, bytes, copied, page, - iomap); - if (unlikely(status < 0)) - break; - copied = status; - - cond_resched(); - - iov_iter_advance(i, copied); - if (unlikely(copied == 0)) { - /* - * If we were unable to copy any data at all, we must - * fall back to a single segment length write. - * - * If we didn't fallback here, we could livelock - * because not all segments in the iov can be copied at - * once without a pagefault. - */ - bytes = min_t(unsigned long, PAGE_SIZE - offset, - iov_iter_single_seg_count(i)); - goto again; - } - pos += copied; - written += copied; - length -= copied; - - balance_dirty_pages_ratelimited(inode->i_mapping); - } while (iov_iter_count(i) && length); - - return written ? written : status; -} - -ssize_t -iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter, - const struct iomap_ops *ops) -{ - struct inode *inode = iocb->ki_filp->f_mapping->host; - loff_t pos = iocb->ki_pos, ret = 0, written = 0; - - while (iov_iter_count(iter)) { - ret = iomap_apply(inode, pos, iov_iter_count(iter), - IOMAP_WRITE, ops, iter, iomap_write_actor); - if (ret <= 0) - break; - pos += ret; - written += ret; - } - - return written ? written : ret; -} -EXPORT_SYMBOL_GPL(iomap_file_buffered_write); - -static struct page * -__iomap_read_page(struct inode *inode, loff_t offset) -{ - struct address_space *mapping = inode->i_mapping; - struct page *page; - - page = read_mapping_page(mapping, offset >> PAGE_SHIFT, NULL); - if (IS_ERR(page)) - return page; - if (!PageUptodate(page)) { - put_page(page); - return ERR_PTR(-EIO); - } - return page; -} - -static loff_t -iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap) -{ - long status = 0; - ssize_t written = 0; - - do { - struct page *page, *rpage; - unsigned long offset; /* Offset into pagecache page */ - unsigned long bytes; /* Bytes to write to page */ - - offset = offset_in_page(pos); - bytes = min_t(loff_t, PAGE_SIZE - offset, length); - - rpage = __iomap_read_page(inode, pos); - if (IS_ERR(rpage)) - return PTR_ERR(rpage); - - status = iomap_write_begin(inode, pos, bytes, - AOP_FLAG_NOFS, &page, iomap); - put_page(rpage); - if (unlikely(status)) - return status; - - WARN_ON_ONCE(!PageUptodate(page)); - - status = iomap_write_end(inode, pos, bytes, bytes, page, iomap); - if (unlikely(status <= 0)) { - if (WARN_ON_ONCE(status == 0)) - return -EIO; - return status; - } - - cond_resched(); - - pos += status; - written += status; - length -= status; - - balance_dirty_pages_ratelimited(inode->i_mapping); - } while (length); - - return written; -} - -int -iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len, - const struct iomap_ops *ops) -{ - loff_t ret; - - while (len) { - ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL, - iomap_dirty_actor); - if (ret <= 0) - return ret; - pos += ret; - len -= ret; - } - - return 0; -} -EXPORT_SYMBOL_GPL(iomap_file_dirty); - -static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, - unsigned bytes, struct iomap *iomap) -{ - struct page *page; - int status; - - status = iomap_write_begin(inode, pos, bytes, AOP_FLAG_NOFS, &page, - iomap); - if (status) - return status; - - zero_user(page, offset, bytes); - mark_page_accessed(page); - - return iomap_write_end(inode, pos, bytes, bytes, page, iomap); -} - -static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes, - struct iomap *iomap) -{ - return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, - iomap_sector(iomap, pos & PAGE_MASK), offset, bytes); -} - -static loff_t -iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, - void *data, struct iomap *iomap) -{ - bool *did_zero = data; - loff_t written = 0; - int status; - - /* already zeroed? we're done. */ - if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) - return count; - - do { - unsigned offset, bytes; - - offset = offset_in_page(pos); - bytes = min_t(loff_t, PAGE_SIZE - offset, count); - - if (IS_DAX(inode)) - status = iomap_dax_zero(pos, offset, bytes, iomap); - else - status = iomap_zero(inode, pos, offset, bytes, iomap); - if (status < 0) - return status; - - pos += bytes; - count -= bytes; - written += bytes; - if (did_zero) - *did_zero = true; - } while (count > 0); - - return written; -} - -int -iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, - const struct iomap_ops *ops) -{ - loff_t ret; - - while (len > 0) { - ret = iomap_apply(inode, pos, len, IOMAP_ZERO, - ops, did_zero, iomap_zero_range_actor); - if (ret <= 0) - return ret; - - pos += ret; - len -= ret; - } - - return 0; -} -EXPORT_SYMBOL_GPL(iomap_zero_range); - -int -iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, - const struct iomap_ops *ops) -{ - unsigned int blocksize = i_blocksize(inode); - unsigned int off = pos & (blocksize - 1); - - /* Block boundary? Nothing to do */ - if (!off) - return 0; - return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops); -} -EXPORT_SYMBOL_GPL(iomap_truncate_page); - -static loff_t -iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, - void *data, struct iomap *iomap) -{ - struct page *page = data; - int ret; - - if (iomap->flags & IOMAP_F_BUFFER_HEAD) { - ret = __block_write_begin_int(page, pos, length, NULL, iomap); - if (ret) - return ret; - block_commit_write(page, 0, length); - } else { - WARN_ON_ONCE(!PageUptodate(page)); - iomap_page_create(inode, page); - set_page_dirty(page); - } - - return length; -} - -vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) -{ - struct page *page = vmf->page; - struct inode *inode = file_inode(vmf->vma->vm_file); - unsigned long length; - loff_t offset, size; - ssize_t ret; - - lock_page(page); - size = i_size_read(inode); - if ((page->mapping != inode->i_mapping) || - (page_offset(page) > size)) { - /* We overload EFAULT to mean page got truncated */ - ret = -EFAULT; - goto out_unlock; - } - - /* page is wholly or partially inside EOF */ - if (((page->index + 1) << PAGE_SHIFT) > size) - length = offset_in_page(size); - else - length = PAGE_SIZE; - - offset = page_offset(page); - while (length > 0) { - ret = iomap_apply(inode, offset, length, - IOMAP_WRITE | IOMAP_FAULT, ops, page, - iomap_page_mkwrite_actor); - if (unlikely(ret <= 0)) - goto out_unlock; - offset += ret; - length -= ret; - } - - wait_for_stable_page(page); - return VM_FAULT_LOCKED; -out_unlock: - unlock_page(page); - return block_page_mkwrite_return(ret); -} -EXPORT_SYMBOL_GPL(iomap_page_mkwrite); - -struct fiemap_ctx { - struct fiemap_extent_info *fi; - struct iomap prev; -}; - -static int iomap_to_fiemap(struct fiemap_extent_info *fi, - struct iomap *iomap, u32 flags) -{ - switch (iomap->type) { - case IOMAP_HOLE: - /* skip holes */ - return 0; - case IOMAP_DELALLOC: - flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN; - break; - case IOMAP_MAPPED: - break; - case IOMAP_UNWRITTEN: - flags |= FIEMAP_EXTENT_UNWRITTEN; - break; - case IOMAP_INLINE: - flags |= FIEMAP_EXTENT_DATA_INLINE; - break; - } - - if (iomap->flags & IOMAP_F_MERGED) - flags |= FIEMAP_EXTENT_MERGED; - if (iomap->flags & IOMAP_F_SHARED) - flags |= FIEMAP_EXTENT_SHARED; - - return fiemap_fill_next_extent(fi, iomap->offset, - iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0, - iomap->length, flags); -} - -static loff_t -iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap) -{ - struct fiemap_ctx *ctx = data; - loff_t ret = length; - - if (iomap->type == IOMAP_HOLE) - return length; - - ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0); - ctx->prev = *iomap; - switch (ret) { - case 0: /* success */ - return length; - case 1: /* extent array full */ - return 0; - default: - return ret; - } -} - -int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, - loff_t start, loff_t len, const struct iomap_ops *ops) -{ - struct fiemap_ctx ctx; - loff_t ret; - - memset(&ctx, 0, sizeof(ctx)); - ctx.fi = fi; - ctx.prev.type = IOMAP_HOLE; - - ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC); - if (ret) - return ret; - - if (fi->fi_flags & FIEMAP_FLAG_SYNC) { - ret = filemap_write_and_wait(inode->i_mapping); - if (ret) - return ret; - } - - while (len > 0) { - ret = iomap_apply(inode, start, len, IOMAP_REPORT, ops, &ctx, - iomap_fiemap_actor); - /* inode with no (attribute) mapping will give ENOENT */ - if (ret == -ENOENT) - break; - if (ret < 0) - return ret; - if (ret == 0) - break; - - start += ret; - len -= ret; - } - - if (ctx.prev.type != IOMAP_HOLE) { - ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST); - if (ret < 0) - return ret; - } - - return 0; -} -EXPORT_SYMBOL_GPL(iomap_fiemap); - -/* - * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff. - * Returns true if found and updates @lastoff to the offset in file. - */ -static bool -page_seek_hole_data(struct inode *inode, struct page *page, loff_t *lastoff, - int whence) -{ - const struct address_space_operations *ops = inode->i_mapping->a_ops; - unsigned int bsize = i_blocksize(inode), off; - bool seek_data = whence == SEEK_DATA; - loff_t poff = page_offset(page); - - if (WARN_ON_ONCE(*lastoff >= poff + PAGE_SIZE)) - return false; - - if (*lastoff < poff) { - /* - * Last offset smaller than the start of the page means we found - * a hole: - */ - if (whence == SEEK_HOLE) - return true; - *lastoff = poff; - } - - /* - * Just check the page unless we can and should check block ranges: - */ - if (bsize == PAGE_SIZE || !ops->is_partially_uptodate) - return PageUptodate(page) == seek_data; - - lock_page(page); - if (unlikely(page->mapping != inode->i_mapping)) - goto out_unlock_not_found; - - for (off = 0; off < PAGE_SIZE; off += bsize) { - if (offset_in_page(*lastoff) >= off + bsize) - continue; - if (ops->is_partially_uptodate(page, off, bsize) == seek_data) { - unlock_page(page); - return true; - } - *lastoff = poff + off + bsize; - } - -out_unlock_not_found: - unlock_page(page); - return false; -} - -/* - * Seek for SEEK_DATA / SEEK_HOLE in the page cache. - * - * Within unwritten extents, the page cache determines which parts are holes - * and which are data: uptodate buffer heads count as data; everything else - * counts as a hole. - * - * Returns the resulting offset on successs, and -ENOENT otherwise. - */ -static loff_t -page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, - int whence) -{ - pgoff_t index = offset >> PAGE_SHIFT; - pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE); - loff_t lastoff = offset; - struct pagevec pvec; - - if (length <= 0) - return -ENOENT; - - pagevec_init(&pvec); - - do { - unsigned nr_pages, i; - - nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index, - end - 1); - if (nr_pages == 0) - break; - - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - - if (page_seek_hole_data(inode, page, &lastoff, whence)) - goto check_range; - lastoff = page_offset(page) + PAGE_SIZE; - } - pagevec_release(&pvec); - } while (index < end); - - /* When no page at lastoff and we are not done, we found a hole. */ - if (whence != SEEK_HOLE) - goto not_found; - -check_range: - if (lastoff < offset + length) - goto out; -not_found: - lastoff = -ENOENT; -out: - pagevec_release(&pvec); - return lastoff; -} - - -static loff_t -iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length, - void *data, struct iomap *iomap) -{ - switch (iomap->type) { - case IOMAP_UNWRITTEN: - offset = page_cache_seek_hole_data(inode, offset, length, - SEEK_HOLE); - if (offset < 0) - return length; - /* fall through */ - case IOMAP_HOLE: - *(loff_t *)data = offset; - return 0; - default: - return length; - } -} - -loff_t -iomap_seek_hole(struct inode *inode, loff_t offset, const struct iomap_ops *ops) -{ - loff_t size = i_size_read(inode); - loff_t length = size - offset; - loff_t ret; - - /* Nothing to be found before or beyond the end of the file. */ - if (offset < 0 || offset >= size) - return -ENXIO; - - while (length > 0) { - ret = iomap_apply(inode, offset, length, IOMAP_REPORT, ops, - &offset, iomap_seek_hole_actor); - if (ret < 0) - return ret; - if (ret == 0) - break; - - offset += ret; - length -= ret; - } - - return offset; -} -EXPORT_SYMBOL_GPL(iomap_seek_hole); - -static loff_t -iomap_seek_data_actor(struct inode *inode, loff_t offset, loff_t length, - void *data, struct iomap *iomap) -{ - switch (iomap->type) { - case IOMAP_HOLE: - return length; - case IOMAP_UNWRITTEN: - offset = page_cache_seek_hole_data(inode, offset, length, - SEEK_DATA); - if (offset < 0) - return length; - /*FALLTHRU*/ - default: - *(loff_t *)data = offset; - return 0; - } -} - -loff_t -iomap_seek_data(struct inode *inode, loff_t offset, const struct iomap_ops *ops) -{ - loff_t size = i_size_read(inode); - loff_t length = size - offset; - loff_t ret; - - /* Nothing to be found before or beyond the end of the file. */ - if (offset < 0 || offset >= size) - return -ENXIO; - - while (length > 0) { - ret = iomap_apply(inode, offset, length, IOMAP_REPORT, ops, - &offset, iomap_seek_data_actor); - if (ret < 0) - return ret; - if (ret == 0) - break; - - offset += ret; - length -= ret; - } - - if (length <= 0) - return -ENXIO; - return offset; -} -EXPORT_SYMBOL_GPL(iomap_seek_data); - -/* - * Private flags for iomap_dio, must not overlap with the public ones in - * iomap.h: - */ -#define IOMAP_DIO_WRITE_FUA (1 << 28) -#define IOMAP_DIO_NEED_SYNC (1 << 29) -#define IOMAP_DIO_WRITE (1 << 30) -#define IOMAP_DIO_DIRTY (1 << 31) - -struct iomap_dio { - struct kiocb *iocb; - iomap_dio_end_io_t *end_io; - loff_t i_size; - loff_t size; - atomic_t ref; - unsigned flags; - int error; - bool wait_for_completion; - - union { - /* used during submission and for synchronous completion: */ - struct { - struct iov_iter *iter; - struct task_struct *waiter; - struct request_queue *last_queue; - blk_qc_t cookie; - } submit; - - /* used for aio completion: */ - struct { - struct work_struct work; - } aio; - }; -}; - -int iomap_dio_iopoll(struct kiocb *kiocb, bool spin) -{ - struct request_queue *q = READ_ONCE(kiocb->private); - - if (!q) - return 0; - return blk_poll(q, READ_ONCE(kiocb->ki_cookie), spin); -} -EXPORT_SYMBOL_GPL(iomap_dio_iopoll); - -static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap, - struct bio *bio) -{ - atomic_inc(&dio->ref); - - if (dio->iocb->ki_flags & IOCB_HIPRI) - bio_set_polled(bio, dio->iocb); - - dio->submit.last_queue = bdev_get_queue(iomap->bdev); - dio->submit.cookie = submit_bio(bio); -} - -static ssize_t iomap_dio_complete(struct iomap_dio *dio) -{ - struct kiocb *iocb = dio->iocb; - struct inode *inode = file_inode(iocb->ki_filp); - loff_t offset = iocb->ki_pos; - ssize_t ret; - - if (dio->end_io) { - ret = dio->end_io(iocb, - dio->error ? dio->error : dio->size, - dio->flags); - } else { - ret = dio->error; - } - - if (likely(!ret)) { - ret = dio->size; - /* check for short read */ - if (offset + ret > dio->i_size && - !(dio->flags & IOMAP_DIO_WRITE)) - ret = dio->i_size - offset; - iocb->ki_pos += ret; - } - - /* - * Try again to invalidate clean pages which might have been cached by - * non-direct readahead, or faulted in by get_user_pages() if the source - * of the write was an mmap'ed region of the file we're writing. Either - * one is a pretty crazy thing to do, so we don't support it 100%. If - * this invalidation fails, tough, the write still worked... - * - * And this page cache invalidation has to be after dio->end_io(), as - * some filesystems convert unwritten extents to real allocations in - * end_io() when necessary, otherwise a racing buffer read would cache - * zeros from unwritten extents. - */ - if (!dio->error && - (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) { - int err; - err = invalidate_inode_pages2_range(inode->i_mapping, - offset >> PAGE_SHIFT, - (offset + dio->size - 1) >> PAGE_SHIFT); - if (err) - dio_warn_stale_pagecache(iocb->ki_filp); - } - - /* - * If this is a DSYNC write, make sure we push it to stable storage now - * that we've written data. - */ - if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC)) - ret = generic_write_sync(iocb, ret); - - inode_dio_end(file_inode(iocb->ki_filp)); - kfree(dio); - - return ret; -} - -static void iomap_dio_complete_work(struct work_struct *work) -{ - struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work); - struct kiocb *iocb = dio->iocb; - - iocb->ki_complete(iocb, iomap_dio_complete(dio), 0); -} - -/* - * Set an error in the dio if none is set yet. We have to use cmpxchg - * as the submission context and the completion context(s) can race to - * update the error. - */ -static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret) -{ - cmpxchg(&dio->error, 0, ret); -} - -static void iomap_dio_bio_end_io(struct bio *bio) -{ - struct iomap_dio *dio = bio->bi_private; - bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); - - if (bio->bi_status) - iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); - - if (atomic_dec_and_test(&dio->ref)) { - if (dio->wait_for_completion) { - struct task_struct *waiter = dio->submit.waiter; - WRITE_ONCE(dio->submit.waiter, NULL); - blk_wake_io_task(waiter); - } else if (dio->flags & IOMAP_DIO_WRITE) { - struct inode *inode = file_inode(dio->iocb->ki_filp); - - INIT_WORK(&dio->aio.work, iomap_dio_complete_work); - queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work); - } else { - iomap_dio_complete_work(&dio->aio.work); - } - } - - if (should_dirty) { - bio_check_pages_dirty(bio); - } else { - bio_release_pages(bio, false); - bio_put(bio); - } -} - -static void -iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, - unsigned len) -{ - struct page *page = ZERO_PAGE(0); - int flags = REQ_SYNC | REQ_IDLE; - struct bio *bio; - - bio = bio_alloc(GFP_KERNEL, 1); - bio_set_dev(bio, iomap->bdev); - bio->bi_iter.bi_sector = iomap_sector(iomap, pos); - bio->bi_private = dio; - bio->bi_end_io = iomap_dio_bio_end_io; - - get_page(page); - __bio_add_page(bio, page, len, 0); - bio_set_op_attrs(bio, REQ_OP_WRITE, flags); - iomap_dio_submit_bio(dio, iomap, bio); -} - -static loff_t -iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, - struct iomap_dio *dio, struct iomap *iomap) -{ - unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev)); - unsigned int fs_block_size = i_blocksize(inode), pad; - unsigned int align = iov_iter_alignment(dio->submit.iter); - struct iov_iter iter; - struct bio *bio; - bool need_zeroout = false; - bool use_fua = false; - int nr_pages, ret = 0; - size_t copied = 0; - - if ((pos | length | align) & ((1 << blkbits) - 1)) - return -EINVAL; - - if (iomap->type == IOMAP_UNWRITTEN) { - dio->flags |= IOMAP_DIO_UNWRITTEN; - need_zeroout = true; - } - - if (iomap->flags & IOMAP_F_SHARED) - dio->flags |= IOMAP_DIO_COW; - - if (iomap->flags & IOMAP_F_NEW) { - need_zeroout = true; - } else if (iomap->type == IOMAP_MAPPED) { - /* - * Use a FUA write if we need datasync semantics, this is a pure - * data IO that doesn't require any metadata updates (including - * after IO completion such as unwritten extent conversion) and - * the underlying device supports FUA. This allows us to avoid - * cache flushes on IO completion. - */ - if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) && - (dio->flags & IOMAP_DIO_WRITE_FUA) && - blk_queue_fua(bdev_get_queue(iomap->bdev))) - use_fua = true; - } - - /* - * Operate on a partial iter trimmed to the extent we were called for. - * We'll update the iter in the dio once we're done with this extent. - */ - iter = *dio->submit.iter; - iov_iter_truncate(&iter, length); - - nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES); - if (nr_pages <= 0) - return nr_pages; - - if (need_zeroout) { - /* zero out from the start of the block to the write offset */ - pad = pos & (fs_block_size - 1); - if (pad) - iomap_dio_zero(dio, iomap, pos - pad, pad); - } - - do { - size_t n; - if (dio->error) { - iov_iter_revert(dio->submit.iter, copied); - return 0; - } - - bio = bio_alloc(GFP_KERNEL, nr_pages); - bio_set_dev(bio, iomap->bdev); - bio->bi_iter.bi_sector = iomap_sector(iomap, pos); - bio->bi_write_hint = dio->iocb->ki_hint; - bio->bi_ioprio = dio->iocb->ki_ioprio; - bio->bi_private = dio; - bio->bi_end_io = iomap_dio_bio_end_io; - - ret = bio_iov_iter_get_pages(bio, &iter); - if (unlikely(ret)) { - /* - * We have to stop part way through an IO. We must fall - * through to the sub-block tail zeroing here, otherwise - * this short IO may expose stale data in the tail of - * the block we haven't written data to. - */ - bio_put(bio); - goto zero_tail; - } - - n = bio->bi_iter.bi_size; - if (dio->flags & IOMAP_DIO_WRITE) { - bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; - if (use_fua) - bio->bi_opf |= REQ_FUA; - else - dio->flags &= ~IOMAP_DIO_WRITE_FUA; - task_io_account_write(n); - } else { - bio->bi_opf = REQ_OP_READ; - if (dio->flags & IOMAP_DIO_DIRTY) - bio_set_pages_dirty(bio); - } - - iov_iter_advance(dio->submit.iter, n); - - dio->size += n; - pos += n; - copied += n; - - nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES); - iomap_dio_submit_bio(dio, iomap, bio); - } while (nr_pages); - - /* - * We need to zeroout the tail of a sub-block write if the extent type - * requires zeroing or the write extends beyond EOF. If we don't zero - * the block tail in the latter case, we can expose stale data via mmap - * reads of the EOF block. - */ -zero_tail: - if (need_zeroout || - ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) { - /* zero out from the end of the write to the end of the block */ - pad = pos & (fs_block_size - 1); - if (pad) - iomap_dio_zero(dio, iomap, pos, fs_block_size - pad); - } - return copied ? copied : ret; -} - -static loff_t -iomap_dio_hole_actor(loff_t length, struct iomap_dio *dio) -{ - length = iov_iter_zero(length, dio->submit.iter); - dio->size += length; - return length; -} - -static loff_t -iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length, - struct iomap_dio *dio, struct iomap *iomap) -{ - struct iov_iter *iter = dio->submit.iter; - size_t copied; - - BUG_ON(pos + length > PAGE_SIZE - offset_in_page(iomap->inline_data)); - - if (dio->flags & IOMAP_DIO_WRITE) { - loff_t size = inode->i_size; - - if (pos > size) - memset(iomap->inline_data + size, 0, pos - size); - copied = copy_from_iter(iomap->inline_data + pos, length, iter); - if (copied) { - if (pos + copied > size) - i_size_write(inode, pos + copied); - mark_inode_dirty(inode); - } - } else { - copied = copy_to_iter(iomap->inline_data + pos, length, iter); - } - dio->size += copied; - return copied; -} - -static loff_t -iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, - void *data, struct iomap *iomap) -{ - struct iomap_dio *dio = data; - - switch (iomap->type) { - case IOMAP_HOLE: - if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE)) - return -EIO; - return iomap_dio_hole_actor(length, dio); - case IOMAP_UNWRITTEN: - if (!(dio->flags & IOMAP_DIO_WRITE)) - return iomap_dio_hole_actor(length, dio); - return iomap_dio_bio_actor(inode, pos, length, dio, iomap); - case IOMAP_MAPPED: - return iomap_dio_bio_actor(inode, pos, length, dio, iomap); - case IOMAP_INLINE: - return iomap_dio_inline_actor(inode, pos, length, dio, iomap); - default: - WARN_ON_ONCE(1); - return -EIO; - } -} - -/* - * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO - * is being issued as AIO or not. This allows us to optimise pure data writes - * to use REQ_FUA rather than requiring generic_write_sync() to issue a - * REQ_FLUSH post write. This is slightly tricky because a single request here - * can be mapped into multiple disjoint IOs and only a subset of the IOs issued - * may be pure data writes. In that case, we still need to do a full data sync - * completion. - */ -ssize_t -iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, - const struct iomap_ops *ops, iomap_dio_end_io_t end_io) -{ - struct address_space *mapping = iocb->ki_filp->f_mapping; - struct inode *inode = file_inode(iocb->ki_filp); - size_t count = iov_iter_count(iter); - loff_t pos = iocb->ki_pos, start = pos; - loff_t end = iocb->ki_pos + count - 1, ret = 0; - unsigned int flags = IOMAP_DIRECT; - bool wait_for_completion = is_sync_kiocb(iocb); - struct blk_plug plug; - struct iomap_dio *dio; - - lockdep_assert_held(&inode->i_rwsem); - - if (!count) - return 0; - - dio = kmalloc(sizeof(*dio), GFP_KERNEL); - if (!dio) - return -ENOMEM; - - dio->iocb = iocb; - atomic_set(&dio->ref, 1); - dio->size = 0; - dio->i_size = i_size_read(inode); - dio->end_io = end_io; - dio->error = 0; - dio->flags = 0; - - dio->submit.iter = iter; - dio->submit.waiter = current; - dio->submit.cookie = BLK_QC_T_NONE; - dio->submit.last_queue = NULL; - - if (iov_iter_rw(iter) == READ) { - if (pos >= dio->i_size) - goto out_free_dio; - - if (iter_is_iovec(iter) && iov_iter_rw(iter) == READ) - dio->flags |= IOMAP_DIO_DIRTY; - } else { - flags |= IOMAP_WRITE; - dio->flags |= IOMAP_DIO_WRITE; - - /* for data sync or sync, we need sync completion processing */ - if (iocb->ki_flags & IOCB_DSYNC) - dio->flags |= IOMAP_DIO_NEED_SYNC; - - /* - * For datasync only writes, we optimistically try using FUA for - * this IO. Any non-FUA write that occurs will clear this flag, - * hence we know before completion whether a cache flush is - * necessary. - */ - if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC) - dio->flags |= IOMAP_DIO_WRITE_FUA; - } - - if (iocb->ki_flags & IOCB_NOWAIT) { - if (filemap_range_has_page(mapping, start, end)) { - ret = -EAGAIN; - goto out_free_dio; - } - flags |= IOMAP_NOWAIT; - } - - ret = filemap_write_and_wait_range(mapping, start, end); - if (ret) - goto out_free_dio; - - /* - * Try to invalidate cache pages for the range we're direct - * writing. If this invalidation fails, tough, the write will - * still work, but racing two incompatible write paths is a - * pretty crazy thing to do, so we don't support it 100%. - */ - ret = invalidate_inode_pages2_range(mapping, - start >> PAGE_SHIFT, end >> PAGE_SHIFT); - if (ret) - dio_warn_stale_pagecache(iocb->ki_filp); - ret = 0; - - if (iov_iter_rw(iter) == WRITE && !wait_for_completion && - !inode->i_sb->s_dio_done_wq) { - ret = sb_init_dio_done_wq(inode->i_sb); - if (ret < 0) - goto out_free_dio; - } - - inode_dio_begin(inode); - - blk_start_plug(&plug); - do { - ret = iomap_apply(inode, pos, count, flags, ops, dio, - iomap_dio_actor); - if (ret <= 0) { - /* magic error code to fall back to buffered I/O */ - if (ret == -ENOTBLK) { - wait_for_completion = true; - ret = 0; - } - break; - } - pos += ret; - - if (iov_iter_rw(iter) == READ && pos >= dio->i_size) - break; - } while ((count = iov_iter_count(iter)) > 0); - blk_finish_plug(&plug); - - if (ret < 0) - iomap_dio_set_error(dio, ret); - - /* - * If all the writes we issued were FUA, we don't need to flush the - * cache on IO completion. Clear the sync flag for this case. - */ - if (dio->flags & IOMAP_DIO_WRITE_FUA) - dio->flags &= ~IOMAP_DIO_NEED_SYNC; - - WRITE_ONCE(iocb->ki_cookie, dio->submit.cookie); - WRITE_ONCE(iocb->private, dio->submit.last_queue); - - /* - * We are about to drop our additional submission reference, which - * might be the last reference to the dio. There are three three - * different ways we can progress here: - * - * (a) If this is the last reference we will always complete and free - * the dio ourselves. - * (b) If this is not the last reference, and we serve an asynchronous - * iocb, we must never touch the dio after the decrement, the - * I/O completion handler will complete and free it. - * (c) If this is not the last reference, but we serve a synchronous - * iocb, the I/O completion handler will wake us up on the drop - * of the final reference, and we will complete and free it here - * after we got woken by the I/O completion handler. - */ - dio->wait_for_completion = wait_for_completion; - if (!atomic_dec_and_test(&dio->ref)) { - if (!wait_for_completion) - return -EIOCBQUEUED; - - for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (!READ_ONCE(dio->submit.waiter)) - break; - - if (!(iocb->ki_flags & IOCB_HIPRI) || - !dio->submit.last_queue || - !blk_poll(dio->submit.last_queue, - dio->submit.cookie, true)) - io_schedule(); - } - __set_current_state(TASK_RUNNING); - } - - return iomap_dio_complete(dio); - -out_free_dio: - kfree(dio); - return ret; -} -EXPORT_SYMBOL_GPL(iomap_dio_rw); - -/* Swapfile activation */ - -#ifdef CONFIG_SWAP -struct iomap_swapfile_info { - struct iomap iomap; /* accumulated iomap */ - struct swap_info_struct *sis; - uint64_t lowest_ppage; /* lowest physical addr seen (pages) */ - uint64_t highest_ppage; /* highest physical addr seen (pages) */ - unsigned long nr_pages; /* number of pages collected */ - int nr_extents; /* extent count */ -}; - -/* - * Collect physical extents for this swap file. Physical extents reported to - * the swap code must be trimmed to align to a page boundary. The logical - * offset within the file is irrelevant since the swapfile code maps logical - * page numbers of the swap device to the physical page-aligned extents. - */ -static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi) -{ - struct iomap *iomap = &isi->iomap; - unsigned long nr_pages; - uint64_t first_ppage; - uint64_t first_ppage_reported; - uint64_t next_ppage; - int error; - - /* - * Round the start up and the end down so that the physical - * extent aligns to a page boundary. - */ - first_ppage = ALIGN(iomap->addr, PAGE_SIZE) >> PAGE_SHIFT; - next_ppage = ALIGN_DOWN(iomap->addr + iomap->length, PAGE_SIZE) >> - PAGE_SHIFT; - - /* Skip too-short physical extents. */ - if (first_ppage >= next_ppage) - return 0; - nr_pages = next_ppage - first_ppage; - - /* - * Calculate how much swap space we're adding; the first page contains - * the swap header and doesn't count. The mm still wants that first - * page fed to add_swap_extent, however. - */ - first_ppage_reported = first_ppage; - if (iomap->offset == 0) - first_ppage_reported++; - if (isi->lowest_ppage > first_ppage_reported) - isi->lowest_ppage = first_ppage_reported; - if (isi->highest_ppage < (next_ppage - 1)) - isi->highest_ppage = next_ppage - 1; - - /* Add extent, set up for the next call. */ - error = add_swap_extent(isi->sis, isi->nr_pages, nr_pages, first_ppage); - if (error < 0) - return error; - isi->nr_extents += error; - isi->nr_pages += nr_pages; - return 0; -} - -/* - * Accumulate iomaps for this swap file. We have to accumulate iomaps because - * swap only cares about contiguous page-aligned physical extents and makes no - * distinction between written and unwritten extents. - */ -static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos, - loff_t count, void *data, struct iomap *iomap) -{ - struct iomap_swapfile_info *isi = data; - int error; - - switch (iomap->type) { - case IOMAP_MAPPED: - case IOMAP_UNWRITTEN: - /* Only real or unwritten extents. */ - break; - case IOMAP_INLINE: - /* No inline data. */ - pr_err("swapon: file is inline\n"); - return -EINVAL; - default: - pr_err("swapon: file has unallocated extents\n"); - return -EINVAL; - } - - /* No uncommitted metadata or shared blocks. */ - if (iomap->flags & IOMAP_F_DIRTY) { - pr_err("swapon: file is not committed\n"); - return -EINVAL; - } - if (iomap->flags & IOMAP_F_SHARED) { - pr_err("swapon: file has shared extents\n"); - return -EINVAL; - } - - /* Only one bdev per swap file. */ - if (iomap->bdev != isi->sis->bdev) { - pr_err("swapon: file is on multiple devices\n"); - return -EINVAL; - } - - if (isi->iomap.length == 0) { - /* No accumulated extent, so just store it. */ - memcpy(&isi->iomap, iomap, sizeof(isi->iomap)); - } else if (isi->iomap.addr + isi->iomap.length == iomap->addr) { - /* Append this to the accumulated extent. */ - isi->iomap.length += iomap->length; - } else { - /* Otherwise, add the retained iomap and store this one. */ - error = iomap_swapfile_add_extent(isi); - if (error) - return error; - memcpy(&isi->iomap, iomap, sizeof(isi->iomap)); - } - return count; -} - -/* - * Iterate a swap file's iomaps to construct physical extents that can be - * passed to the swapfile subsystem. - */ -int iomap_swapfile_activate(struct swap_info_struct *sis, - struct file *swap_file, sector_t *pagespan, - const struct iomap_ops *ops) -{ - struct iomap_swapfile_info isi = { - .sis = sis, - .lowest_ppage = (sector_t)-1ULL, - }; - struct address_space *mapping = swap_file->f_mapping; - struct inode *inode = mapping->host; - loff_t pos = 0; - loff_t len = ALIGN_DOWN(i_size_read(inode), PAGE_SIZE); - loff_t ret; - - /* - * Persist all file mapping metadata so that we won't have any - * IOMAP_F_DIRTY iomaps. - */ - ret = vfs_fsync(swap_file, 1); - if (ret) - return ret; - - while (len > 0) { - ret = iomap_apply(inode, pos, len, IOMAP_REPORT, - ops, &isi, iomap_swapfile_activate_actor); - if (ret <= 0) - return ret; - - pos += ret; - len -= ret; - } - - if (isi.iomap.length) { - ret = iomap_swapfile_add_extent(&isi); - if (ret) - return ret; - } - - *pagespan = 1 + isi.highest_ppage - isi.lowest_ppage; - sis->max = isi.nr_pages; - sis->pages = isi.nr_pages - 1; - sis->highest_bit = isi.nr_pages - 1; - return isi.nr_extents; -} -EXPORT_SYMBOL_GPL(iomap_swapfile_activate); -#endif /* CONFIG_SWAP */ - -static loff_t -iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length, - void *data, struct iomap *iomap) -{ - sector_t *bno = data, addr; - - if (iomap->type == IOMAP_MAPPED) { - addr = (pos - iomap->offset + iomap->addr) >> inode->i_blkbits; - if (addr > INT_MAX) - WARN(1, "would truncate bmap result\n"); - else - *bno = addr; - } - return 0; -} - -/* legacy ->bmap interface. 0 is the error return (!) */ -sector_t -iomap_bmap(struct address_space *mapping, sector_t bno, - const struct iomap_ops *ops) -{ - struct inode *inode = mapping->host; - loff_t pos = bno << inode->i_blkbits; - unsigned blocksize = i_blocksize(inode); - - if (filemap_write_and_wait(mapping)) - return 0; - - bno = 0; - iomap_apply(inode, pos, blocksize, 0, ops, &bno, iomap_bmap_actor); - return bno; -} -EXPORT_SYMBOL_GPL(iomap_bmap); diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile new file mode 100644 index 000000000000..2d165388d952 --- /dev/null +++ b/fs/iomap/Makefile @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0-or-newer +# +# Copyright (c) 2019 Oracle. +# All Rights Reserved. +# +obj-$(CONFIG_FS_IOMAP) += iomap.o + +iomap-y += \ + apply.o \ + buffered-io.o \ + direct-io.o \ + fiemap.o \ + seek.o + +iomap-$(CONFIG_SWAP) += swapfile.o diff --git a/fs/iomap/apply.c b/fs/iomap/apply.c new file mode 100644 index 000000000000..54c02aecf3cd --- /dev/null +++ b/fs/iomap/apply.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2010 Red Hat, Inc. + * Copyright (c) 2016-2018 Christoph Hellwig. + */ +#include <linux/module.h> +#include <linux/compiler.h> +#include <linux/fs.h> +#include <linux/iomap.h> + +/* + * Execute a iomap write on a segment of the mapping that spans a + * contiguous range of pages that have identical block mapping state. + * + * This avoids the need to map pages individually, do individual allocations + * for each page and most importantly avoid the need for filesystem specific + * locking per page. Instead, all the operations are amortised over the entire + * range of pages. It is assumed that the filesystems will lock whatever + * resources they require in the iomap_begin call, and release them in the + * iomap_end call. + */ +loff_t +iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, + const struct iomap_ops *ops, void *data, iomap_actor_t actor) +{ + struct iomap iomap = { 0 }; + loff_t written = 0, ret; + + /* + * Need to map a range from start position for length bytes. This can + * span multiple pages - it is only guaranteed to return a range of a + * single type of pages (e.g. all into a hole, all mapped or all + * unwritten). Failure at this point has nothing to undo. + * + * If allocation is required for this range, reserve the space now so + * that the allocation is guaranteed to succeed later on. Once we copy + * the data into the page cache pages, then we cannot fail otherwise we + * expose transient stale data. If the reserve fails, we can safely + * back out at this point as there is nothing to undo. + */ + ret = ops->iomap_begin(inode, pos, length, flags, &iomap); + if (ret) + return ret; + if (WARN_ON(iomap.offset > pos)) + return -EIO; + if (WARN_ON(iomap.length == 0)) + return -EIO; + + /* + * Cut down the length to the one actually provided by the filesystem, + * as it might not be able to give us the whole size that we requested. + */ + if (iomap.offset + iomap.length < pos + length) + length = iomap.offset + iomap.length - pos; + + /* + * Now that we have guaranteed that the space allocation will succeed. + * we can do the copy-in page by page without having to worry about + * failures exposing transient data. + */ + written = actor(inode, pos, length, data, &iomap); + + /* + * Now the data has been copied, commit the range we've copied. This + * should not fail unless the filesystem has had a fatal error. + */ + if (ops->iomap_end) { + ret = ops->iomap_end(inode, pos, length, + written > 0 ? written : 0, + flags, &iomap); + } + + return written ? written : ret; +} diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c new file mode 100644 index 000000000000..e25901ae3ff4 --- /dev/null +++ b/fs/iomap/buffered-io.c @@ -0,0 +1,1073 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2010 Red Hat, Inc. + * Copyright (c) 2016-2018 Christoph Hellwig. + */ +#include <linux/module.h> +#include <linux/compiler.h> +#include <linux/fs.h> +#include <linux/iomap.h> +#include <linux/pagemap.h> +#include <linux/uio.h> +#include <linux/buffer_head.h> +#include <linux/dax.h> +#include <linux/writeback.h> +#include <linux/swap.h> +#include <linux/bio.h> +#include <linux/sched/signal.h> +#include <linux/migrate.h> + +#include "../internal.h" + +static struct iomap_page * +iomap_page_create(struct inode *inode, struct page *page) +{ + struct iomap_page *iop = to_iomap_page(page); + + if (iop || i_blocksize(inode) == PAGE_SIZE) + return iop; + + iop = kmalloc(sizeof(*iop), GFP_NOFS | __GFP_NOFAIL); + atomic_set(&iop->read_count, 0); + atomic_set(&iop->write_count, 0); + bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE); + + /* + * migrate_page_move_mapping() assumes that pages with private data have + * their count elevated by 1. + */ + get_page(page); + set_page_private(page, (unsigned long)iop); + SetPagePrivate(page); + return iop; +} + +static void +iomap_page_release(struct page *page) +{ + struct iomap_page *iop = to_iomap_page(page); + + if (!iop) + return; + WARN_ON_ONCE(atomic_read(&iop->read_count)); + WARN_ON_ONCE(atomic_read(&iop->write_count)); + ClearPagePrivate(page); + set_page_private(page, 0); + put_page(page); + kfree(iop); +} + +/* + * Calculate the range inside the page that we actually need to read. + */ +static void +iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop, + loff_t *pos, loff_t length, unsigned *offp, unsigned *lenp) +{ + loff_t orig_pos = *pos; + loff_t isize = i_size_read(inode); + unsigned block_bits = inode->i_blkbits; + unsigned block_size = (1 << block_bits); + unsigned poff = offset_in_page(*pos); + unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length); + unsigned first = poff >> block_bits; + unsigned last = (poff + plen - 1) >> block_bits; + + /* + * If the block size is smaller than the page size we need to check the + * per-block uptodate status and adjust the offset and length if needed + * to avoid reading in already uptodate ranges. + */ + if (iop) { + unsigned int i; + + /* move forward for each leading block marked uptodate */ + for (i = first; i <= last; i++) { + if (!test_bit(i, iop->uptodate)) + break; + *pos += block_size; + poff += block_size; + plen -= block_size; + first++; + } + + /* truncate len if we find any trailing uptodate block(s) */ + for ( ; i <= last; i++) { + if (test_bit(i, iop->uptodate)) { + plen -= (last - i + 1) * block_size; + last = i - 1; + break; + } + } + } + + /* + * If the extent spans the block that contains the i_size we need to + * handle both halves separately so that we properly zero data in the + * page cache for blocks that are entirely outside of i_size. + */ + if (orig_pos <= isize && orig_pos + length > isize) { + unsigned end = offset_in_page(isize - 1) >> block_bits; + + if (first <= end && last > end) + plen -= (last - end) * block_size; + } + + *offp = poff; + *lenp = plen; +} + +static void +iomap_set_range_uptodate(struct page *page, unsigned off, unsigned len) +{ + struct iomap_page *iop = to_iomap_page(page); + struct inode *inode = page->mapping->host; + unsigned first = off >> inode->i_blkbits; + unsigned last = (off + len - 1) >> inode->i_blkbits; + unsigned int i; + bool uptodate = true; + + if (iop) { + for (i = 0; i < PAGE_SIZE / i_blocksize(inode); i++) { + if (i >= first && i <= last) + set_bit(i, iop->uptodate); + else if (!test_bit(i, iop->uptodate)) + uptodate = false; + } + } + + if (uptodate && !PageError(page)) + SetPageUptodate(page); +} + +static void +iomap_read_finish(struct iomap_page *iop, struct page *page) +{ + if (!iop || atomic_dec_and_test(&iop->read_count)) + unlock_page(page); +} + +static void +iomap_read_page_end_io(struct bio_vec *bvec, int error) +{ + struct page *page = bvec->bv_page; + struct iomap_page *iop = to_iomap_page(page); + + if (unlikely(error)) { + ClearPageUptodate(page); + SetPageError(page); + } else { + iomap_set_range_uptodate(page, bvec->bv_offset, bvec->bv_len); + } + + iomap_read_finish(iop, page); +} + +static void +iomap_read_end_io(struct bio *bio) +{ + int error = blk_status_to_errno(bio->bi_status); + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bvec, bio, iter_all) + iomap_read_page_end_io(bvec, error); + bio_put(bio); +} + +struct iomap_readpage_ctx { + struct page *cur_page; + bool cur_page_in_bio; + bool is_readahead; + struct bio *bio; + struct list_head *pages; +}; + +static void +iomap_read_inline_data(struct inode *inode, struct page *page, + struct iomap *iomap) +{ + size_t size = i_size_read(inode); + void *addr; + + if (PageUptodate(page)) + return; + + BUG_ON(page->index); + BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data)); + + addr = kmap_atomic(page); + memcpy(addr, iomap->inline_data, size); + memset(addr + size, 0, PAGE_SIZE - size); + kunmap_atomic(addr); + SetPageUptodate(page); +} + +static loff_t +iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, + struct iomap *iomap) +{ + struct iomap_readpage_ctx *ctx = data; + struct page *page = ctx->cur_page; + struct iomap_page *iop = iomap_page_create(inode, page); + bool same_page = false, is_contig = false; + loff_t orig_pos = pos; + unsigned poff, plen; + sector_t sector; + + if (iomap->type == IOMAP_INLINE) { + WARN_ON_ONCE(pos); + iomap_read_inline_data(inode, page, iomap); + return PAGE_SIZE; + } + + /* zero post-eof blocks as the page may be mapped */ + iomap_adjust_read_range(inode, iop, &pos, length, &poff, &plen); + if (plen == 0) + goto done; + + if (iomap->type != IOMAP_MAPPED || pos >= i_size_read(inode)) { + zero_user(page, poff, plen); + iomap_set_range_uptodate(page, poff, plen); + goto done; + } + + ctx->cur_page_in_bio = true; + + /* + * Try to merge into a previous segment if we can. + */ + sector = iomap_sector(iomap, pos); + if (ctx->bio && bio_end_sector(ctx->bio) == sector) + is_contig = true; + + if (is_contig && + __bio_try_merge_page(ctx->bio, page, plen, poff, &same_page)) { + if (!same_page && iop) + atomic_inc(&iop->read_count); + goto done; + } + + /* + * If we start a new segment we need to increase the read count, and we + * need to do so before submitting any previous full bio to make sure + * that we don't prematurely unlock the page. + */ + if (iop) + atomic_inc(&iop->read_count); + + if (!ctx->bio || !is_contig || bio_full(ctx->bio, plen)) { + gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); + int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT; + + if (ctx->bio) + submit_bio(ctx->bio); + + if (ctx->is_readahead) /* same as readahead_gfp_mask */ + gfp |= __GFP_NORETRY | __GFP_NOWARN; + ctx->bio = bio_alloc(gfp, min(BIO_MAX_PAGES, nr_vecs)); + ctx->bio->bi_opf = REQ_OP_READ; + if (ctx->is_readahead) + ctx->bio->bi_opf |= REQ_RAHEAD; + ctx->bio->bi_iter.bi_sector = sector; + bio_set_dev(ctx->bio, iomap->bdev); + ctx->bio->bi_end_io = iomap_read_end_io; + } + + bio_add_page(ctx->bio, page, plen, poff); +done: + /* + * Move the caller beyond our range so that it keeps making progress. + * For that we have to include any leading non-uptodate ranges, but + * we can skip trailing ones as they will be handled in the next + * iteration. + */ + return pos - orig_pos + plen; +} + +int +iomap_readpage(struct page *page, const struct iomap_ops *ops) +{ + struct iomap_readpage_ctx ctx = { .cur_page = page }; + struct inode *inode = page->mapping->host; + unsigned poff; + loff_t ret; + + for (poff = 0; poff < PAGE_SIZE; poff += ret) { + ret = iomap_apply(inode, page_offset(page) + poff, + PAGE_SIZE - poff, 0, ops, &ctx, + iomap_readpage_actor); + if (ret <= 0) { + WARN_ON_ONCE(ret == 0); + SetPageError(page); + break; + } + } + + if (ctx.bio) { + submit_bio(ctx.bio); + WARN_ON_ONCE(!ctx.cur_page_in_bio); + } else { + WARN_ON_ONCE(ctx.cur_page_in_bio); + unlock_page(page); + } + + /* + * Just like mpage_readpages and block_read_full_page we always + * return 0 and just mark the page as PageError on errors. This + * should be cleaned up all through the stack eventually. + */ + return 0; +} +EXPORT_SYMBOL_GPL(iomap_readpage); + +static struct page * +iomap_next_page(struct inode *inode, struct list_head *pages, loff_t pos, + loff_t length, loff_t *done) +{ + while (!list_empty(pages)) { + struct page *page = lru_to_page(pages); + + if (page_offset(page) >= (u64)pos + length) + break; + + list_del(&page->lru); + if (!add_to_page_cache_lru(page, inode->i_mapping, page->index, + GFP_NOFS)) + return page; + + /* + * If we already have a page in the page cache at index we are + * done. Upper layers don't care if it is uptodate after the + * readpages call itself as every page gets checked again once + * actually needed. + */ + *done += PAGE_SIZE; + put_page(page); + } + + return NULL; +} + +static loff_t +iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length, + void *data, struct iomap *iomap) +{ + struct iomap_readpage_ctx *ctx = data; + loff_t done, ret; + + for (done = 0; done < length; done += ret) { + if (ctx->cur_page && offset_in_page(pos + done) == 0) { + if (!ctx->cur_page_in_bio) + unlock_page(ctx->cur_page); + put_page(ctx->cur_page); + ctx->cur_page = NULL; + } + if (!ctx->cur_page) { + ctx->cur_page = iomap_next_page(inode, ctx->pages, + pos, length, &done); + if (!ctx->cur_page) + break; + ctx->cur_page_in_bio = false; + } + ret = iomap_readpage_actor(inode, pos + done, length - done, + ctx, iomap); + } + + return done; +} + +int +iomap_readpages(struct address_space *mapping, struct list_head *pages, + unsigned nr_pages, const struct iomap_ops *ops) +{ + struct iomap_readpage_ctx ctx = { + .pages = pages, + .is_readahead = true, + }; + loff_t pos = page_offset(list_entry(pages->prev, struct page, lru)); + loff_t last = page_offset(list_entry(pages->next, struct page, lru)); + loff_t length = last - pos + PAGE_SIZE, ret = 0; + + while (length > 0) { + ret = iomap_apply(mapping->host, pos, length, 0, ops, + &ctx, iomap_readpages_actor); + if (ret <= 0) { + WARN_ON_ONCE(ret == 0); + goto done; + } + pos += ret; + length -= ret; + } + ret = 0; +done: + if (ctx.bio) + submit_bio(ctx.bio); + if (ctx.cur_page) { + if (!ctx.cur_page_in_bio) + unlock_page(ctx.cur_page); + put_page(ctx.cur_page); + } + + /* + * Check that we didn't lose a page due to the arcance calling + * conventions.. + */ + WARN_ON_ONCE(!ret && !list_empty(ctx.pages)); + return ret; +} +EXPORT_SYMBOL_GPL(iomap_readpages); + +/* + * iomap_is_partially_uptodate checks whether blocks within a page are + * uptodate or not. + * + * Returns true if all blocks which correspond to a file portion + * we want to read within the page are uptodate. + */ +int +iomap_is_partially_uptodate(struct page *page, unsigned long from, + unsigned long count) +{ + struct iomap_page *iop = to_iomap_page(page); + struct inode *inode = page->mapping->host; + unsigned len, first, last; + unsigned i; + + /* Limit range to one page */ + len = min_t(unsigned, PAGE_SIZE - from, count); + + /* First and last blocks in range within page */ + first = from >> inode->i_blkbits; + last = (from + len - 1) >> inode->i_blkbits; + + if (iop) { + for (i = first; i <= last; i++) + if (!test_bit(i, iop->uptodate)) + return 0; + return 1; + } + + return 0; +} +EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); + +int +iomap_releasepage(struct page *page, gfp_t gfp_mask) +{ + /* + * mm accommodates an old ext3 case where clean pages might not have had + * the dirty bit cleared. Thus, it can send actual dirty pages to + * ->releasepage() via shrink_active_list(), skip those here. + */ + if (PageDirty(page) || PageWriteback(page)) + return 0; + iomap_page_release(page); + return 1; +} +EXPORT_SYMBOL_GPL(iomap_releasepage); + +void +iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len) +{ + /* + * If we are invalidating the entire page, clear the dirty state from it + * and release it to avoid unnecessary buildup of the LRU. + */ + if (offset == 0 && len == PAGE_SIZE) { + WARN_ON_ONCE(PageWriteback(page)); + cancel_dirty_page(page); + iomap_page_release(page); + } +} +EXPORT_SYMBOL_GPL(iomap_invalidatepage); + +#ifdef CONFIG_MIGRATION +int +iomap_migrate_page(struct address_space *mapping, struct page *newpage, + struct page *page, enum migrate_mode mode) +{ + int ret; + + ret = migrate_page_move_mapping(mapping, newpage, page, 0); + if (ret != MIGRATEPAGE_SUCCESS) + return ret; + + if (page_has_private(page)) { + ClearPagePrivate(page); + get_page(newpage); + set_page_private(newpage, page_private(page)); + set_page_private(page, 0); + put_page(page); + SetPagePrivate(newpage); + } + + if (mode != MIGRATE_SYNC_NO_COPY) + migrate_page_copy(newpage, page); + else + migrate_page_states(newpage, page); + return MIGRATEPAGE_SUCCESS; +} +EXPORT_SYMBOL_GPL(iomap_migrate_page); +#endif /* CONFIG_MIGRATION */ + +static void +iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) +{ + loff_t i_size = i_size_read(inode); + + /* + * Only truncate newly allocated pages beyoned EOF, even if the + * write started inside the existing inode size. + */ + if (pos + len > i_size) + truncate_pagecache_range(inode, max(pos, i_size), pos + len); +} + +static int +iomap_read_page_sync(struct inode *inode, loff_t block_start, struct page *page, + unsigned poff, unsigned plen, unsigned from, unsigned to, + struct iomap *iomap) +{ + struct bio_vec bvec; + struct bio bio; + + if (iomap->type != IOMAP_MAPPED || block_start >= i_size_read(inode)) { + zero_user_segments(page, poff, from, to, poff + plen); + iomap_set_range_uptodate(page, poff, plen); + return 0; + } + + bio_init(&bio, &bvec, 1); + bio.bi_opf = REQ_OP_READ; + bio.bi_iter.bi_sector = iomap_sector(iomap, block_start); + bio_set_dev(&bio, iomap->bdev); + __bio_add_page(&bio, page, plen, poff); + return submit_bio_wait(&bio); +} + +static int +__iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, + struct page *page, struct iomap *iomap) +{ + struct iomap_page *iop = iomap_page_create(inode, page); + loff_t block_size = i_blocksize(inode); + loff_t block_start = pos & ~(block_size - 1); + loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1); + unsigned from = offset_in_page(pos), to = from + len, poff, plen; + int status = 0; + + if (PageUptodate(page)) + return 0; + + do { + iomap_adjust_read_range(inode, iop, &block_start, + block_end - block_start, &poff, &plen); + if (plen == 0) + break; + + if ((from > poff && from < poff + plen) || + (to > poff && to < poff + plen)) { + status = iomap_read_page_sync(inode, block_start, page, + poff, plen, from, to, iomap); + if (status) + break; + } + + } while ((block_start += plen) < block_end); + + return status; +} + +static int +iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, + struct page **pagep, struct iomap *iomap) +{ + const struct iomap_page_ops *page_ops = iomap->page_ops; + pgoff_t index = pos >> PAGE_SHIFT; + struct page *page; + int status = 0; + + BUG_ON(pos + len > iomap->offset + iomap->length); + + if (fatal_signal_pending(current)) + return -EINTR; + + if (page_ops && page_ops->page_prepare) { + status = page_ops->page_prepare(inode, pos, len, iomap); + if (status) + return status; + } + + page = grab_cache_page_write_begin(inode->i_mapping, index, flags); + if (!page) { + status = -ENOMEM; + goto out_no_page; + } + + if (iomap->type == IOMAP_INLINE) + iomap_read_inline_data(inode, page, iomap); + else if (iomap->flags & IOMAP_F_BUFFER_HEAD) + status = __block_write_begin_int(page, pos, len, NULL, iomap); + else + status = __iomap_write_begin(inode, pos, len, page, iomap); + + if (unlikely(status)) + goto out_unlock; + + *pagep = page; + return 0; + +out_unlock: + unlock_page(page); + put_page(page); + iomap_write_failed(inode, pos, len); + +out_no_page: + if (page_ops && page_ops->page_done) + page_ops->page_done(inode, pos, 0, NULL, iomap); + return status; +} + +int +iomap_set_page_dirty(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + int newly_dirty; + + if (unlikely(!mapping)) + return !TestSetPageDirty(page); + + /* + * Lock out page->mem_cgroup migration to keep PageDirty + * synchronized with per-memcg dirty page counters. + */ + lock_page_memcg(page); + newly_dirty = !TestSetPageDirty(page); + if (newly_dirty) + __set_page_dirty(page, mapping, 0); + unlock_page_memcg(page); + + if (newly_dirty) + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + return newly_dirty; +} +EXPORT_SYMBOL_GPL(iomap_set_page_dirty); + +static int +__iomap_write_end(struct inode *inode, loff_t pos, unsigned len, + unsigned copied, struct page *page, struct iomap *iomap) +{ + flush_dcache_page(page); + + /* + * The blocks that were entirely written will now be uptodate, so we + * don't have to worry about a readpage reading them and overwriting a + * partial write. However if we have encountered a short write and only + * partially written into a block, it will not be marked uptodate, so a + * readpage might come in and destroy our partial write. + * + * Do the simplest thing, and just treat any short write to a non + * uptodate page as a zero-length write, and force the caller to redo + * the whole thing. + */ + if (unlikely(copied < len && !PageUptodate(page))) + return 0; + iomap_set_range_uptodate(page, offset_in_page(pos), len); + iomap_set_page_dirty(page); + return copied; +} + +static int +iomap_write_end_inline(struct inode *inode, struct page *page, + struct iomap *iomap, loff_t pos, unsigned copied) +{ + void *addr; + + WARN_ON_ONCE(!PageUptodate(page)); + BUG_ON(pos + copied > PAGE_SIZE - offset_in_page(iomap->inline_data)); + + addr = kmap_atomic(page); + memcpy(iomap->inline_data + pos, addr + pos, copied); + kunmap_atomic(addr); + + mark_inode_dirty(inode); + return copied; +} + +static int +iomap_write_end(struct inode *inode, loff_t pos, unsigned len, + unsigned copied, struct page *page, struct iomap *iomap) +{ + const struct iomap_page_ops *page_ops = iomap->page_ops; + loff_t old_size = inode->i_size; + int ret; + + if (iomap->type == IOMAP_INLINE) { + ret = iomap_write_end_inline(inode, page, iomap, pos, copied); + } else if (iomap->flags & IOMAP_F_BUFFER_HEAD) { + ret = block_write_end(NULL, inode->i_mapping, pos, len, copied, + page, NULL); + } else { + ret = __iomap_write_end(inode, pos, len, copied, page, iomap); + } + + /* + * Update the in-memory inode size after copying the data into the page + * cache. It's up to the file system to write the updated size to disk, + * preferably after I/O completion so that no stale data is exposed. + */ + if (pos + ret > old_size) { + i_size_write(inode, pos + ret); + iomap->flags |= IOMAP_F_SIZE_CHANGED; + } + unlock_page(page); + + if (old_size < pos) + pagecache_isize_extended(inode, old_size, pos); + if (page_ops && page_ops->page_done) + page_ops->page_done(inode, pos, ret, page, iomap); + put_page(page); + + if (ret < len) + iomap_write_failed(inode, pos, len); + return ret; +} + +static loff_t +iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data, + struct iomap *iomap) +{ + struct iov_iter *i = data; + long status = 0; + ssize_t written = 0; + unsigned int flags = AOP_FLAG_NOFS; + + do { + struct page *page; + unsigned long offset; /* Offset into pagecache page */ + unsigned long bytes; /* Bytes to write to page */ + size_t copied; /* Bytes copied from user */ + + offset = offset_in_page(pos); + bytes = min_t(unsigned long, PAGE_SIZE - offset, + iov_iter_count(i)); +again: + if (bytes > length) + bytes = length; + + /* + * Bring in the user page that we will copy from _first_. + * Otherwise there's a nasty deadlock on copying from the + * same page as we're writing to, without it being marked + * up-to-date. + * + * Not only is this an optimisation, but it is also required + * to check that the address is actually valid, when atomic + * usercopies are used, below. + */ + if (unlikely(iov_iter_fault_in_readable(i, bytes))) { + status = -EFAULT; + break; + } + + status = iomap_write_begin(inode, pos, bytes, flags, &page, + iomap); + if (unlikely(status)) + break; + + if (mapping_writably_mapped(inode->i_mapping)) + flush_dcache_page(page); + + copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); + + flush_dcache_page(page); + + status = iomap_write_end(inode, pos, bytes, copied, page, + iomap); + if (unlikely(status < 0)) + break; + copied = status; + + cond_resched(); + + iov_iter_advance(i, copied); + if (unlikely(copied == 0)) { + /* + * If we were unable to copy any data at all, we must + * fall back to a single segment length write. + * + * If we didn't fallback here, we could livelock + * because not all segments in the iov can be copied at + * once without a pagefault. + */ + bytes = min_t(unsigned long, PAGE_SIZE - offset, + iov_iter_single_seg_count(i)); + goto again; + } + pos += copied; + written += copied; + length -= copied; + + balance_dirty_pages_ratelimited(inode->i_mapping); + } while (iov_iter_count(i) && length); + + return written ? written : status; +} + +ssize_t +iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter, + const struct iomap_ops *ops) +{ + struct inode *inode = iocb->ki_filp->f_mapping->host; + loff_t pos = iocb->ki_pos, ret = 0, written = 0; + + while (iov_iter_count(iter)) { + ret = iomap_apply(inode, pos, iov_iter_count(iter), + IOMAP_WRITE, ops, iter, iomap_write_actor); + if (ret <= 0) + break; + pos += ret; + written += ret; + } + + return written ? written : ret; +} +EXPORT_SYMBOL_GPL(iomap_file_buffered_write); + +static struct page * +__iomap_read_page(struct inode *inode, loff_t offset) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page; + + page = read_mapping_page(mapping, offset >> PAGE_SHIFT, NULL); + if (IS_ERR(page)) + return page; + if (!PageUptodate(page)) { + put_page(page); + return ERR_PTR(-EIO); + } + return page; +} + +static loff_t +iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data, + struct iomap *iomap) +{ + long status = 0; + ssize_t written = 0; + + do { + struct page *page, *rpage; + unsigned long offset; /* Offset into pagecache page */ + unsigned long bytes; /* Bytes to write to page */ + + offset = offset_in_page(pos); + bytes = min_t(loff_t, PAGE_SIZE - offset, length); + + rpage = __iomap_read_page(inode, pos); + if (IS_ERR(rpage)) + return PTR_ERR(rpage); + + status = iomap_write_begin(inode, pos, bytes, + AOP_FLAG_NOFS, &page, iomap); + put_page(rpage); + if (unlikely(status)) + return status; + + WARN_ON_ONCE(!PageUptodate(page)); + + status = iomap_write_end(inode, pos, bytes, bytes, page, iomap); + if (unlikely(status <= 0)) { + if (WARN_ON_ONCE(status == 0)) + return -EIO; + return status; + } + + cond_resched(); + + pos += status; + written += status; + length -= status; + + balance_dirty_pages_ratelimited(inode->i_mapping); + } while (length); + + return written; +} + +int +iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len, + const struct iomap_ops *ops) +{ + loff_t ret; + + while (len) { + ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL, + iomap_dirty_actor); + if (ret <= 0) + return ret; + pos += ret; + len -= ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(iomap_file_dirty); + +static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, + unsigned bytes, struct iomap *iomap) +{ + struct page *page; + int status; + + status = iomap_write_begin(inode, pos, bytes, AOP_FLAG_NOFS, &page, + iomap); + if (status) + return status; + + zero_user(page, offset, bytes); + mark_page_accessed(page); + + return iomap_write_end(inode, pos, bytes, bytes, page, iomap); +} + +static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes, + struct iomap *iomap) +{ + return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, + iomap_sector(iomap, pos & PAGE_MASK), offset, bytes); +} + +static loff_t +iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, + void *data, struct iomap *iomap) +{ + bool *did_zero = data; + loff_t written = 0; + int status; + + /* already zeroed? we're done. */ + if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) + return count; + + do { + unsigned offset, bytes; + + offset = offset_in_page(pos); + bytes = min_t(loff_t, PAGE_SIZE - offset, count); + + if (IS_DAX(inode)) + status = iomap_dax_zero(pos, offset, bytes, iomap); + else + status = iomap_zero(inode, pos, offset, bytes, iomap); + if (status < 0) + return status; + + pos += bytes; + count -= bytes; + written += bytes; + if (did_zero) + *did_zero = true; + } while (count > 0); + + return written; +} + +int +iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, + const struct iomap_ops *ops) +{ + loff_t ret; + + while (len > 0) { + ret = iomap_apply(inode, pos, len, IOMAP_ZERO, + ops, did_zero, iomap_zero_range_actor); + if (ret <= 0) + return ret; + + pos += ret; + len -= ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(iomap_zero_range); + +int +iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, + const struct iomap_ops *ops) +{ + unsigned int blocksize = i_blocksize(inode); + unsigned int off = pos & (blocksize - 1); + + /* Block boundary? Nothing to do */ + if (!off) + return 0; + return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops); +} +EXPORT_SYMBOL_GPL(iomap_truncate_page); + +static loff_t +iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, + void *data, struct iomap *iomap) +{ + struct page *page = data; + int ret; + + if (iomap->flags & IOMAP_F_BUFFER_HEAD) { + ret = __block_write_begin_int(page, pos, length, NULL, iomap); + if (ret) + return ret; + block_commit_write(page, 0, length); + } else { + WARN_ON_ONCE(!PageUptodate(page)); + iomap_page_create(inode, page); + set_page_dirty(page); + } + + return length; +} + +vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) +{ + struct page *page = vmf->page; + struct inode *inode = file_inode(vmf->vma->vm_file); + unsigned long length; + loff_t offset, size; + ssize_t ret; + + lock_page(page); + size = i_size_read(inode); + if ((page->mapping != inode->i_mapping) || + (page_offset(page) > size)) { + /* We overload EFAULT to mean page got truncated */ + ret = -EFAULT; + goto out_unlock; + } + + /* page is wholly or partially inside EOF */ + if (((page->index + 1) << PAGE_SHIFT) > size) + length = offset_in_page(size); + else + length = PAGE_SIZE; + + offset = page_offset(page); + while (length > 0) { + ret = iomap_apply(inode, offset, length, + IOMAP_WRITE | IOMAP_FAULT, ops, page, + iomap_page_mkwrite_actor); + if (unlikely(ret <= 0)) + goto out_unlock; + offset += ret; + length -= ret; + } + + wait_for_stable_page(page); + return VM_FAULT_LOCKED; +out_unlock: + unlock_page(page); + return block_page_mkwrite_return(ret); +} +EXPORT_SYMBOL_GPL(iomap_page_mkwrite); diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c new file mode 100644 index 000000000000..10517cea9682 --- /dev/null +++ b/fs/iomap/direct-io.c @@ -0,0 +1,562 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2010 Red Hat, Inc. + * Copyright (c) 2016-2018 Christoph Hellwig. + */ +#include <linux/module.h> +#include <linux/compiler.h> +#include <linux/fs.h> +#include <linux/iomap.h> +#include <linux/backing-dev.h> +#include <linux/uio.h> +#include <linux/task_io_accounting_ops.h> + +#include "../internal.h" + +/* + * Private flags for iomap_dio, must not overlap with the public ones in + * iomap.h: + */ +#define IOMAP_DIO_WRITE_FUA (1 << 28) +#define IOMAP_DIO_NEED_SYNC (1 << 29) +#define IOMAP_DIO_WRITE (1 << 30) +#define IOMAP_DIO_DIRTY (1 << 31) + +struct iomap_dio { + struct kiocb *iocb; + iomap_dio_end_io_t *end_io; + loff_t i_size; + loff_t size; + atomic_t ref; + unsigned flags; + int error; + bool wait_for_completion; + + union { + /* used during submission and for synchronous completion: */ + struct { + struct iov_iter *iter; + struct task_struct *waiter; + struct request_queue *last_queue; + blk_qc_t cookie; + } submit; + + /* used for aio completion: */ + struct { + struct work_struct work; + } aio; + }; +}; + +int iomap_dio_iopoll(struct kiocb *kiocb, bool spin) +{ + struct request_queue *q = READ_ONCE(kiocb->private); + + if (!q) + return 0; + return blk_poll(q, READ_ONCE(kiocb->ki_cookie), spin); +} +EXPORT_SYMBOL_GPL(iomap_dio_iopoll); + +static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap, + struct bio *bio) +{ + atomic_inc(&dio->ref); + + if (dio->iocb->ki_flags & IOCB_HIPRI) + bio_set_polled(bio, dio->iocb); + + dio->submit.last_queue = bdev_get_queue(iomap->bdev); + dio->submit.cookie = submit_bio(bio); +} + +static ssize_t iomap_dio_complete(struct iomap_dio *dio) +{ + struct kiocb *iocb = dio->iocb; + struct inode *inode = file_inode(iocb->ki_filp); + loff_t offset = iocb->ki_pos; + ssize_t ret; + + if (dio->end_io) { + ret = dio->end_io(iocb, + dio->error ? dio->error : dio->size, + dio->flags); + } else { + ret = dio->error; + } + + if (likely(!ret)) { + ret = dio->size; + /* check for short read */ + if (offset + ret > dio->i_size && + !(dio->flags & IOMAP_DIO_WRITE)) + ret = dio->i_size - offset; + iocb->ki_pos += ret; + } + + /* + * Try again to invalidate clean pages which might have been cached by + * non-direct readahead, or faulted in by get_user_pages() if the source + * of the write was an mmap'ed region of the file we're writing. Either + * one is a pretty crazy thing to do, so we don't support it 100%. If + * this invalidation fails, tough, the write still worked... + * + * And this page cache invalidation has to be after dio->end_io(), as + * some filesystems convert unwritten extents to real allocations in + * end_io() when necessary, otherwise a racing buffer read would cache + * zeros from unwritten extents. + */ + if (!dio->error && + (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) { + int err; + err = invalidate_inode_pages2_range(inode->i_mapping, + offset >> PAGE_SHIFT, + (offset + dio->size - 1) >> PAGE_SHIFT); + if (err) + dio_warn_stale_pagecache(iocb->ki_filp); + } + + /* + * If this is a DSYNC write, make sure we push it to stable storage now + * that we've written data. + */ + if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC)) + ret = generic_write_sync(iocb, ret); + + inode_dio_end(file_inode(iocb->ki_filp)); + kfree(dio); + + return ret; +} + +static void iomap_dio_complete_work(struct work_struct *work) +{ + struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work); + struct kiocb *iocb = dio->iocb; + + iocb->ki_complete(iocb, iomap_dio_complete(dio), 0); +} + +/* + * Set an error in the dio if none is set yet. We have to use cmpxchg + * as the submission context and the completion context(s) can race to + * update the error. + */ +static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret) +{ + cmpxchg(&dio->error, 0, ret); +} + +static void iomap_dio_bio_end_io(struct bio *bio) +{ + struct iomap_dio *dio = bio->bi_private; + bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); + + if (bio->bi_status) + iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); + + if (atomic_dec_and_test(&dio->ref)) { + if (dio->wait_for_completion) { + struct task_struct *waiter = dio->submit.waiter; + WRITE_ONCE(dio->submit.waiter, NULL); + blk_wake_io_task(waiter); + } else if (dio->flags & IOMAP_DIO_WRITE) { + struct inode *inode = file_inode(dio->iocb->ki_filp); + + INIT_WORK(&dio->aio.work, iomap_dio_complete_work); + queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work); + } else { + iomap_dio_complete_work(&dio->aio.work); + } + } + + if (should_dirty) { + bio_check_pages_dirty(bio); + } else { + bio_release_pages(bio, false); + bio_put(bio); + } +} + +static void +iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, + unsigned len) +{ + struct page *page = ZERO_PAGE(0); + int flags = REQ_SYNC | REQ_IDLE; + struct bio *bio; + + bio = bio_alloc(GFP_KERNEL, 1); + bio_set_dev(bio, iomap->bdev); + bio->bi_iter.bi_sector = iomap_sector(iomap, pos); + bio->bi_private = dio; + bio->bi_end_io = iomap_dio_bio_end_io; + + get_page(page); + __bio_add_page(bio, page, len, 0); + bio_set_op_attrs(bio, REQ_OP_WRITE, flags); + iomap_dio_submit_bio(dio, iomap, bio); +} + +static loff_t +iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, + struct iomap_dio *dio, struct iomap *iomap) +{ + unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev)); + unsigned int fs_block_size = i_blocksize(inode), pad; + unsigned int align = iov_iter_alignment(dio->submit.iter); + struct iov_iter iter; + struct bio *bio; + bool need_zeroout = false; + bool use_fua = false; + int nr_pages, ret = 0; + size_t copied = 0; + + if ((pos | length | align) & ((1 << blkbits) - 1)) + return -EINVAL; + + if (iomap->type == IOMAP_UNWRITTEN) { + dio->flags |= IOMAP_DIO_UNWRITTEN; + need_zeroout = true; + } + + if (iomap->flags & IOMAP_F_SHARED) + dio->flags |= IOMAP_DIO_COW; + + if (iomap->flags & IOMAP_F_NEW) { + need_zeroout = true; + } else if (iomap->type == IOMAP_MAPPED) { + /* + * Use a FUA write if we need datasync semantics, this is a pure + * data IO that doesn't require any metadata updates (including + * after IO completion such as unwritten extent conversion) and + * the underlying device supports FUA. This allows us to avoid + * cache flushes on IO completion. + */ + if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) && + (dio->flags & IOMAP_DIO_WRITE_FUA) && + blk_queue_fua(bdev_get_queue(iomap->bdev))) + use_fua = true; + } + + /* + * Operate on a partial iter trimmed to the extent we were called for. + * We'll update the iter in the dio once we're done with this extent. + */ + iter = *dio->submit.iter; + iov_iter_truncate(&iter, length); + + nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES); + if (nr_pages <= 0) + return nr_pages; + + if (need_zeroout) { + /* zero out from the start of the block to the write offset */ + pad = pos & (fs_block_size - 1); + if (pad) + iomap_dio_zero(dio, iomap, pos - pad, pad); + } + + do { + size_t n; + if (dio->error) { + iov_iter_revert(dio->submit.iter, copied); + return 0; + } + + bio = bio_alloc(GFP_KERNEL, nr_pages); + bio_set_dev(bio, iomap->bdev); + bio->bi_iter.bi_sector = iomap_sector(iomap, pos); + bio->bi_write_hint = dio->iocb->ki_hint; + bio->bi_ioprio = dio->iocb->ki_ioprio; + bio->bi_private = dio; + bio->bi_end_io = iomap_dio_bio_end_io; + + ret = bio_iov_iter_get_pages(bio, &iter); + if (unlikely(ret)) { + /* + * We have to stop part way through an IO. We must fall + * through to the sub-block tail zeroing here, otherwise + * this short IO may expose stale data in the tail of + * the block we haven't written data to. + */ + bio_put(bio); + goto zero_tail; + } + + n = bio->bi_iter.bi_size; + if (dio->flags & IOMAP_DIO_WRITE) { + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; + if (use_fua) + bio->bi_opf |= REQ_FUA; + else + dio->flags &= ~IOMAP_DIO_WRITE_FUA; + task_io_account_write(n); + } else { + bio->bi_opf = REQ_OP_READ; + if (dio->flags & IOMAP_DIO_DIRTY) + bio_set_pages_dirty(bio); + } + + iov_iter_advance(dio->submit.iter, n); + + dio->size += n; + pos += n; + copied += n; + + nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES); + iomap_dio_submit_bio(dio, iomap, bio); + } while (nr_pages); + + /* + * We need to zeroout the tail of a sub-block write if the extent type + * requires zeroing or the write extends beyond EOF. If we don't zero + * the block tail in the latter case, we can expose stale data via mmap + * reads of the EOF block. + */ +zero_tail: + if (need_zeroout || + ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) { + /* zero out from the end of the write to the end of the block */ + pad = pos & (fs_block_size - 1); + if (pad) + iomap_dio_zero(dio, iomap, pos, fs_block_size - pad); + } + return copied ? copied : ret; +} + +static loff_t +iomap_dio_hole_actor(loff_t length, struct iomap_dio *dio) +{ + length = iov_iter_zero(length, dio->submit.iter); + dio->size += length; + return length; +} + +static loff_t +iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length, + struct iomap_dio *dio, struct iomap *iomap) +{ + struct iov_iter *iter = dio->submit.iter; + size_t copied; + + BUG_ON(pos + length > PAGE_SIZE - offset_in_page(iomap->inline_data)); + + if (dio->flags & IOMAP_DIO_WRITE) { + loff_t size = inode->i_size; + + if (pos > size) + memset(iomap->inline_data + size, 0, pos - size); + copied = copy_from_iter(iomap->inline_data + pos, length, iter); + if (copied) { + if (pos + copied > size) + i_size_write(inode, pos + copied); + mark_inode_dirty(inode); + } + } else { + copied = copy_to_iter(iomap->inline_data + pos, length, iter); + } + dio->size += copied; + return copied; +} + +static loff_t +iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, + void *data, struct iomap *iomap) +{ + struct iomap_dio *dio = data; + + switch (iomap->type) { + case IOMAP_HOLE: + if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE)) + return -EIO; + return iomap_dio_hole_actor(length, dio); + case IOMAP_UNWRITTEN: + if (!(dio->flags & IOMAP_DIO_WRITE)) + return iomap_dio_hole_actor(length, dio); + return iomap_dio_bio_actor(inode, pos, length, dio, iomap); + case IOMAP_MAPPED: + return iomap_dio_bio_actor(inode, pos, length, dio, iomap); + case IOMAP_INLINE: + return iomap_dio_inline_actor(inode, pos, length, dio, iomap); + default: + WARN_ON_ONCE(1); + return -EIO; + } +} + +/* + * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO + * is being issued as AIO or not. This allows us to optimise pure data writes + * to use REQ_FUA rather than requiring generic_write_sync() to issue a + * REQ_FLUSH post write. This is slightly tricky because a single request here + * can be mapped into multiple disjoint IOs and only a subset of the IOs issued + * may be pure data writes. In that case, we still need to do a full data sync + * completion. + */ +ssize_t +iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, + const struct iomap_ops *ops, iomap_dio_end_io_t end_io) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = file_inode(iocb->ki_filp); + size_t count = iov_iter_count(iter); + loff_t pos = iocb->ki_pos, start = pos; + loff_t end = iocb->ki_pos + count - 1, ret = 0; + unsigned int flags = IOMAP_DIRECT; + bool wait_for_completion = is_sync_kiocb(iocb); + struct blk_plug plug; + struct iomap_dio *dio; + + lockdep_assert_held(&inode->i_rwsem); + + if (!count) + return 0; + + dio = kmalloc(sizeof(*dio), GFP_KERNEL); + if (!dio) + return -ENOMEM; + + dio->iocb = iocb; + atomic_set(&dio->ref, 1); + dio->size = 0; + dio->i_size = i_size_read(inode); + dio->end_io = end_io; + dio->error = 0; + dio->flags = 0; + + dio->submit.iter = iter; + dio->submit.waiter = current; + dio->submit.cookie = BLK_QC_T_NONE; + dio->submit.last_queue = NULL; + + if (iov_iter_rw(iter) == READ) { + if (pos >= dio->i_size) + goto out_free_dio; + + if (iter_is_iovec(iter) && iov_iter_rw(iter) == READ) + dio->flags |= IOMAP_DIO_DIRTY; + } else { + flags |= IOMAP_WRITE; + dio->flags |= IOMAP_DIO_WRITE; + + /* for data sync or sync, we need sync completion processing */ + if (iocb->ki_flags & IOCB_DSYNC) + dio->flags |= IOMAP_DIO_NEED_SYNC; + + /* + * For datasync only writes, we optimistically try using FUA for + * this IO. Any non-FUA write that occurs will clear this flag, + * hence we know before completion whether a cache flush is + * necessary. + */ + if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC) + dio->flags |= IOMAP_DIO_WRITE_FUA; + } + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (filemap_range_has_page(mapping, start, end)) { + ret = -EAGAIN; + goto out_free_dio; + } + flags |= IOMAP_NOWAIT; + } + + ret = filemap_write_and_wait_range(mapping, start, end); + if (ret) + goto out_free_dio; + + /* + * Try to invalidate cache pages for the range we're direct + * writing. If this invalidation fails, tough, the write will + * still work, but racing two incompatible write paths is a + * pretty crazy thing to do, so we don't support it 100%. + */ + ret = invalidate_inode_pages2_range(mapping, + start >> PAGE_SHIFT, end >> PAGE_SHIFT); + if (ret) + dio_warn_stale_pagecache(iocb->ki_filp); + ret = 0; + + if (iov_iter_rw(iter) == WRITE && !wait_for_completion && + !inode->i_sb->s_dio_done_wq) { + ret = sb_init_dio_done_wq(inode->i_sb); + if (ret < 0) + goto out_free_dio; + } + + inode_dio_begin(inode); + + blk_start_plug(&plug); + do { + ret = iomap_apply(inode, pos, count, flags, ops, dio, + iomap_dio_actor); + if (ret <= 0) { + /* magic error code to fall back to buffered I/O */ + if (ret == -ENOTBLK) { + wait_for_completion = true; + ret = 0; + } + break; + } + pos += ret; + + if (iov_iter_rw(iter) == READ && pos >= dio->i_size) + break; + } while ((count = iov_iter_count(iter)) > 0); + blk_finish_plug(&plug); + + if (ret < 0) + iomap_dio_set_error(dio, ret); + + /* + * If all the writes we issued were FUA, we don't need to flush the + * cache on IO completion. Clear the sync flag for this case. + */ + if (dio->flags & IOMAP_DIO_WRITE_FUA) + dio->flags &= ~IOMAP_DIO_NEED_SYNC; + + WRITE_ONCE(iocb->ki_cookie, dio->submit.cookie); + WRITE_ONCE(iocb->private, dio->submit.last_queue); + + /* + * We are about to drop our additional submission reference, which + * might be the last reference to the dio. There are three three + * different ways we can progress here: + * + * (a) If this is the last reference we will always complete and free + * the dio ourselves. + * (b) If this is not the last reference, and we serve an asynchronous + * iocb, we must never touch the dio after the decrement, the + * I/O completion handler will complete and free it. + * (c) If this is not the last reference, but we serve a synchronous + * iocb, the I/O completion handler will wake us up on the drop + * of the final reference, and we will complete and free it here + * after we got woken by the I/O completion handler. + */ + dio->wait_for_completion = wait_for_completion; + if (!atomic_dec_and_test(&dio->ref)) { + if (!wait_for_completion) + return -EIOCBQUEUED; + + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!READ_ONCE(dio->submit.waiter)) + break; + + if (!(iocb->ki_flags & IOCB_HIPRI) || + !dio->submit.last_queue || + !blk_poll(dio->submit.last_queue, + dio->submit.cookie, true)) + io_schedule(); + } + __set_current_state(TASK_RUNNING); + } + + return iomap_dio_complete(dio); + +out_free_dio: + kfree(dio); + return ret; +} +EXPORT_SYMBOL_GPL(iomap_dio_rw); diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c new file mode 100644 index 000000000000..f26fdd36e383 --- /dev/null +++ b/fs/iomap/fiemap.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2016-2018 Christoph Hellwig. + */ +#include <linux/module.h> +#include <linux/compiler.h> +#include <linux/fs.h> +#include <linux/iomap.h> + +struct fiemap_ctx { + struct fiemap_extent_info *fi; + struct iomap prev; +}; + +static int iomap_to_fiemap(struct fiemap_extent_info *fi, + struct iomap *iomap, u32 flags) +{ + switch (iomap->type) { + case IOMAP_HOLE: + /* skip holes */ + return 0; + case IOMAP_DELALLOC: + flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN; + break; + case IOMAP_MAPPED: + break; + case IOMAP_UNWRITTEN: + flags |= FIEMAP_EXTENT_UNWRITTEN; + break; + case IOMAP_INLINE: + flags |= FIEMAP_EXTENT_DATA_INLINE; + break; + } + + if (iomap->flags & IOMAP_F_MERGED) + flags |= FIEMAP_EXTENT_MERGED; + if (iomap->flags & IOMAP_F_SHARED) + flags |= FIEMAP_EXTENT_SHARED; + + return fiemap_fill_next_extent(fi, iomap->offset, + iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0, + iomap->length, flags); +} + +static loff_t +iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, + struct iomap *iomap) +{ + struct fiemap_ctx *ctx = data; + loff_t ret = length; + + if (iomap->type == IOMAP_HOLE) + return length; + + ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0); + ctx->prev = *iomap; + switch (ret) { + case 0: /* success */ + return length; + case 1: /* extent array full */ + return 0; + default: + return ret; + } +} + +int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, + loff_t start, loff_t len, const struct iomap_ops *ops) +{ + struct fiemap_ctx ctx; + loff_t ret; + + memset(&ctx, 0, sizeof(ctx)); + ctx.fi = fi; + ctx.prev.type = IOMAP_HOLE; + + ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC); + if (ret) + return ret; + + if (fi->fi_flags & FIEMAP_FLAG_SYNC) { + ret = filemap_write_and_wait(inode->i_mapping); + if (ret) + return ret; + } + + while (len > 0) { + ret = iomap_apply(inode, start, len, IOMAP_REPORT, ops, &ctx, + iomap_fiemap_actor); + /* inode with no (attribute) mapping will give ENOENT */ + if (ret == -ENOENT) + break; + if (ret < 0) + return ret; + if (ret == 0) + break; + + start += ret; + len -= ret; + } + + if (ctx.prev.type != IOMAP_HOLE) { + ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST); + if (ret < 0) + return ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(iomap_fiemap); + +static loff_t +iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length, + void *data, struct iomap *iomap) +{ + sector_t *bno = data, addr; + + if (iomap->type == IOMAP_MAPPED) { + addr = (pos - iomap->offset + iomap->addr) >> inode->i_blkbits; + if (addr > INT_MAX) + WARN(1, "would truncate bmap result\n"); + else + *bno = addr; + } + return 0; +} + +/* legacy ->bmap interface. 0 is the error return (!) */ +sector_t +iomap_bmap(struct address_space *mapping, sector_t bno, + const struct iomap_ops *ops) +{ + struct inode *inode = mapping->host; + loff_t pos = bno << inode->i_blkbits; + unsigned blocksize = i_blocksize(inode); + + if (filemap_write_and_wait(mapping)) + return 0; + + bno = 0; + iomap_apply(inode, pos, blocksize, 0, ops, &bno, iomap_bmap_actor); + return bno; +} +EXPORT_SYMBOL_GPL(iomap_bmap); diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c new file mode 100644 index 000000000000..c04bad4b2b43 --- /dev/null +++ b/fs/iomap/seek.c @@ -0,0 +1,212 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2017 Red Hat, Inc. + * Copyright (c) 2018 Christoph Hellwig. + */ +#include <linux/module.h> +#include <linux/compiler.h> +#include <linux/fs.h> +#include <linux/iomap.h> +#include <linux/pagemap.h> +#include <linux/pagevec.h> + +/* + * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff. + * Returns true if found and updates @lastoff to the offset in file. + */ +static bool +page_seek_hole_data(struct inode *inode, struct page *page, loff_t *lastoff, + int whence) +{ + const struct address_space_operations *ops = inode->i_mapping->a_ops; + unsigned int bsize = i_blocksize(inode), off; + bool seek_data = whence == SEEK_DATA; + loff_t poff = page_offset(page); + + if (WARN_ON_ONCE(*lastoff >= poff + PAGE_SIZE)) + return false; + + if (*lastoff < poff) { + /* + * Last offset smaller than the start of the page means we found + * a hole: + */ + if (whence == SEEK_HOLE) + return true; + *lastoff = poff; + } + + /* + * Just check the page unless we can and should check block ranges: + */ + if (bsize == PAGE_SIZE || !ops->is_partially_uptodate) + return PageUptodate(page) == seek_data; + + lock_page(page); + if (unlikely(page->mapping != inode->i_mapping)) + goto out_unlock_not_found; + + for (off = 0; off < PAGE_SIZE; off += bsize) { + if (offset_in_page(*lastoff) >= off + bsize) + continue; + if (ops->is_partially_uptodate(page, off, bsize) == seek_data) { + unlock_page(page); + return true; + } + *lastoff = poff + off + bsize; + } + +out_unlock_not_found: + unlock_page(page); + return false; +} + +/* + * Seek for SEEK_DATA / SEEK_HOLE in the page cache. + * + * Within unwritten extents, the page cache determines which parts are holes + * and which are data: uptodate buffer heads count as data; everything else + * counts as a hole. + * + * Returns the resulting offset on successs, and -ENOENT otherwise. + */ +static loff_t +page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, + int whence) +{ + pgoff_t index = offset >> PAGE_SHIFT; + pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE); + loff_t lastoff = offset; + struct pagevec pvec; + + if (length <= 0) + return -ENOENT; + + pagevec_init(&pvec); + + do { + unsigned nr_pages, i; + + nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index, + end - 1); + if (nr_pages == 0) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (page_seek_hole_data(inode, page, &lastoff, whence)) + goto check_range; + lastoff = page_offset(page) + PAGE_SIZE; + } + pagevec_release(&pvec); + } while (index < end); + + /* When no page at lastoff and we are not done, we found a hole. */ + if (whence != SEEK_HOLE) + goto not_found; + +check_range: + if (lastoff < offset + length) + goto out; +not_found: + lastoff = -ENOENT; +out: + pagevec_release(&pvec); + return lastoff; +} + + +static loff_t +iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length, + void *data, struct iomap *iomap) +{ + switch (iomap->type) { + case IOMAP_UNWRITTEN: + offset = page_cache_seek_hole_data(inode, offset, length, + SEEK_HOLE); + if (offset < 0) + return length; + /* fall through */ + case IOMAP_HOLE: + *(loff_t *)data = offset; + return 0; + default: + return length; + } +} + +loff_t +iomap_seek_hole(struct inode *inode, loff_t offset, const struct iomap_ops *ops) +{ + loff_t size = i_size_read(inode); + loff_t length = size - offset; + loff_t ret; + + /* Nothing to be found before or beyond the end of the file. */ + if (offset < 0 || offset >= size) + return -ENXIO; + + while (length > 0) { + ret = iomap_apply(inode, offset, length, IOMAP_REPORT, ops, + &offset, iomap_seek_hole_actor); + if (ret < 0) + return ret; + if (ret == 0) + break; + + offset += ret; + length -= ret; + } + + return offset; +} +EXPORT_SYMBOL_GPL(iomap_seek_hole); + +static loff_t +iomap_seek_data_actor(struct inode *inode, loff_t offset, loff_t length, + void *data, struct iomap *iomap) +{ + switch (iomap->type) { + case IOMAP_HOLE: + return length; + case IOMAP_UNWRITTEN: + offset = page_cache_seek_hole_data(inode, offset, length, + SEEK_DATA); + if (offset < 0) + return length; + /*FALLTHRU*/ + default: + *(loff_t *)data = offset; + return 0; + } +} + +loff_t +iomap_seek_data(struct inode *inode, loff_t offset, const struct iomap_ops *ops) +{ + loff_t size = i_size_read(inode); + loff_t length = size - offset; + loff_t ret; + + /* Nothing to be found before or beyond the end of the file. */ + if (offset < 0 || offset >= size) + return -ENXIO; + + while (length > 0) { + ret = iomap_apply(inode, offset, length, IOMAP_REPORT, ops, + &offset, iomap_seek_data_actor); + if (ret < 0) + return ret; + if (ret == 0) + break; + + offset += ret; + length -= ret; + } + + if (length <= 0) + return -ENXIO; + return offset; +} +EXPORT_SYMBOL_GPL(iomap_seek_data); diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c new file mode 100644 index 000000000000..152a230f668d --- /dev/null +++ b/fs/iomap/swapfile.c @@ -0,0 +1,178 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2018 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#include <linux/module.h> +#include <linux/compiler.h> +#include <linux/fs.h> +#include <linux/iomap.h> +#include <linux/swap.h> + +/* Swapfile activation */ + +struct iomap_swapfile_info { + struct iomap iomap; /* accumulated iomap */ + struct swap_info_struct *sis; + uint64_t lowest_ppage; /* lowest physical addr seen (pages) */ + uint64_t highest_ppage; /* highest physical addr seen (pages) */ + unsigned long nr_pages; /* number of pages collected */ + int nr_extents; /* extent count */ +}; + +/* + * Collect physical extents for this swap file. Physical extents reported to + * the swap code must be trimmed to align to a page boundary. The logical + * offset within the file is irrelevant since the swapfile code maps logical + * page numbers of the swap device to the physical page-aligned extents. + */ +static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi) +{ + struct iomap *iomap = &isi->iomap; + unsigned long nr_pages; + uint64_t first_ppage; + uint64_t first_ppage_reported; + uint64_t next_ppage; + int error; + + /* + * Round the start up and the end down so that the physical + * extent aligns to a page boundary. + */ + first_ppage = ALIGN(iomap->addr, PAGE_SIZE) >> PAGE_SHIFT; + next_ppage = ALIGN_DOWN(iomap->addr + iomap->length, PAGE_SIZE) >> + PAGE_SHIFT; + + /* Skip too-short physical extents. */ + if (first_ppage >= next_ppage) + return 0; + nr_pages = next_ppage - first_ppage; + + /* + * Calculate how much swap space we're adding; the first page contains + * the swap header and doesn't count. The mm still wants that first + * page fed to add_swap_extent, however. + */ + first_ppage_reported = first_ppage; + if (iomap->offset == 0) + first_ppage_reported++; + if (isi->lowest_ppage > first_ppage_reported) + isi->lowest_ppage = first_ppage_reported; + if (isi->highest_ppage < (next_ppage - 1)) + isi->highest_ppage = next_ppage - 1; + + /* Add extent, set up for the next call. */ + error = add_swap_extent(isi->sis, isi->nr_pages, nr_pages, first_ppage); + if (error < 0) + return error; + isi->nr_extents += error; + isi->nr_pages += nr_pages; + return 0; +} + +/* + * Accumulate iomaps for this swap file. We have to accumulate iomaps because + * swap only cares about contiguous page-aligned physical extents and makes no + * distinction between written and unwritten extents. + */ +static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos, + loff_t count, void *data, struct iomap *iomap) +{ + struct iomap_swapfile_info *isi = data; + int error; + + switch (iomap->type) { + case IOMAP_MAPPED: + case IOMAP_UNWRITTEN: + /* Only real or unwritten extents. */ + break; + case IOMAP_INLINE: + /* No inline data. */ + pr_err("swapon: file is inline\n"); + return -EINVAL; + default: + pr_err("swapon: file has unallocated extents\n"); + return -EINVAL; + } + + /* No uncommitted metadata or shared blocks. */ + if (iomap->flags & IOMAP_F_DIRTY) { + pr_err("swapon: file is not committed\n"); + return -EINVAL; + } + if (iomap->flags & IOMAP_F_SHARED) { + pr_err("swapon: file has shared extents\n"); + return -EINVAL; + } + + /* Only one bdev per swap file. */ + if (iomap->bdev != isi->sis->bdev) { + pr_err("swapon: file is on multiple devices\n"); + return -EINVAL; + } + + if (isi->iomap.length == 0) { + /* No accumulated extent, so just store it. */ + memcpy(&isi->iomap, iomap, sizeof(isi->iomap)); + } else if (isi->iomap.addr + isi->iomap.length == iomap->addr) { + /* Append this to the accumulated extent. */ + isi->iomap.length += iomap->length; + } else { + /* Otherwise, add the retained iomap and store this one. */ + error = iomap_swapfile_add_extent(isi); + if (error) + return error; + memcpy(&isi->iomap, iomap, sizeof(isi->iomap)); + } + return count; +} + +/* + * Iterate a swap file's iomaps to construct physical extents that can be + * passed to the swapfile subsystem. + */ +int iomap_swapfile_activate(struct swap_info_struct *sis, + struct file *swap_file, sector_t *pagespan, + const struct iomap_ops *ops) +{ + struct iomap_swapfile_info isi = { + .sis = sis, + .lowest_ppage = (sector_t)-1ULL, + }; + struct address_space *mapping = swap_file->f_mapping; + struct inode *inode = mapping->host; + loff_t pos = 0; + loff_t len = ALIGN_DOWN(i_size_read(inode), PAGE_SIZE); + loff_t ret; + + /* + * Persist all file mapping metadata so that we won't have any + * IOMAP_F_DIRTY iomaps. + */ + ret = vfs_fsync(swap_file, 1); + if (ret) + return ret; + + while (len > 0) { + ret = iomap_apply(inode, pos, len, IOMAP_REPORT, + ops, &isi, iomap_swapfile_activate_actor); + if (ret <= 0) + return ret; + + pos += ret; + len -= ret; + } + + if (isi.iomap.length) { + ret = iomap_swapfile_add_extent(&isi); + if (ret) + return ret; + } + + *pagespan = 1 + isi.highest_ppage - isi.lowest_ppage; + sis->max = isi.nr_pages; + sis->pages = isi.nr_pages - 1; + sis->highest_bit = isi.nr_pages - 1; + return isi.nr_extents; +} +EXPORT_SYMBOL_GPL(iomap_swapfile_activate); diff --git a/fs/libfs.c b/fs/libfs.c index 7e52e77692ec..c9b2850c0f7c 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -17,6 +17,8 @@ #include <linux/exportfs.h> #include <linux/writeback.h> #include <linux/buffer_head.h> /* sync_mapping_buffers */ +#include <linux/fs_context.h> +#include <linux/pseudo_fs.h> #include <linux/uaccess.h> @@ -236,34 +238,22 @@ static const struct super_operations simple_super_operations = { .statfs = simple_statfs, }; -/* - * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that - * will never be mountable) - */ -struct dentry *mount_pseudo_xattr(struct file_system_type *fs_type, char *name, - const struct super_operations *ops, const struct xattr_handler **xattr, - const struct dentry_operations *dops, unsigned long magic) +static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc) { - struct super_block *s; - struct dentry *dentry; + struct pseudo_fs_context *ctx = fc->fs_private; struct inode *root; - struct qstr d_name = QSTR_INIT(name, strlen(name)); - - s = sget_userns(fs_type, NULL, set_anon_super, SB_KERNMOUNT|SB_NOUSER, - &init_user_ns, NULL); - if (IS_ERR(s)) - return ERR_CAST(s); s->s_maxbytes = MAX_LFS_FILESIZE; s->s_blocksize = PAGE_SIZE; s->s_blocksize_bits = PAGE_SHIFT; - s->s_magic = magic; - s->s_op = ops ? ops : &simple_super_operations; - s->s_xattr = xattr; + s->s_magic = ctx->magic; + s->s_op = ctx->ops ?: &simple_super_operations; + s->s_xattr = ctx->xattr; s->s_time_gran = 1; root = new_inode(s); if (!root) - goto Enomem; + return -ENOMEM; + /* * since this is the first inode, make it number 1. New inodes created * after this must take care not to collide with it (by passing @@ -272,22 +262,48 @@ struct dentry *mount_pseudo_xattr(struct file_system_type *fs_type, char *name, root->i_ino = 1; root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR; root->i_atime = root->i_mtime = root->i_ctime = current_time(root); - dentry = __d_alloc(s, &d_name); - if (!dentry) { - iput(root); - goto Enomem; + s->s_root = d_make_root(root); + if (!s->s_root) + return -ENOMEM; + s->s_d_op = ctx->dops; + return 0; +} + +static int pseudo_fs_get_tree(struct fs_context *fc) +{ + return get_tree_nodev(fc, pseudo_fs_fill_super); +} + +static void pseudo_fs_free(struct fs_context *fc) +{ + kfree(fc->fs_private); +} + +static const struct fs_context_operations pseudo_fs_context_ops = { + .free = pseudo_fs_free, + .get_tree = pseudo_fs_get_tree, +}; + +/* + * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that + * will never be mountable) + */ +struct pseudo_fs_context *init_pseudo(struct fs_context *fc, + unsigned long magic) +{ + struct pseudo_fs_context *ctx; + + ctx = kzalloc(sizeof(struct pseudo_fs_context), GFP_KERNEL); + if (likely(ctx)) { + ctx->magic = magic; + fc->fs_private = ctx; + fc->ops = &pseudo_fs_context_ops; + fc->sb_flags |= SB_NOUSER; + fc->global = true; } - d_instantiate(dentry, root); - s->s_root = dentry; - s->s_d_op = dops; - s->s_flags |= SB_ACTIVE; - return dget(s->s_root); - -Enomem: - deactivate_locked_super(s); - return ERR_PTR(-ENOMEM); + return ctx; } -EXPORT_SYMBOL(mount_pseudo_xattr); +EXPORT_SYMBOL(init_pseudo); int simple_open(struct inode *inode, struct file *file) { diff --git a/fs/mount.h b/fs/mount.h index 6250de544760..711a4093e475 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -58,7 +58,10 @@ struct mount { struct mount *mnt_master; /* slave is on master->mnt_slave_list */ struct mnt_namespace *mnt_ns; /* containing namespace */ struct mountpoint *mnt_mp; /* where is it mounted */ - struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */ + union { + struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */ + struct hlist_node mnt_umount; + }; struct list_head mnt_umounting; /* list entry for umount propagation */ #ifdef CONFIG_FSNOTIFY struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks; @@ -68,8 +71,7 @@ struct mount { int mnt_group_id; /* peer group identifier */ int mnt_expiry_mark; /* true if marked for expiry */ struct hlist_head mnt_pins; - struct fs_pin mnt_umount; - struct dentry *mnt_ex_mountpoint; + struct hlist_head mnt_stuck_children; } __randomize_layout; #define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */ diff --git a/fs/mpage.c b/fs/mpage.c index 436a85260394..a63620cdb73a 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -647,7 +647,7 @@ alloc_new: * the confused fail path above (OOM) will be very confused when * it finds all bh marked clean (i.e. it will not write anything) */ - wbc_account_io(wbc, page, PAGE_SIZE); + wbc_account_cgroup_owner(wbc, page, PAGE_SIZE); length = first_unmapped << blkbits; if (bio_add_page(bio, page, length, 0) < length) { bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio); diff --git a/fs/namespace.c b/fs/namespace.c index 6fbc9126367a..6464ea4acba9 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -29,6 +29,7 @@ #include <linux/sched/task.h> #include <uapi/linux/mount.h> #include <linux/fs_context.h> +#include <linux/shmem_fs.h> #include "pnode.h" #include "internal.h" @@ -69,6 +70,8 @@ static struct hlist_head *mount_hashtable __read_mostly; static struct hlist_head *mountpoint_hashtable __read_mostly; static struct kmem_cache *mnt_cache __read_mostly; static DECLARE_RWSEM(namespace_sem); +static HLIST_HEAD(unmounted); /* protected by namespace_sem */ +static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ /* /sys/fs */ struct kobject *fs_kobj; @@ -169,14 +172,6 @@ unsigned int mnt_get_count(struct mount *mnt) #endif } -static void drop_mountpoint(struct fs_pin *p) -{ - struct mount *m = container_of(p, struct mount, mnt_umount); - dput(m->mnt_ex_mountpoint); - pin_remove(p); - mntput(&m->mnt); -} - static struct mount *alloc_vfsmnt(const char *name) { struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); @@ -214,7 +209,7 @@ static struct mount *alloc_vfsmnt(const char *name) INIT_LIST_HEAD(&mnt->mnt_slave); INIT_HLIST_NODE(&mnt->mnt_mp_list); INIT_LIST_HEAD(&mnt->mnt_umounting); - init_fs_pin(&mnt->mnt_umount, drop_mountpoint); + INIT_HLIST_HEAD(&mnt->mnt_stuck_children); } return mnt; @@ -739,7 +734,7 @@ mountpoint: /* Add the new mountpoint to the hash table */ read_seqlock_excl(&mount_lock); - new->m_dentry = dentry; + new->m_dentry = dget(dentry); new->m_count = 1; hlist_add_head(&new->m_hash, mp_hash(dentry)); INIT_HLIST_HEAD(&new->m_list); @@ -752,7 +747,11 @@ done: return mp; } -static void put_mountpoint(struct mountpoint *mp) +/* + * vfsmount lock must be held. Additionally, the caller is responsible + * for serializing calls for given disposal list. + */ +static void __put_mountpoint(struct mountpoint *mp, struct list_head *list) { if (!--mp->m_count) { struct dentry *dentry = mp->m_dentry; @@ -760,11 +759,18 @@ static void put_mountpoint(struct mountpoint *mp) spin_lock(&dentry->d_lock); dentry->d_flags &= ~DCACHE_MOUNTED; spin_unlock(&dentry->d_lock); + dput_to_list(dentry, list); hlist_del(&mp->m_hash); kfree(mp); } } +/* called with namespace_lock and vfsmount lock */ +static void put_mountpoint(struct mountpoint *mp) +{ + __put_mountpoint(mp, &ex_mountpoints); +} + static inline int check_mnt(struct mount *mnt) { return mnt->mnt_ns == current->nsproxy->mnt_ns; @@ -795,25 +801,17 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns) /* * vfsmount lock must be held for write */ -static void unhash_mnt(struct mount *mnt) +static struct mountpoint *unhash_mnt(struct mount *mnt) { + struct mountpoint *mp; mnt->mnt_parent = mnt; mnt->mnt_mountpoint = mnt->mnt.mnt_root; list_del_init(&mnt->mnt_child); hlist_del_init_rcu(&mnt->mnt_hash); hlist_del_init(&mnt->mnt_mp_list); - put_mountpoint(mnt->mnt_mp); + mp = mnt->mnt_mp; mnt->mnt_mp = NULL; -} - -/* - * vfsmount lock must be held for write - */ -static void detach_mnt(struct mount *mnt, struct path *old_path) -{ - old_path->dentry = mnt->mnt_mountpoint; - old_path->mnt = &mnt->mnt_parent->mnt; - unhash_mnt(mnt); + return mp; } /* @@ -821,9 +819,7 @@ static void detach_mnt(struct mount *mnt, struct path *old_path) */ static void umount_mnt(struct mount *mnt) { - /* old mountpoint will be dropped when we can do that */ - mnt->mnt_ex_mountpoint = mnt->mnt_mountpoint; - unhash_mnt(mnt); + put_mountpoint(unhash_mnt(mnt)); } /* @@ -835,7 +831,7 @@ void mnt_set_mountpoint(struct mount *mnt, { mp->m_count++; mnt_add_count(mnt, 1); /* essentially, that's mntget */ - child_mnt->mnt_mountpoint = dget(mp->m_dentry); + child_mnt->mnt_mountpoint = mp->m_dentry; child_mnt->mnt_parent = mnt; child_mnt->mnt_mp = mp; hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list); @@ -862,7 +858,6 @@ static void attach_mnt(struct mount *mnt, void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt) { struct mountpoint *old_mp = mnt->mnt_mp; - struct dentry *old_mountpoint = mnt->mnt_mountpoint; struct mount *old_parent = mnt->mnt_parent; list_del_init(&mnt->mnt_child); @@ -872,22 +867,6 @@ void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct m attach_mnt(mnt, parent, mp); put_mountpoint(old_mp); - - /* - * Safely avoid even the suggestion this code might sleep or - * lock the mount hash by taking advantage of the knowledge that - * mnt_change_mountpoint will not release the final reference - * to a mountpoint. - * - * During mounting, the mount passed in as the parent mount will - * continue to use the old mountpoint and during unmounting, the - * old mountpoint will continue to exist until namespace_unlock, - * which happens well after mnt_change_mountpoint. - */ - spin_lock(&old_mountpoint->d_lock); - old_mountpoint->d_lockref.count--; - spin_unlock(&old_mountpoint->d_lock); - mnt_add_count(old_parent, -1); } @@ -1102,19 +1081,22 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, static void cleanup_mnt(struct mount *mnt) { + struct hlist_node *p; + struct mount *m; /* - * This probably indicates that somebody messed - * up a mnt_want/drop_write() pair. If this - * happens, the filesystem was probably unable - * to make r/w->r/o transitions. - */ - /* + * The warning here probably indicates that somebody messed + * up a mnt_want/drop_write() pair. If this happens, the + * filesystem was probably unable to make r/w->r/o transitions. * The locking used to deal with mnt_count decrement provides barriers, * so mnt_get_writers() below is safe. */ WARN_ON(mnt_get_writers(mnt)); if (unlikely(mnt->mnt_pins.first)) mnt_pin_kill(mnt); + hlist_for_each_entry_safe(m, p, &mnt->mnt_stuck_children, mnt_umount) { + hlist_del(&m->mnt_umount); + mntput(&m->mnt); + } fsnotify_vfsmount_delete(&mnt->mnt); dput(mnt->mnt.mnt_root); deactivate_super(mnt->mnt.mnt_sb); @@ -1140,6 +1122,8 @@ static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput); static void mntput_no_expire(struct mount *mnt) { + LIST_HEAD(list); + rcu_read_lock(); if (likely(READ_ONCE(mnt->mnt_ns))) { /* @@ -1180,10 +1164,12 @@ static void mntput_no_expire(struct mount *mnt) if (unlikely(!list_empty(&mnt->mnt_mounts))) { struct mount *p, *tmp; list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) { - umount_mnt(p); + __put_mountpoint(unhash_mnt(p), &list); + hlist_add_head(&p->mnt_umount, &mnt->mnt_stuck_children); } } unlock_mount_hash(); + shrink_dentry_list(&list); if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) { struct task_struct *task = current; @@ -1369,22 +1355,29 @@ int may_umount(struct vfsmount *mnt) EXPORT_SYMBOL(may_umount); -static HLIST_HEAD(unmounted); /* protected by namespace_sem */ - static void namespace_unlock(void) { struct hlist_head head; + struct hlist_node *p; + struct mount *m; + LIST_HEAD(list); hlist_move_list(&unmounted, &head); + list_splice_init(&ex_mountpoints, &list); up_write(&namespace_sem); + shrink_dentry_list(&list); + if (likely(hlist_empty(&head))) return; synchronize_rcu_expedited(); - group_pin_kill(&head); + hlist_for_each_entry_safe(m, p, &head, mnt_umount) { + hlist_del(&m->mnt_umount); + mntput(&m->mnt); + } } static inline void namespace_lock(void) @@ -1471,8 +1464,6 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) disconnect = disconnect_mount(p, how); - pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt, - disconnect ? &unmounted : NULL); if (mnt_has_parent(p)) { mnt_add_count(p->mnt_parent, -1); if (!disconnect) { @@ -1480,6 +1471,7 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts); } else { umount_mnt(p); + hlist_add_head(&p->mnt_umount, &unmounted); } } change_mnt_propagation(p, MS_PRIVATE); @@ -1625,15 +1617,15 @@ void __detach_mounts(struct dentry *dentry) namespace_lock(); lock_mount_hash(); mp = lookup_mountpoint(dentry); - if (IS_ERR_OR_NULL(mp)) + if (!mp) goto out_unlock; event++; while (!hlist_empty(&mp->m_list)) { mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list); if (mnt->mnt.mnt_flags & MNT_UMOUNT) { - hlist_add_head(&mnt->mnt_umount.s_list, &unmounted); umount_mnt(mnt); + hlist_add_head(&mnt->mnt_umount, &unmounted); } else umount_tree(mnt, UMOUNT_CONNECTED); } @@ -2045,7 +2037,7 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt) static int attach_recursive_mnt(struct mount *source_mnt, struct mount *dest_mnt, struct mountpoint *dest_mp, - struct path *parent_path) + bool moving) { struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; HLIST_HEAD(tree_list); @@ -2063,7 +2055,7 @@ static int attach_recursive_mnt(struct mount *source_mnt, return PTR_ERR(smp); /* Is there space to add these mounts to the mount namespace? */ - if (!parent_path) { + if (!moving) { err = count_mounts(ns, source_mnt); if (err) goto out; @@ -2082,8 +2074,8 @@ static int attach_recursive_mnt(struct mount *source_mnt, } else { lock_mount_hash(); } - if (parent_path) { - detach_mnt(source_mnt, parent_path); + if (moving) { + unhash_mnt(source_mnt); attach_mnt(source_mnt, dest_mnt, dest_mp); touch_mnt_namespace(source_mnt->mnt_ns); } else { @@ -2181,7 +2173,7 @@ static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) d_is_dir(mnt->mnt.mnt_root)) return -ENOTDIR; - return attach_recursive_mnt(mnt, p, mp, NULL); + return attach_recursive_mnt(mnt, p, mp, false); } /* @@ -2574,11 +2566,11 @@ out: static int do_move_mount(struct path *old_path, struct path *new_path) { - struct path parent_path = {.mnt = NULL, .dentry = NULL}; struct mnt_namespace *ns; struct mount *p; struct mount *old; - struct mountpoint *mp; + struct mount *parent; + struct mountpoint *mp, *old_mp; int err; bool attached; @@ -2588,7 +2580,9 @@ static int do_move_mount(struct path *old_path, struct path *new_path) old = real_mount(old_path->mnt); p = real_mount(new_path->mnt); + parent = old->mnt_parent; attached = mnt_has_parent(old); + old_mp = old->mnt_mp; ns = old->mnt_ns; err = -EINVAL; @@ -2616,7 +2610,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path) /* * Don't move a mount residing in a shared parent. */ - if (attached && IS_MNT_SHARED(old->mnt_parent)) + if (attached && IS_MNT_SHARED(parent)) goto out; /* * Don't move a mount tree containing unbindable mounts to a destination @@ -2632,18 +2626,21 @@ static int do_move_mount(struct path *old_path, struct path *new_path) goto out; err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp, - attached ? &parent_path : NULL); + attached); if (err) goto out; /* if the mount is moved, it should no longer be expire * automatically */ list_del_init(&old->mnt_expire); + if (attached) + put_mountpoint(old_mp); out: unlock_mount(mp); if (!err) { - path_put(&parent_path); - if (!attached) + if (attached) + mntput_no_expire(parent); + else free_mnt_ns(ns); } return err; @@ -2788,6 +2785,8 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags, err = vfs_parse_fs_string(fc, "source", name, strlen(name)); if (!err) err = parse_monolithic_mount_data(fc, data); + if (!err && !mount_capable(fc)) + err = -EPERM; if (!err) err = vfs_get_tree(fc); if (!err) @@ -3295,8 +3294,8 @@ struct dentry *mount_subtree(struct vfsmount *m, const char *name) } EXPORT_SYMBOL(mount_subtree); -int ksys_mount(char __user *dev_name, char __user *dir_name, char __user *type, - unsigned long flags, void __user *data) +int ksys_mount(const char __user *dev_name, const char __user *dir_name, + const char __user *type, unsigned long flags, void __user *data) { int ret; char *kernel_type; @@ -3586,8 +3585,8 @@ EXPORT_SYMBOL(path_is_under); SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, const char __user *, put_old) { - struct path new, old, parent_path, root_parent, root; - struct mount *new_mnt, *root_mnt, *old_mnt; + struct path new, old, root; + struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent; struct mountpoint *old_mp, *root_mp; int error; @@ -3616,9 +3615,11 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, new_mnt = real_mount(new.mnt); root_mnt = real_mount(root.mnt); old_mnt = real_mount(old.mnt); + ex_parent = new_mnt->mnt_parent; + root_parent = root_mnt->mnt_parent; if (IS_MNT_SHARED(old_mnt) || - IS_MNT_SHARED(new_mnt->mnt_parent) || - IS_MNT_SHARED(root_mnt->mnt_parent)) + IS_MNT_SHARED(ex_parent) || + IS_MNT_SHARED(root_parent)) goto out4; if (!check_mnt(root_mnt) || !check_mnt(new_mnt)) goto out4; @@ -3635,7 +3636,6 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, goto out4; /* not a mountpoint */ if (!mnt_has_parent(root_mnt)) goto out4; /* not attached */ - root_mp = root_mnt->mnt_mp; if (new.mnt->mnt_root != new.dentry) goto out4; /* not a mountpoint */ if (!mnt_has_parent(new_mnt)) @@ -3646,10 +3646,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, /* make certain new is below the root */ if (!is_path_reachable(new_mnt, new.dentry, &root)) goto out4; - root_mp->m_count++; /* pin it so it won't go away */ lock_mount_hash(); - detach_mnt(new_mnt, &parent_path); - detach_mnt(root_mnt, &root_parent); + umount_mnt(new_mnt); + root_mp = unhash_mnt(root_mnt); /* we'll need its mountpoint */ if (root_mnt->mnt.mnt_flags & MNT_LOCKED) { new_mnt->mnt.mnt_flags |= MNT_LOCKED; root_mnt->mnt.mnt_flags &= ~MNT_LOCKED; @@ -3657,7 +3656,8 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, /* mount old root on put_old */ attach_mnt(root_mnt, old_mnt, old_mp); /* mount new_root on / */ - attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp); + attach_mnt(new_mnt, root_parent, root_mp); + mnt_add_count(root_parent, -1); touch_mnt_namespace(current->nsproxy->mnt_ns); /* A moved mount should not expire automatically */ list_del_init(&new_mnt->mnt_expire); @@ -3667,10 +3667,8 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, error = 0; out4: unlock_mount(old_mp); - if (!error) { - path_put(&root_parent); - path_put(&parent_path); - } + if (!error) + mntput_no_expire(ex_parent); out3: path_put(&root); out2: @@ -3687,13 +3685,8 @@ static void __init init_mount_tree(void) struct mount *m; struct mnt_namespace *ns; struct path root; - struct file_system_type *type; - type = get_fs_type("rootfs"); - if (!type) - panic("Can't find rootfs type"); - mnt = vfs_kern_mount(type, 0, "rootfs", NULL); - put_filesystem(type); + mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL); if (IS_ERR(mnt)) panic("Can't create rootfs"); @@ -3746,6 +3739,7 @@ void __init mnt_init(void) fs_kobj = kobject_create_and_add("fs", NULL); if (!fs_kobj) printk(KERN_WARNING "%s: kobj create error\n", __func__); + shmem_init(); init_rootfs(); init_mount_tree(); } diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index c587e3c4c6a6..34cdeaecccf6 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -8,7 +8,8 @@ obj-$(CONFIG_NFS_FS) += nfs.o CFLAGS_nfstrace.o += -I$(src) nfs-y := client.o dir.o file.o getroot.o inode.o super.o \ io.o direct.o pagelist.o read.o symlink.o unlink.o \ - write.o namespace.o mount_clnt.o nfstrace.o export.o + write.o namespace.o mount_clnt.o nfstrace.o \ + export.o sysfs.o nfs-$(CONFIG_ROOT_NFS) += nfsroot.o nfs-$(CONFIG_SYSCTL) += sysctl.o nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 315967354954..f39924ba050b 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -414,27 +414,39 @@ static __be32 validate_seqid(const struct nfs4_slot_table *tbl, const struct nfs4_slot *slot, const struct cb_sequenceargs * args) { + __be32 ret; + + ret = cpu_to_be32(NFS4ERR_BADSLOT); if (args->csa_slotid > tbl->server_highest_slotid) - return htonl(NFS4ERR_BADSLOT); + goto out_err; /* Replay */ if (args->csa_sequenceid == slot->seq_nr) { + ret = cpu_to_be32(NFS4ERR_DELAY); if (nfs4_test_locked_slot(tbl, slot->slot_nr)) - return htonl(NFS4ERR_DELAY); + goto out_err; + /* Signal process_op to set this error on next op */ + ret = cpu_to_be32(NFS4ERR_RETRY_UNCACHED_REP); if (args->csa_cachethis == 0) - return htonl(NFS4ERR_RETRY_UNCACHED_REP); + goto out_err; /* Liar! We never allowed you to set csa_cachethis != 0 */ - return htonl(NFS4ERR_SEQ_FALSE_RETRY); + ret = cpu_to_be32(NFS4ERR_SEQ_FALSE_RETRY); + goto out_err; } /* Note: wraparound relies on seq_nr being of type u32 */ - if (likely(args->csa_sequenceid == slot->seq_nr + 1)) - return htonl(NFS4_OK); - /* Misordered request */ - return htonl(NFS4ERR_SEQ_MISORDERED); + ret = cpu_to_be32(NFS4ERR_SEQ_MISORDERED); + if (args->csa_sequenceid != slot->seq_nr + 1) + goto out_err; + + return cpu_to_be32(NFS4_OK); + +out_err: + trace_nfs4_cb_seqid_err(args, ret); + return ret; } /* diff --git a/fs/nfs/client.c b/fs/nfs/client.c index d7e4f0848e28..30838304a0bf 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -49,6 +49,7 @@ #include "pnfs.h" #include "nfs.h" #include "netns.h" +#include "sysfs.h" #define NFSDBG_FACILITY NFSDBG_CLIENT @@ -175,6 +176,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) clp->cl_rpcclient = ERR_PTR(-EINVAL); clp->cl_proto = cl_init->proto; + clp->cl_nconnect = cl_init->nconnect; clp->cl_net = get_net(cl_init->net); clp->cl_principal = "*"; @@ -192,7 +194,7 @@ error_0: EXPORT_SYMBOL_GPL(nfs_alloc_client); #if IS_ENABLED(CONFIG_NFS_V4) -void nfs_cleanup_cb_ident_idr(struct net *net) +static void nfs_cleanup_cb_ident_idr(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); @@ -214,7 +216,7 @@ static void pnfs_init_server(struct nfs_server *server) } #else -void nfs_cleanup_cb_ident_idr(struct net *net) +static void nfs_cleanup_cb_ident_idr(struct net *net) { } @@ -406,10 +408,10 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) clp = nfs_match_client(cl_init); if (clp) { spin_unlock(&nn->nfs_client_lock); - if (IS_ERR(clp)) - return clp; if (new) new->rpc_ops->free_client(new); + if (IS_ERR(clp)) + return clp; return nfs_found_client(cl_init, clp); } if (new) { @@ -493,6 +495,7 @@ int nfs_create_rpc_client(struct nfs_client *clp, struct rpc_create_args args = { .net = clp->cl_net, .protocol = clp->cl_proto, + .nconnect = clp->cl_nconnect, .address = (struct sockaddr *)&clp->cl_addr, .addrsize = clp->cl_addrlen, .timeout = cl_init->timeparms, @@ -658,6 +661,7 @@ static int nfs_init_server(struct nfs_server *server, .net = data->net, .timeparms = &timeparms, .cred = server->cred, + .nconnect = data->nfs_server.nconnect, }; struct nfs_client *clp; int error; @@ -1072,6 +1076,18 @@ void nfs_clients_init(struct net *net) #endif spin_lock_init(&nn->nfs_client_lock); nn->boot_time = ktime_get_real(); + + nfs_netns_sysfs_setup(nn, net); +} + +void nfs_clients_exit(struct net *net) +{ + struct nfs_net *nn = net_generic(net, nfs_net_id); + + nfs_netns_sysfs_destroy(nn); + nfs_cleanup_cb_ident_idr(net); + WARN_ON_ONCE(!list_empty(&nn->nfs_client_list)); + WARN_ON_ONCE(!list_empty(&nn->nfs_volume_list)); } #ifdef CONFIG_PROC_FS diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 57b6a45576ad..8d501093660f 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -80,6 +80,10 @@ static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir ctx->dup_cookie = 0; ctx->cred = get_cred(cred); spin_lock(&dir->i_lock); + if (list_empty(&nfsi->open_files) && + (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER)) + nfsi->cache_validity |= NFS_INO_INVALID_DATA | + NFS_INO_REVAL_FORCED; list_add(&ctx->list, &nfsi->open_files); spin_unlock(&dir->i_lock); return ctx; @@ -140,19 +144,12 @@ struct nfs_cache_array { struct nfs_cache_array_entry array[0]; }; -struct readdirvec { - unsigned long nr; - unsigned long index; - struct page *pages[NFS_MAX_READDIR_RAPAGES]; -}; - typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, bool); typedef struct { struct file *file; struct page *page; struct dir_context *ctx; unsigned long page_index; - struct readdirvec pvec; u64 *dir_cookie; u64 last_cookie; loff_t current_index; @@ -532,10 +529,6 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en struct nfs_cache_array *array; unsigned int count = 0; int status; - int max_rapages = NFS_MAX_READDIR_RAPAGES; - - desc->pvec.index = desc->page_index; - desc->pvec.nr = 0; scratch = alloc_page(GFP_KERNEL); if (scratch == NULL) @@ -560,40 +553,20 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en if (desc->plus) nfs_prime_dcache(file_dentry(desc->file), entry); - status = nfs_readdir_add_to_array(entry, desc->pvec.pages[desc->pvec.nr]); - if (status == -ENOSPC) { - desc->pvec.nr++; - if (desc->pvec.nr == max_rapages) - break; - status = nfs_readdir_add_to_array(entry, desc->pvec.pages[desc->pvec.nr]); - } + status = nfs_readdir_add_to_array(entry, page); if (status != 0) break; } while (!entry->eof); - /* - * page and desc->pvec.pages[0] are valid, don't need to check - * whether or not to be NULL. - */ - copy_highpage(page, desc->pvec.pages[0]); - out_nopages: if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) { - array = kmap_atomic(desc->pvec.pages[desc->pvec.nr]); + array = kmap(page); array->eof_index = array->size; status = 0; - kunmap_atomic(array); + kunmap(page); } put_page(scratch); - - /* - * desc->pvec.nr > 0 means at least one page was completely filled, - * we should return -ENOSPC. Otherwise function - * nfs_readdir_xdr_to_array will enter infinite loop. - */ - if (desc->pvec.nr > 0) - return -ENOSPC; return status; } @@ -627,24 +600,6 @@ out_freepages: return -ENOMEM; } -/* - * nfs_readdir_rapages_init initialize rapages by nfs_cache_array structure. - */ -static -void nfs_readdir_rapages_init(nfs_readdir_descriptor_t *desc) -{ - struct nfs_cache_array *array; - int max_rapages = NFS_MAX_READDIR_RAPAGES; - int index; - - for (index = 0; index < max_rapages; index++) { - array = kmap_atomic(desc->pvec.pages[index]); - memset(array, 0, sizeof(struct nfs_cache_array)); - array->eof_index = -1; - kunmap_atomic(array); - } -} - static int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode) { @@ -655,12 +610,6 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, int status = -ENOMEM; unsigned int array_size = ARRAY_SIZE(pages); - /* - * This means we hit readdir rdpages miss, the preallocated rdpages - * are useless, the preallocate rdpages should be reinitialized. - */ - nfs_readdir_rapages_init(desc); - entry.prev_cookie = 0; entry.cookie = desc->last_cookie; entry.eof = 0; @@ -721,24 +670,9 @@ int nfs_readdir_filler(void *data, struct page* page) struct inode *inode = file_inode(desc->file); int ret; - /* - * If desc->page_index in range desc->pvec.index and - * desc->pvec.index + desc->pvec.nr, we get readdir cache hit. - */ - if (desc->page_index >= desc->pvec.index && - desc->page_index < (desc->pvec.index + desc->pvec.nr)) { - /* - * page and desc->pvec.pages[x] are valid, don't need to check - * whether or not to be NULL. - */ - copy_highpage(page, desc->pvec.pages[desc->page_index - desc->pvec.index]); - ret = 0; - } else { - ret = nfs_readdir_xdr_to_array(desc, page, inode); - if (ret < 0) - goto error; - } - + ret = nfs_readdir_xdr_to_array(desc, page, inode); + if (ret < 0) + goto error; SetPageUptodate(page); if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) { @@ -903,7 +837,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) *desc = &my_desc; struct nfs_open_dir_context *dir_ctx = file->private_data; int res = 0; - int max_rapages = NFS_MAX_READDIR_RAPAGES; dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n", file, (long long)ctx->pos); @@ -923,12 +856,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) desc->decode = NFS_PROTO(inode)->decode_dirent; desc->plus = nfs_use_readdirplus(inode, ctx); - res = nfs_readdir_alloc_pages(desc->pvec.pages, max_rapages); - if (res < 0) - return -ENOMEM; - - nfs_readdir_rapages_init(desc); - if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) res = nfs_revalidate_mapping(inode, file->f_mapping); if (res < 0) @@ -964,7 +891,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) break; } while (!desc->eof); out: - nfs_readdir_free_pages(desc->pvec.pages, max_rapages); if (res > 0) res = 0; dfprintk(FILE, "NFS: readdir(%pD2) returns %d\n", file, res); diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index bcff3bf5ae09..b04e20d28162 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -934,6 +934,10 @@ out_nolseg: if (pgio->pg_error < 0) return; out_mds: + trace_pnfs_mds_fallback_pg_init_read(pgio->pg_inode, + 0, NFS4_MAX_UINT64, IOMODE_READ, + NFS_I(pgio->pg_inode)->layout, + pgio->pg_lseg); pnfs_put_lseg(pgio->pg_lseg); pgio->pg_lseg = NULL; nfs_pageio_reset_read_mds(pgio); @@ -1000,6 +1004,10 @@ retry: return; out_mds: + trace_pnfs_mds_fallback_pg_init_write(pgio->pg_inode, + 0, NFS4_MAX_UINT64, IOMODE_RW, + NFS_I(pgio->pg_inode)->layout, + pgio->pg_lseg); pnfs_put_lseg(pgio->pg_lseg); pgio->pg_lseg = NULL; nfs_pageio_reset_write_mds(pgio); @@ -1026,6 +1034,10 @@ ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio, if (pgio->pg_lseg) return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg); + trace_pnfs_mds_fallback_pg_get_mirror_count(pgio->pg_inode, + 0, NFS4_MAX_UINT64, IOMODE_RW, + NFS_I(pgio->pg_inode)->layout, + pgio->pg_lseg); /* no lseg means that pnfs is not in use, so no mirroring here */ nfs_pageio_reset_write_mds(pgio); out: @@ -1075,6 +1087,10 @@ static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs) hdr->args.count, (unsigned long long)hdr->args.offset); + trace_pnfs_mds_fallback_write_done(hdr->inode, + hdr->args.offset, hdr->args.count, + IOMODE_RW, NFS_I(hdr->inode)->layout, + hdr->lseg); task->tk_status = pnfs_write_done_resend_to_mds(hdr); } } @@ -1094,6 +1110,10 @@ static void ff_layout_reset_read(struct nfs_pgio_header *hdr) hdr->args.count, (unsigned long long)hdr->args.offset); + trace_pnfs_mds_fallback_read_done(hdr->inode, + hdr->args.offset, hdr->args.count, + IOMODE_READ, NFS_I(hdr->inode)->layout, + hdr->lseg); task->tk_status = pnfs_read_done_resend_to_mds(hdr); } } @@ -1827,6 +1847,9 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) out_failed: if (ff_layout_avoid_mds_available_ds(lseg)) return PNFS_TRY_AGAIN; + trace_pnfs_mds_fallback_read_pagelist(hdr->inode, + hdr->args.offset, hdr->args.count, + IOMODE_READ, NFS_I(hdr->inode)->layout, lseg); return PNFS_NOT_ATTEMPTED; } @@ -1892,6 +1915,9 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) out_failed: if (ff_layout_avoid_mds_available_ds(lseg)) return PNFS_TRY_AGAIN; + trace_pnfs_mds_fallback_write_pagelist(hdr->inode, + hdr->args.offset, hdr->args.count, + IOMODE_RW, NFS_I(hdr->inode)->layout, lseg); return PNFS_NOT_ATTEMPTED; } diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index 19f856f45689..3eda40a320a5 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -257,7 +257,7 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, if (status == 0) return 0; - if (mirror->mirror_ds == NULL) + if (IS_ERR_OR_NULL(mirror->mirror_ds)) return -EINVAL; dserr = kmalloc(sizeof(*dserr), gfp_flags); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 0b4a1a974411..8a1758200b57 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -51,6 +51,7 @@ #include "pnfs.h" #include "nfs.h" #include "netns.h" +#include "sysfs.h" #include "nfstrace.h" @@ -208,7 +209,7 @@ static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) } if (inode->i_mapping->nrpages == 0) - flags &= ~NFS_INO_INVALID_DATA; + flags &= ~(NFS_INO_INVALID_DATA|NFS_INO_DATA_INVAL_DEFER); nfsi->cache_validity |= flags; if (flags & NFS_INO_INVALID_DATA) nfs_fscache_invalidate(inode); @@ -652,7 +653,8 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset) i_size_write(inode, offset); /* Optimisation */ if (offset == 0) - NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_DATA; + NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_DATA | + NFS_INO_DATA_INVAL_DEFER); NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE; spin_unlock(&inode->i_lock); @@ -1032,6 +1034,10 @@ void nfs_inode_attach_open_context(struct nfs_open_context *ctx) struct nfs_inode *nfsi = NFS_I(inode); spin_lock(&inode->i_lock); + if (list_empty(&nfsi->open_files) && + (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER)) + nfsi->cache_validity |= NFS_INO_INVALID_DATA | + NFS_INO_REVAL_FORCED; list_add_tail_rcu(&ctx->list, &nfsi->open_files); spin_unlock(&inode->i_lock); } @@ -1100,6 +1106,7 @@ int nfs_open(struct inode *inode, struct file *filp) nfs_fscache_open_file(inode, filp); return 0; } +EXPORT_SYMBOL_GPL(nfs_open); /* * This function is called whenever some part of NFS notices that @@ -1312,7 +1319,8 @@ int nfs_revalidate_mapping(struct inode *inode, set_bit(NFS_INO_INVALIDATING, bitlock); smp_wmb(); - nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; + nfsi->cache_validity &= ~(NFS_INO_INVALID_DATA| + NFS_INO_DATA_INVAL_DEFER); spin_unlock(&inode->i_lock); trace_nfs_invalidate_mapping_enter(inode); ret = nfs_invalidate_mapping(inode, mapping); @@ -1870,7 +1878,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) dprintk("NFS: change_attr change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); - } + } else if (!have_delegation) + nfsi->cache_validity |= NFS_INO_DATA_INVAL_DEFER; inode_set_iversion_raw(inode, fattr->change_attr); attr_changed = true; } @@ -2159,12 +2168,8 @@ static int nfs_net_init(struct net *net) static void nfs_net_exit(struct net *net) { - struct nfs_net *nn = net_generic(net, nfs_net_id); - nfs_fs_proc_net_exit(net); - nfs_cleanup_cb_ident_idr(net); - WARN_ON_ONCE(!list_empty(&nn->nfs_client_list)); - WARN_ON_ONCE(!list_empty(&nn->nfs_volume_list)); + nfs_clients_exit(net); } static struct pernet_operations nfs_net_ops = { @@ -2181,6 +2186,10 @@ static int __init init_nfs_fs(void) { int err; + err = nfs_sysfs_init(); + if (err < 0) + goto out10; + err = register_pernet_subsys(&nfs_net_ops); if (err < 0) goto out9; @@ -2244,6 +2253,8 @@ out7: out8: unregister_pernet_subsys(&nfs_net_ops); out9: + nfs_sysfs_exit(); +out10: return err; } @@ -2260,6 +2271,7 @@ static void __exit exit_nfs_fs(void) unregister_nfs_fs(); nfs_fs_proc_exit(); nfsiod_stop(); + nfs_sysfs_exit(); } /* Not quite true; I just maintain it */ diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 498fab72f70b..a2346a2f8361 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -69,8 +69,7 @@ struct nfs_clone_mount { * Maximum number of pages that readdir can use for creating * a vmapped array of pages. */ -#define NFS_MAX_READDIR_PAGES 64 -#define NFS_MAX_READDIR_RAPAGES 8 +#define NFS_MAX_READDIR_PAGES 8 struct nfs_client_initdata { unsigned long init_flags; @@ -82,6 +81,7 @@ struct nfs_client_initdata { struct nfs_subversion *nfs_mod; int proto; u32 minorversion; + unsigned int nconnect; struct net *net; const struct rpc_timeout *timeparms; const struct cred *cred; @@ -123,6 +123,7 @@ struct nfs_parsed_mount_data { char *export_path; int port; unsigned short protocol; + unsigned short nconnect; } nfs_server; void *lsm_opts; @@ -158,6 +159,7 @@ extern void nfs_umount(const struct nfs_mount_request *info); /* client.c */ extern const struct rpc_program nfs_program; extern void nfs_clients_init(struct net *net); +extern void nfs_clients_exit(struct net *net); extern struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *); int nfs_create_rpc_client(struct nfs_client *, const struct nfs_client_initdata *, rpc_authflavor_t); struct nfs_client *nfs_get_client(const struct nfs_client_initdata *); @@ -170,7 +172,6 @@ int nfs_init_server_rpcclient(struct nfs_server *, const struct rpc_timeout *t, struct nfs_server *nfs_alloc_server(void); void nfs_server_copy_userdata(struct nfs_server *, struct nfs_server *); -extern void nfs_cleanup_cb_ident_idr(struct net *); extern void nfs_put_client(struct nfs_client *); extern void nfs_free_client(struct nfs_client *); extern struct nfs_client *nfs4_find_client_ident(struct net *, int); diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h index fc9978c58265..c8374f74dce1 100644 --- a/fs/nfs/netns.h +++ b/fs/nfs/netns.h @@ -15,6 +15,8 @@ struct bl_dev_msg { uint32_t major, minor; }; +struct nfs_netns_client; + struct nfs_net { struct cache_detail *nfs_dns_resolve; struct rpc_pipe *bl_device_pipe; @@ -29,6 +31,7 @@ struct nfs_net { unsigned short nfs_callback_tcpport6; int cb_users[NFS4_MAX_MINOR_VERSION + 1]; #endif + struct nfs_netns_client *nfs_client; spinlock_t nfs_client_lock; ktime_t boot_time; #ifdef CONFIG_PROC_FS diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 572794dab4b1..cbc17a203248 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -151,7 +151,7 @@ static int decode_stat(struct xdr_stream *xdr, enum nfs_stat *status) return 0; out_status: *status = be32_to_cpup(p); - trace_nfs_xdr_status((int)*status); + trace_nfs_xdr_status(xdr, (int)*status); return 0; } diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c index fb0c425b5d45..148ceb74d27c 100644 --- a/fs/nfs/nfs3client.c +++ b/fs/nfs/nfs3client.c @@ -102,6 +102,9 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, return ERR_PTR(-EINVAL); cl_init.hostname = buf; + if (mds_clp->cl_nconnect > 1 && ds_proto == XPRT_TRANSPORT_TCP) + cl_init.nconnect = mds_clp->cl_nconnect; + if (mds_srv->flags & NFS_MOUNT_NORESVPORT) set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index abbbdde97e31..602767850b36 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -343,7 +343,7 @@ static int decode_nfsstat3(struct xdr_stream *xdr, enum nfs_stat *status) return 0; out_status: *status = be32_to_cpup(p); - trace_nfs_xdr_status((int)*status); + trace_nfs_xdr_status(xdr, (int)*status); return 0; } diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 8a38a254f516..d778dad9a75e 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -312,12 +312,12 @@ extern int nfs4_set_rw_stateid(nfs4_stateid *stateid, const struct nfs_lock_context *l_ctx, fmode_t fmode); +extern int nfs4_proc_get_lease_time(struct nfs_client *clp, + struct nfs_fsinfo *fsinfo); #if defined(CONFIG_NFS_V4_1) extern int nfs41_sequence_done(struct rpc_task *, struct nfs4_sequence_res *); extern int nfs4_proc_create_session(struct nfs_client *, const struct cred *); extern int nfs4_proc_destroy_session(struct nfs4_session *, const struct cred *); -extern int nfs4_proc_get_lease_time(struct nfs_client *clp, - struct nfs_fsinfo *fsinfo); extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync); extern int nfs4_detect_session_trunking(struct nfs_client *clp, diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 81b9b6d7927a..616393a01c06 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -859,7 +859,8 @@ static int nfs4_set_client(struct nfs_server *server, const size_t addrlen, const char *ip_addr, int proto, const struct rpc_timeout *timeparms, - u32 minorversion, struct net *net) + u32 minorversion, unsigned int nconnect, + struct net *net) { struct nfs_client_initdata cl_init = { .hostname = hostname, @@ -875,6 +876,8 @@ static int nfs4_set_client(struct nfs_server *server, }; struct nfs_client *clp; + if (minorversion > 0 && proto == XPRT_TRANSPORT_TCP) + cl_init.nconnect = nconnect; if (server->flags & NFS_MOUNT_NORESVPORT) set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); if (server->options & NFS_OPTION_MIGRATION) @@ -941,6 +944,9 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, return ERR_PTR(-EINVAL); cl_init.hostname = buf; + if (mds_clp->cl_nconnect > 1 && ds_proto == XPRT_TRANSPORT_TCP) + cl_init.nconnect = mds_clp->cl_nconnect; + if (mds_srv->flags & NFS_MOUNT_NORESVPORT) __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); @@ -1074,6 +1080,7 @@ static int nfs4_init_server(struct nfs_server *server, data->nfs_server.protocol, &timeparms, data->minorversion, + data->nfs_server.nconnect, data->net); if (error < 0) return error; @@ -1163,6 +1170,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, XPRT_TRANSPORT_RDMA, parent_server->client->cl_timeout, parent_client->cl_mvops->minor_version, + parent_client->cl_nconnect, parent_client->cl_net); if (!error) goto init_server; @@ -1176,6 +1184,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, XPRT_TRANSPORT_TCP, parent_server->client->cl_timeout, parent_client->cl_mvops->minor_version, + parent_client->cl_nconnect, parent_client->cl_net); if (error < 0) goto error; @@ -1271,7 +1280,8 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname, set_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status); error = nfs4_set_client(server, hostname, sap, salen, buf, clp->cl_proto, clnt->cl_timeout, - clp->cl_minorversion, net); + clp->cl_minorversion, + clp->cl_nconnect, net); clear_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status); if (error != 0) { nfs_server_insert_lists(server); diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index f4157eb1f69d..96db471ca2e5 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -49,7 +49,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) return err; if ((openflags & O_ACCMODE) == 3) - openflags--; + return nfs_open(inode, filp); /* We can't create new files here */ openflags &= ~(O_CREAT|O_EXCL); @@ -204,7 +204,11 @@ static loff_t nfs42_remap_file_range(struct file *src_file, loff_t src_off, bool same_inode = false; int ret; - if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) + /* NFS does not support deduplication. */ + if (remap_flags & REMAP_FILE_DEDUP) + return -EOPNOTSUPP; + + if (remap_flags & ~REMAP_FILE_ADVISORY) return -EINVAL; /* check alignment w.r.t. clone_blksize */ diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 6418cb6c079b..39896afc6edf 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -428,6 +428,22 @@ static int nfs4_delay(long *timeout, bool interruptible) return nfs4_delay_killable(timeout); } +static const nfs4_stateid * +nfs4_recoverable_stateid(const nfs4_stateid *stateid) +{ + if (!stateid) + return NULL; + switch (stateid->type) { + case NFS4_OPEN_STATEID_TYPE: + case NFS4_LOCK_STATEID_TYPE: + case NFS4_DELEGATION_STATEID_TYPE: + return stateid; + default: + break; + } + return NULL; +} + /* This is the error handling routine for processes that are allowed * to sleep. */ @@ -436,7 +452,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server, { struct nfs_client *clp = server->nfs_client; struct nfs4_state *state = exception->state; - const nfs4_stateid *stateid = exception->stateid; + const nfs4_stateid *stateid; struct inode *inode = exception->inode; int ret = errorcode; @@ -444,8 +460,9 @@ static int nfs4_do_handle_exception(struct nfs_server *server, exception->recovering = 0; exception->retry = 0; + stateid = nfs4_recoverable_stateid(exception->stateid); if (stateid == NULL && state != NULL) - stateid = &state->stateid; + stateid = nfs4_recoverable_stateid(&state->stateid); switch(errorcode) { case 0: @@ -1165,6 +1182,18 @@ static bool nfs4_clear_cap_atomic_open_v1(struct nfs_server *server, return true; } +static fmode_t _nfs4_ctx_to_accessmode(const struct nfs_open_context *ctx) +{ + return ctx->mode & (FMODE_READ|FMODE_WRITE|FMODE_EXEC); +} + +static fmode_t _nfs4_ctx_to_openmode(const struct nfs_open_context *ctx) +{ + fmode_t ret = ctx->mode & (FMODE_READ|FMODE_WRITE); + + return (ctx->mode & FMODE_EXEC) ? FMODE_READ | ret : ret; +} + static u32 nfs4_map_atomic_open_share(struct nfs_server *server, fmode_t fmode, int openflags) @@ -2900,14 +2929,13 @@ static unsigned nfs4_exclusive_attrset(struct nfs4_opendata *opendata, } static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, - fmode_t fmode, - int flags, - struct nfs_open_context *ctx) + int flags, struct nfs_open_context *ctx) { struct nfs4_state_owner *sp = opendata->owner; struct nfs_server *server = sp->so_server; struct dentry *dentry; struct nfs4_state *state; + fmode_t acc_mode = _nfs4_ctx_to_accessmode(ctx); unsigned int seq; int ret; @@ -2946,7 +2974,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, /* Parse layoutget results before we check for access */ pnfs_parse_lgopen(state->inode, opendata->lgp, ctx); - ret = nfs4_opendata_access(sp->so_cred, opendata, state, fmode, flags); + ret = nfs4_opendata_access(sp->so_cred, opendata, state, + acc_mode, flags); if (ret != 0) goto out; @@ -2978,7 +3007,7 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry = ctx->dentry; const struct cred *cred = ctx->cred; struct nfs4_threshold **ctx_th = &ctx->mdsthreshold; - fmode_t fmode = ctx->mode & (FMODE_READ|FMODE_WRITE|FMODE_EXEC); + fmode_t fmode = _nfs4_ctx_to_openmode(ctx); enum open_claim_type4 claim = NFS4_OPEN_CLAIM_NULL; struct iattr *sattr = c->sattr; struct nfs4_label *label = c->label; @@ -3024,7 +3053,7 @@ static int _nfs4_do_open(struct inode *dir, if (d_really_is_positive(dentry)) opendata->state = nfs4_get_open_state(d_inode(dentry), sp); - status = _nfs4_open_and_get_state(opendata, fmode, flags, ctx); + status = _nfs4_open_and_get_state(opendata, flags, ctx); if (status != 0) goto err_free_label; state = ctx->state; @@ -3594,9 +3623,9 @@ static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) if (ctx->state == NULL) return; if (is_sync) - nfs4_close_sync(ctx->state, ctx->mode); + nfs4_close_sync(ctx->state, _nfs4_ctx_to_openmode(ctx)); else - nfs4_close_state(ctx->state, ctx->mode); + nfs4_close_state(ctx->state, _nfs4_ctx_to_openmode(ctx)); } #define FATTR4_WORD1_NFS40_MASK (2*FATTR4_WORD1_MOUNTED_ON_FILEID - 1UL) @@ -5980,7 +6009,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, .rpc_message = &msg, .callback_ops = &nfs4_setclientid_ops, .callback_data = &setclientid, - .flags = RPC_TASK_TIMEOUT, + .flags = RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN, }; int status; @@ -6046,7 +6075,8 @@ int nfs4_proc_setclientid_confirm(struct nfs_client *clp, dprintk("NFS call setclientid_confirm auth=%s, (client ID %llx)\n", clp->cl_rpcclient->cl_auth->au_ops->au_name, clp->cl_clientid); - status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + status = rpc_call_sync(clp->cl_rpcclient, &msg, + RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN); trace_nfs4_setclientid_confirm(clp, status); dprintk("NFS reply setclientid_confirm: %d\n", status); return status; @@ -7627,7 +7657,7 @@ static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct NFS_SP4_MACH_CRED_SECINFO, &clnt, &msg); status = nfs4_call_sync(clnt, NFS_SERVER(dir), &msg, &args.seq_args, - &res.seq_res, 0); + &res.seq_res, RPC_TASK_NO_ROUND_ROBIN); dprintk("NFS reply secinfo: %d\n", status); put_cred(cred); @@ -7965,7 +7995,7 @@ nfs4_run_exchange_id(struct nfs_client *clp, const struct cred *cred, .rpc_client = clp->cl_rpcclient, .callback_ops = &nfs4_exchange_id_call_ops, .rpc_message = &msg, - .flags = RPC_TASK_TIMEOUT, + .flags = RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN, }; struct nfs41_exchange_id_data *calldata; int status; @@ -8190,7 +8220,8 @@ static int _nfs4_proc_destroy_clientid(struct nfs_client *clp, }; int status; - status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + status = rpc_call_sync(clp->cl_rpcclient, &msg, + RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN); trace_nfs4_destroy_clientid(clp, status); if (status) dprintk("NFS: Got error %d from the server %s on " @@ -8241,6 +8272,8 @@ out: return ret; } +#endif /* CONFIG_NFS_V4_1 */ + struct nfs4_get_lease_time_data { struct nfs4_get_lease_time_args *args; struct nfs4_get_lease_time_res *res; @@ -8273,7 +8306,7 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata) (struct nfs4_get_lease_time_data *)calldata; dprintk("--> %s\n", __func__); - if (!nfs41_sequence_done(task, &data->res->lr_seq_res)) + if (!nfs4_sequence_done(task, &data->res->lr_seq_res)) return; switch (task->tk_status) { case -NFS4ERR_DELAY: @@ -8331,6 +8364,8 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) return status; } +#ifdef CONFIG_NFS_V4_1 + /* * Initialize the values to be used by the client in CREATE_SESSION * If nfs4_init_session set the fore channel request and response sizes, @@ -8345,6 +8380,7 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args, { unsigned int max_rqst_sz, max_resp_sz; unsigned int max_bc_payload = rpc_max_bc_payload(clnt); + unsigned int max_bc_slots = rpc_num_bc_slots(clnt); max_rqst_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxwrite_overhead; max_resp_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxread_overhead; @@ -8367,6 +8403,8 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args, args->bc_attrs.max_resp_sz_cached = 0; args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS; args->bc_attrs.max_reqs = max_t(unsigned short, max_session_cb_slots, 1); + if (args->bc_attrs.max_reqs > max_bc_slots) + args->bc_attrs.max_reqs = max_bc_slots; dprintk("%s: Back Channel : max_rqst_sz=%u max_resp_sz=%u " "max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n", @@ -8469,7 +8507,8 @@ static int _nfs4_proc_create_session(struct nfs_client *clp, nfs4_init_channel_attrs(&args, clp->cl_rpcclient); args.flags = (SESSION4_PERSIST | SESSION4_BACK_CHAN); - status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + status = rpc_call_sync(session->clp->cl_rpcclient, &msg, + RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN); trace_nfs4_create_session(clp, status); switch (status) { @@ -8545,7 +8584,8 @@ int nfs4_proc_destroy_session(struct nfs4_session *session, if (!test_and_clear_bit(NFS4_SESSION_ESTABLISHED, &session->session_state)) return 0; - status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + status = rpc_call_sync(session->clp->cl_rpcclient, &msg, + RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN); trace_nfs4_destroy_session(session->clp, status); if (status) @@ -8799,7 +8839,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp, .rpc_client = clp->cl_rpcclient, .rpc_message = &msg, .callback_ops = &nfs4_reclaim_complete_call_ops, - .flags = RPC_TASK_ASYNC, + .flags = RPC_TASK_ASYNC | RPC_TASK_NO_ROUND_ROBIN, }; int status = -ENOMEM; @@ -9318,7 +9358,7 @@ _nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, dprintk("--> %s\n", __func__); status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, - &res.seq_res, 0); + &res.seq_res, RPC_TASK_NO_ROUND_ROBIN); dprintk("<-- %s status=%d\n", __func__, status); put_cred(cred); diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index e2e3c4f04d3e..9afd051a4876 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -87,6 +87,27 @@ const nfs4_stateid current_stateid = { static DEFINE_MUTEX(nfs_clid_init_mutex); +static int nfs4_setup_state_renewal(struct nfs_client *clp) +{ + int status; + struct nfs_fsinfo fsinfo; + unsigned long now; + + if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) { + nfs4_schedule_state_renewal(clp); + return 0; + } + + now = jiffies; + status = nfs4_proc_get_lease_time(clp, &fsinfo); + if (status == 0) { + nfs4_set_lease_period(clp, fsinfo.lease_time * HZ, now); + nfs4_schedule_state_renewal(clp); + } + + return status; +} + int nfs4_init_clientid(struct nfs_client *clp, const struct cred *cred) { struct nfs4_setclientid_res clid = { @@ -114,7 +135,7 @@ do_confirm: if (status != 0) goto out; clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); - nfs4_schedule_state_renewal(clp); + nfs4_setup_state_renewal(clp); out: return status; } @@ -286,34 +307,13 @@ static int nfs4_begin_drain_session(struct nfs_client *clp) #if defined(CONFIG_NFS_V4_1) -static int nfs41_setup_state_renewal(struct nfs_client *clp) -{ - int status; - struct nfs_fsinfo fsinfo; - unsigned long now; - - if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) { - nfs4_schedule_state_renewal(clp); - return 0; - } - - now = jiffies; - status = nfs4_proc_get_lease_time(clp, &fsinfo); - if (status == 0) { - nfs4_set_lease_period(clp, fsinfo.lease_time * HZ, now); - nfs4_schedule_state_renewal(clp); - } - - return status; -} - static void nfs41_finish_session_reset(struct nfs_client *clp) { clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); /* create_session negotiated new slot table */ clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); - nfs41_setup_state_renewal(clp); + nfs4_setup_state_renewal(clp); } int nfs41_init_clientid(struct nfs_client *clp, const struct cred *cred) @@ -1064,8 +1064,7 @@ int nfs4_select_rw_stateid(struct nfs4_state *state, * choose to use. */ goto out; - nfs4_copy_open_stateid(dst, state); - ret = 0; + ret = nfs4_copy_open_stateid(dst, state) ? 0 : -EAGAIN; out: if (nfs_server_capable(state->inode, NFS_CAP_STATEID_NFSV41)) dst->seqid = 0; diff --git a/fs/nfs/nfs4trace.c b/fs/nfs/nfs4trace.c index e9fb3e50a999..1a8f376b3f73 100644 --- a/fs/nfs/nfs4trace.c +++ b/fs/nfs/nfs4trace.c @@ -16,4 +16,12 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_read); EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_write); EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_commit_ds); + +EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_pg_init_read); +EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_pg_init_write); +EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_pg_get_mirror_count); +EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_read_done); +EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_write_done); +EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_read_pagelist); +EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_write_pagelist); #endif diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index cd1a5c08da9a..b2f395fa7350 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -156,7 +156,7 @@ TRACE_DEFINE_ENUM(NFS4ERR_WRONG_TYPE); TRACE_DEFINE_ENUM(NFS4ERR_XDEV); #define show_nfsv4_errors(error) \ - __print_symbolic(-(error), \ + __print_symbolic(error, \ { NFS4_OK, "OK" }, \ /* Mapped by nfs4_stat_to_errno() */ \ { EPERM, "EPERM" }, \ @@ -348,7 +348,7 @@ DECLARE_EVENT_CLASS(nfs4_clientid_event, TP_STRUCT__entry( __string(dstaddr, clp->cl_hostname) - __field(int, error) + __field(unsigned long, error) ), TP_fast_assign( @@ -357,8 +357,8 @@ DECLARE_EVENT_CLASS(nfs4_clientid_event, ), TP_printk( - "error=%d (%s) dstaddr=%s", - __entry->error, + "error=%ld (%s) dstaddr=%s", + -__entry->error, show_nfsv4_errors(__entry->error), __get_str(dstaddr) ) @@ -420,7 +420,7 @@ TRACE_EVENT(nfs4_sequence_done, __field(unsigned int, highest_slotid) __field(unsigned int, target_highest_slotid) __field(unsigned int, status_flags) - __field(int, error) + __field(unsigned long, error) ), TP_fast_assign( @@ -435,10 +435,10 @@ TRACE_EVENT(nfs4_sequence_done, __entry->error = res->sr_status; ), TP_printk( - "error=%d (%s) session=0x%08x slot_nr=%u seq_nr=%u " + "error=%ld (%s) session=0x%08x slot_nr=%u seq_nr=%u " "highest_slotid=%u target_highest_slotid=%u " "status_flags=%u (%s)", - __entry->error, + -__entry->error, show_nfsv4_errors(__entry->error), __entry->session, __entry->slot_nr, @@ -467,7 +467,7 @@ TRACE_EVENT(nfs4_cb_sequence, __field(unsigned int, seq_nr) __field(unsigned int, highest_slotid) __field(unsigned int, cachethis) - __field(int, error) + __field(unsigned long, error) ), TP_fast_assign( @@ -476,13 +476,13 @@ TRACE_EVENT(nfs4_cb_sequence, __entry->seq_nr = args->csa_sequenceid; __entry->highest_slotid = args->csa_highestslotid; __entry->cachethis = args->csa_cachethis; - __entry->error = -be32_to_cpu(status); + __entry->error = be32_to_cpu(status); ), TP_printk( - "error=%d (%s) session=0x%08x slot_nr=%u seq_nr=%u " + "error=%ld (%s) session=0x%08x slot_nr=%u seq_nr=%u " "highest_slotid=%u", - __entry->error, + -__entry->error, show_nfsv4_errors(__entry->error), __entry->session, __entry->slot_nr, @@ -490,6 +490,44 @@ TRACE_EVENT(nfs4_cb_sequence, __entry->highest_slotid ) ); + +TRACE_EVENT(nfs4_cb_seqid_err, + TP_PROTO( + const struct cb_sequenceargs *args, + __be32 status + ), + TP_ARGS(args, status), + + TP_STRUCT__entry( + __field(unsigned int, session) + __field(unsigned int, slot_nr) + __field(unsigned int, seq_nr) + __field(unsigned int, highest_slotid) + __field(unsigned int, cachethis) + __field(unsigned long, error) + ), + + TP_fast_assign( + __entry->session = nfs_session_id_hash(&args->csa_sessionid); + __entry->slot_nr = args->csa_slotid; + __entry->seq_nr = args->csa_sequenceid; + __entry->highest_slotid = args->csa_highestslotid; + __entry->cachethis = args->csa_cachethis; + __entry->error = be32_to_cpu(status); + ), + + TP_printk( + "error=%ld (%s) session=0x%08x slot_nr=%u seq_nr=%u " + "highest_slotid=%u", + -__entry->error, + show_nfsv4_errors(__entry->error), + __entry->session, + __entry->slot_nr, + __entry->seq_nr, + __entry->highest_slotid + ) +); + #endif /* CONFIG_NFS_V4_1 */ TRACE_EVENT(nfs4_setup_sequence, @@ -526,26 +564,37 @@ TRACE_EVENT(nfs4_setup_sequence, TRACE_EVENT(nfs4_xdr_status, TP_PROTO( + const struct xdr_stream *xdr, u32 op, int error ), - TP_ARGS(op, error), + TP_ARGS(xdr, op, error), TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(u32, xid) __field(u32, op) - __field(int, error) + __field(unsigned long, error) ), TP_fast_assign( + const struct rpc_rqst *rqstp = xdr->rqst; + const struct rpc_task *task = rqstp->rq_task; + + __entry->task_id = task->tk_pid; + __entry->client_id = task->tk_client->cl_clid; + __entry->xid = be32_to_cpu(rqstp->rq_xid); __entry->op = op; - __entry->error = -error; + __entry->error = error; ), TP_printk( - "operation %d: nfs status %d (%s)", - __entry->op, - __entry->error, show_nfsv4_errors(__entry->error) + "task:%u@%d xid=0x%08x error=%ld (%s) operation=%u", + __entry->task_id, __entry->client_id, __entry->xid, + -__entry->error, show_nfsv4_errors(__entry->error), + __entry->op ) ); @@ -559,7 +608,7 @@ DECLARE_EVENT_CLASS(nfs4_open_event, TP_ARGS(ctx, flags, error), TP_STRUCT__entry( - __field(int, error) + __field(unsigned long, error) __field(unsigned int, flags) __field(unsigned int, fmode) __field(dev_t, dev) @@ -577,7 +626,7 @@ DECLARE_EVENT_CLASS(nfs4_open_event, const struct nfs4_state *state = ctx->state; const struct inode *inode = NULL; - __entry->error = error; + __entry->error = -error; __entry->flags = flags; __entry->fmode = (__force unsigned int)ctx->mode; __entry->dev = ctx->dentry->d_sb->s_dev; @@ -609,11 +658,11 @@ DECLARE_EVENT_CLASS(nfs4_open_event, ), TP_printk( - "error=%d (%s) flags=%d (%s) fmode=%s " + "error=%ld (%s) flags=%d (%s) fmode=%s " "fileid=%02x:%02x:%llu fhandle=0x%08x " "name=%02x:%02x:%llu/%s stateid=%d:0x%08x " "openstateid=%d:0x%08x", - __entry->error, + -__entry->error, show_nfsv4_errors(__entry->error), __entry->flags, show_open_flags(__entry->flags), @@ -695,7 +744,7 @@ TRACE_EVENT(nfs4_close, __field(u32, fhandle) __field(u64, fileid) __field(unsigned int, fmode) - __field(int, error) + __field(unsigned long, error) __field(int, stateid_seq) __field(u32, stateid_hash) ), @@ -715,9 +764,9 @@ TRACE_EVENT(nfs4_close, ), TP_printk( - "error=%d (%s) fmode=%s fileid=%02x:%02x:%llu " + "error=%ld (%s) fmode=%s fileid=%02x:%02x:%llu " "fhandle=0x%08x openstateid=%d:0x%08x", - __entry->error, + -__entry->error, show_nfsv4_errors(__entry->error), __entry->fmode ? show_fmode_flags(__entry->fmode) : "closed", @@ -757,7 +806,7 @@ DECLARE_EVENT_CLASS(nfs4_lock_event, TP_ARGS(request, state, cmd, error), TP_STRUCT__entry( - __field(int, error) + __field(unsigned long, error) __field(int, cmd) __field(char, type) __field(loff_t, start) @@ -787,10 +836,10 @@ DECLARE_EVENT_CLASS(nfs4_lock_event, ), TP_printk( - "error=%d (%s) cmd=%s:%s range=%lld:%lld " + "error=%ld (%s) cmd=%s:%s range=%lld:%lld " "fileid=%02x:%02x:%llu fhandle=0x%08x " "stateid=%d:0x%08x", - __entry->error, + -__entry->error, show_nfsv4_errors(__entry->error), show_lock_cmd(__entry->cmd), show_lock_type(__entry->type), @@ -827,7 +876,7 @@ TRACE_EVENT(nfs4_set_lock, TP_ARGS(request, state, lockstateid, cmd, error), TP_STRUCT__entry( - __field(int, error) + __field(unsigned long, error) __field(int, cmd) __field(char, type) __field(loff_t, start) @@ -863,10 +912,10 @@ TRACE_EVENT(nfs4_set_lock, ), TP_printk( - "error=%d (%s) cmd=%s:%s range=%lld:%lld " + "error=%ld (%s) cmd=%s:%s range=%lld:%lld " "fileid=%02x:%02x:%llu fhandle=0x%08x " "stateid=%d:0x%08x lockstateid=%d:0x%08x", - __entry->error, + -__entry->error, show_nfsv4_errors(__entry->error), show_lock_cmd(__entry->cmd), show_lock_type(__entry->type), @@ -932,7 +981,7 @@ TRACE_EVENT(nfs4_delegreturn_exit, TP_STRUCT__entry( __field(dev_t, dev) __field(u32, fhandle) - __field(int, error) + __field(unsigned long, error) __field(int, stateid_seq) __field(u32, stateid_hash) ), @@ -948,9 +997,9 @@ TRACE_EVENT(nfs4_delegreturn_exit, ), TP_printk( - "error=%d (%s) dev=%02x:%02x fhandle=0x%08x " + "error=%ld (%s) dev=%02x:%02x fhandle=0x%08x " "stateid=%d:0x%08x", - __entry->error, + -__entry->error, show_nfsv4_errors(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), __entry->fhandle, @@ -969,7 +1018,7 @@ DECLARE_EVENT_CLASS(nfs4_test_stateid_event, TP_ARGS(state, lsp, error), TP_STRUCT__entry( - __field(int, error) + __field(unsigned long, error) __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) @@ -991,9 +1040,9 @@ DECLARE_EVENT_CLASS(nfs4_test_stateid_event, ), TP_printk( - "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " "stateid=%d:0x%08x", - __entry->error, + -__entry->error, show_nfsv4_errors(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, @@ -1026,7 +1075,7 @@ DECLARE_EVENT_CLASS(nfs4_lookup_event, TP_STRUCT__entry( __field(dev_t, dev) - __field(int, error) + __field(unsigned long, error) __field(u64, dir) __string(name, name->name) ), @@ -1034,13 +1083,13 @@ DECLARE_EVENT_CLASS(nfs4_lookup_event, TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); - __entry->error = error; + __entry->error = -error; __assign_str(name, name->name); ), TP_printk( - "error=%d (%s) name=%02x:%02x:%llu/%s", - __entry->error, + "error=%ld (%s) name=%02x:%02x:%llu/%s", + -__entry->error, show_nfsv4_errors(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->dir, @@ -1076,7 +1125,7 @@ TRACE_EVENT(nfs4_lookupp, TP_STRUCT__entry( __field(dev_t, dev) __field(u64, ino) - __field(int, error) + __field(unsigned long, error) ), TP_fast_assign( @@ -1086,8 +1135,8 @@ TRACE_EVENT(nfs4_lookupp, ), TP_printk( - "error=%d (%s) inode=%02x:%02x:%llu", - __entry->error, + "error=%ld (%s) inode=%02x:%02x:%llu", + -__entry->error, show_nfsv4_errors(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->ino @@ -1107,7 +1156,7 @@ TRACE_EVENT(nfs4_rename, TP_STRUCT__entry( __field(dev_t, dev) - __field(int, error) + __field(unsigned long, error) __field(u64, olddir) __string(oldname, oldname->name) __field(u64, newdir) @@ -1124,9 +1173,9 @@ TRACE_EVENT(nfs4_rename, ), TP_printk( - "error=%d (%s) oldname=%02x:%02x:%llu/%s " + "error=%ld (%s) oldname=%02x:%02x:%llu/%s " "newname=%02x:%02x:%llu/%s", - __entry->error, + -__entry->error, show_nfsv4_errors(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->olddir, @@ -1149,19 +1198,19 @@ DECLARE_EVENT_CLASS(nfs4_inode_event, __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) - __field(int, error) + __field(unsigned long, error) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->fileid = NFS_FILEID(inode); __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); - __entry->error = error; + __entry->error = error < 0 ? -error : 0; ), TP_printk( - "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x", - __entry->error, + "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x", + -__entry->error, show_nfsv4_errors(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, @@ -1200,7 +1249,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_event, __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) - __field(int, error) + __field(unsigned long, error) __field(int, stateid_seq) __field(u32, stateid_hash) ), @@ -1217,9 +1266,9 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_event, ), TP_printk( - "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " "stateid=%d:0x%08x", - __entry->error, + -__entry->error, show_nfsv4_errors(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, @@ -1257,7 +1306,7 @@ DECLARE_EVENT_CLASS(nfs4_getattr_event, __field(u32, fhandle) __field(u64, fileid) __field(unsigned int, valid) - __field(int, error) + __field(unsigned long, error) ), TP_fast_assign( @@ -1269,9 +1318,9 @@ DECLARE_EVENT_CLASS(nfs4_getattr_event, ), TP_printk( - "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " "valid=%s", - __entry->error, + -__entry->error, show_nfsv4_errors(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, @@ -1304,7 +1353,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_callback_event, TP_ARGS(clp, fhandle, inode, error), TP_STRUCT__entry( - __field(int, error) + __field(unsigned long, error) __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) @@ -1325,9 +1374,9 @@ DECLARE_EVENT_CLASS(nfs4_inode_callback_event, ), TP_printk( - "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " "dstaddr=%s", - __entry->error, + -__entry->error, show_nfsv4_errors(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, @@ -1359,7 +1408,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event, TP_ARGS(clp, fhandle, inode, stateid, error), TP_STRUCT__entry( - __field(int, error) + __field(unsigned long, error) __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) @@ -1386,9 +1435,9 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event, ), TP_printk( - "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " "stateid=%d:0x%08x dstaddr=%s", - __entry->error, + -__entry->error, show_nfsv4_errors(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, @@ -1422,7 +1471,7 @@ DECLARE_EVENT_CLASS(nfs4_idmap_event, TP_ARGS(name, len, id, error), TP_STRUCT__entry( - __field(int, error) + __field(unsigned long, error) __field(u32, id) __dynamic_array(char, name, len > 0 ? len + 1 : 1) ), @@ -1437,8 +1486,8 @@ DECLARE_EVENT_CLASS(nfs4_idmap_event, ), TP_printk( - "error=%d id=%u name=%s", - __entry->error, + "error=%ld (%s) id=%u name=%s", + -__entry->error, show_nfsv4_errors(__entry->error), __entry->id, __get_str(name) ) @@ -1471,7 +1520,7 @@ DECLARE_EVENT_CLASS(nfs4_read_event, __field(u64, fileid) __field(loff_t, offset) __field(size_t, count) - __field(int, error) + __field(unsigned long, error) __field(int, stateid_seq) __field(u32, stateid_hash) ), @@ -1485,7 +1534,7 @@ DECLARE_EVENT_CLASS(nfs4_read_event, __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); __entry->offset = hdr->args.offset; __entry->count = hdr->args.count; - __entry->error = error; + __entry->error = error < 0 ? -error : 0; __entry->stateid_seq = be32_to_cpu(state->stateid.seqid); __entry->stateid_hash = @@ -1493,9 +1542,9 @@ DECLARE_EVENT_CLASS(nfs4_read_event, ), TP_printk( - "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " "offset=%lld count=%zu stateid=%d:0x%08x", - __entry->error, + -__entry->error, show_nfsv4_errors(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, @@ -1531,7 +1580,7 @@ DECLARE_EVENT_CLASS(nfs4_write_event, __field(u64, fileid) __field(loff_t, offset) __field(size_t, count) - __field(int, error) + __field(unsigned long, error) __field(int, stateid_seq) __field(u32, stateid_hash) ), @@ -1545,7 +1594,7 @@ DECLARE_EVENT_CLASS(nfs4_write_event, __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); __entry->offset = hdr->args.offset; __entry->count = hdr->args.count; - __entry->error = error; + __entry->error = error < 0 ? -error : 0; __entry->stateid_seq = be32_to_cpu(state->stateid.seqid); __entry->stateid_hash = @@ -1553,9 +1602,9 @@ DECLARE_EVENT_CLASS(nfs4_write_event, ), TP_printk( - "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " "offset=%lld count=%zu stateid=%d:0x%08x", - __entry->error, + -__entry->error, show_nfsv4_errors(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, @@ -1592,7 +1641,7 @@ DECLARE_EVENT_CLASS(nfs4_commit_event, __field(u64, fileid) __field(loff_t, offset) __field(size_t, count) - __field(int, error) + __field(unsigned long, error) ), TP_fast_assign( @@ -1606,9 +1655,9 @@ DECLARE_EVENT_CLASS(nfs4_commit_event, ), TP_printk( - "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " "offset=%lld count=%zu", - __entry->error, + -__entry->error, show_nfsv4_errors(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, @@ -1656,7 +1705,7 @@ TRACE_EVENT(nfs4_layoutget, __field(u32, iomode) __field(u64, offset) __field(u64, count) - __field(int, error) + __field(unsigned long, error) __field(int, stateid_seq) __field(u32, stateid_hash) __field(int, layoutstateid_seq) @@ -1689,10 +1738,10 @@ TRACE_EVENT(nfs4_layoutget, ), TP_printk( - "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " "iomode=%s offset=%llu count=%llu stateid=%d:0x%08x " "layoutstateid=%d:0x%08x", - __entry->error, + -__entry->error, show_nfsv4_errors(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, @@ -1722,6 +1771,7 @@ TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_BLOCKED); TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_INVALID_OPEN); TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_RETRY); TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET); +TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_EXIT); #define show_pnfs_update_layout_reason(reason) \ __print_symbolic(reason, \ @@ -1737,7 +1787,8 @@ TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET); { PNFS_UPDATE_LAYOUT_BLOCKED, "layouts blocked" }, \ { PNFS_UPDATE_LAYOUT_INVALID_OPEN, "invalid open" }, \ { PNFS_UPDATE_LAYOUT_RETRY, "retrying" }, \ - { PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, "sent layoutget" }) + { PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, "sent layoutget" }, \ + { PNFS_UPDATE_LAYOUT_EXIT, "exit" }) TRACE_EVENT(pnfs_update_layout, TP_PROTO(struct inode *inode, @@ -1796,6 +1847,78 @@ TRACE_EVENT(pnfs_update_layout, ) ); +DECLARE_EVENT_CLASS(pnfs_layout_event, + TP_PROTO(struct inode *inode, + loff_t pos, + u64 count, + enum pnfs_iomode iomode, + struct pnfs_layout_hdr *lo, + struct pnfs_layout_segment *lseg + ), + TP_ARGS(inode, pos, count, iomode, lo, lseg), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u64, fileid) + __field(u32, fhandle) + __field(loff_t, pos) + __field(u64, count) + __field(enum pnfs_iomode, iomode) + __field(int, layoutstateid_seq) + __field(u32, layoutstateid_hash) + __field(long, lseg) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->pos = pos; + __entry->count = count; + __entry->iomode = iomode; + if (lo != NULL) { + __entry->layoutstateid_seq = + be32_to_cpu(lo->plh_stateid.seqid); + __entry->layoutstateid_hash = + nfs_stateid_hash(&lo->plh_stateid); + } else { + __entry->layoutstateid_seq = 0; + __entry->layoutstateid_hash = 0; + } + __entry->lseg = (long)lseg; + ), + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x " + "iomode=%s pos=%llu count=%llu " + "layoutstateid=%d:0x%08x lseg=0x%lx", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + show_pnfs_iomode(__entry->iomode), + (unsigned long long)__entry->pos, + (unsigned long long)__entry->count, + __entry->layoutstateid_seq, __entry->layoutstateid_hash, + __entry->lseg + ) +); + +#define DEFINE_PNFS_LAYOUT_EVENT(name) \ + DEFINE_EVENT(pnfs_layout_event, name, \ + TP_PROTO(struct inode *inode, \ + loff_t pos, \ + u64 count, \ + enum pnfs_iomode iomode, \ + struct pnfs_layout_hdr *lo, \ + struct pnfs_layout_segment *lseg \ + ), \ + TP_ARGS(inode, pos, count, iomode, lo, lseg)) + +DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_pg_init_read); +DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_pg_init_write); +DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_pg_get_mirror_count); +DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_read_done); +DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_write_done); +DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_read_pagelist); +DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_write_pagelist); + #endif /* CONFIG_NFS_V4_1 */ #endif /* _TRACE_NFS4_H */ diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 602446158bfb..46a8d636d151 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -837,6 +837,7 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, #define NFS4_dec_sequence_sz \ (compound_decode_hdr_maxsz + \ decode_sequence_maxsz) +#endif #define NFS4_enc_get_lease_time_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putrootfh_maxsz + \ @@ -845,6 +846,7 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, decode_sequence_maxsz + \ decode_putrootfh_maxsz + \ decode_fsinfo_maxsz) +#if defined(CONFIG_NFS_V4_1) #define NFS4_enc_reclaim_complete_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_reclaim_complete_maxsz) @@ -2957,6 +2959,8 @@ static void nfs4_xdr_enc_sequence(struct rpc_rqst *req, struct xdr_stream *xdr, encode_nops(&hdr); } +#endif + /* * a GET_LEASE_TIME request */ @@ -2977,6 +2981,8 @@ static void nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, encode_nops(&hdr); } +#ifdef CONFIG_NFS_V4_1 + /* * a RECLAIM_COMPLETE request */ @@ -3187,7 +3193,7 @@ static bool __decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected, return true; out_status: nfserr = be32_to_cpup(p); - trace_nfs4_xdr_status(opnum, nfserr); + trace_nfs4_xdr_status(xdr, opnum, nfserr); *nfs_retval = nfs4_stat_to_errno(nfserr); return true; out_bad_operation: @@ -3427,7 +3433,7 @@ static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME; } - dprintk("%s: file size=%u\n", __func__, (unsigned int)*res); + dprintk("%s: lease time=%u\n", __func__, (unsigned int)*res); return 0; } @@ -7122,6 +7128,8 @@ static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp, return status; } +#endif + /* * Decode GET_LEASE_TIME response */ @@ -7143,6 +7151,8 @@ static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp, return status; } +#ifdef CONFIG_NFS_V4_1 + /* * Decode RECLAIM_COMPLETE response */ @@ -7551,7 +7561,7 @@ const struct rpc_procinfo nfs4_procedures[] = { PROC41(CREATE_SESSION, enc_create_session, dec_create_session), PROC41(DESTROY_SESSION, enc_destroy_session, dec_destroy_session), PROC41(SEQUENCE, enc_sequence, dec_sequence), - PROC41(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time), + PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time), PROC41(RECLAIM_COMPLETE,enc_reclaim_complete, dec_reclaim_complete), PROC41(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), PROC41(LAYOUTGET, enc_layoutget, dec_layoutget), diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index a0d6910aa03a..976d4089e267 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -11,6 +11,16 @@ #include <linux/tracepoint.h> #include <linux/iversion.h> +TRACE_DEFINE_ENUM(DT_UNKNOWN); +TRACE_DEFINE_ENUM(DT_FIFO); +TRACE_DEFINE_ENUM(DT_CHR); +TRACE_DEFINE_ENUM(DT_DIR); +TRACE_DEFINE_ENUM(DT_BLK); +TRACE_DEFINE_ENUM(DT_REG); +TRACE_DEFINE_ENUM(DT_LNK); +TRACE_DEFINE_ENUM(DT_SOCK); +TRACE_DEFINE_ENUM(DT_WHT); + #define nfs_show_file_type(ftype) \ __print_symbolic(ftype, \ { DT_UNKNOWN, "UNKNOWN" }, \ @@ -23,25 +33,57 @@ { DT_SOCK, "SOCK" }, \ { DT_WHT, "WHT" }) +TRACE_DEFINE_ENUM(NFS_INO_INVALID_DATA); +TRACE_DEFINE_ENUM(NFS_INO_INVALID_ATIME); +TRACE_DEFINE_ENUM(NFS_INO_INVALID_ACCESS); +TRACE_DEFINE_ENUM(NFS_INO_INVALID_ACL); +TRACE_DEFINE_ENUM(NFS_INO_REVAL_PAGECACHE); +TRACE_DEFINE_ENUM(NFS_INO_REVAL_FORCED); +TRACE_DEFINE_ENUM(NFS_INO_INVALID_LABEL); +TRACE_DEFINE_ENUM(NFS_INO_INVALID_CHANGE); +TRACE_DEFINE_ENUM(NFS_INO_INVALID_CTIME); +TRACE_DEFINE_ENUM(NFS_INO_INVALID_MTIME); +TRACE_DEFINE_ENUM(NFS_INO_INVALID_SIZE); +TRACE_DEFINE_ENUM(NFS_INO_INVALID_OTHER); + #define nfs_show_cache_validity(v) \ __print_flags(v, "|", \ - { NFS_INO_INVALID_ATTR, "INVALID_ATTR" }, \ { NFS_INO_INVALID_DATA, "INVALID_DATA" }, \ { NFS_INO_INVALID_ATIME, "INVALID_ATIME" }, \ { NFS_INO_INVALID_ACCESS, "INVALID_ACCESS" }, \ { NFS_INO_INVALID_ACL, "INVALID_ACL" }, \ { NFS_INO_REVAL_PAGECACHE, "REVAL_PAGECACHE" }, \ { NFS_INO_REVAL_FORCED, "REVAL_FORCED" }, \ - { NFS_INO_INVALID_LABEL, "INVALID_LABEL" }) + { NFS_INO_INVALID_LABEL, "INVALID_LABEL" }, \ + { NFS_INO_INVALID_CHANGE, "INVALID_CHANGE" }, \ + { NFS_INO_INVALID_CTIME, "INVALID_CTIME" }, \ + { NFS_INO_INVALID_MTIME, "INVALID_MTIME" }, \ + { NFS_INO_INVALID_SIZE, "INVALID_SIZE" }, \ + { NFS_INO_INVALID_OTHER, "INVALID_OTHER" }) + +TRACE_DEFINE_ENUM(NFS_INO_ADVISE_RDPLUS); +TRACE_DEFINE_ENUM(NFS_INO_STALE); +TRACE_DEFINE_ENUM(NFS_INO_ACL_LRU_SET); +TRACE_DEFINE_ENUM(NFS_INO_INVALIDATING); +TRACE_DEFINE_ENUM(NFS_INO_FSCACHE); +TRACE_DEFINE_ENUM(NFS_INO_FSCACHE_LOCK); +TRACE_DEFINE_ENUM(NFS_INO_LAYOUTCOMMIT); +TRACE_DEFINE_ENUM(NFS_INO_LAYOUTCOMMITTING); +TRACE_DEFINE_ENUM(NFS_INO_LAYOUTSTATS); +TRACE_DEFINE_ENUM(NFS_INO_ODIRECT); #define nfs_show_nfsi_flags(v) \ __print_flags(v, "|", \ - { 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \ - { 1 << NFS_INO_STALE, "STALE" }, \ - { 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \ - { 1 << NFS_INO_FSCACHE, "FSCACHE" }, \ - { 1 << NFS_INO_LAYOUTCOMMIT, "NEED_LAYOUTCOMMIT" }, \ - { 1 << NFS_INO_LAYOUTCOMMITTING, "LAYOUTCOMMIT" }) + { BIT(NFS_INO_ADVISE_RDPLUS), "ADVISE_RDPLUS" }, \ + { BIT(NFS_INO_STALE), "STALE" }, \ + { BIT(NFS_INO_ACL_LRU_SET), "ACL_LRU_SET" }, \ + { BIT(NFS_INO_INVALIDATING), "INVALIDATING" }, \ + { BIT(NFS_INO_FSCACHE), "FSCACHE" }, \ + { BIT(NFS_INO_FSCACHE_LOCK), "FSCACHE_LOCK" }, \ + { BIT(NFS_INO_LAYOUTCOMMIT), "NEED_LAYOUTCOMMIT" }, \ + { BIT(NFS_INO_LAYOUTCOMMITTING), "LAYOUTCOMMIT" }, \ + { BIT(NFS_INO_LAYOUTSTATS), "LAYOUTSTATS" }, \ + { BIT(NFS_INO_ODIRECT), "ODIRECT" }) DECLARE_EVENT_CLASS(nfs_inode_event, TP_PROTO( @@ -83,7 +125,7 @@ DECLARE_EVENT_CLASS(nfs_inode_event_done, TP_ARGS(inode, error), TP_STRUCT__entry( - __field(int, error) + __field(unsigned long, error) __field(dev_t, dev) __field(u32, fhandle) __field(unsigned char, type) @@ -96,7 +138,7 @@ DECLARE_EVENT_CLASS(nfs_inode_event_done, TP_fast_assign( const struct nfs_inode *nfsi = NFS_I(inode); - __entry->error = error; + __entry->error = error < 0 ? -error : 0; __entry->dev = inode->i_sb->s_dev; __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); @@ -108,10 +150,10 @@ DECLARE_EVENT_CLASS(nfs_inode_event_done, ), TP_printk( - "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x " + "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " "type=%u (%s) version=%llu size=%lld " - "cache_validity=%lu (%s) nfs_flags=%ld (%s)", - __entry->error, + "cache_validity=0x%lx (%s) nfs_flags=0x%lx (%s)", + -__entry->error, nfs_show_status(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, @@ -158,13 +200,41 @@ DEFINE_NFS_INODE_EVENT_DONE(nfs_fsync_exit); DEFINE_NFS_INODE_EVENT(nfs_access_enter); DEFINE_NFS_INODE_EVENT_DONE(nfs_access_exit); +TRACE_DEFINE_ENUM(LOOKUP_FOLLOW); +TRACE_DEFINE_ENUM(LOOKUP_DIRECTORY); +TRACE_DEFINE_ENUM(LOOKUP_AUTOMOUNT); +TRACE_DEFINE_ENUM(LOOKUP_PARENT); +TRACE_DEFINE_ENUM(LOOKUP_REVAL); +TRACE_DEFINE_ENUM(LOOKUP_RCU); +TRACE_DEFINE_ENUM(LOOKUP_NO_REVAL); +TRACE_DEFINE_ENUM(LOOKUP_NO_EVAL); +TRACE_DEFINE_ENUM(LOOKUP_OPEN); +TRACE_DEFINE_ENUM(LOOKUP_CREATE); +TRACE_DEFINE_ENUM(LOOKUP_EXCL); +TRACE_DEFINE_ENUM(LOOKUP_RENAME_TARGET); +TRACE_DEFINE_ENUM(LOOKUP_JUMPED); +TRACE_DEFINE_ENUM(LOOKUP_ROOT); +TRACE_DEFINE_ENUM(LOOKUP_EMPTY); +TRACE_DEFINE_ENUM(LOOKUP_DOWN); + #define show_lookup_flags(flags) \ - __print_flags((unsigned long)flags, "|", \ - { LOOKUP_AUTOMOUNT, "AUTOMOUNT" }, \ + __print_flags(flags, "|", \ + { LOOKUP_FOLLOW, "FOLLOW" }, \ { LOOKUP_DIRECTORY, "DIRECTORY" }, \ + { LOOKUP_AUTOMOUNT, "AUTOMOUNT" }, \ + { LOOKUP_PARENT, "PARENT" }, \ + { LOOKUP_REVAL, "REVAL" }, \ + { LOOKUP_RCU, "RCU" }, \ + { LOOKUP_NO_REVAL, "NO_REVAL" }, \ + { LOOKUP_NO_EVAL, "NO_EVAL" }, \ { LOOKUP_OPEN, "OPEN" }, \ { LOOKUP_CREATE, "CREATE" }, \ - { LOOKUP_EXCL, "EXCL" }) + { LOOKUP_EXCL, "EXCL" }, \ + { LOOKUP_RENAME_TARGET, "RENAME_TARGET" }, \ + { LOOKUP_JUMPED, "JUMPED" }, \ + { LOOKUP_ROOT, "ROOT" }, \ + { LOOKUP_EMPTY, "EMPTY" }, \ + { LOOKUP_DOWN, "DOWN" }) DECLARE_EVENT_CLASS(nfs_lookup_event, TP_PROTO( @@ -176,7 +246,7 @@ DECLARE_EVENT_CLASS(nfs_lookup_event, TP_ARGS(dir, dentry, flags), TP_STRUCT__entry( - __field(unsigned int, flags) + __field(unsigned long, flags) __field(dev_t, dev) __field(u64, dir) __string(name, dentry->d_name.name) @@ -190,7 +260,7 @@ DECLARE_EVENT_CLASS(nfs_lookup_event, ), TP_printk( - "flags=%u (%s) name=%02x:%02x:%llu/%s", + "flags=0x%lx (%s) name=%02x:%02x:%llu/%s", __entry->flags, show_lookup_flags(__entry->flags), MAJOR(__entry->dev), MINOR(__entry->dev), @@ -219,8 +289,8 @@ DECLARE_EVENT_CLASS(nfs_lookup_event_done, TP_ARGS(dir, dentry, flags, error), TP_STRUCT__entry( - __field(int, error) - __field(unsigned int, flags) + __field(unsigned long, error) + __field(unsigned long, flags) __field(dev_t, dev) __field(u64, dir) __string(name, dentry->d_name.name) @@ -229,14 +299,14 @@ DECLARE_EVENT_CLASS(nfs_lookup_event_done, TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); - __entry->error = error; + __entry->error = error < 0 ? -error : 0; __entry->flags = flags; __assign_str(name, dentry->d_name.name); ), TP_printk( - "error=%d flags=%u (%s) name=%02x:%02x:%llu/%s", - __entry->error, + "error=%ld (%s) flags=0x%lx (%s) name=%02x:%02x:%llu/%s", + -__entry->error, nfs_show_status(__entry->error), __entry->flags, show_lookup_flags(__entry->flags), MAJOR(__entry->dev), MINOR(__entry->dev), @@ -260,15 +330,43 @@ DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_lookup_exit); DEFINE_NFS_LOOKUP_EVENT(nfs_lookup_revalidate_enter); DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_lookup_revalidate_exit); +TRACE_DEFINE_ENUM(O_WRONLY); +TRACE_DEFINE_ENUM(O_RDWR); +TRACE_DEFINE_ENUM(O_CREAT); +TRACE_DEFINE_ENUM(O_EXCL); +TRACE_DEFINE_ENUM(O_NOCTTY); +TRACE_DEFINE_ENUM(O_TRUNC); +TRACE_DEFINE_ENUM(O_APPEND); +TRACE_DEFINE_ENUM(O_NONBLOCK); +TRACE_DEFINE_ENUM(O_DSYNC); +TRACE_DEFINE_ENUM(O_DIRECT); +TRACE_DEFINE_ENUM(O_LARGEFILE); +TRACE_DEFINE_ENUM(O_DIRECTORY); +TRACE_DEFINE_ENUM(O_NOFOLLOW); +TRACE_DEFINE_ENUM(O_NOATIME); +TRACE_DEFINE_ENUM(O_CLOEXEC); + #define show_open_flags(flags) \ - __print_flags((unsigned long)flags, "|", \ + __print_flags(flags, "|", \ + { O_WRONLY, "O_WRONLY" }, \ + { O_RDWR, "O_RDWR" }, \ { O_CREAT, "O_CREAT" }, \ { O_EXCL, "O_EXCL" }, \ + { O_NOCTTY, "O_NOCTTY" }, \ { O_TRUNC, "O_TRUNC" }, \ { O_APPEND, "O_APPEND" }, \ + { O_NONBLOCK, "O_NONBLOCK" }, \ { O_DSYNC, "O_DSYNC" }, \ { O_DIRECT, "O_DIRECT" }, \ - { O_DIRECTORY, "O_DIRECTORY" }) + { O_LARGEFILE, "O_LARGEFILE" }, \ + { O_DIRECTORY, "O_DIRECTORY" }, \ + { O_NOFOLLOW, "O_NOFOLLOW" }, \ + { O_NOATIME, "O_NOATIME" }, \ + { O_CLOEXEC, "O_CLOEXEC" }) + +TRACE_DEFINE_ENUM(FMODE_READ); +TRACE_DEFINE_ENUM(FMODE_WRITE); +TRACE_DEFINE_ENUM(FMODE_EXEC); #define show_fmode_flags(mode) \ __print_flags(mode, "|", \ @@ -286,7 +384,7 @@ TRACE_EVENT(nfs_atomic_open_enter, TP_ARGS(dir, ctx, flags), TP_STRUCT__entry( - __field(unsigned int, flags) + __field(unsigned long, flags) __field(unsigned int, fmode) __field(dev_t, dev) __field(u64, dir) @@ -302,7 +400,7 @@ TRACE_EVENT(nfs_atomic_open_enter, ), TP_printk( - "flags=%u (%s) fmode=%s name=%02x:%02x:%llu/%s", + "flags=0x%lx (%s) fmode=%s name=%02x:%02x:%llu/%s", __entry->flags, show_open_flags(__entry->flags), show_fmode_flags(__entry->fmode), @@ -323,8 +421,8 @@ TRACE_EVENT(nfs_atomic_open_exit, TP_ARGS(dir, ctx, flags, error), TP_STRUCT__entry( - __field(int, error) - __field(unsigned int, flags) + __field(unsigned long, error) + __field(unsigned long, flags) __field(unsigned int, fmode) __field(dev_t, dev) __field(u64, dir) @@ -332,7 +430,7 @@ TRACE_EVENT(nfs_atomic_open_exit, ), TP_fast_assign( - __entry->error = error; + __entry->error = -error; __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); __entry->flags = flags; @@ -341,9 +439,9 @@ TRACE_EVENT(nfs_atomic_open_exit, ), TP_printk( - "error=%d flags=%u (%s) fmode=%s " + "error=%ld (%s) flags=0x%lx (%s) fmode=%s " "name=%02x:%02x:%llu/%s", - __entry->error, + -__entry->error, nfs_show_status(__entry->error), __entry->flags, show_open_flags(__entry->flags), show_fmode_flags(__entry->fmode), @@ -363,7 +461,7 @@ TRACE_EVENT(nfs_create_enter, TP_ARGS(dir, dentry, flags), TP_STRUCT__entry( - __field(unsigned int, flags) + __field(unsigned long, flags) __field(dev_t, dev) __field(u64, dir) __string(name, dentry->d_name.name) @@ -377,7 +475,7 @@ TRACE_EVENT(nfs_create_enter, ), TP_printk( - "flags=%u (%s) name=%02x:%02x:%llu/%s", + "flags=0x%lx (%s) name=%02x:%02x:%llu/%s", __entry->flags, show_open_flags(__entry->flags), MAJOR(__entry->dev), MINOR(__entry->dev), @@ -397,15 +495,15 @@ TRACE_EVENT(nfs_create_exit, TP_ARGS(dir, dentry, flags, error), TP_STRUCT__entry( - __field(int, error) - __field(unsigned int, flags) + __field(unsigned long, error) + __field(unsigned long, flags) __field(dev_t, dev) __field(u64, dir) __string(name, dentry->d_name.name) ), TP_fast_assign( - __entry->error = error; + __entry->error = -error; __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); __entry->flags = flags; @@ -413,8 +511,8 @@ TRACE_EVENT(nfs_create_exit, ), TP_printk( - "error=%d flags=%u (%s) name=%02x:%02x:%llu/%s", - __entry->error, + "error=%ld (%s) flags=0x%lx (%s) name=%02x:%02x:%llu/%s", + -__entry->error, nfs_show_status(__entry->error), __entry->flags, show_open_flags(__entry->flags), MAJOR(__entry->dev), MINOR(__entry->dev), @@ -469,7 +567,7 @@ DECLARE_EVENT_CLASS(nfs_directory_event_done, TP_ARGS(dir, dentry, error), TP_STRUCT__entry( - __field(int, error) + __field(unsigned long, error) __field(dev_t, dev) __field(u64, dir) __string(name, dentry->d_name.name) @@ -478,13 +576,13 @@ DECLARE_EVENT_CLASS(nfs_directory_event_done, TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); - __entry->error = error; + __entry->error = error < 0 ? -error : 0; __assign_str(name, dentry->d_name.name); ), TP_printk( - "error=%d name=%02x:%02x:%llu/%s", - __entry->error, + "error=%ld (%s) name=%02x:%02x:%llu/%s", + -__entry->error, nfs_show_status(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->dir, __get_str(name) @@ -557,7 +655,7 @@ TRACE_EVENT(nfs_link_exit, TP_ARGS(inode, dir, dentry, error), TP_STRUCT__entry( - __field(int, error) + __field(unsigned long, error) __field(dev_t, dev) __field(u64, fileid) __field(u64, dir) @@ -568,13 +666,13 @@ TRACE_EVENT(nfs_link_exit, __entry->dev = inode->i_sb->s_dev; __entry->fileid = NFS_FILEID(inode); __entry->dir = NFS_FILEID(dir); - __entry->error = error; + __entry->error = error < 0 ? -error : 0; __assign_str(name, dentry->d_name.name); ), TP_printk( - "error=%d fileid=%02x:%02x:%llu name=%02x:%02x:%llu/%s", - __entry->error, + "error=%ld (%s) fileid=%02x:%02x:%llu name=%02x:%02x:%llu/%s", + -__entry->error, nfs_show_status(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), __entry->fileid, MAJOR(__entry->dev), MINOR(__entry->dev), @@ -642,7 +740,7 @@ DECLARE_EVENT_CLASS(nfs_rename_event_done, TP_STRUCT__entry( __field(dev_t, dev) - __field(int, error) + __field(unsigned long, error) __field(u64, old_dir) __string(old_name, old_dentry->d_name.name) __field(u64, new_dir) @@ -651,17 +749,17 @@ DECLARE_EVENT_CLASS(nfs_rename_event_done, TP_fast_assign( __entry->dev = old_dir->i_sb->s_dev; + __entry->error = -error; __entry->old_dir = NFS_FILEID(old_dir); __entry->new_dir = NFS_FILEID(new_dir); - __entry->error = error; __assign_str(old_name, old_dentry->d_name.name); __assign_str(new_name, new_dentry->d_name.name); ), TP_printk( - "error=%d old_name=%02x:%02x:%llu/%s " + "error=%ld (%s) old_name=%02x:%02x:%llu/%s " "new_name=%02x:%02x:%llu/%s", - __entry->error, + -__entry->error, nfs_show_status(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->old_dir, __get_str(old_name), @@ -697,7 +795,7 @@ TRACE_EVENT(nfs_sillyrename_unlink, TP_STRUCT__entry( __field(dev_t, dev) - __field(int, error) + __field(unsigned long, error) __field(u64, dir) __dynamic_array(char, name, data->args.name.len + 1) ), @@ -707,15 +805,15 @@ TRACE_EVENT(nfs_sillyrename_unlink, size_t len = data->args.name.len; __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); - __entry->error = error; + __entry->error = -error; memcpy(__get_str(name), data->args.name.name, len); __get_str(name)[len] = 0; ), TP_printk( - "error=%d name=%02x:%02x:%llu/%s", - __entry->error, + "error=%ld (%s) name=%02x:%02x:%llu/%s", + -__entry->error, nfs_show_status(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->dir, __get_str(name) @@ -974,6 +1072,8 @@ TRACE_DEFINE_ENUM(NFSERR_PERM); TRACE_DEFINE_ENUM(NFSERR_NOENT); TRACE_DEFINE_ENUM(NFSERR_IO); TRACE_DEFINE_ENUM(NFSERR_NXIO); +TRACE_DEFINE_ENUM(ECHILD); +TRACE_DEFINE_ENUM(NFSERR_EAGAIN); TRACE_DEFINE_ENUM(NFSERR_ACCES); TRACE_DEFINE_ENUM(NFSERR_EXIST); TRACE_DEFINE_ENUM(NFSERR_XDEV); @@ -985,6 +1085,7 @@ TRACE_DEFINE_ENUM(NFSERR_FBIG); TRACE_DEFINE_ENUM(NFSERR_NOSPC); TRACE_DEFINE_ENUM(NFSERR_ROFS); TRACE_DEFINE_ENUM(NFSERR_MLINK); +TRACE_DEFINE_ENUM(NFSERR_OPNOTSUPP); TRACE_DEFINE_ENUM(NFSERR_NAMETOOLONG); TRACE_DEFINE_ENUM(NFSERR_NOTEMPTY); TRACE_DEFINE_ENUM(NFSERR_DQUOT); @@ -1007,6 +1108,8 @@ TRACE_DEFINE_ENUM(NFSERR_JUKEBOX); { NFSERR_NOENT, "NOENT" }, \ { NFSERR_IO, "IO" }, \ { NFSERR_NXIO, "NXIO" }, \ + { ECHILD, "CHILD" }, \ + { NFSERR_EAGAIN, "AGAIN" }, \ { NFSERR_ACCES, "ACCES" }, \ { NFSERR_EXIST, "EXIST" }, \ { NFSERR_XDEV, "XDEV" }, \ @@ -1018,6 +1121,7 @@ TRACE_DEFINE_ENUM(NFSERR_JUKEBOX); { NFSERR_NOSPC, "NOSPC" }, \ { NFSERR_ROFS, "ROFS" }, \ { NFSERR_MLINK, "MLINK" }, \ + { NFSERR_OPNOTSUPP, "OPNOTSUPP" }, \ { NFSERR_NAMETOOLONG, "NAMETOOLONG" }, \ { NFSERR_NOTEMPTY, "NOTEMPTY" }, \ { NFSERR_DQUOT, "DQUOT" }, \ @@ -1035,22 +1139,33 @@ TRACE_DEFINE_ENUM(NFSERR_JUKEBOX); TRACE_EVENT(nfs_xdr_status, TP_PROTO( + const struct xdr_stream *xdr, int error ), - TP_ARGS(error), + TP_ARGS(xdr, error), TP_STRUCT__entry( - __field(int, error) + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(u32, xid) + __field(unsigned long, error) ), TP_fast_assign( + const struct rpc_rqst *rqstp = xdr->rqst; + const struct rpc_task *task = rqstp->rq_task; + + __entry->task_id = task->tk_pid; + __entry->client_id = task->tk_client->cl_clid; + __entry->xid = be32_to_cpu(rqstp->rq_xid); __entry->error = error; ), TP_printk( - "error=%d (%s)", - __entry->error, nfs_show_status(__entry->error) + "task:%u@%d xid=0x%08x error=%ld (%s)", + __entry->task_id, __entry->client_id, __entry->xid, + -__entry->error, nfs_show_status(__entry->error) ) ); diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 6ef5278326b6..ed4e1b07447b 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -77,7 +77,7 @@ void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos) static inline struct nfs_page * nfs_page_alloc(void) { - struct nfs_page *p = kmem_cache_zalloc(nfs_page_cachep, GFP_NOIO); + struct nfs_page *p = kmem_cache_zalloc(nfs_page_cachep, GFP_KERNEL); if (p) INIT_LIST_HEAD(&p->wb_list); return p; @@ -775,8 +775,6 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, if (pagecount <= ARRAY_SIZE(pg_array->page_array)) pg_array->pagevec = pg_array->page_array; else { - if (hdr->rw_mode == FMODE_WRITE) - gfp_flags = GFP_NOIO; pg_array->pagevec = kcalloc(pagecount, sizeof(struct page *), gfp_flags); if (!pg_array->pagevec) { pg_array->npages = 0; @@ -851,7 +849,7 @@ nfs_pageio_alloc_mirrors(struct nfs_pageio_descriptor *desc, desc->pg_mirrors_dynamic = NULL; if (mirror_count == 1) return desc->pg_mirrors_static; - ret = kmalloc_array(mirror_count, sizeof(*ret), GFP_NOFS); + ret = kmalloc_array(mirror_count, sizeof(*ret), GFP_KERNEL); if (ret != NULL) { for (i = 0; i < mirror_count; i++) nfs_pageio_mirror_init(&ret[i], desc->pg_bsize); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 83722e936b4a..75bd5b552ba4 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1890,7 +1890,7 @@ lookup_again: spin_unlock(&ino->i_lock); lseg = ERR_PTR(wait_var_event_killable(&lo->plh_outstanding, !atomic_read(&lo->plh_outstanding))); - if (IS_ERR(lseg) || !list_empty(&lo->plh_segs)) + if (IS_ERR(lseg)) goto out_put_layout_hdr; pnfs_put_layout_hdr(lo); goto lookup_again; @@ -1915,6 +1915,7 @@ lookup_again: * stateid. */ if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) { + int status; /* * The first layoutget for the file. Need to serialize per @@ -1934,13 +1935,20 @@ lookup_again: } first = true; - if (nfs4_select_rw_stateid(ctx->state, + status = nfs4_select_rw_stateid(ctx->state, iomode == IOMODE_RW ? FMODE_WRITE : FMODE_READ, - NULL, &stateid, NULL) != 0) { + NULL, &stateid, NULL); + if (status != 0) { trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_INVALID_OPEN); - goto out_unlock; + if (status != -EAGAIN) + goto out_unlock; + spin_unlock(&ino->i_lock); + nfs4_schedule_stateid_recovery(server, ctx->state); + pnfs_clear_first_layoutget(lo); + pnfs_put_layout_hdr(lo); + goto lookup_again; } } else { nfs4_stateid_copy(&stateid, &lo->plh_stateid); @@ -2029,6 +2037,8 @@ lookup_again: out_put_layout_hdr: if (first) pnfs_clear_first_layoutget(lo); + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, + PNFS_UPDATE_LAYOUT_EXIT); pnfs_put_layout_hdr(lo); out: dprintk("%s: inode %s/%llu pNFS layout segment %s for " @@ -2468,7 +2478,7 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, wb_size, IOMODE_RW, false, - GFP_NOFS); + GFP_KERNEL); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); pgio->pg_lseg = NULL; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index f88ddac2dcdf..628631e2e34f 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -77,6 +77,8 @@ #define NFS_DEFAULT_VERSION 2 #endif +#define NFS_MAX_CONNECTIONS 16 + enum { /* Mount options that take no arguments */ Opt_soft, Opt_softerr, Opt_hard, @@ -108,6 +110,7 @@ enum { Opt_nfsvers, Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost, Opt_addr, Opt_mountaddr, Opt_clientaddr, + Opt_nconnect, Opt_lookupcache, Opt_fscache_uniq, Opt_local_lock, @@ -181,6 +184,8 @@ static const match_table_t nfs_mount_option_tokens = { { Opt_mounthost, "mounthost=%s" }, { Opt_mountaddr, "mountaddr=%s" }, + { Opt_nconnect, "nconnect=%s" }, + { Opt_lookupcache, "lookupcache=%s" }, { Opt_fscache_uniq, "fsc=%s" }, { Opt_local_lock, "local_lock=%s" }, @@ -452,10 +457,8 @@ int nfs_statfs(struct dentry *dentry, struct kstatfs *buf) struct dentry *pd_dentry; pd_dentry = dget_parent(dentry); - if (pd_dentry != NULL) { - nfs_zap_caches(d_inode(pd_dentry)); - dput(pd_dentry); - } + nfs_zap_caches(d_inode(pd_dentry)); + dput(pd_dentry); } nfs_free_fattr(res.fattr); if (error < 0) @@ -582,7 +585,7 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss, } default: if (showdefaults) - seq_printf(m, ",mountaddr=unspecified"); + seq_puts(m, ",mountaddr=unspecified"); } if (nfss->mountd_version || showdefaults) @@ -673,6 +676,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, seq_printf(m, ",proto=%s", rpc_peeraddr2str(nfss->client, RPC_DISPLAY_NETID)); rcu_read_unlock(); + if (clp->cl_nconnect > 0) + seq_printf(m, ",nconnect=%u", clp->cl_nconnect); if (version == 4) { if (nfss->port != NFS_PORT) seq_printf(m, ",port=%u", nfss->port); @@ -690,29 +695,29 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, nfs_show_nfsv4_options(m, nfss, showdefaults); if (nfss->options & NFS_OPTION_FSCACHE) - seq_printf(m, ",fsc"); + seq_puts(m, ",fsc"); if (nfss->options & NFS_OPTION_MIGRATION) - seq_printf(m, ",migration"); + seq_puts(m, ",migration"); if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) { if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONE) - seq_printf(m, ",lookupcache=none"); + seq_puts(m, ",lookupcache=none"); else - seq_printf(m, ",lookupcache=pos"); + seq_puts(m, ",lookupcache=pos"); } local_flock = nfss->flags & NFS_MOUNT_LOCAL_FLOCK; local_fcntl = nfss->flags & NFS_MOUNT_LOCAL_FCNTL; if (!local_flock && !local_fcntl) - seq_printf(m, ",local_lock=none"); + seq_puts(m, ",local_lock=none"); else if (local_flock && local_fcntl) - seq_printf(m, ",local_lock=all"); + seq_puts(m, ",local_lock=all"); else if (local_flock) - seq_printf(m, ",local_lock=flock"); + seq_puts(m, ",local_lock=flock"); else - seq_printf(m, ",local_lock=posix"); + seq_puts(m, ",local_lock=posix"); } /* @@ -735,11 +740,21 @@ int nfs_show_options(struct seq_file *m, struct dentry *root) EXPORT_SYMBOL_GPL(nfs_show_options); #if IS_ENABLED(CONFIG_NFS_V4) +static void show_lease(struct seq_file *m, struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + unsigned long expire; + + seq_printf(m, ",lease_time=%ld", clp->cl_lease_time / HZ); + expire = clp->cl_last_renewal + clp->cl_lease_time; + seq_printf(m, ",lease_expired=%ld", + time_after(expire, jiffies) ? 0 : (jiffies - expire) / HZ); +} #ifdef CONFIG_NFS_V4_1 static void show_sessions(struct seq_file *m, struct nfs_server *server) { if (nfs4_has_session(server->nfs_client)) - seq_printf(m, ",sessions"); + seq_puts(m, ",sessions"); } #else static void show_sessions(struct seq_file *m, struct nfs_server *server) {} @@ -816,7 +831,7 @@ int nfs_show_stats(struct seq_file *m, struct dentry *root) /* * Display all mount option settings */ - seq_printf(m, "\n\topts:\t"); + seq_puts(m, "\n\topts:\t"); seq_puts(m, sb_rdonly(root->d_sb) ? "ro" : "rw"); seq_puts(m, root->d_sb->s_flags & SB_SYNCHRONOUS ? ",sync" : ""); seq_puts(m, root->d_sb->s_flags & SB_NOATIME ? ",noatime" : ""); @@ -827,7 +842,7 @@ int nfs_show_stats(struct seq_file *m, struct dentry *root) show_implementation_id(m, nfss); - seq_printf(m, "\n\tcaps:\t"); + seq_puts(m, "\n\tcaps:\t"); seq_printf(m, "caps=0x%x", nfss->caps); seq_printf(m, ",wtmult=%u", nfss->wtmult); seq_printf(m, ",dtsize=%u", nfss->dtsize); @@ -836,13 +851,14 @@ int nfs_show_stats(struct seq_file *m, struct dentry *root) #if IS_ENABLED(CONFIG_NFS_V4) if (nfss->nfs_client->rpc_ops->version == 4) { - seq_printf(m, "\n\tnfsv4:\t"); + seq_puts(m, "\n\tnfsv4:\t"); seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]); seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]); seq_printf(m, ",bm2=0x%x", nfss->attr_bitmask[2]); seq_printf(m, ",acl=0x%x", nfss->acl_bitmask); show_sessions(m, nfss); show_pnfs(m, nfss); + show_lease(m, nfss); } #endif @@ -874,20 +890,20 @@ int nfs_show_stats(struct seq_file *m, struct dentry *root) preempt_enable(); } - seq_printf(m, "\n\tevents:\t"); + seq_puts(m, "\n\tevents:\t"); for (i = 0; i < __NFSIOS_COUNTSMAX; i++) seq_printf(m, "%lu ", totals.events[i]); - seq_printf(m, "\n\tbytes:\t"); + seq_puts(m, "\n\tbytes:\t"); for (i = 0; i < __NFSIOS_BYTESMAX; i++) seq_printf(m, "%Lu ", totals.bytes[i]); #ifdef CONFIG_NFS_FSCACHE if (nfss->options & NFS_OPTION_FSCACHE) { - seq_printf(m, "\n\tfsc:\t"); + seq_puts(m, "\n\tfsc:\t"); for (i = 0; i < __NFSIOS_FSCACHEMAX; i++) seq_printf(m, "%Lu ", totals.fscache[i]); } #endif - seq_printf(m, "\n"); + seq_putc(m, '\n'); rpc_clnt_show_stats(m, nfss->client); @@ -1549,6 +1565,11 @@ static int nfs_parse_mount_options(char *raw, if (mnt->mount_server.addrlen == 0) goto out_invalid_address; break; + case Opt_nconnect: + if (nfs_get_option_ul_bound(args, &option, 1, NFS_MAX_CONNECTIONS)) + goto out_invalid_value; + mnt->nfs_server.nconnect = option; + break; case Opt_lookupcache: string = match_strdup(args); if (string == NULL) diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c new file mode 100644 index 000000000000..4f3390b20239 --- /dev/null +++ b/fs/nfs/sysfs.c @@ -0,0 +1,187 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2019 Hammerspace Inc + */ + +#include <linux/module.h> +#include <linux/kobject.h> +#include <linux/sysfs.h> +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/netdevice.h> +#include <linux/string.h> +#include <linux/nfs_fs.h> +#include <linux/rcupdate.h> + +#include "nfs4_fs.h" +#include "netns.h" +#include "sysfs.h" + +struct kobject *nfs_client_kobj; +static struct kset *nfs_client_kset; + +static void nfs_netns_object_release(struct kobject *kobj) +{ + kfree(kobj); +} + +static const struct kobj_ns_type_operations *nfs_netns_object_child_ns_type( + struct kobject *kobj) +{ + return &net_ns_type_operations; +} + +static struct kobj_type nfs_netns_object_type = { + .release = nfs_netns_object_release, + .sysfs_ops = &kobj_sysfs_ops, + .child_ns_type = nfs_netns_object_child_ns_type, +}; + +static struct kobject *nfs_netns_object_alloc(const char *name, + struct kset *kset, struct kobject *parent) +{ + struct kobject *kobj; + + kobj = kzalloc(sizeof(*kobj), GFP_KERNEL); + if (kobj) { + kobj->kset = kset; + if (kobject_init_and_add(kobj, &nfs_netns_object_type, + parent, "%s", name) == 0) + return kobj; + kobject_put(kobj); + } + return NULL; +} + +int nfs_sysfs_init(void) +{ + nfs_client_kset = kset_create_and_add("nfs", NULL, fs_kobj); + if (!nfs_client_kset) + return -ENOMEM; + nfs_client_kobj = nfs_netns_object_alloc("net", nfs_client_kset, NULL); + if (!nfs_client_kobj) { + kset_unregister(nfs_client_kset); + nfs_client_kset = NULL; + return -ENOMEM; + } + return 0; +} + +void nfs_sysfs_exit(void) +{ + kobject_put(nfs_client_kobj); + kset_unregister(nfs_client_kset); +} + +static ssize_t nfs_netns_identifier_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct nfs_netns_client *c = container_of(kobj, + struct nfs_netns_client, + kobject); + return scnprintf(buf, PAGE_SIZE, "%s\n", c->identifier); +} + +/* Strip trailing '\n' */ +static size_t nfs_string_strip(const char *c, size_t len) +{ + while (len > 0 && c[len-1] == '\n') + --len; + return len; +} + +static ssize_t nfs_netns_identifier_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct nfs_netns_client *c = container_of(kobj, + struct nfs_netns_client, + kobject); + const char *old; + char *p; + size_t len; + + len = nfs_string_strip(buf, min_t(size_t, count, CONTAINER_ID_MAXLEN)); + if (!len) + return 0; + p = kmemdup_nul(buf, len, GFP_KERNEL); + if (!p) + return -ENOMEM; + old = xchg(&c->identifier, p); + if (old) { + synchronize_rcu(); + kfree(old); + } + return count; +} + +static void nfs_netns_client_release(struct kobject *kobj) +{ + struct nfs_netns_client *c = container_of(kobj, + struct nfs_netns_client, + kobject); + + if (c->identifier) + kfree(c->identifier); + kfree(c); +} + +static const void *nfs_netns_client_namespace(struct kobject *kobj) +{ + return container_of(kobj, struct nfs_netns_client, kobject)->net; +} + +static struct kobj_attribute nfs_netns_client_id = __ATTR(identifier, + 0644, nfs_netns_identifier_show, nfs_netns_identifier_store); + +static struct attribute *nfs_netns_client_attrs[] = { + &nfs_netns_client_id.attr, + NULL, +}; + +static struct kobj_type nfs_netns_client_type = { + .release = nfs_netns_client_release, + .default_attrs = nfs_netns_client_attrs, + .sysfs_ops = &kobj_sysfs_ops, + .namespace = nfs_netns_client_namespace, +}; + +static struct nfs_netns_client *nfs_netns_client_alloc(struct kobject *parent, + struct net *net) +{ + struct nfs_netns_client *p; + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (p) { + p->net = net; + p->kobject.kset = nfs_client_kset; + if (kobject_init_and_add(&p->kobject, &nfs_netns_client_type, + parent, "nfs_client") == 0) + return p; + kobject_put(&p->kobject); + } + return NULL; +} + +void nfs_netns_sysfs_setup(struct nfs_net *netns, struct net *net) +{ + struct nfs_netns_client *clp; + + clp = nfs_netns_client_alloc(nfs_client_kobj, net); + if (clp) { + netns->nfs_client = clp; + kobject_uevent(&clp->kobject, KOBJ_ADD); + } +} + +void nfs_netns_sysfs_destroy(struct nfs_net *netns) +{ + struct nfs_netns_client *clp = netns->nfs_client; + + if (clp) { + kobject_uevent(&clp->kobject, KOBJ_REMOVE); + kobject_del(&clp->kobject); + kobject_put(&clp->kobject); + netns->nfs_client = NULL; + } +} diff --git a/fs/nfs/sysfs.h b/fs/nfs/sysfs.h new file mode 100644 index 000000000000..f1b27411dcc0 --- /dev/null +++ b/fs/nfs/sysfs.h @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2019 Hammerspace Inc + */ + +#ifndef __NFS_SYSFS_H +#define __NFS_SYSFS_H + +#define CONTAINER_ID_MAXLEN (64) + +struct nfs_netns_client { + struct kobject kobject; + struct net *net; + const char *identifier; +}; + +extern struct kobject *nfs_client_kobj; + +extern int nfs_sysfs_init(void); +extern void nfs_sysfs_exit(void); + +void nfs_netns_sysfs_setup(struct nfs_net *netns, struct net *net); +void nfs_netns_sysfs_destroy(struct nfs_net *netns); + +#endif diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 059a7c38bc4f..92d9cadc6102 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -103,7 +103,7 @@ EXPORT_SYMBOL_GPL(nfs_commit_free); static struct nfs_pgio_header *nfs_writehdr_alloc(void) { - struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO); + struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_KERNEL); memset(p, 0, sizeof(*p)); p->rw_mode = FMODE_WRITE; @@ -721,12 +721,11 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) struct inode *inode = mapping->host; struct nfs_pageio_descriptor pgio; struct nfs_io_completion *ioc; - unsigned int pflags = memalloc_nofs_save(); int err; nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); - ioc = nfs_io_completion_alloc(GFP_NOFS); + ioc = nfs_io_completion_alloc(GFP_KERNEL); if (ioc) nfs_io_completion_init(ioc, nfs_io_completion_commit, inode); @@ -737,8 +736,6 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) nfs_pageio_complete(&pgio); nfs_io_completion_put(ioc); - memalloc_nofs_restore(pflags); - if (err < 0) goto out_err; err = pgio.pg_error; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 0a9a49ded546..13c548733860 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -8,6 +8,7 @@ #include <linux/slab.h> #include <linux/namei.h> #include <linux/ctype.h> +#include <linux/fs_context.h> #include <linux/sunrpc/svcsock.h> #include <linux/lockd/lockd.h> @@ -1337,7 +1338,7 @@ void nfsd_client_rmdir(struct dentry *dentry) inode_unlock(dir); } -static int nfsd_fill_super(struct super_block * sb, void * data, int silent) +static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc) { struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, nfsd_net_id); @@ -1372,7 +1373,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent) #endif /* last one */ {""} }; - get_net(sb->s_fs_info); + ret = simple_fill_super(sb, 0x6e667364, nfsd_files); if (ret) return ret; @@ -1381,14 +1382,31 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent) return PTR_ERR(dentry); nn->nfsd_client_dir = dentry; return 0; +} +static int nfsd_fs_get_tree(struct fs_context *fc) +{ + fc->s_fs_info = get_net(fc->net_ns); + return vfs_get_super(fc, vfs_get_keyed_super, nfsd_fill_super); } -static struct dentry *nfsd_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static void nfsd_fs_free_fc(struct fs_context *fc) { - struct net *net = current->nsproxy->net_ns; - return mount_ns(fs_type, flags, data, net, net->user_ns, nfsd_fill_super); + if (fc->s_fs_info) + put_net(fc->s_fs_info); +} + +static const struct fs_context_operations nfsd_fs_context_ops = { + .free = nfsd_fs_free_fc, + .get_tree = nfsd_fs_get_tree, +}; + +static int nfsd_init_fs_context(struct fs_context *fc) +{ + put_user_ns(fc->user_ns); + fc->user_ns = get_user_ns(fc->net_ns->user_ns); + fc->ops = &nfsd_fs_context_ops; + return 0; } static void nfsd_umount(struct super_block *sb) @@ -1402,7 +1420,7 @@ static void nfsd_umount(struct super_block *sb) static struct file_system_type nfsd_fs_type = { .owner = THIS_MODULE, .name = "nfsd", - .mount = nfsd_mount, + .init_fs_context = nfsd_init_fs_context, .kill_sb = nfsd_umount, }; MODULE_ALIAS_FS("nfsd"); diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index cce8de32779f..0b815178126e 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -45,8 +45,6 @@ struct kmem_cache *inotify_inode_mark_cachep __read_mostly; #include <linux/sysctl.h> -static int zero; - struct ctl_table inotify_table[] = { { .procname = "max_user_instances", @@ -54,7 +52,7 @@ struct ctl_table inotify_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "max_user_watches", @@ -62,7 +60,7 @@ struct ctl_table inotify_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "max_queued_events", @@ -70,7 +68,7 @@ struct ctl_table inotify_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero + .extra1 = SYSCTL_ZERO }, { } }; diff --git a/fs/nsfs.c b/fs/nsfs.c index e3bf08c5af41..a0431642c6b5 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/mount.h> +#include <linux/pseudo_fs.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/proc_ns.h> @@ -258,15 +259,20 @@ static const struct super_operations nsfs_ops = { .evict_inode = nsfs_evict, .show_path = nsfs_show_path, }; -static struct dentry *nsfs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) + +static int nsfs_init_fs_context(struct fs_context *fc) { - return mount_pseudo(fs_type, "nsfs:", &nsfs_ops, - &ns_dentry_operations, NSFS_MAGIC); + struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC); + if (!ctx) + return -ENOMEM; + ctx->ops = &nsfs_ops; + ctx->dops = &ns_dentry_operations; + return 0; } + static struct file_system_type nsfs = { .name = "nsfs", - .mount = nsfs_mount, + .init_fs_context = nsfs_init_fs_context, .kill_sb = kill_anon_super, }; diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index e6cb7689fec4..40c8c2e32fa3 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -9,6 +9,7 @@ #include <linux/types.h> #include <linux/string.h> #include <linux/fs.h> +#include <linux/fs_context.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/seq_file.h> @@ -375,7 +376,7 @@ static const struct super_operations openprom_sops = { .remount_fs = openprom_remount, }; -static int openprom_fill_super(struct super_block *s, void *data, int silent) +static int openprom_fill_super(struct super_block *s, struct fs_context *fc) { struct inode *root_inode; struct op_inode_info *oi; @@ -409,16 +410,25 @@ out_no_root: return ret; } -static struct dentry *openprom_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int openpromfs_get_tree(struct fs_context *fc) { - return mount_single(fs_type, flags, data, openprom_fill_super); + return get_tree_single(fc, openprom_fill_super); +} + +static const struct fs_context_operations openpromfs_context_ops = { + .get_tree = openpromfs_get_tree, +}; + +static int openpromfs_init_fs_context(struct fs_context *fc) +{ + fc->ops = &openpromfs_context_ops; + return 0; } static struct file_system_type openprom_fs_type = { .owner = THIS_MODULE, .name = "openpromfs", - .mount = openprom_mount, + .init_fs_context = openpromfs_init_fs_context, .kill_sb = kill_anon_super, }; MODULE_ALIAS_FS("openpromfs"); diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 679a3c8e4fb3..960f9a3c012d 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -52,7 +52,7 @@ ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode, struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); struct orangefs_khandle *handle = &orangefs_inode->refn.khandle; struct orangefs_kernel_op_s *new_op = NULL; - int buffer_index = -1; + int buffer_index; ssize_t ret; size_t copy_amount; @@ -134,7 +134,6 @@ populate_shared_memory: */ if (ret == -EAGAIN && op_state_purged(new_op)) { orangefs_bufmap_put(buffer_index); - buffer_index = -1; if (type == ORANGEFS_IO_WRITE) iov_iter_revert(iter, total_size); gossip_debug(GOSSIP_FILE_DEBUG, @@ -262,7 +261,6 @@ out: "%s(%pU): PUT buffer_index %d\n", __func__, handle, buffer_index); } - buffer_index = -1; } op_release(new_op); return ret; diff --git a/fs/pipe.c b/fs/pipe.c index 41065901106b..8a2ab2f974bd 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -14,6 +14,7 @@ #include <linux/fs.h> #include <linux/log2.h> #include <linux/mount.h> +#include <linux/pseudo_fs.h> #include <linux/magic.h> #include <linux/pipe_fs_i.h> #include <linux/uio.h> @@ -1182,16 +1183,20 @@ static const struct super_operations pipefs_ops = { * any operations on the root directory. However, we need a non-trivial * d_name - pipe: will go nicely and kill the special-casing in procfs. */ -static struct dentry *pipefs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) + +static int pipefs_init_fs_context(struct fs_context *fc) { - return mount_pseudo(fs_type, "pipe:", &pipefs_ops, - &pipefs_dentry_operations, PIPEFS_MAGIC); + struct pseudo_fs_context *ctx = init_pseudo(fc, PIPEFS_MAGIC); + if (!ctx) + return -ENOMEM; + ctx->ops = &pipefs_ops; + ctx->dops = &pipefs_dentry_operations; + return 0; } static struct file_system_type pipe_fs_type = { .name = "pipefs", - .mount = pipefs_mount, + .init_fs_context = pipefs_init_fs_context, .kill_sb = kill_anon_super, }; diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index 4c3dcb718961..cb5629bd5fff 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -58,7 +58,8 @@ config PROC_VMCORE_DEVICE_DUMP snapshot. If you say Y here, the collected device dumps will be added - as ELF notes to /proc/vmcore. + as ELF notes to /proc/vmcore. You can still disable device + dump using the kernel command line option 'novmcoredd'. config PROC_SYSCTL bool "Sysctl support (/proc/sys)" if EXPERT @@ -72,7 +73,7 @@ config PROC_SYSCTL interface is through /proc/sys. If you say Y here a tree of modifiable sysctl entries will be generated beneath the /proc/sys directory. They are explained in the files - in <file:Documentation/sysctl/>. Note that enabling this + in <file:Documentation/admin-guide/sysctl/>. Note that enabling this option will enlarge the kernel by at least 8 KB. As it is generally a good thing, you should say Y here unless diff --git a/fs/proc/base.c b/fs/proc/base.c index 77eb628ecc7f..ebea9501afb8 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -209,12 +209,53 @@ static int proc_root_link(struct dentry *dentry, struct path *path) return result; } +/* + * If the user used setproctitle(), we just get the string from + * user space at arg_start, and limit it to a maximum of one page. + */ +static ssize_t get_mm_proctitle(struct mm_struct *mm, char __user *buf, + size_t count, unsigned long pos, + unsigned long arg_start) +{ + char *page; + int ret, got; + + if (pos >= PAGE_SIZE) + return 0; + + page = (char *)__get_free_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + + ret = 0; + got = access_remote_vm(mm, arg_start, page, PAGE_SIZE, FOLL_ANON); + if (got > 0) { + int len = strnlen(page, got); + + /* Include the NUL character if it was found */ + if (len < got) + len++; + + if (len > pos) { + len -= pos; + if (len > count) + len = count; + len -= copy_to_user(buf, page+pos, len); + if (!len) + len = -EFAULT; + ret = len; + } + } + free_page((unsigned long)page); + return ret; +} + static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf, size_t count, loff_t *ppos) { unsigned long arg_start, arg_end, env_start, env_end; unsigned long pos, len; - char *page; + char *page, c; /* Check if process spawned far enough to have cmdline. */ if (!mm->env_end) @@ -231,28 +272,42 @@ static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf, return 0; /* - * We have traditionally allowed the user to re-write - * the argument strings and overflow the end result - * into the environment section. But only do that if - * the environment area is contiguous to the arguments. + * We allow setproctitle() to overwrite the argument + * strings, and overflow past the original end. But + * only when it overflows into the environment area. */ - if (env_start != arg_end || env_start >= env_end) + if (env_start != arg_end || env_end < env_start) env_start = env_end = arg_end; - - /* .. and limit it to a maximum of one page of slop */ - if (env_end >= arg_end + PAGE_SIZE) - env_end = arg_end + PAGE_SIZE - 1; + len = env_end - arg_start; /* We're not going to care if "*ppos" has high bits set */ - pos = arg_start + *ppos; - - /* .. but we do check the result is in the proper range */ - if (pos < arg_start || pos >= env_end) + pos = *ppos; + if (pos >= len) return 0; + if (count > len - pos) + count = len - pos; + if (!count) + return 0; + + /* + * Magical special case: if the argv[] end byte is not + * zero, the user has overwritten it with setproctitle(3). + * + * Possible future enhancement: do this only once when + * pos is 0, and set a flag in the 'struct file'. + */ + if (access_remote_vm(mm, arg_end-1, &c, 1, FOLL_ANON) == 1 && c) + return get_mm_proctitle(mm, buf, count, pos, arg_start); - /* .. and we never go past env_end */ - if (env_end - pos < count) - count = env_end - pos; + /* + * For the non-setproctitle() case we limit things strictly + * to the [arg_start, arg_end[ range. + */ + pos += arg_start; + if (pos < arg_start || pos >= arg_end) + return 0; + if (count > arg_end - pos) + count = arg_end - pos; page = (char *)__get_free_page(GFP_KERNEL); if (!page) @@ -262,48 +317,11 @@ static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf, while (count) { int got; size_t size = min_t(size_t, PAGE_SIZE, count); - long offset; - /* - * Are we already starting past the official end? - * We always include the last byte that is *supposed* - * to be NUL - */ - offset = (pos >= arg_end) ? pos - arg_end + 1 : 0; - - got = access_remote_vm(mm, pos - offset, page, size + offset, FOLL_ANON); - if (got <= offset) + got = access_remote_vm(mm, pos, page, size, FOLL_ANON); + if (got <= 0) break; - got -= offset; - - /* Don't walk past a NUL character once you hit arg_end */ - if (pos + got >= arg_end) { - int n = 0; - - /* - * If we started before 'arg_end' but ended up - * at or after it, we start the NUL character - * check at arg_end-1 (where we expect the normal - * EOF to be). - * - * NOTE! This is smaller than 'got', because - * pos + got >= arg_end - */ - if (pos < arg_end) - n = arg_end - pos - 1; - - /* Cut off at first NUL after 'n' */ - got = n + strnlen(page+n, offset+got-n); - if (got < offset) - break; - got -= offset; - - /* Include the NUL if it existed */ - if (got < size) - got++; - } - - got -= copy_to_user(buf, page+offset, got); + got -= copy_to_user(buf, page, got); if (unlikely(!got)) { if (!len) len = -EFAULT; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 5f8d215b3fd0..dbe43a50caf2 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -200,7 +200,8 @@ static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence) struct proc_dir_entry *pde = PDE(file_inode(file)); loff_t rv = -EINVAL; if (use_pde(pde)) { - loff_t (*llseek)(struct file *, loff_t, int); + typeof_member(struct file_operations, llseek) llseek; + llseek = pde->proc_fops->llseek; if (!llseek) llseek = default_llseek; @@ -212,10 +213,11 @@ static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence) static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - ssize_t (*read)(struct file *, char __user *, size_t, loff_t *); struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; if (use_pde(pde)) { + typeof_member(struct file_operations, read) read; + read = pde->proc_fops->read; if (read) rv = read(file, buf, count, ppos); @@ -226,10 +228,11 @@ static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *); struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; if (use_pde(pde)) { + typeof_member(struct file_operations, write) write; + write = pde->proc_fops->write; if (write) rv = write(file, buf, count, ppos); @@ -242,8 +245,9 @@ static __poll_t proc_reg_poll(struct file *file, struct poll_table_struct *pts) { struct proc_dir_entry *pde = PDE(file_inode(file)); __poll_t rv = DEFAULT_POLLMASK; - __poll_t (*poll)(struct file *, struct poll_table_struct *); if (use_pde(pde)) { + typeof_member(struct file_operations, poll) poll; + poll = pde->proc_fops->poll; if (poll) rv = poll(file, pts); @@ -256,8 +260,9 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne { struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; - long (*ioctl)(struct file *, unsigned int, unsigned long); if (use_pde(pde)) { + typeof_member(struct file_operations, unlocked_ioctl) ioctl; + ioctl = pde->proc_fops->unlocked_ioctl; if (ioctl) rv = ioctl(file, cmd, arg); @@ -271,8 +276,9 @@ static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned { struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; - long (*compat_ioctl)(struct file *, unsigned int, unsigned long); if (use_pde(pde)) { + typeof_member(struct file_operations, compat_ioctl) compat_ioctl; + compat_ioctl = pde->proc_fops->compat_ioctl; if (compat_ioctl) rv = compat_ioctl(file, cmd, arg); @@ -286,8 +292,9 @@ static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma) { struct proc_dir_entry *pde = PDE(file_inode(file)); int rv = -EIO; - int (*mmap)(struct file *, struct vm_area_struct *); if (use_pde(pde)) { + typeof_member(struct file_operations, mmap) mmap; + mmap = pde->proc_fops->mmap; if (mmap) rv = mmap(file, vma); @@ -305,7 +312,7 @@ proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, unsigned long rv = -EIO; if (use_pde(pde)) { - typeof(proc_reg_get_unmapped_area) *get_area; + typeof_member(struct file_operations, get_unmapped_area) get_area; get_area = pde->proc_fops->get_unmapped_area; #ifdef CONFIG_MMU @@ -326,8 +333,8 @@ static int proc_reg_open(struct inode *inode, struct file *file) { struct proc_dir_entry *pde = PDE(inode); int rv = 0; - int (*open)(struct inode *, struct file *); - int (*release)(struct inode *, struct file *); + typeof_member(struct file_operations, open) open; + typeof_member(struct file_operations, release) release; struct pde_opener *pdeo; /* diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index c74570736b24..d80989b6c344 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -22,6 +22,10 @@ static const struct inode_operations proc_sys_inode_operations; static const struct file_operations proc_sys_dir_file_operations; static const struct inode_operations proc_sys_dir_operations; +/* shared constants to be used in various sysctls */ +const int sysctl_vals[] = { 0, 1, INT_MAX }; +EXPORT_SYMBOL(sysctl_vals); + /* Support for permanently empty directories */ struct ctl_table sysctl_mount_point[] = { @@ -499,6 +503,10 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, if (root->set_ownership) root->set_ownership(head, table, &inode->i_uid, &inode->i_gid); + else { + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + } return inode; } diff --git a/fs/proc/root.c b/fs/proc/root.c index 522199e9525e..33f72d1b92cc 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -157,8 +157,6 @@ static int proc_get_tree(struct fs_context *fc) { struct proc_fs_context *ctx = fc->fs_private; - put_user_ns(fc->user_ns); - fc->user_ns = get_user_ns(ctx->pid_ns->user_ns); fc->s_fs_info = ctx->pid_ns; return vfs_get_super(fc, vfs_get_keyed_super, proc_fill_super); } @@ -167,8 +165,7 @@ static void proc_fs_context_free(struct fs_context *fc) { struct proc_fs_context *ctx = fc->fs_private; - if (ctx->pid_ns) - put_pid_ns(ctx->pid_ns); + put_pid_ns(ctx->pid_ns); kfree(ctx); } @@ -188,6 +185,8 @@ static int proc_init_fs_context(struct fs_context *fc) return -ENOMEM; ctx->pid_ns = get_pid_ns(task_active_pid_ns(current)); + put_user_ns(fc->user_ns); + fc->user_ns = get_user_ns(ctx->pid_ns->user_ns); fc->fs_private = ctx; fc->ops = &proc_fs_context_ops; return 0; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index dedca3da428a..731642e0f5a0 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -832,7 +832,8 @@ static int show_smap(struct seq_file *m, void *v) __show_smap(m, &mss, false); - seq_printf(m, "THPeligible: %d\n", transparent_hugepage_enabled(vma)); + seq_printf(m, "THPeligible: %d\n", + transparent_hugepage_enabled(vma)); if (arch_pkeys_enabled()) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); @@ -1322,7 +1323,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, if (pm->show_pfn) frame = pte_pfn(pte); flags |= PM_PRESENT; - page = _vm_normal_page(vma, addr, pte, true); + page = vm_normal_page(vma, addr, pte); if (pte_soft_dirty(pte)) flags |= PM_SOFT_DIRTY; } else if (is_swap_pte(pte)) { diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 57957c91c6df..7bcc92add72c 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -21,6 +21,7 @@ #include <linux/init.h> #include <linux/crash_dump.h> #include <linux/list.h> +#include <linux/moduleparam.h> #include <linux/mutex.h> #include <linux/vmalloc.h> #include <linux/pagemap.h> @@ -54,6 +55,9 @@ static struct proc_dir_entry *proc_vmcore; /* Device Dump list and mutex to synchronize access to list */ static LIST_HEAD(vmcoredd_list); static DEFINE_MUTEX(vmcoredd_mutex); + +static bool vmcoredd_disabled; +core_param(novmcoredd, vmcoredd_disabled, bool, 0); #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ /* Device Dump Size */ @@ -1452,6 +1456,11 @@ int vmcore_add_device_dump(struct vmcoredd_data *data) size_t data_size; int ret; + if (vmcoredd_disabled) { + pr_err_once("Device dump is disabled\n"); + return -EINVAL; + } + if (!data || !strlen(data->dump_name) || !data->vmcoredd_callback || !data->size) return -EINVAL; diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 11201b2d06b9..733c6b4193dc 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -266,12 +266,8 @@ static struct file_system_type ramfs_fs_type = { .fs_flags = FS_USERNS_MOUNT, }; -int __init init_ramfs_fs(void) +static int __init init_ramfs_fs(void) { - static unsigned long once; - - if (test_and_set_bit(0, &once)) - return 0; return register_filesystem(&ramfs_fs_type); } fs_initcall(init_ramfs_fs); diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 36346dc4cec0..4517a1394c6f 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -94,7 +94,7 @@ static int journal_join(struct reiserfs_transaction_handle *th, struct super_block *sb); static void release_journal_dev(struct super_block *super, struct reiserfs_journal *journal); -static int dirty_one_transaction(struct super_block *s, +static void dirty_one_transaction(struct super_block *s, struct reiserfs_journal_list *jl); static void flush_async_commits(struct work_struct *work); static void queue_log_writer(struct super_block *s); @@ -1682,12 +1682,11 @@ next: } /* used by flush_commit_list */ -static int dirty_one_transaction(struct super_block *s, +static void dirty_one_transaction(struct super_block *s, struct reiserfs_journal_list *jl) { struct reiserfs_journal_cnode *cn; struct reiserfs_journal_list *pjl; - int ret = 0; jl->j_state |= LIST_DIRTY; cn = jl->j_realblock; @@ -1716,7 +1715,6 @@ static int dirty_one_transaction(struct super_block *s, } cn = cn->next; } - return ret; } static int kupdate_transactions(struct super_block *s, diff --git a/fs/select.c b/fs/select.c index a4d8f6e8b63c..53a0c149f528 100644 --- a/fs/select.c +++ b/fs/select.c @@ -294,12 +294,14 @@ enum poll_time_type { PT_OLD_TIMESPEC = 3, }; -static int poll_select_copy_remaining(struct timespec64 *end_time, - void __user *p, - enum poll_time_type pt_type, int ret) +static int poll_select_finish(struct timespec64 *end_time, + void __user *p, + enum poll_time_type pt_type, int ret) { struct timespec64 rts; + restore_saved_sigmask_unless(ret == -ERESTARTNOHAND); + if (!p) return ret; @@ -714,9 +716,7 @@ static int kern_select(int n, fd_set __user *inp, fd_set __user *outp, } ret = core_sys_select(n, inp, outp, exp, to); - ret = poll_select_copy_remaining(&end_time, tvp, PT_TIMEVAL, ret); - - return ret; + return poll_select_finish(&end_time, tvp, PT_TIMEVAL, ret); } SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp, @@ -730,7 +730,6 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp, const sigset_t __user *sigmask, size_t sigsetsize, enum poll_time_type type) { - sigset_t ksigmask, sigsaved; struct timespec64 ts, end_time, *to = NULL; int ret; @@ -753,15 +752,12 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp, return -EINVAL; } - ret = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize); + ret = set_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = core_sys_select(n, inp, outp, exp, to); - restore_user_sigmask(sigmask, &sigsaved, ret == -ERESTARTNOHAND); - ret = poll_select_copy_remaining(&end_time, tsp, type, ret); - - return ret; + return poll_select_finish(&end_time, tsp, type, ret); } /* @@ -926,7 +922,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait, if (!count) { count = wait->error; if (signal_pending(current)) - count = -EINTR; + count = -ERESTARTNOHAND; } if (count || timed_out) break; @@ -965,7 +961,7 @@ static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, struct timespec64 *end_time) { struct poll_wqueues table; - int err = -EFAULT, fdcount, len, size; + int err = -EFAULT, fdcount, len; /* Allocate small arguments on the stack to save memory and be faster - use long to make sure the buffer is aligned properly on 64 bit archs to avoid unaligned access */ @@ -993,8 +989,8 @@ static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, break; len = min(todo, POLLFD_PER_PAGE); - size = sizeof(struct poll_list) + sizeof(struct pollfd) * len; - walk = walk->next = kmalloc(size, GFP_KERNEL); + walk = walk->next = kmalloc(struct_size(walk, entries, len), + GFP_KERNEL); if (!walk) { err = -ENOMEM; goto out_fds; @@ -1041,7 +1037,7 @@ static long do_restart_poll(struct restart_block *restart_block) ret = do_sys_poll(ufds, nfds, to); - if (ret == -EINTR) { + if (ret == -ERESTARTNOHAND) { restart_block->fn = do_restart_poll; ret = -ERESTART_RESTARTBLOCK; } @@ -1062,7 +1058,7 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, ret = do_sys_poll(ufds, nfds, to); - if (ret == -EINTR) { + if (ret == -ERESTARTNOHAND) { struct restart_block *restart_block; restart_block = ¤t->restart_block; @@ -1086,7 +1082,6 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, struct __kernel_timespec __user *, tsp, const sigset_t __user *, sigmask, size_t, sigsetsize) { - sigset_t ksigmask, sigsaved; struct timespec64 ts, end_time, *to = NULL; int ret; @@ -1099,20 +1094,12 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, return -EINVAL; } - ret = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize); + ret = set_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); - - restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR); - /* We can restart this syscall, usually */ - if (ret == -EINTR) - ret = -ERESTARTNOHAND; - - ret = poll_select_copy_remaining(&end_time, tsp, PT_TIMESPEC, ret); - - return ret; + return poll_select_finish(&end_time, tsp, PT_TIMESPEC, ret); } #if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT) @@ -1121,7 +1108,6 @@ SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds, struct old_timespec32 __user *, tsp, const sigset_t __user *, sigmask, size_t, sigsetsize) { - sigset_t ksigmask, sigsaved; struct timespec64 ts, end_time, *to = NULL; int ret; @@ -1134,20 +1120,12 @@ SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds, return -EINVAL; } - ret = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize); + ret = set_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); - - restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR); - /* We can restart this syscall, usually */ - if (ret == -EINTR) - ret = -ERESTARTNOHAND; - - ret = poll_select_copy_remaining(&end_time, tsp, PT_OLD_TIMESPEC, ret); - - return ret; + return poll_select_finish(&end_time, tsp, PT_OLD_TIMESPEC, ret); } #endif @@ -1284,9 +1262,7 @@ static int do_compat_select(int n, compat_ulong_t __user *inp, } ret = compat_core_sys_select(n, inp, outp, exp, to); - ret = poll_select_copy_remaining(&end_time, tvp, PT_OLD_TIMEVAL, ret); - - return ret; + return poll_select_finish(&end_time, tvp, PT_OLD_TIMEVAL, ret); } COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp, @@ -1319,7 +1295,6 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp, void __user *tsp, compat_sigset_t __user *sigmask, compat_size_t sigsetsize, enum poll_time_type type) { - sigset_t ksigmask, sigsaved; struct timespec64 ts, end_time, *to = NULL; int ret; @@ -1342,15 +1317,12 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp, return -EINVAL; } - ret = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize); + ret = set_compat_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = compat_core_sys_select(n, inp, outp, exp, to); - restore_user_sigmask(sigmask, &sigsaved, ret == -ERESTARTNOHAND); - ret = poll_select_copy_remaining(&end_time, tsp, type, ret); - - return ret; + return poll_select_finish(&end_time, tsp, type, ret); } COMPAT_SYSCALL_DEFINE6(pselect6_time64, int, n, compat_ulong_t __user *, inp, @@ -1402,7 +1374,6 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds, struct old_timespec32 __user *, tsp, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { - sigset_t ksigmask, sigsaved; struct timespec64 ts, end_time, *to = NULL; int ret; @@ -1415,20 +1386,12 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, return -EINVAL; } - ret = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize); + ret = set_compat_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); - - restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR); - /* We can restart this syscall, usually */ - if (ret == -EINTR) - ret = -ERESTARTNOHAND; - - ret = poll_select_copy_remaining(&end_time, tsp, PT_OLD_TIMESPEC, ret); - - return ret; + return poll_select_finish(&end_time, tsp, PT_OLD_TIMESPEC, ret); } #endif @@ -1437,7 +1400,6 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds, unsigned int, nfds, struct __kernel_timespec __user *, tsp, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { - sigset_t ksigmask, sigsaved; struct timespec64 ts, end_time, *to = NULL; int ret; @@ -1450,20 +1412,12 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds, return -EINVAL; } - ret = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize); + ret = set_compat_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); - - restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR); - /* We can restart this syscall, usually */ - if (ret == -EINTR) - ret = -ERESTARTNOHAND; - - ret = poll_select_copy_remaining(&end_time, tsp, PT_TIMESPEC, ret); - - return ret; + return poll_select_finish(&end_time, tsp, PT_TIMESPEC, ret); } #endif diff --git a/fs/splice.c b/fs/splice.c index 14cb602d9a2f..98412721f056 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1356,7 +1356,7 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov, struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; - long error; + ssize_t error; struct fd f; int type; @@ -1367,7 +1367,7 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov, error = import_iovec(type, uiov, nr_segs, ARRAY_SIZE(iovstack), &iov, &iter); - if (!error) { + if (error >= 0) { error = do_vmsplice(f.file, &iter, flags); kfree(iov); } @@ -1382,7 +1382,7 @@ COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, io struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; - long error; + ssize_t error; struct fd f; int type; @@ -1393,7 +1393,7 @@ COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, io error = compat_import_iovec(type, iov32, nr_segs, ARRAY_SIZE(iovstack), &iov, &iter); - if (!error) { + if (error >= 0) { error = do_vmsplice(f.file, &iter, flags); kfree(iov); } diff --git a/fs/super.c b/fs/super.c index 2739f57515f8..113c58f19425 100644 --- a/fs/super.c +++ b/fs/super.c @@ -476,6 +476,17 @@ void generic_shutdown_super(struct super_block *sb) EXPORT_SYMBOL(generic_shutdown_super); +bool mount_capable(struct fs_context *fc) +{ + struct user_namespace *user_ns = fc->global ? &init_user_ns + : fc->user_ns; + + if (!(fc->fs_type->fs_flags & FS_USERNS_MOUNT)) + return capable(CAP_SYS_ADMIN); + else + return ns_capable(user_ns, CAP_SYS_ADMIN); +} + /** * sget_fc - Find or create a superblock * @fc: Filesystem context. @@ -503,20 +514,6 @@ struct super_block *sget_fc(struct fs_context *fc, struct user_namespace *user_ns = fc->global ? &init_user_ns : fc->user_ns; int err; - if (!(fc->sb_flags & SB_KERNMOUNT) && - fc->purpose != FS_CONTEXT_FOR_SUBMOUNT) { - /* Don't allow mounting unless the caller has CAP_SYS_ADMIN - * over the namespace. - */ - if (!(fc->fs_type->fs_flags & FS_USERNS_MOUNT)) { - if (!capable(CAP_SYS_ADMIN)) - return ERR_PTR(-EPERM); - } else { - if (!ns_capable(fc->user_ns, CAP_SYS_ADMIN)) - return ERR_PTR(-EPERM); - } - } - retry: spin_lock(&sb_lock); if (test) { @@ -543,6 +540,7 @@ retry: } fc->s_fs_info = NULL; s->s_type = fc->fs_type; + s->s_iflags |= fc->s_iflags; strlcpy(s->s_id, s->s_type->name, sizeof(s->s_id)); list_add_tail(&s->s_list, &super_blocks); hlist_add_head(&s->s_instances, &s->s_type->fs_supers); @@ -565,28 +563,31 @@ share_extant_sb: EXPORT_SYMBOL(sget_fc); /** - * sget_userns - find or create a superblock - * @type: filesystem type superblock should belong to - * @test: comparison callback - * @set: setup callback - * @flags: mount flags - * @user_ns: User namespace for the super_block - * @data: argument to each of them + * sget - find or create a superblock + * @type: filesystem type superblock should belong to + * @test: comparison callback + * @set: setup callback + * @flags: mount flags + * @data: argument to each of them */ -struct super_block *sget_userns(struct file_system_type *type, +struct super_block *sget(struct file_system_type *type, int (*test)(struct super_block *,void *), int (*set)(struct super_block *,void *), - int flags, struct user_namespace *user_ns, + int flags, void *data) { + struct user_namespace *user_ns = current_user_ns(); struct super_block *s = NULL; struct super_block *old; int err; - if (!(flags & (SB_KERNMOUNT|SB_SUBMOUNT)) && - !(type->fs_flags & FS_USERNS_MOUNT) && - !capable(CAP_SYS_ADMIN)) - return ERR_PTR(-EPERM); + /* We don't yet pass the user namespace of the parent + * mount through to here so always use &init_user_ns + * until that changes. + */ + if (flags & SB_SUBMOUNT) + user_ns = &init_user_ns; + retry: spin_lock(&sb_lock); if (test) { @@ -627,39 +628,6 @@ retry: register_shrinker_prepared(&s->s_shrink); return s; } - -EXPORT_SYMBOL(sget_userns); - -/** - * sget - find or create a superblock - * @type: filesystem type superblock should belong to - * @test: comparison callback - * @set: setup callback - * @flags: mount flags - * @data: argument to each of them - */ -struct super_block *sget(struct file_system_type *type, - int (*test)(struct super_block *,void *), - int (*set)(struct super_block *,void *), - int flags, - void *data) -{ - struct user_namespace *user_ns = current_user_ns(); - - /* We don't yet pass the user namespace of the parent - * mount through to here so always use &init_user_ns - * until that changes. - */ - if (flags & SB_SUBMOUNT) - user_ns = &init_user_ns; - - /* Ensure the requestor has permissions over the target filesystem */ - if (!(flags & (SB_KERNMOUNT|SB_SUBMOUNT)) && !ns_capable(user_ns, CAP_SYS_ADMIN)) - return ERR_PTR(-EPERM); - - return sget_userns(type, test, set, flags, user_ns, data); -} - EXPORT_SYMBOL(sget); void drop_super(struct super_block *sb) @@ -1147,50 +1115,6 @@ void kill_litter_super(struct super_block *sb) } EXPORT_SYMBOL(kill_litter_super); -static int ns_test_super(struct super_block *sb, void *data) -{ - return sb->s_fs_info == data; -} - -static int ns_set_super(struct super_block *sb, void *data) -{ - sb->s_fs_info = data; - return set_anon_super(sb, NULL); -} - -struct dentry *mount_ns(struct file_system_type *fs_type, - int flags, void *data, void *ns, struct user_namespace *user_ns, - int (*fill_super)(struct super_block *, void *, int)) -{ - struct super_block *sb; - - /* Don't allow mounting unless the caller has CAP_SYS_ADMIN - * over the namespace. - */ - if (!(flags & SB_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN)) - return ERR_PTR(-EPERM); - - sb = sget_userns(fs_type, ns_test_super, ns_set_super, flags, - user_ns, ns); - if (IS_ERR(sb)) - return ERR_CAST(sb); - - if (!sb->s_root) { - int err; - err = fill_super(sb, data, flags & SB_SILENT ? 1 : 0); - if (err) { - deactivate_locked_super(sb); - return ERR_PTR(err); - } - - sb->s_flags |= SB_ACTIVE; - } - - return dget(sb->s_root); -} - -EXPORT_SYMBOL(mount_ns); - int set_anon_super_fc(struct super_block *sb, struct fs_context *fc) { return set_anon_super(sb, NULL); @@ -1274,6 +1198,22 @@ int vfs_get_super(struct fs_context *fc, } EXPORT_SYMBOL(vfs_get_super); +int get_tree_nodev(struct fs_context *fc, + int (*fill_super)(struct super_block *sb, + struct fs_context *fc)) +{ + return vfs_get_super(fc, vfs_get_independent_super, fill_super); +} +EXPORT_SYMBOL(get_tree_nodev); + +int get_tree_single(struct fs_context *fc, + int (*fill_super)(struct super_block *sb, + struct fs_context *fc)) +{ + return vfs_get_super(fc, vfs_get_single_super, fill_super); +} +EXPORT_SYMBOL(get_tree_single); + #ifdef CONFIG_BLOCK static int set_bdev_super(struct super_block *s, void *data) { diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 1b56686ab178..db81cfbab9d6 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -72,8 +72,7 @@ static int sysfs_init_fs_context(struct fs_context *fc) fc->fs_private = kfc; fc->ops = &sysfs_fs_context_ops; if (netns) { - if (fc->user_ns) - put_user_ns(fc->user_ns); + put_user_ns(fc->user_ns); fc->user_ns = get_user_ns(netns->user_ns); } fc->global = true; diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig index 06c35c64162b..69932bcfa920 100644 --- a/fs/ubifs/Kconfig +++ b/fs/ubifs/Kconfig @@ -6,8 +6,10 @@ config UBIFS_FS select CRYPTO if UBIFS_FS_ADVANCED_COMPR select CRYPTO if UBIFS_FS_LZO select CRYPTO if UBIFS_FS_ZLIB + select CRYPTO if UBIFS_FS_ZSTD select CRYPTO_LZO if UBIFS_FS_LZO select CRYPTO_DEFLATE if UBIFS_FS_ZLIB + select CRYPTO_ZSTD if UBIFS_FS_ZSTD select CRYPTO_HASH_INFO select UBIFS_FS_XATTR if FS_ENCRYPTION depends on MTD_UBI @@ -38,6 +40,14 @@ config UBIFS_FS_ZLIB help Zlib compresses better than LZO but it is slower. Say 'Y' if unsure. +config UBIFS_FS_ZSTD + bool "ZSTD compression support" if UBIFS_FS_ADVANCED_COMPR + depends on UBIFS_FS + default y + help + ZSTD compresses is a big win in speed over Zlib and + in compression ratio over LZO. Say 'Y' if unsure. + config UBIFS_ATIME_SUPPORT bool "Access time support" default n @@ -77,8 +87,9 @@ config UBIFS_FS_SECURITY config UBIFS_FS_AUTHENTICATION bool "UBIFS authentication support" - depends on KEYS + select KEYS select CRYPTO_HMAC + select SYSTEM_DATA_VERIFICATION help Enable authentication support for UBIFS. This feature offers protection against offline changes for both data and metadata of the filesystem. diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c index 60f43b93d06e..d9af2de9084a 100644 --- a/fs/ubifs/auth.c +++ b/fs/ubifs/auth.c @@ -10,10 +10,12 @@ */ #include <linux/crypto.h> +#include <linux/verification.h> #include <crypto/hash.h> #include <crypto/sha.h> #include <crypto/algapi.h> #include <keys/user-type.h> +#include <keys/asymmetric-type.h> #include "ubifs.h" @@ -199,6 +201,77 @@ int __ubifs_node_check_hash(const struct ubifs_info *c, const void *node, } /** + * ubifs_sb_verify_signature - verify the signature of a superblock + * @c: UBIFS file-system description object + * @sup: The superblock node + * + * To support offline signed images the superblock can be signed with a + * PKCS#7 signature. The signature is placed directly behind the superblock + * node in an ubifs_sig_node. + * + * Returns 0 when the signature can be successfully verified or a negative + * error code if not. + */ +int ubifs_sb_verify_signature(struct ubifs_info *c, + const struct ubifs_sb_node *sup) +{ + int err; + struct ubifs_scan_leb *sleb; + struct ubifs_scan_node *snod; + const struct ubifs_sig_node *signode; + + sleb = ubifs_scan(c, UBIFS_SB_LNUM, UBIFS_SB_NODE_SZ, c->sbuf, 0); + if (IS_ERR(sleb)) { + err = PTR_ERR(sleb); + return err; + } + + if (sleb->nodes_cnt == 0) { + ubifs_err(c, "Unable to find signature node"); + err = -EINVAL; + goto out_destroy; + } + + snod = list_first_entry(&sleb->nodes, struct ubifs_scan_node, list); + + if (snod->type != UBIFS_SIG_NODE) { + ubifs_err(c, "Signature node is of wrong type"); + err = -EINVAL; + goto out_destroy; + } + + signode = snod->node; + + if (le32_to_cpu(signode->len) > snod->len + sizeof(struct ubifs_sig_node)) { + ubifs_err(c, "invalid signature len %d", le32_to_cpu(signode->len)); + err = -EINVAL; + goto out_destroy; + } + + if (le32_to_cpu(signode->type) != UBIFS_SIGNATURE_TYPE_PKCS7) { + ubifs_err(c, "Signature type %d is not supported\n", + le32_to_cpu(signode->type)); + err = -EINVAL; + goto out_destroy; + } + + err = verify_pkcs7_signature(sup, sizeof(struct ubifs_sb_node), + signode->sig, le32_to_cpu(signode->len), + NULL, VERIFYING_UNSPECIFIED_SIGNATURE, + NULL, NULL); + + if (err) + ubifs_err(c, "Failed to verify signature"); + else + ubifs_msg(c, "Successfully verified super block signature"); + +out_destroy: + ubifs_scan_destroy(sleb); + + return err; +} + +/** * ubifs_init_authentication - initialize UBIFS authentication support * @c: UBIFS file-system description object * @@ -478,3 +551,16 @@ int ubifs_hmac_wkm(struct ubifs_info *c, u8 *hmac) return err; return 0; } + +/* + * ubifs_hmac_zero - test if a HMAC is zero + * @c: UBIFS file-system description object + * @hmac: the HMAC to test + * + * This function tests if a HMAC is zero and returns true if it is + * and false otherwise. + */ +bool ubifs_hmac_zero(struct ubifs_info *c, const u8 *hmac) +{ + return !memchr_inv(hmac, 0, c->hmac_desc_len); +} diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c index 99c53ad11e93..3a92e6af69b2 100644 --- a/fs/ubifs/compress.c +++ b/fs/ubifs/compress.c @@ -59,6 +59,24 @@ static struct ubifs_compressor zlib_compr = { }; #endif +#ifdef CONFIG_UBIFS_FS_ZSTD +static DEFINE_MUTEX(zstd_enc_mutex); +static DEFINE_MUTEX(zstd_dec_mutex); + +static struct ubifs_compressor zstd_compr = { + .compr_type = UBIFS_COMPR_ZSTD, + .comp_mutex = &zstd_enc_mutex, + .decomp_mutex = &zstd_dec_mutex, + .name = "zstd", + .capi_name = "zstd", +}; +#else +static struct ubifs_compressor zstd_compr = { + .compr_type = UBIFS_COMPR_ZSTD, + .name = "zstd", +}; +#endif + /* All UBIFS compressors */ struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT]; @@ -216,13 +234,19 @@ int __init ubifs_compressors_init(void) if (err) return err; - err = compr_init(&zlib_compr); + err = compr_init(&zstd_compr); if (err) goto out_lzo; + err = compr_init(&zlib_compr); + if (err) + goto out_zstd; + ubifs_compressors[UBIFS_COMPR_NONE] = &none_compr; return 0; +out_zstd: + compr_exit(&zstd_compr); out_lzo: compr_exit(&lzo_compr); return err; @@ -235,4 +259,5 @@ void ubifs_compressors_exit(void) { compr_exit(&lzo_compr); compr_exit(&zlib_compr); + compr_exit(&zstd_compr); } diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index e5f8de62fc51..400970d740bb 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1470,7 +1470,7 @@ static int ubifs_migrate_page(struct address_space *mapping, { int rc; - rc = migrate_page_move_mapping(mapping, newpage, page, mode, 0); + rc = migrate_page_move_mapping(mapping, newpage, page, 0); if (rc != MIGRATEPAGE_SUCCESS) return rc; diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c index cd85d7d4c515..b6ac9c4281ef 100644 --- a/fs/ubifs/log.c +++ b/fs/ubifs/log.c @@ -438,10 +438,7 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum) *ltail_lnum = c->lhead_lnum; c->lhead_offs += len; - if (c->lhead_offs == c->leb_size) { - c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum); - c->lhead_offs = 0; - } + ubifs_assert(c, c->lhead_offs < c->leb_size); remove_buds(c); diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c index b42a768709c0..52a85c01397e 100644 --- a/fs/ubifs/master.c +++ b/fs/ubifs/master.c @@ -48,6 +48,39 @@ int ubifs_compare_master_node(struct ubifs_info *c, void *m1, void *m2) return 0; } +/* mst_node_check_hash - Check hash of a master node + * @c: UBIFS file-system description object + * @mst: The master node + * @expected: The expected hash of the master node + * + * This checks the hash of a master node against a given expected hash. + * Note that we have two master nodes on a UBIFS image which have different + * sequence numbers and consequently different CRCs. To be able to match + * both master nodes we exclude the common node header containing the sequence + * number and CRC from the hash. + * + * Returns 0 if the hashes are equal, a negative error code otherwise. + */ +static int mst_node_check_hash(const struct ubifs_info *c, + const struct ubifs_mst_node *mst, + const u8 *expected) +{ + u8 calc[UBIFS_MAX_HASH_LEN]; + const void *node = mst; + + SHASH_DESC_ON_STACK(shash, c->hash_tfm); + + shash->tfm = c->hash_tfm; + + crypto_shash_digest(shash, node + sizeof(struct ubifs_ch), + UBIFS_MST_NODE_SZ - sizeof(struct ubifs_ch), calc); + + if (ubifs_check_hash(c, expected, calc)) + return -EPERM; + + return 0; +} + /** * scan_for_master - search the valid master node. * @c: UBIFS file-system description object @@ -102,14 +135,22 @@ static int scan_for_master(struct ubifs_info *c) if (!ubifs_authenticated(c)) return 0; - err = ubifs_node_verify_hmac(c, c->mst_node, - sizeof(struct ubifs_mst_node), - offsetof(struct ubifs_mst_node, hmac)); - if (err) { - ubifs_err(c, "Failed to verify master node HMAC"); - return -EPERM; + if (ubifs_hmac_zero(c, c->mst_node->hmac)) { + err = mst_node_check_hash(c, c->mst_node, + c->sup_node->hash_mst); + if (err) + ubifs_err(c, "Failed to verify master node hash"); + } else { + err = ubifs_node_verify_hmac(c, c->mst_node, + sizeof(struct ubifs_mst_node), + offsetof(struct ubifs_mst_node, hmac)); + if (err) + ubifs_err(c, "Failed to verify master node HMAC"); } + if (err) + return -EPERM; + return 0; out: diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index cb72688032cd..b52624e28fa1 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -126,25 +126,11 @@ static void __orphan_drop(struct ubifs_info *c, struct ubifs_orphan *o) kfree(o); } -static void orphan_delete(struct ubifs_info *c, ino_t inum) +static void orphan_delete(struct ubifs_info *c, struct ubifs_orphan *orph) { - struct ubifs_orphan *orph, *child_orph, *tmp_o; - - spin_lock(&c->orphan_lock); - - orph = lookup_orphan(c, inum); - if (!orph) { - spin_unlock(&c->orphan_lock); - ubifs_err(c, "missing orphan ino %lu", (unsigned long)inum); - dump_stack(); - - return; - } - if (orph->del) { spin_unlock(&c->orphan_lock); - dbg_gen("deleted twice ino %lu", - (unsigned long)inum); + dbg_gen("deleted twice ino %lu", orph->inum); return; } @@ -153,19 +139,11 @@ static void orphan_delete(struct ubifs_info *c, ino_t inum) orph->dnext = c->orph_dnext; c->orph_dnext = orph; spin_unlock(&c->orphan_lock); - dbg_gen("delete later ino %lu", - (unsigned long)inum); + dbg_gen("delete later ino %lu", orph->inum); return; } - list_for_each_entry_safe(child_orph, tmp_o, &orph->child_list, child_list) { - list_del(&child_orph->child_list); - __orphan_drop(c, child_orph); - } - __orphan_drop(c, orph); - - spin_unlock(&c->orphan_lock); } /** @@ -223,7 +201,27 @@ int ubifs_add_orphan(struct ubifs_info *c, ino_t inum) */ void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum) { - orphan_delete(c, inum); + struct ubifs_orphan *orph, *child_orph, *tmp_o; + + spin_lock(&c->orphan_lock); + + orph = lookup_orphan(c, inum); + if (!orph) { + spin_unlock(&c->orphan_lock); + ubifs_err(c, "missing orphan ino %lu", (unsigned long)inum); + dump_stack(); + + return; + } + + list_for_each_entry_safe(child_orph, tmp_o, &orph->child_list, child_list) { + list_del(&child_orph->child_list); + orphan_delete(c, child_orph); + } + + orphan_delete(c, orph); + + spin_unlock(&c->orphan_lock); } /** @@ -630,6 +628,7 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb, { struct ubifs_scan_node *snod; struct ubifs_orph_node *orph; + struct ubifs_ino_node *ino = NULL; unsigned long long cmt_no; ino_t inum; int i, n, err, first = 1; @@ -676,23 +675,40 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb, if (first) first = 0; + ino = kmalloc(UBIFS_MAX_INO_NODE_SZ, GFP_NOFS); + if (!ino) + return -ENOMEM; + n = (le32_to_cpu(orph->ch.len) - UBIFS_ORPH_NODE_SZ) >> 3; for (i = 0; i < n; i++) { union ubifs_key key1, key2; inum = le64_to_cpu(orph->inos[i]); - dbg_rcvry("deleting orphaned inode %lu", - (unsigned long)inum); - - lowest_ino_key(c, &key1, inum); - highest_ino_key(c, &key2, inum); - err = ubifs_tnc_remove_range(c, &key1, &key2); + ino_key_init(c, &key1, inum); + err = ubifs_tnc_lookup(c, &key1, ino); if (err) - return err; + goto out_free; + + /* + * Check whether an inode can really get deleted. + * linkat() with O_TMPFILE allows rebirth of an inode. + */ + if (ino->nlink == 0) { + dbg_rcvry("deleting orphaned inode %lu", + (unsigned long)inum); + + lowest_ino_key(c, &key1, inum); + highest_ino_key(c, &key2, inum); + + err = ubifs_tnc_remove_range(c, &key1, &key2); + if (err) + goto out_ro; + } + err = insert_dead_orphan(c, inum); if (err) - return err; + goto out_free; } *last_cmt_no = cmt_no; @@ -704,7 +720,15 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb, *last_flagged = 0; } - return 0; + err = 0; +out_free: + kfree(ino); + return err; + +out_ro: + ubifs_ro_mode(c, err); + kfree(ino); + return err; } /** diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index 3fc589881825..f116f7b3f9e5 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -818,7 +818,7 @@ static int get_cs_sqnum(struct ubifs_info *c, int lnum, int offs, goto out_err; } if (cs_node->ch.node_type != UBIFS_CS_NODE) { - ubifs_err(c, "Node a CS node, type is %d", cs_node->ch.node_type); + ubifs_err(c, "Not a CS node, type is %d", cs_node->ch.node_type); goto out_err; } if (le64_to_cpu(cs_node->cmt_no) != c->cmt_no) { diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index 12c2afdb5804..a551eb3e9b89 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -578,17 +578,26 @@ static int authenticate_sb_node(struct ubifs_info *c, return -EINVAL; } - err = ubifs_hmac_wkm(c, hmac_wkm); - if (err) - return err; - - if (ubifs_check_hmac(c, hmac_wkm, sup->hmac_wkm)) { - ubifs_err(c, "provided key does not fit"); - return -ENOKEY; + /* + * The super block node can either be authenticated by a HMAC or + * by a signature in a ubifs_sig_node directly following the + * super block node to support offline image creation. + */ + if (ubifs_hmac_zero(c, sup->hmac)) { + err = ubifs_sb_verify_signature(c, sup); + } else { + err = ubifs_hmac_wkm(c, hmac_wkm); + if (err) + return err; + if (ubifs_check_hmac(c, hmac_wkm, sup->hmac_wkm)) { + ubifs_err(c, "provided key does not fit"); + return -ENOKEY; + } + err = ubifs_node_verify_hmac(c, sup, sizeof(*sup), + offsetof(struct ubifs_sb_node, + hmac)); } - err = ubifs_node_verify_hmac(c, sup, sizeof(*sup), - offsetof(struct ubifs_sb_node, hmac)); if (err) ubifs_err(c, "Failed to authenticate superblock: %d", err); @@ -744,21 +753,16 @@ int ubifs_read_superblock(struct ubifs_info *c) } /* Automatically increase file system size to the maximum size */ - c->old_leb_cnt = c->leb_cnt; if (c->leb_cnt < c->vi.size && c->leb_cnt < c->max_leb_cnt) { + int old_leb_cnt = c->leb_cnt; + c->leb_cnt = min_t(int, c->max_leb_cnt, c->vi.size); - if (c->ro_mount) - dbg_mnt("Auto resizing (ro) from %d LEBs to %d LEBs", - c->old_leb_cnt, c->leb_cnt); - else { - dbg_mnt("Auto resizing (sb) from %d LEBs to %d LEBs", - c->old_leb_cnt, c->leb_cnt); - sup->leb_cnt = cpu_to_le32(c->leb_cnt); - err = ubifs_write_sb_node(c, sup); - if (err) - goto out; - c->old_leb_cnt = c->leb_cnt; - } + sup->leb_cnt = cpu_to_le32(c->leb_cnt); + + c->superblock_need_write = 1; + + dbg_mnt("Auto resizing from %d LEBs to %d LEBs", + old_leb_cnt, c->leb_cnt); } c->log_bytes = (long long)c->log_lebs * c->leb_size; @@ -916,9 +920,7 @@ int ubifs_fixup_free_space(struct ubifs_info *c) c->space_fixup = 0; sup->flags &= cpu_to_le32(~UBIFS_FLG_SPACE_FIXUP); - err = ubifs_write_sb_node(c, sup); - if (err) - return err; + c->superblock_need_write = 1; ubifs_msg(c, "free space fixup complete"); return err; diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index fd1977b568f0..2c0803b0ac3a 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -566,6 +566,8 @@ static int init_constants_early(struct ubifs_info *c) c->ranges[UBIFS_AUTH_NODE].min_len = UBIFS_AUTH_NODE_SZ; c->ranges[UBIFS_AUTH_NODE].max_len = UBIFS_AUTH_NODE_SZ + UBIFS_MAX_HMAC_LEN; + c->ranges[UBIFS_SIG_NODE].min_len = UBIFS_SIG_NODE_SZ; + c->ranges[UBIFS_SIG_NODE].max_len = c->leb_size - UBIFS_SB_NODE_SZ; c->ranges[UBIFS_INO_NODE].min_len = UBIFS_INO_NODE_SZ; c->ranges[UBIFS_INO_NODE].max_len = UBIFS_MAX_INO_NODE_SZ; @@ -1043,6 +1045,8 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options, c->mount_opts.compr_type = UBIFS_COMPR_LZO; else if (!strcmp(name, "zlib")) c->mount_opts.compr_type = UBIFS_COMPR_ZLIB; + else if (!strcmp(name, "zstd")) + c->mount_opts.compr_type = UBIFS_COMPR_ZSTD; else { ubifs_err(c, "unknown compressor \"%s\"", name); //FIXME: is c ready? kfree(name); @@ -1296,8 +1300,7 @@ static int mount_ubifs(struct ubifs_info *c) if (err) goto out_free; - sz = ALIGN(c->max_idx_node_sz, c->min_io_size); - sz = ALIGN(sz + c->max_idx_node_sz, c->min_io_size); + sz = ALIGN(c->max_idx_node_sz, c->min_io_size) * 2; c->cbuf = kmalloc(sz, GFP_NOFS); if (!c->cbuf) { err = -ENOMEM; @@ -1360,6 +1363,26 @@ static int mount_ubifs(struct ubifs_info *c) goto out_lpt; } + /* + * Handle offline signed images: Now that the master node is + * written and its validation no longer depends on the hash + * in the superblock, we can update the offline signed + * superblock with a HMAC version, + */ + if (ubifs_authenticated(c) && ubifs_hmac_zero(c, c->sup_node->hmac)) { + err = ubifs_hmac_wkm(c, c->sup_node->hmac_wkm); + if (err) + goto out_lpt; + c->superblock_need_write = 1; + } + + if (!c->ro_mount && c->superblock_need_write) { + err = ubifs_write_sb_node(c, c->sup_node); + if (err) + goto out_lpt; + c->superblock_need_write = 0; + } + err = dbg_check_idx_size(c, c->bi.old_idx_sz); if (err) goto out_lpt; @@ -1642,15 +1665,6 @@ static int ubifs_remount_rw(struct ubifs_info *c) if (err) goto out; - if (c->old_leb_cnt != c->leb_cnt) { - struct ubifs_sb_node *sup = c->sup_node; - - sup->leb_cnt = cpu_to_le32(c->leb_cnt); - err = ubifs_write_sb_node(c, sup); - if (err) - goto out; - } - if (c->need_recovery) { ubifs_msg(c, "completing deferred recovery"); err = ubifs_write_rcvrd_mst_node(c); @@ -1682,6 +1696,16 @@ static int ubifs_remount_rw(struct ubifs_info *c) goto out; } + if (c->superblock_need_write) { + struct ubifs_sb_node *sup = c->sup_node; + + err = ubifs_write_sb_node(c, sup); + if (err) + goto out; + + c->superblock_need_write = 0; + } + c->ileb_buf = vmalloc(c->leb_size); if (!c->ileb_buf) { err = -ENOMEM; diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index f5a823cb0e43..e8e7b0e9532e 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -1158,8 +1158,8 @@ static struct ubifs_znode *dirty_cow_bottom_up(struct ubifs_info *c, * o exact match, i.e. the found zero-level znode contains key @key, then %1 * is returned and slot number of the matched branch is stored in @n; * o not exact match, which means that zero-level znode does not contain - * @key, then %0 is returned and slot number of the closest branch is stored - * in @n; + * @key, then %0 is returned and slot number of the closest branch or %-1 + * is stored in @n; In this case calling tnc_next() is mandatory. * o @key is so small that it is even less than the lowest key of the * leftmost zero-level node, then %0 is returned and %0 is stored in @n. * @@ -1882,13 +1882,19 @@ int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, static int search_dh_cookie(struct ubifs_info *c, const union ubifs_key *key, struct ubifs_dent_node *dent, uint32_t cookie, - struct ubifs_znode **zn, int *n) + struct ubifs_znode **zn, int *n, int exact) { int err; struct ubifs_znode *znode = *zn; struct ubifs_zbranch *zbr; union ubifs_key *dkey; + if (!exact) { + err = tnc_next(c, &znode, n); + if (err) + return err; + } + for (;;) { zbr = &znode->zbranch[*n]; dkey = &zbr->key; @@ -1930,7 +1936,7 @@ static int do_lookup_dh(struct ubifs_info *c, const union ubifs_key *key, if (unlikely(err < 0)) goto out_unlock; - err = search_dh_cookie(c, key, dent, cookie, &znode, &n); + err = search_dh_cookie(c, key, dent, cookie, &znode, &n, err); out_unlock: mutex_unlock(&c->tnc_mutex); @@ -2723,7 +2729,7 @@ int ubifs_tnc_remove_dh(struct ubifs_info *c, const union ubifs_key *key, if (unlikely(err < 0)) goto out_free; - err = search_dh_cookie(c, key, dent, cookie, &znode, &n); + err = search_dh_cookie(c, key, dent, cookie, &znode, &n, err); if (err) goto out_free; } diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h index 86f0f2be116c..3c9792cbb6ff 100644 --- a/fs/ubifs/ubifs-media.h +++ b/fs/ubifs/ubifs-media.h @@ -275,6 +275,8 @@ enum { #define UBIFS_CS_NODE_SZ sizeof(struct ubifs_cs_node) #define UBIFS_ORPH_NODE_SZ sizeof(struct ubifs_orph_node) #define UBIFS_AUTH_NODE_SZ sizeof(struct ubifs_auth_node) +#define UBIFS_SIG_NODE_SZ sizeof(struct ubifs_sig_node) + /* Extended attribute entry nodes are identical to directory entry nodes */ #define UBIFS_XENT_NODE_SZ UBIFS_DENT_NODE_SZ /* Only this does not have to be multiple of 8 bytes */ @@ -301,6 +303,8 @@ enum { */ #define UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT "c" +/* Type field in ubifs_sig_node */ +#define UBIFS_SIGNATURE_TYPE_PKCS7 1 /* * On-flash inode flags. @@ -336,12 +340,14 @@ enum { * UBIFS_COMPR_NONE: no compression * UBIFS_COMPR_LZO: LZO compression * UBIFS_COMPR_ZLIB: ZLIB compression + * UBIFS_COMPR_ZSTD: ZSTD compression * UBIFS_COMPR_TYPES_CNT: count of supported compression types */ enum { UBIFS_COMPR_NONE, UBIFS_COMPR_LZO, UBIFS_COMPR_ZLIB, + UBIFS_COMPR_ZSTD, UBIFS_COMPR_TYPES_CNT, }; @@ -361,6 +367,7 @@ enum { * UBIFS_CS_NODE: commit start node * UBIFS_ORPH_NODE: orphan node * UBIFS_AUTH_NODE: authentication node + * UBIFS_SIG_NODE: signature node * UBIFS_NODE_TYPES_CNT: count of supported node types * * Note, we index arrays by these numbers, so keep them low and contiguous. @@ -381,6 +388,7 @@ enum { UBIFS_CS_NODE, UBIFS_ORPH_NODE, UBIFS_AUTH_NODE, + UBIFS_SIG_NODE, UBIFS_NODE_TYPES_CNT, }; @@ -638,6 +646,8 @@ struct ubifs_pad_node { * @hmac_wkm: HMAC of a well known message (the string "UBIFS") as a convenience * to the user to check if the correct key is passed. * @hash_algo: The hash algo used for this filesystem (one of enum hash_algo) + * @hash_mst: hash of the master node, only valid for signed images in which the + * master node does not contain a hmac */ struct ubifs_sb_node { struct ubifs_ch ch; @@ -668,7 +678,8 @@ struct ubifs_sb_node { __u8 hmac[UBIFS_MAX_HMAC_LEN]; __u8 hmac_wkm[UBIFS_MAX_HMAC_LEN]; __le16 hash_algo; - __u8 padding2[3838]; + __u8 hash_mst[UBIFS_MAX_HASH_LEN]; + __u8 padding2[3774]; } __packed; /** @@ -771,6 +782,23 @@ struct ubifs_auth_node { } __packed; /** + * struct ubifs_sig_node - node for signing other nodes + * @ch: common header + * @type: type of the signature, currently only UBIFS_SIGNATURE_TYPE_PKCS7 + * supported + * @len: The length of the signature data + * @padding: reserved for future, zeroes + * @sig: The signature data + */ +struct ubifs_sig_node { + struct ubifs_ch ch; + __le32 type; + __le32 len; + __u8 padding[32]; + __u8 sig[]; +} __packed; + +/** * struct ubifs_branch - key/reference/length branch * @lnum: LEB number of the target node * @offs: offset within @lnum diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 745b23e5b406..c55f212dcb75 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1104,7 +1104,6 @@ struct ubifs_debug_info; * used to store indexing nodes (@leb_size - @max_idx_node_sz) * @leb_cnt: count of logical eraseblocks * @max_leb_cnt: maximum count of logical eraseblocks - * @old_leb_cnt: count of logical eraseblocks before re-size * @ro_media: the underlying UBI volume is read-only * @ro_mount: the file-system was mounted as read-only * @ro_error: UBIFS switched to R/O mode because an error happened @@ -1295,6 +1294,7 @@ struct ubifs_info { unsigned int rw_incompat:1; unsigned int assert_action:2; unsigned int authenticated:1; + unsigned int superblock_need_write:1; struct mutex tnc_mutex; struct ubifs_zbranch zroot; @@ -1352,7 +1352,6 @@ struct ubifs_info { int idx_leb_size; int leb_cnt; int max_leb_cnt; - int old_leb_cnt; unsigned int ro_media:1; unsigned int ro_mount:1; unsigned int ro_error:1; @@ -1680,6 +1679,9 @@ static inline int ubifs_auth_node_sz(const struct ubifs_info *c) else return 0; } +int ubifs_sb_verify_signature(struct ubifs_info *c, + const struct ubifs_sb_node *sup); +bool ubifs_hmac_zero(struct ubifs_info *c, const u8 *hmac); int ubifs_hmac_wkm(struct ubifs_info *c, u8 *hmac); diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 3d247c0d92aa..4ed0dca52ec8 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1407,11 +1407,9 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf) struct super_block *sb = dentry->d_sb; struct ufs_sb_private_info *uspi= UFS_SB(sb)->s_uspi; unsigned flags = UFS_SB(sb)->s_flags; - struct ufs_super_block_third *usb3; u64 id = huge_encode_dev(sb->s_bdev->bd_dev); mutex_lock(&UFS_SB(sb)->s_lock); - usb3 = ubh_get_usb_third(uspi); if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) buf->f_type = UFS2_MAGIC; diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index b74a47169297..06b68b6115bc 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -49,6 +49,7 @@ xfs-y += $(addprefix libxfs/, \ xfs_refcount_btree.o \ xfs_sb.o \ xfs_symlink_remote.o \ + xfs_trans_inode.o \ xfs_trans_resv.o \ xfs_types.o \ ) @@ -107,8 +108,7 @@ xfs-y += xfs_log.o \ xfs_rmap_item.o \ xfs_log_recover.o \ xfs_trans_ail.o \ - xfs_trans_buf.o \ - xfs_trans_inode.o + xfs_trans_buf.o # optional features xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \ diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index 93d14e47269d..a9ad90926b87 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -66,6 +66,10 @@ xfs_trans_ichgtime( inode->i_mtime = tv; if (flags & XFS_ICHGTIME_CHG) inode->i_ctime = tv; + if (flags & XFS_ICHGTIME_CREATE) { + ip->i_d.di_crtime.t_sec = (int32_t)tv.tv_sec; + ip->i_d.di_crtime.t_nsec = (int32_t)tv.tv_nsec; + } } /* diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 761248ee2778..f16d5f196c6b 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -796,7 +796,7 @@ xfs_add_to_ioend( } wpc->ioend->io_size += len; - wbc_account_io(wbc, page, len); + wbc_account_cgroup_owner(wbc, page, len); } STATIC void diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index e93bacbd49ae..28101bbc0b78 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1197,11 +1197,14 @@ xfs_file_mmap( struct file *filp, struct vm_area_struct *vma) { + struct dax_device *dax_dev; + + dax_dev = xfs_find_daxdev_for_inode(file_inode(filp)); /* - * We don't support synchronous mappings for non-DAX files. At least - * until someone comes with a sensible use case. + * We don't support synchronous mappings for non-DAX files and + * for DAX files if underneath dax_device is not synchronous. */ - if (!IS_DAX(file_inode(filp)) && (vma->vm_flags & VM_SYNC)) + if (!daxdev_mapping_supported(vma, dax_dev)) return -EOPNOTSUPP; file_accessed(filp); |