summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/befs/debug.c1
-rw-r--r--fs/buffer.c22
-rw-r--r--fs/direct-io.c2
-rw-r--r--fs/ext3/Kconfig19
-rw-r--r--fs/ext3/super.c8
-rw-r--r--fs/jbd/commit.c7
-rw-r--r--fs/jbd2/commit.c13
-rw-r--r--fs/jffs2/acl.c4
-rw-r--r--fs/jffs2/malloc.c6
-rw-r--r--fs/libfs.c16
-rw-r--r--fs/lockd/svclock.c13
-rw-r--r--fs/nfs/super.c2
-rw-r--r--fs/nfsd/Kconfig1
-rw-r--r--fs/nfsd/nfs3proc.c10
-rw-r--r--fs/nfsd/nfs4callback.c47
-rw-r--r--fs/nfsd/nfs4proc.c246
-rw-r--r--fs/nfsd/nfs4recover.c74
-rw-r--r--fs/nfsd/nfs4state.c1196
-rw-r--r--fs/nfsd/nfs4xdr.c633
-rw-r--r--fs/nfsd/nfsctl.c38
-rw-r--r--fs/nfsd/nfsproc.c3
-rw-r--r--fs/nfsd/nfssvc.c88
-rw-r--r--fs/nfsd/vfs.c37
-rw-r--r--fs/romfs/Kconfig48
-rw-r--r--fs/romfs/Makefile9
-rw-r--r--fs/romfs/inode.c665
-rw-r--r--fs/romfs/internal.h47
-rw-r--r--fs/romfs/mmap-nommu.c75
-rw-r--r--fs/romfs/storage.c261
-rw-r--r--fs/romfs/super.c648
-rw-r--r--fs/squashfs/export.c1
-rw-r--r--fs/ubifs/budget.c37
-rw-r--r--fs/ubifs/debug.c6
-rw-r--r--fs/ubifs/file.c16
-rw-r--r--fs/ubifs/find.c12
-rw-r--r--fs/ubifs/gc.c428
-rw-r--r--fs/ubifs/journal.c7
-rw-r--r--fs/ubifs/key.h6
-rw-r--r--fs/ubifs/log.c5
-rw-r--r--fs/ubifs/lpt_commit.c34
-rw-r--r--fs/ubifs/recovery.c70
-rw-r--r--fs/ubifs/replay.c2
-rw-r--r--fs/ubifs/sb.c36
-rw-r--r--fs/ubifs/shrinker.c6
-rw-r--r--fs/ubifs/super.c37
-rw-r--r--fs/ubifs/tnc.c2
-rw-r--r--fs/ubifs/ubifs-media.h30
-rw-r--r--fs/ubifs/ubifs.h13
48 files changed, 3656 insertions, 1331 deletions
diff --git a/fs/befs/debug.c b/fs/befs/debug.c
index b8e304a0661e..622e73775c83 100644
--- a/fs/befs/debug.c
+++ b/fs/befs/debug.c
@@ -17,6 +17,7 @@
#include <linux/spinlock.h>
#include <linux/kernel.h>
#include <linux/fs.h>
+#include <linux/slab.h>
#endif /* __KERNEL__ */
diff --git a/fs/buffer.c b/fs/buffer.c
index 5d55a896ff78..6e35762b6169 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -737,7 +737,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
{
struct buffer_head *bh;
struct list_head tmp;
- struct address_space *mapping;
+ struct address_space *mapping, *prev_mapping = NULL;
int err = 0, err2;
INIT_LIST_HEAD(&tmp);
@@ -762,7 +762,18 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
* contents - it is a noop if I/O is still in
* flight on potentially older contents.
*/
- ll_rw_block(SWRITE_SYNC, 1, &bh);
+ ll_rw_block(SWRITE_SYNC_PLUG, 1, &bh);
+
+ /*
+ * Kick off IO for the previous mapping. Note
+ * that we will not run the very last mapping,
+ * wait_on_buffer() will do that for us
+ * through sync_buffer().
+ */
+ if (prev_mapping && prev_mapping != mapping)
+ blk_run_address_space(prev_mapping);
+ prev_mapping = mapping;
+
brelse(bh);
spin_lock(lock);
}
@@ -2957,12 +2968,13 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
for (i = 0; i < nr; i++) {
struct buffer_head *bh = bhs[i];
- if (rw == SWRITE || rw == SWRITE_SYNC)
+ if (rw == SWRITE || rw == SWRITE_SYNC || rw == SWRITE_SYNC_PLUG)
lock_buffer(bh);
else if (!trylock_buffer(bh))
continue;
- if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) {
+ if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC ||
+ rw == SWRITE_SYNC_PLUG) {
if (test_clear_buffer_dirty(bh)) {
bh->b_end_io = end_buffer_write_sync;
get_bh(bh);
@@ -2998,7 +3010,7 @@ int sync_dirty_buffer(struct buffer_head *bh)
if (test_clear_buffer_dirty(bh)) {
get_bh(bh);
bh->b_end_io = end_buffer_write_sync;
- ret = submit_bh(WRITE, bh);
+ ret = submit_bh(WRITE_SYNC, bh);
wait_on_buffer(bh);
if (buffer_eopnotsupp(bh)) {
clear_buffer_eopnotsupp(bh);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b6d43908ff7a..da258e7249cc 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1126,7 +1126,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
int acquire_i_mutex = 0;
if (rw & WRITE)
- rw = WRITE_SYNC;
+ rw = WRITE_ODIRECT;
if (bdev)
bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig
index 8e0cfe44b0fc..fb3c1a21b135 100644
--- a/fs/ext3/Kconfig
+++ b/fs/ext3/Kconfig
@@ -28,6 +28,25 @@ config EXT3_FS
To compile this file system support as a module, choose M here: the
module will be called ext3.
+config EXT3_DEFAULTS_TO_ORDERED
+ bool "Default to 'data=ordered' in ext3 (legacy option)"
+ depends on EXT3_FS
+ help
+ If a filesystem does not explicitly specify a data ordering
+ mode, and the journal capability allowed it, ext3 used to
+ historically default to 'data=ordered'.
+
+ That was a rather unfortunate choice, because it leads to all
+ kinds of latency problems, and the 'data=writeback' mode is more
+ appropriate these days.
+
+ You should probably always answer 'n' here, and if you really
+ want to use 'data=ordered' mode, set it in the filesystem itself
+ with 'tune2fs -o journal_data_ordered'.
+
+ But if you really want to enable the legacy default, you can do
+ so by answering 'y' to this question.
+
config EXT3_FS_XATTR
bool "Ext3 extended attributes"
depends on EXT3_FS
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 9e5b8e387e1e..599dbfe504c3 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -44,6 +44,12 @@
#include "acl.h"
#include "namei.h"
+#ifdef CONFIG_EXT3_DEFAULTS_TO_ORDERED
+ #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_ORDERED_DATA
+#else
+ #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_WRITEBACK_DATA
+#endif
+
static int ext3_load_journal(struct super_block *, struct ext3_super_block *,
unsigned long journal_devnum);
static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
@@ -1919,7 +1925,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
cope, else JOURNAL_DATA */
if (journal_check_available_features
(sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE))
- set_opt(sbi->s_mount_opt, ORDERED_DATA);
+ set_opt(sbi->s_mount_opt, DEFAULT_DATA_MODE);
else
set_opt(sbi->s_mount_opt, JOURNAL_DATA);
break;
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index f8077b9c8981..a8e8513a78a9 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -351,8 +351,13 @@ void journal_commit_transaction(journal_t *journal)
spin_lock(&journal->j_state_lock);
commit_transaction->t_state = T_LOCKED;
+ /*
+ * Use plugged writes here, since we want to submit several before
+ * we unplug the device. We don't do explicit unplugging in here,
+ * instead we rely on sync_buffer() doing the unplug for us.
+ */
if (commit_transaction->t_synchronous_commit)
- write_op = WRITE_SYNC;
+ write_op = WRITE_SYNC_PLUG;
spin_lock(&commit_transaction->t_handle_lock);
while (commit_transaction->t_updates) {
DEFINE_WAIT(wait);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 4ea72377c7a2..073c8c3df7cd 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -138,7 +138,7 @@ static int journal_submit_commit_record(journal_t *journal,
set_buffer_ordered(bh);
barrier_done = 1;
}
- ret = submit_bh(WRITE_SYNC, bh);
+ ret = submit_bh(WRITE_SYNC_PLUG, bh);
if (barrier_done)
clear_buffer_ordered(bh);
@@ -159,7 +159,7 @@ static int journal_submit_commit_record(journal_t *journal,
lock_buffer(bh);
set_buffer_uptodate(bh);
clear_buffer_dirty(bh);
- ret = submit_bh(WRITE_SYNC, bh);
+ ret = submit_bh(WRITE_SYNC_PLUG, bh);
}
*cbh = bh;
return ret;
@@ -190,7 +190,7 @@ retry:
set_buffer_uptodate(bh);
bh->b_end_io = journal_end_buffer_io_sync;
- ret = submit_bh(WRITE_SYNC, bh);
+ ret = submit_bh(WRITE_SYNC_PLUG, bh);
if (ret) {
unlock_buffer(bh);
return ret;
@@ -402,8 +402,13 @@ void jbd2_journal_commit_transaction(journal_t *journal)
spin_lock(&journal->j_state_lock);
commit_transaction->t_state = T_LOCKED;
+ /*
+ * Use plugged writes here, since we want to submit several before
+ * we unplug the device. We don't do explicit unplugging in here,
+ * instead we rely on sync_buffer() doing the unplug for us.
+ */
if (commit_transaction->t_synchronous_commit)
- write_op = WRITE_SYNC;
+ write_op = WRITE_SYNC_PLUG;
stats.u.run.rs_wait = commit_transaction->t_max_wait;
stats.u.run.rs_locked = jiffies;
stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 77ccf8cb0823..043740dde20c 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -38,12 +38,12 @@ static int jffs2_acl_count(size_t size)
size_t s;
size -= sizeof(struct jffs2_acl_header);
- s = size - 4 * sizeof(struct jffs2_acl_entry_short);
- if (s < 0) {
+ if (size < 4 * sizeof(struct jffs2_acl_entry_short)) {
if (size % sizeof(struct jffs2_acl_entry_short))
return -1;
return size / sizeof(struct jffs2_acl_entry_short);
} else {
+ s = size - 4 * sizeof(struct jffs2_acl_entry_short);
if (s % sizeof(struct jffs2_acl_entry))
return -1;
return s / sizeof(struct jffs2_acl_entry) + 4;
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index f9211252b5f1..9eff2bdae8a7 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -284,10 +284,9 @@ void jffs2_free_inode_cache(struct jffs2_inode_cache *x)
struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void)
{
struct jffs2_xattr_datum *xd;
- xd = kmem_cache_alloc(xattr_datum_cache, GFP_KERNEL);
+ xd = kmem_cache_zalloc(xattr_datum_cache, GFP_KERNEL);
dbg_memalloc("%p\n", xd);
- memset(xd, 0, sizeof(struct jffs2_xattr_datum));
xd->class = RAWNODE_CLASS_XATTR_DATUM;
xd->node = (void *)xd;
INIT_LIST_HEAD(&xd->xindex);
@@ -303,10 +302,9 @@ void jffs2_free_xattr_datum(struct jffs2_xattr_datum *xd)
struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void)
{
struct jffs2_xattr_ref *ref;
- ref = kmem_cache_alloc(xattr_ref_cache, GFP_KERNEL);
+ ref = kmem_cache_zalloc(xattr_ref_cache, GFP_KERNEL);
dbg_memalloc("%p\n", ref);
- memset(ref, 0, sizeof(struct jffs2_xattr_ref));
ref->class = RAWNODE_CLASS_XATTR_REF;
ref->node = (void *)ref;
return ref;
diff --git a/fs/libfs.c b/fs/libfs.c
index 4910a36f516e..cd223190c4e9 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -575,6 +575,21 @@ ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
* possibly a read which collects the result - which is stored in a
* file-local buffer.
*/
+
+void simple_transaction_set(struct file *file, size_t n)
+{
+ struct simple_transaction_argresp *ar = file->private_data;
+
+ BUG_ON(n > SIMPLE_TRANSACTION_LIMIT);
+
+ /*
+ * The barrier ensures that ar->size will really remain zero until
+ * ar->data is ready for reading.
+ */
+ smp_mb();
+ ar->size = n;
+}
+
char *simple_transaction_get(struct file *file, const char __user *buf, size_t size)
{
struct simple_transaction_argresp *ar;
@@ -820,6 +835,7 @@ EXPORT_SYMBOL(simple_sync_file);
EXPORT_SYMBOL(simple_unlink);
EXPORT_SYMBOL(simple_read_from_buffer);
EXPORT_SYMBOL(memory_read_from_buffer);
+EXPORT_SYMBOL(simple_transaction_set);
EXPORT_SYMBOL(simple_transaction_get);
EXPORT_SYMBOL(simple_transaction_read);
EXPORT_SYMBOL(simple_transaction_release);
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 763b78a6e9de..83ee34203bd7 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -426,8 +426,15 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
ret = nlm_granted;
goto out;
case -EAGAIN:
+ /*
+ * If this is a blocking request for an
+ * already pending lock request then we need
+ * to put it back on lockd's block list
+ */
+ if (wait)
+ break;
ret = nlm_lck_denied;
- break;
+ goto out;
case FILE_LOCK_DEFERRED:
if (wait)
break;
@@ -443,10 +450,6 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
goto out;
}
- ret = nlm_lck_denied;
- if (!wait)
- goto out;
-
ret = nlm_lck_blocked;
/* Append to list of blocked */
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 82eaadbff408..6717200923fe 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1228,7 +1228,6 @@ static int nfs_parse_mount_options(char *raw,
goto out_nomem;
token = match_token(string,
nfs_xprt_protocol_tokens, args);
- kfree(string);
switch (token) {
case Opt_xprt_udp:
@@ -1258,6 +1257,7 @@ static int nfs_parse_mount_options(char *raw,
goto out_nomem;
token = match_token(string,
nfs_xprt_protocol_tokens, args);
+ kfree(string);
switch (token) {
case Opt_xprt_udp:
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 44d7d04dab95..503b9da159a3 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -1,6 +1,7 @@
config NFSD
tristate "NFS server support"
depends on INET
+ depends on FILE_LOCKING
select LOCKD
select SUNRPC
select EXPORTFS
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 9dbd2eb91281..7c9fe838f038 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -18,6 +18,7 @@
#include <linux/unistd.h>
#include <linux/slab.h>
#include <linux/major.h>
+#include <linux/magic.h>
#include <linux/sunrpc/svc.h>
#include <linux/nfsd/nfsd.h>
@@ -202,6 +203,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
struct nfsd3_writeres *resp)
{
__be32 nfserr;
+ unsigned long cnt = argp->len;
dprintk("nfsd: WRITE(3) %s %d bytes at %ld%s\n",
SVCFH_fmt(&argp->fh),
@@ -214,9 +216,9 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
nfserr = nfsd_write(rqstp, &resp->fh, NULL,
argp->offset,
rqstp->rq_vec, argp->vlen,
- argp->len,
+ &cnt,
&resp->committed);
- resp->count = argp->count;
+ resp->count = cnt;
RETURN_STATUS(nfserr);
}
@@ -569,7 +571,7 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
struct super_block *sb = argp->fh.fh_dentry->d_inode->i_sb;
/* Note that we don't care for remote fs's here */
- if (sb->s_magic == 0x4d44 /* MSDOS_SUPER_MAGIC */) {
+ if (sb->s_magic == MSDOS_SUPER_MAGIC) {
resp->f_properties = NFS3_FSF_BILLYBOY;
}
resp->f_maxfilesize = sb->s_maxbytes;
@@ -610,7 +612,7 @@ nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
resp->p_link_max = EXT2_LINK_MAX;
resp->p_name_max = EXT2_NAME_LEN;
break;
- case 0x4d44: /* MSDOS_SUPER_MAGIC */
+ case MSDOS_SUPER_MAGIC:
resp->p_case_insensitive = 1;
resp->p_case_preserving = 0;
break;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index c464181b5994..290289bd44f7 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -218,7 +218,7 @@ static int
encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
{
__be32 *p;
- int len = cb_rec->cbr_fhlen;
+ int len = cb_rec->cbr_fh.fh_size;
RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len);
WRITE32(OP_CB_RECALL);
@@ -226,7 +226,7 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
WRITEMEM(&cb_rec->cbr_stateid.si_opaque, sizeof(stateid_opaque_t));
WRITE32(cb_rec->cbr_trunc);
WRITE32(len);
- WRITEMEM(cb_rec->cbr_fhval, len);
+ WRITEMEM(&cb_rec->cbr_fh.fh_base, len);
return 0;
}
@@ -361,9 +361,8 @@ static struct rpc_program cb_program = {
/* Reference counting, callback cleanup, etc., all look racy as heck.
* And why is cb_set an atomic? */
-static int do_probe_callback(void *data)
+static struct rpc_clnt *setup_callback_client(struct nfs4_client *clp)
{
- struct nfs4_client *clp = data;
struct sockaddr_in addr;
struct nfs4_callback *cb = &clp->cl_callback;
struct rpc_timeout timeparms = {
@@ -384,17 +383,10 @@ static int do_probe_callback(void *data)
.flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
.client_name = clp->cl_principal,
};
- struct rpc_message msg = {
- .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
- .rpc_argp = clp,
- };
struct rpc_clnt *client;
- int status;
- if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) {
- status = nfserr_cb_path_down;
- goto out_err;
- }
+ if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
+ return ERR_PTR(-EINVAL);
/* Initialize address */
memset(&addr, 0, sizeof(addr));
@@ -404,9 +396,29 @@ static int do_probe_callback(void *data)
/* Create RPC client */
client = rpc_create(&args);
+ if (IS_ERR(client))
+ dprintk("NFSD: couldn't create callback client: %ld\n",
+ PTR_ERR(client));
+ return client;
+
+}
+
+static int do_probe_callback(void *data)
+{
+ struct nfs4_client *clp = data;
+ struct nfs4_callback *cb = &clp->cl_callback;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
+ .rpc_argp = clp,
+ };
+ struct rpc_clnt *client;
+ int status;
+
+ client = setup_callback_client(clp);
if (IS_ERR(client)) {
- dprintk("NFSD: couldn't create callback client\n");
status = PTR_ERR(client);
+ dprintk("NFSD: couldn't create callback client: %d\n",
+ status);
goto out_err;
}
@@ -422,10 +434,10 @@ static int do_probe_callback(void *data)
out_release_client:
rpc_shutdown_client(client);
out_err:
- dprintk("NFSD: warning: no callback path to client %.*s\n",
- (int)clp->cl_name.len, clp->cl_name.data);
+ dprintk("NFSD: warning: no callback path to client %.*s: error %d\n",
+ (int)clp->cl_name.len, clp->cl_name.data, status);
put_nfs4_client(clp);
- return status;
+ return 0;
}
/*
@@ -451,7 +463,6 @@ nfsd4_probe_callback(struct nfs4_client *clp)
/*
* called with dp->dl_count inc'ed.
- * nfs4_lock_state() may or may not have been called.
*/
void
nfsd4_cb_recall(struct nfs4_delegation *dp)
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 9fa60a3ad48c..b2883e9c6381 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -93,6 +93,21 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
open->op_truncate = 0;
if (open->op_create) {
+ /* FIXME: check session persistence and pnfs flags.
+ * The nfsv4.1 spec requires the following semantics:
+ *
+ * Persistent | pNFS | Server REQUIRED | Client Allowed
+ * Reply Cache | server | |
+ * -------------+--------+-----------------+--------------------
+ * no | no | EXCLUSIVE4_1 | EXCLUSIVE4_1
+ * | | | (SHOULD)
+ * | | and EXCLUSIVE4 | or EXCLUSIVE4
+ * | | | (SHOULD NOT)
+ * no | yes | EXCLUSIVE4_1 | EXCLUSIVE4_1
+ * yes | no | GUARDED4 | GUARDED4
+ * yes | yes | GUARDED4 | GUARDED4
+ */
+
/*
* Note: create modes (UNCHECKED,GUARDED...) are the same
* in NFSv4 as in v3.
@@ -103,11 +118,13 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
(u32 *)open->op_verf.data,
&open->op_truncate, &created);
- /* If we ever decide to use different attrs to store the
- * verifier in nfsd_create_v3, then we'll need to change this
+ /*
+ * Following rfc 3530 14.2.16, use the returned bitmask
+ * to indicate which attributes we used to store the
+ * verifier:
*/
if (open->op_createmode == NFS4_CREATE_EXCLUSIVE && status == 0)
- open->op_bmval[1] |= (FATTR4_WORD1_TIME_ACCESS |
+ open->op_bmval[1] = (FATTR4_WORD1_TIME_ACCESS |
FATTR4_WORD1_TIME_MODIFY);
} else {
status = nfsd_lookup(rqstp, current_fh,
@@ -118,13 +135,11 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
goto out;
set_change_info(&open->op_cinfo, current_fh);
-
- /* set reply cache */
fh_dup2(current_fh, &resfh);
- open->op_stateowner->so_replay.rp_openfh_len = resfh.fh_handle.fh_size;
- memcpy(open->op_stateowner->so_replay.rp_openfh,
- &resfh.fh_handle.fh_base, resfh.fh_handle.fh_size);
+ /* set reply cache */
+ fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh,
+ &resfh.fh_handle);
if (!created)
status = do_open_permission(rqstp, current_fh, open,
NFSD_MAY_NOP);
@@ -150,10 +165,8 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info));
/* set replay cache */
- open->op_stateowner->so_replay.rp_openfh_len = current_fh->fh_handle.fh_size;
- memcpy(open->op_stateowner->so_replay.rp_openfh,
- &current_fh->fh_handle.fh_base,
- current_fh->fh_handle.fh_size);
+ fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh,
+ &current_fh->fh_handle);
open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) &&
(open->op_iattr.ia_size == 0);
@@ -164,12 +177,23 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
return status;
}
+static void
+copy_clientid(clientid_t *clid, struct nfsd4_session *session)
+{
+ struct nfsd4_sessionid *sid =
+ (struct nfsd4_sessionid *)session->se_sessionid.data;
+
+ clid->cl_boot = sid->clientid.cl_boot;
+ clid->cl_id = sid->clientid.cl_id;
+}
static __be32
nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_open *open)
{
__be32 status;
+ struct nfsd4_compoundres *resp;
+
dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n",
(int)open->op_fname.len, open->op_fname.data,
open->op_stateowner);
@@ -178,16 +202,19 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL)
return nfserr_inval;
+ if (nfsd4_has_session(cstate))
+ copy_clientid(&open->op_clientid, cstate->session);
+
nfs4_lock_state();
/* check seqid for replay. set nfs4_owner */
- status = nfsd4_process_open1(open);
+ resp = rqstp->rq_resp;
+ status = nfsd4_process_open1(&resp->cstate, open);
if (status == nfserr_replay_me) {
struct nfs4_replay *rp = &open->op_stateowner->so_replay;
fh_put(&cstate->current_fh);
- cstate->current_fh.fh_handle.fh_size = rp->rp_openfh_len;
- memcpy(&cstate->current_fh.fh_handle.fh_base, rp->rp_openfh,
- rp->rp_openfh_len);
+ fh_copy_shallow(&cstate->current_fh.fh_handle,
+ &rp->rp_openfh);
status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
if (status)
dprintk("nfsd4_open: replay failed"
@@ -209,10 +236,6 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
switch (open->op_claim_type) {
case NFS4_OPEN_CLAIM_DELEGATE_CUR:
- status = nfserr_inval;
- if (open->op_create)
- goto out;
- /* fall through */
case NFS4_OPEN_CLAIM_NULL:
/*
* (1) set CURRENT_FH to the file being opened,
@@ -455,8 +478,9 @@ nfsd4_getattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (getattr->ga_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)
return nfserr_inval;
- getattr->ga_bmval[0] &= NFSD_SUPPORTED_ATTRS_WORD0;
- getattr->ga_bmval[1] &= NFSD_SUPPORTED_ATTRS_WORD1;
+ getattr->ga_bmval[0] &= nfsd_suppattrs0(cstate->minorversion);
+ getattr->ga_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
+ getattr->ga_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
getattr->ga_fhp = &cstate->current_fh;
return nfs_ok;
@@ -520,9 +544,8 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
nfs4_lock_state();
/* check stateid */
- if ((status = nfs4_preprocess_stateid_op(&cstate->current_fh,
- &read->rd_stateid,
- CHECK_FH | RD_STATE, &read->rd_filp))) {
+ if ((status = nfs4_preprocess_stateid_op(cstate, &read->rd_stateid,
+ RD_STATE, &read->rd_filp))) {
dprintk("NFSD: nfsd4_read: couldn't process stateid!\n");
goto out;
}
@@ -548,8 +571,9 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (readdir->rd_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)
return nfserr_inval;
- readdir->rd_bmval[0] &= NFSD_SUPPORTED_ATTRS_WORD0;
- readdir->rd_bmval[1] &= NFSD_SUPPORTED_ATTRS_WORD1;
+ readdir->rd_bmval[0] &= nfsd_suppattrs0(cstate->minorversion);
+ readdir->rd_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
+ readdir->rd_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
if ((cookie > ~(u32)0) || (cookie == 1) || (cookie == 2) ||
(cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE)))
@@ -653,8 +677,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (setattr->sa_iattr.ia_valid & ATTR_SIZE) {
nfs4_lock_state();
- status = nfs4_preprocess_stateid_op(&cstate->current_fh,
- &setattr->sa_stateid, CHECK_FH | WR_STATE, NULL);
+ status = nfs4_preprocess_stateid_op(cstate,
+ &setattr->sa_stateid, WR_STATE, NULL);
nfs4_unlock_state();
if (status) {
dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n");
@@ -685,6 +709,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct file *filp = NULL;
u32 *p;
__be32 status = nfs_ok;
+ unsigned long cnt;
/* no need to check permission - this will be done in nfsd_write() */
@@ -692,8 +717,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
return nfserr_inval;
nfs4_lock_state();
- status = nfs4_preprocess_stateid_op(&cstate->current_fh, stateid,
- CHECK_FH | WR_STATE, &filp);
+ status = nfs4_preprocess_stateid_op(cstate, stateid, WR_STATE, &filp);
if (filp)
get_file(filp);
nfs4_unlock_state();
@@ -703,7 +727,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
return status;
}
- write->wr_bytes_written = write->wr_buflen;
+ cnt = write->wr_buflen;
write->wr_how_written = write->wr_stable_how;
p = (u32 *)write->wr_verifier.data;
*p++ = nfssvc_boot.tv_sec;
@@ -711,10 +735,12 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = nfsd_write(rqstp, &cstate->current_fh, filp,
write->wr_offset, rqstp->rq_vec, write->wr_vlen,
- write->wr_buflen, &write->wr_how_written);
+ &cnt, &write->wr_how_written);
if (filp)
fput(filp);
+ write->wr_bytes_written = cnt;
+
if (status == nfserr_symlink)
status = nfserr_inval;
return status;
@@ -737,8 +763,9 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (status)
return status;
- if ((verify->ve_bmval[0] & ~NFSD_SUPPORTED_ATTRS_WORD0)
- || (verify->ve_bmval[1] & ~NFSD_SUPPORTED_ATTRS_WORD1))
+ if ((verify->ve_bmval[0] & ~nfsd_suppattrs0(cstate->minorversion))
+ || (verify->ve_bmval[1] & ~nfsd_suppattrs1(cstate->minorversion))
+ || (verify->ve_bmval[2] & ~nfsd_suppattrs2(cstate->minorversion)))
return nfserr_attrnotsupp;
if ((verify->ve_bmval[0] & FATTR4_WORD0_RDATTR_ERROR)
|| (verify->ve_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1))
@@ -766,7 +793,8 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (status)
goto out_kfree;
- p = buf + 3;
+ /* skip bitmap */
+ p = buf + 1 + ntohl(buf[0]);
status = nfserr_not_same;
if (ntohl(*p++) != verify->ve_attrlen)
goto out_kfree;
@@ -813,39 +841,17 @@ static inline void nfsd4_increment_op_stats(u32 opnum)
nfsdstats.nfs4_opcount[opnum]++;
}
-static void cstate_free(struct nfsd4_compound_state *cstate)
-{
- if (cstate == NULL)
- return;
- fh_put(&cstate->current_fh);
- fh_put(&cstate->save_fh);
- BUG_ON(cstate->replay_owner);
- kfree(cstate);
-}
-
-static struct nfsd4_compound_state *cstate_alloc(void)
-{
- struct nfsd4_compound_state *cstate;
-
- cstate = kmalloc(sizeof(struct nfsd4_compound_state), GFP_KERNEL);
- if (cstate == NULL)
- return NULL;
- fh_init(&cstate->current_fh, NFS4_FHSIZE);
- fh_init(&cstate->save_fh, NFS4_FHSIZE);
- cstate->replay_owner = NULL;
- return cstate;
-}
-
typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *,
void *);
+enum nfsd4_op_flags {
+ ALLOWED_WITHOUT_FH = 1 << 0, /* No current filehandle required */
+ ALLOWED_ON_ABSENT_FS = 2 << 0, /* ops processed on absent fs */
+ ALLOWED_AS_FIRST_OP = 3 << 0, /* ops reqired first in compound */
+};
struct nfsd4_operation {
nfsd4op_func op_func;
u32 op_flags;
-/* Most ops require a valid current filehandle; a few don't: */
-#define ALLOWED_WITHOUT_FH 1
-/* GETATTR and ops not listed as returning NFS4ERR_MOVED: */
-#define ALLOWED_ON_ABSENT_FS 2
char *op_name;
};
@@ -854,6 +860,51 @@ static struct nfsd4_operation nfsd4_ops[];
static const char *nfsd4_op_name(unsigned opnum);
/*
+ * This is a replay of a compound for which no cache entry pages
+ * were used. Encode the sequence operation, and if cachethis is FALSE
+ * encode the uncache rep error on the next operation.
+ */
+static __be32
+nfsd4_enc_uncached_replay(struct nfsd4_compoundargs *args,
+ struct nfsd4_compoundres *resp)
+{
+ struct nfsd4_op *op;
+
+ dprintk("--> %s resp->opcnt %d ce_cachethis %u \n", __func__,
+ resp->opcnt, resp->cstate.slot->sl_cache_entry.ce_cachethis);
+
+ /* Encode the replayed sequence operation */
+ BUG_ON(resp->opcnt != 1);
+ op = &args->ops[resp->opcnt - 1];
+ nfsd4_encode_operation(resp, op);
+
+ /*return nfserr_retry_uncached_rep in next operation. */
+ if (resp->cstate.slot->sl_cache_entry.ce_cachethis == 0) {
+ op = &args->ops[resp->opcnt++];
+ op->status = nfserr_retry_uncached_rep;
+ nfsd4_encode_operation(resp, op);
+ }
+ return op->status;
+}
+
+/*
+ * Enforce NFSv4.1 COMPOUND ordering rules.
+ *
+ * TODO:
+ * - enforce NFS4ERR_NOT_ONLY_OP,
+ * - DESTROY_SESSION MUST be the final operation in the COMPOUND request.
+ */
+static bool nfs41_op_ordering_ok(struct nfsd4_compoundargs *args)
+{
+ if (args->minorversion && args->opcnt > 0) {
+ struct nfsd4_op *op = &args->ops[0];
+ return (op->status == nfserr_op_illegal) ||
+ (nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP);
+ }
+ return true;
+}
+
+/*
* COMPOUND call.
*/
static __be32
@@ -863,12 +914,13 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
{
struct nfsd4_op *op;
struct nfsd4_operation *opdesc;
- struct nfsd4_compound_state *cstate = NULL;
+ struct nfsd4_compound_state *cstate = &resp->cstate;
int slack_bytes;
__be32 status;
resp->xbuf = &rqstp->rq_res;
- resp->p = rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len;
+ resp->p = rqstp->rq_res.head[0].iov_base +
+ rqstp->rq_res.head[0].iov_len;
resp->tagp = resp->p;
/* reserve space for: taglen, tag, and opcnt */
resp->p += 2 + XDR_QUADLEN(args->taglen);
@@ -877,18 +929,25 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
resp->tag = args->tag;
resp->opcnt = 0;
resp->rqstp = rqstp;
+ resp->cstate.minorversion = args->minorversion;
+ resp->cstate.replay_owner = NULL;
+ fh_init(&resp->cstate.current_fh, NFS4_FHSIZE);
+ fh_init(&resp->cstate.save_fh, NFS4_FHSIZE);
+ /* Use the deferral mechanism only for NFSv4.0 compounds */
+ rqstp->rq_usedeferral = (args->minorversion == 0);
/*
* According to RFC3010, this takes precedence over all other errors.
*/
status = nfserr_minor_vers_mismatch;
- if (args->minorversion > NFSD_SUPPORTED_MINOR_VERSION)
+ if (args->minorversion > nfsd_supported_minorversion)
goto out;
- status = nfserr_resource;
- cstate = cstate_alloc();
- if (cstate == NULL)
- goto out;
+ if (!nfs41_op_ordering_ok(args)) {
+ op = &args->ops[0];
+ op->status = nfserr_sequence_pos;
+ goto encode_op;
+ }
status = nfs_ok;
while (!status && resp->opcnt < args->opcnt) {
@@ -897,7 +956,6 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
dprintk("nfsv4 compound op #%d/%d: %d (%s)\n",
resp->opcnt, args->opcnt, op->opnum,
nfsd4_op_name(op->opnum));
-
/*
* The XDR decode routines may have pre-set op->status;
* for example, if there is a miscellaneous XDR error
@@ -938,6 +996,15 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
BUG_ON(op->status == nfs_ok);
encode_op:
+ /* Only from SEQUENCE or CREATE_SESSION */
+ if (resp->cstate.status == nfserr_replay_cache) {
+ dprintk("%s NFS4.1 replay from cache\n", __func__);
+ if (nfsd4_not_cached(resp))
+ status = nfsd4_enc_uncached_replay(args, resp);
+ else
+ status = op->status;
+ goto out;
+ }
if (op->status == nfserr_replay_me) {
op->replay = &cstate->replay_owner->so_replay;
nfsd4_encode_replay(resp, op);
@@ -961,15 +1028,24 @@ encode_op:
nfsd4_increment_op_stats(op->opnum);
}
+ if (!rqstp->rq_usedeferral && status == nfserr_dropit) {
+ dprintk("%s Dropit - send NFS4ERR_DELAY\n", __func__);
+ status = nfserr_jukebox;
+ }
- cstate_free(cstate);
+ resp->cstate.status = status;
+ fh_put(&resp->cstate.current_fh);
+ fh_put(&resp->cstate.save_fh);
+ BUG_ON(resp->cstate.replay_owner);
out:
nfsd4_release_compoundargs(args);
+ /* Reset deferral mechanism for RPC deferrals */
+ rqstp->rq_usedeferral = 1;
dprintk("nfsv4 compound returned %d\n", ntohl(status));
return status;
}
-static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = {
+static struct nfsd4_operation nfsd4_ops[] = {
[OP_ACCESS] = {
.op_func = (nfsd4op_func)nfsd4_access,
.op_name = "OP_ACCESS",
@@ -1045,7 +1121,7 @@ static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = {
.op_name = "OP_PUTFH",
},
[OP_PUTPUBFH] = {
- /* unsupported, just for future reference: */
+ .op_func = (nfsd4op_func)nfsd4_putrootfh,
.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
.op_name = "OP_PUTPUBFH",
},
@@ -1119,6 +1195,28 @@ static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = {
.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
.op_name = "OP_RELEASE_LOCKOWNER",
},
+
+ /* NFSv4.1 operations */
+ [OP_EXCHANGE_ID] = {
+ .op_func = (nfsd4op_func)nfsd4_exchange_id,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+ .op_name = "OP_EXCHANGE_ID",
+ },
+ [OP_CREATE_SESSION] = {
+ .op_func = (nfsd4op_func)nfsd4_create_session,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+ .op_name = "OP_CREATE_SESSION",
+ },
+ [OP_DESTROY_SESSION] = {
+ .op_func = (nfsd4op_func)nfsd4_destroy_session,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+ .op_name = "OP_DESTROY_SESSION",
+ },
+ [OP_SEQUENCE] = {
+ .op_func = (nfsd4op_func)nfsd4_sequence,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+ .op_name = "OP_SEQUENCE",
+ },
};
static const char *nfsd4_op_name(unsigned opnum)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 74f7b67567fd..3444c0052a87 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -182,36 +182,26 @@ out_unlock:
typedef int (recdir_func)(struct dentry *, struct dentry *);
-struct dentry_list {
- struct dentry *dentry;
+struct name_list {
+ char name[HEXDIR_LEN];
struct list_head list;
};
-struct dentry_list_arg {
- struct list_head dentries;
- struct dentry *parent;
-};
-
static int
-nfsd4_build_dentrylist(void *arg, const char *name, int namlen,
+nfsd4_build_namelist(void *arg, const char *name, int namlen,
loff_t offset, u64 ino, unsigned int d_type)
{
- struct dentry_list_arg *dla = arg;
- struct list_head *dentries = &dla->dentries;
- struct dentry *parent = dla->parent;
- struct dentry *dentry;
- struct dentry_list *child;
+ struct list_head *names = arg;
+ struct name_list *entry;
- if (name && isdotent(name, namlen))
+ if (namlen != HEXDIR_LEN - 1)
return 0;
- dentry = lookup_one_len(name, parent, namlen);
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
- child = kmalloc(sizeof(*child), GFP_KERNEL);
- if (child == NULL)
+ entry = kmalloc(sizeof(struct name_list), GFP_KERNEL);
+ if (entry == NULL)
return -ENOMEM;
- child->dentry = dentry;
- list_add(&child->list, dentries);
+ memcpy(entry->name, name, HEXDIR_LEN - 1);
+ entry->name[HEXDIR_LEN - 1] = '\0';
+ list_add(&entry->list, names);
return 0;
}
@@ -220,11 +210,9 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
{
const struct cred *original_cred;
struct file *filp;
- struct dentry_list_arg dla = {
- .parent = dir,
- };
- struct list_head *dentries = &dla.dentries;
- struct dentry_list *child;
+ LIST_HEAD(names);
+ struct name_list *entry;
+ struct dentry *dentry;
int status;
if (!rec_dir_init)
@@ -233,31 +221,34 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
status = nfs4_save_creds(&original_cred);
if (status < 0)
return status;
- INIT_LIST_HEAD(dentries);
filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY,
current_cred());
status = PTR_ERR(filp);
if (IS_ERR(filp))
goto out;
- INIT_LIST_HEAD(dentries);
- status = vfs_readdir(filp, nfsd4_build_dentrylist, &dla);
+ status = vfs_readdir(filp, nfsd4_build_namelist, &names);
fput(filp);
- while (!list_empty(dentries)) {
- child = list_entry(dentries->next, struct dentry_list, list);
- status = f(dir, child->dentry);
+ while (!list_empty(&names)) {
+ entry = list_entry(names.next, struct name_list, list);
+
+ dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1);
+ if (IS_ERR(dentry)) {
+ status = PTR_ERR(dentry);
+ goto out;
+ }
+ status = f(dir, dentry);
+ dput(dentry);
if (status)
goto out;
- list_del(&child->list);
- dput(child->dentry);
- kfree(child);
+ list_del(&entry->list);
+ kfree(entry);
}
out:
- while (!list_empty(dentries)) {
- child = list_entry(dentries->next, struct dentry_list, list);
- list_del(&child->list);
- dput(child->dentry);
- kfree(child);
+ while (!list_empty(&names)) {
+ entry = list_entry(names.next, struct name_list, list);
+ list_del(&entry->list);
+ kfree(entry);
}
nfs4_reset_creds(original_cred);
return status;
@@ -353,7 +344,8 @@ purge_old(struct dentry *parent, struct dentry *child)
{
int status;
- if (nfs4_has_reclaimed_state(child->d_name.name))
+ /* note: we currently use this path only for minorversion 0 */
+ if (nfs4_has_reclaimed_state(child->d_name.name, false))
return 0;
status = nfsd4_clear_clid_dir(parent, child);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index b6f60f48e94b..c65a27b76a9d 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -68,6 +68,7 @@ static u32 current_delegid = 1;
static u32 nfs4_init;
static stateid_t zerostateid; /* bits all 0 */
static stateid_t onestateid; /* bits all 1 */
+static u64 current_sessionid = 1;
#define ZERO_STATEID(stateid) (!memcmp((stateid), &zerostateid, sizeof(stateid_t)))
#define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t)))
@@ -75,18 +76,21 @@ static stateid_t onestateid; /* bits all 1 */
/* forward declarations */
static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags);
static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid);
-static void release_stateid_lockowners(struct nfs4_stateid *open_stp);
static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
static void nfs4_set_recdir(char *recdir);
-/* Locking:
- *
- * client_mutex:
- * protects clientid_hashtbl[], clientstr_hashtbl[],
- * unconfstr_hashtbl[], uncofid_hashtbl[].
- */
+/* Locking: */
+
+/* Currently used for almost all code touching nfsv4 state: */
static DEFINE_MUTEX(client_mutex);
+/*
+ * Currently used for the del_recall_lru and file hash table. In an
+ * effort to decrease the scope of the client_mutex, this spinlock may
+ * eventually cover more:
+ */
+static DEFINE_SPINLOCK(recall_lock);
+
static struct kmem_cache *stateowner_slab = NULL;
static struct kmem_cache *file_slab = NULL;
static struct kmem_cache *stateid_slab = NULL;
@@ -117,37 +121,23 @@ opaque_hashval(const void *ptr, int nbytes)
return x;
}
-/* forward declarations */
-static void release_stateowner(struct nfs4_stateowner *sop);
-static void release_stateid(struct nfs4_stateid *stp, int flags);
-
-/*
- * Delegation state
- */
-
-/* recall_lock protects the del_recall_lru */
-static DEFINE_SPINLOCK(recall_lock);
static struct list_head del_recall_lru;
-static void
-free_nfs4_file(struct kref *kref)
-{
- struct nfs4_file *fp = container_of(kref, struct nfs4_file, fi_ref);
- list_del(&fp->fi_hash);
- iput(fp->fi_inode);
- kmem_cache_free(file_slab, fp);
-}
-
static inline void
put_nfs4_file(struct nfs4_file *fi)
{
- kref_put(&fi->fi_ref, free_nfs4_file);
+ if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) {
+ list_del(&fi->fi_hash);
+ spin_unlock(&recall_lock);
+ iput(fi->fi_inode);
+ kmem_cache_free(file_slab, fi);
+ }
}
static inline void
get_nfs4_file(struct nfs4_file *fi)
{
- kref_get(&fi->fi_ref);
+ atomic_inc(&fi->fi_ref);
}
static int num_delegations;
@@ -220,9 +210,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
dp->dl_stateid.si_stateownerid = current_delegid++;
dp->dl_stateid.si_fileid = 0;
dp->dl_stateid.si_generation = 0;
- dp->dl_fhlen = current_fh->fh_handle.fh_size;
- memcpy(dp->dl_fhval, &current_fh->fh_handle.fh_base,
- current_fh->fh_handle.fh_size);
+ fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
dp->dl_time = 0;
atomic_set(&dp->dl_count, 1);
list_add(&dp->dl_perfile, &fp->fi_delegations);
@@ -311,6 +299,291 @@ static struct list_head unconf_id_hashtbl[CLIENT_HASH_SIZE];
static struct list_head client_lru;
static struct list_head close_lru;
+static void unhash_generic_stateid(struct nfs4_stateid *stp)
+{
+ list_del(&stp->st_hash);
+ list_del(&stp->st_perfile);
+ list_del(&stp->st_perstateowner);
+}
+
+static void free_generic_stateid(struct nfs4_stateid *stp)
+{
+ put_nfs4_file(stp->st_file);
+ kmem_cache_free(stateid_slab, stp);
+}
+
+static void release_lock_stateid(struct nfs4_stateid *stp)
+{
+ unhash_generic_stateid(stp);
+ locks_remove_posix(stp->st_vfs_file, (fl_owner_t)stp->st_stateowner);
+ free_generic_stateid(stp);
+}
+
+static void unhash_lockowner(struct nfs4_stateowner *sop)
+{
+ struct nfs4_stateid *stp;
+
+ list_del(&sop->so_idhash);
+ list_del(&sop->so_strhash);
+ list_del(&sop->so_perstateid);
+ while (!list_empty(&sop->so_stateids)) {
+ stp = list_first_entry(&sop->so_stateids,
+ struct nfs4_stateid, st_perstateowner);
+ release_lock_stateid(stp);
+ }
+}
+
+static void release_lockowner(struct nfs4_stateowner *sop)
+{
+ unhash_lockowner(sop);
+ nfs4_put_stateowner(sop);
+}
+
+static void
+release_stateid_lockowners(struct nfs4_stateid *open_stp)
+{
+ struct nfs4_stateowner *lock_sop;
+
+ while (!list_empty(&open_stp->st_lockowners)) {
+ lock_sop = list_entry(open_stp->st_lockowners.next,
+ struct nfs4_stateowner, so_perstateid);
+ /* list_del(&open_stp->st_lockowners); */
+ BUG_ON(lock_sop->so_is_open_owner);
+ release_lockowner(lock_sop);
+ }
+}
+
+static void release_open_stateid(struct nfs4_stateid *stp)
+{
+ unhash_generic_stateid(stp);
+ release_stateid_lockowners(stp);
+ nfsd_close(stp->st_vfs_file);
+ free_generic_stateid(stp);
+}
+
+static void unhash_openowner(struct nfs4_stateowner *sop)
+{
+ struct nfs4_stateid *stp;
+
+ list_del(&sop->so_idhash);
+ list_del(&sop->so_strhash);
+ list_del(&sop->so_perclient);
+ list_del(&sop->so_perstateid); /* XXX: necessary? */
+ while (!list_empty(&sop->so_stateids)) {
+ stp = list_first_entry(&sop->so_stateids,
+ struct nfs4_stateid, st_perstateowner);
+ release_open_stateid(stp);
+ }
+}
+
+static void release_openowner(struct nfs4_stateowner *sop)
+{
+ unhash_openowner(sop);
+ list_del(&sop->so_close_lru);
+ nfs4_put_stateowner(sop);
+}
+
+static DEFINE_SPINLOCK(sessionid_lock);
+#define SESSION_HASH_SIZE 512
+static struct list_head sessionid_hashtbl[SESSION_HASH_SIZE];
+
+static inline int
+hash_sessionid(struct nfs4_sessionid *sessionid)
+{
+ struct nfsd4_sessionid *sid = (struct nfsd4_sessionid *)sessionid;
+
+ return sid->sequence % SESSION_HASH_SIZE;
+}
+
+static inline void
+dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid)
+{
+ u32 *ptr = (u32 *)(&sessionid->data[0]);
+ dprintk("%s: %u:%u:%u:%u\n", fn, ptr[0], ptr[1], ptr[2], ptr[3]);
+}
+
+static void
+gen_sessionid(struct nfsd4_session *ses)
+{
+ struct nfs4_client *clp = ses->se_client;
+ struct nfsd4_sessionid *sid;
+
+ sid = (struct nfsd4_sessionid *)ses->se_sessionid.data;
+ sid->clientid = clp->cl_clientid;
+ sid->sequence = current_sessionid++;
+ sid->reserved = 0;
+}
+
+/*
+ * Give the client the number of slots it requests bound by
+ * NFSD_MAX_SLOTS_PER_SESSION and by sv_drc_max_pages.
+ *
+ * If we run out of pages (sv_drc_pages_used == sv_drc_max_pages) we
+ * should (up to a point) re-negotiate active sessions and reduce their
+ * slot usage to make rooom for new connections. For now we just fail the
+ * create session.
+ */
+static int set_forechannel_maxreqs(struct nfsd4_channel_attrs *fchan)
+{
+ int status = 0, np = fchan->maxreqs * NFSD_PAGES_PER_SLOT;
+
+ spin_lock(&nfsd_serv->sv_lock);
+ if (np + nfsd_serv->sv_drc_pages_used > nfsd_serv->sv_drc_max_pages)
+ np = nfsd_serv->sv_drc_max_pages - nfsd_serv->sv_drc_pages_used;
+ nfsd_serv->sv_drc_pages_used += np;
+ spin_unlock(&nfsd_serv->sv_lock);
+
+ if (np <= 0) {
+ status = nfserr_resource;
+ fchan->maxreqs = 0;
+ } else
+ fchan->maxreqs = np / NFSD_PAGES_PER_SLOT;
+
+ return status;
+}
+
+/*
+ * fchan holds the client values on input, and the server values on output
+ */
+static int init_forechannel_attrs(struct svc_rqst *rqstp,
+ struct nfsd4_session *session,
+ struct nfsd4_channel_attrs *fchan)
+{
+ int status = 0;
+ __u32 maxcount = svc_max_payload(rqstp);
+
+ /* headerpadsz set to zero in encode routine */
+
+ /* Use the client's max request and max response size if possible */
+ if (fchan->maxreq_sz > maxcount)
+ fchan->maxreq_sz = maxcount;
+ session->se_fmaxreq_sz = fchan->maxreq_sz;
+
+ if (fchan->maxresp_sz > maxcount)
+ fchan->maxresp_sz = maxcount;
+ session->se_fmaxresp_sz = fchan->maxresp_sz;
+
+ /* Set the max response cached size our default which is
+ * a multiple of PAGE_SIZE and small */
+ session->se_fmaxresp_cached = NFSD_PAGES_PER_SLOT * PAGE_SIZE;
+ fchan->maxresp_cached = session->se_fmaxresp_cached;
+
+ /* Use the client's maxops if possible */
+ if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND)
+ fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND;
+ session->se_fmaxops = fchan->maxops;
+
+ /* try to use the client requested number of slots */
+ if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
+ fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
+
+ /* FIXME: Error means no more DRC pages so the server should
+ * recover pages from existing sessions. For now fail session
+ * creation.
+ */
+ status = set_forechannel_maxreqs(fchan);
+
+ session->se_fnumslots = fchan->maxreqs;
+ return status;
+}
+
+static int
+alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
+ struct nfsd4_create_session *cses)
+{
+ struct nfsd4_session *new, tmp;
+ int idx, status = nfserr_resource, slotsize;
+
+ memset(&tmp, 0, sizeof(tmp));
+
+ /* FIXME: For now, we just accept the client back channel attributes. */
+ status = init_forechannel_attrs(rqstp, &tmp, &cses->fore_channel);
+ if (status)
+ goto out;
+
+ /* allocate struct nfsd4_session and slot table in one piece */
+ slotsize = tmp.se_fnumslots * sizeof(struct nfsd4_slot);
+ new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL);
+ if (!new)
+ goto out;
+
+ memcpy(new, &tmp, sizeof(*new));
+
+ new->se_client = clp;
+ gen_sessionid(new);
+ idx = hash_sessionid(&new->se_sessionid);
+ memcpy(clp->cl_sessionid.data, new->se_sessionid.data,
+ NFS4_MAX_SESSIONID_LEN);
+
+ new->se_flags = cses->flags;
+ kref_init(&new->se_ref);
+ spin_lock(&sessionid_lock);
+ list_add(&new->se_hash, &sessionid_hashtbl[idx]);
+ list_add(&new->se_perclnt, &clp->cl_sessions);
+ spin_unlock(&sessionid_lock);
+
+ status = nfs_ok;
+out:
+ return status;
+}
+
+/* caller must hold sessionid_lock */
+static struct nfsd4_session *
+find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid)
+{
+ struct nfsd4_session *elem;
+ int idx;
+
+ dump_sessionid(__func__, sessionid);
+ idx = hash_sessionid(sessionid);
+ dprintk("%s: idx is %d\n", __func__, idx);
+ /* Search in the appropriate list */
+ list_for_each_entry(elem, &sessionid_hashtbl[idx], se_hash) {
+ dump_sessionid("list traversal", &elem->se_sessionid);
+ if (!memcmp(elem->se_sessionid.data, sessionid->data,
+ NFS4_MAX_SESSIONID_LEN)) {
+ return elem;
+ }
+ }
+
+ dprintk("%s: session not found\n", __func__);
+ return NULL;
+}
+
+/* caller must hold sessionid_lock */
+static void
+unhash_session(struct nfsd4_session *ses)
+{
+ list_del(&ses->se_hash);
+ list_del(&ses->se_perclnt);
+}
+
+static void
+release_session(struct nfsd4_session *ses)
+{
+ spin_lock(&sessionid_lock);
+ unhash_session(ses);
+ spin_unlock(&sessionid_lock);
+ nfsd4_put_session(ses);
+}
+
+static void nfsd4_release_respages(struct page **respages, short resused);
+
+void
+free_session(struct kref *kref)
+{
+ struct nfsd4_session *ses;
+ int i;
+
+ ses = container_of(kref, struct nfsd4_session, se_ref);
+ for (i = 0; i < ses->se_fnumslots; i++) {
+ struct nfsd4_cache_entry *e = &ses->se_slots[i].sl_cache_entry;
+ nfsd4_release_respages(e->ce_respages, e->ce_resused);
+ }
+ kfree(ses->se_slots);
+ kfree(ses);
+}
+
static inline void
renew_client(struct nfs4_client *clp)
{
@@ -330,8 +603,8 @@ STALE_CLIENTID(clientid_t *clid)
{
if (clid->cl_boot == boot_time)
return 0;
- dprintk("NFSD stale clientid (%08x/%08x)\n",
- clid->cl_boot, clid->cl_id);
+ dprintk("NFSD stale clientid (%08x/%08x) boot_time %08lx\n",
+ clid->cl_boot, clid->cl_id, boot_time);
return 1;
}
@@ -376,6 +649,8 @@ static inline void
free_client(struct nfs4_client *clp)
{
shutdown_callback_client(clp);
+ nfsd4_release_respages(clp->cl_slot.sl_cache_entry.ce_respages,
+ clp->cl_slot.sl_cache_entry.ce_resused);
if (clp->cl_cred.cr_group_info)
put_group_info(clp->cl_cred.cr_group_info);
kfree(clp->cl_principal);
@@ -420,7 +695,13 @@ expire_client(struct nfs4_client *clp)
list_del(&clp->cl_lru);
while (!list_empty(&clp->cl_openowners)) {
sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
- release_stateowner(sop);
+ release_openowner(sop);
+ }
+ while (!list_empty(&clp->cl_sessions)) {
+ struct nfsd4_session *ses;
+ ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
+ se_perclnt);
+ release_session(ses);
}
put_nfs4_client(clp);
}
@@ -439,6 +720,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir)
INIT_LIST_HEAD(&clp->cl_strhash);
INIT_LIST_HEAD(&clp->cl_openowners);
INIT_LIST_HEAD(&clp->cl_delegations);
+ INIT_LIST_HEAD(&clp->cl_sessions);
INIT_LIST_HEAD(&clp->cl_lru);
return clp;
}
@@ -568,25 +850,45 @@ find_unconfirmed_client(clientid_t *clid)
return NULL;
}
+/*
+ * Return 1 iff clp's clientid establishment method matches the use_exchange_id
+ * parameter. Matching is based on the fact the at least one of the
+ * EXCHGID4_FLAG_USE_{NON_PNFS,PNFS_MDS,PNFS_DS} flags must be set for v4.1
+ *
+ * FIXME: we need to unify the clientid namespaces for nfsv4.x
+ * and correctly deal with client upgrade/downgrade in EXCHANGE_ID
+ * and SET_CLIENTID{,_CONFIRM}
+ */
+static inline int
+match_clientid_establishment(struct nfs4_client *clp, bool use_exchange_id)
+{
+ bool has_exchange_flags = (clp->cl_exchange_flags != 0);
+ return use_exchange_id == has_exchange_flags;
+}
+
static struct nfs4_client *
-find_confirmed_client_by_str(const char *dname, unsigned int hashval)
+find_confirmed_client_by_str(const char *dname, unsigned int hashval,
+ bool use_exchange_id)
{
struct nfs4_client *clp;
list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) {
- if (same_name(clp->cl_recdir, dname))
+ if (same_name(clp->cl_recdir, dname) &&
+ match_clientid_establishment(clp, use_exchange_id))
return clp;
}
return NULL;
}
static struct nfs4_client *
-find_unconfirmed_client_by_str(const char *dname, unsigned int hashval)
+find_unconfirmed_client_by_str(const char *dname, unsigned int hashval,
+ bool use_exchange_id)
{
struct nfs4_client *clp;
list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) {
- if (same_name(clp->cl_recdir, dname))
+ if (same_name(clp->cl_recdir, dname) &&
+ match_clientid_establishment(clp, use_exchange_id))
return clp;
}
return NULL;
@@ -685,6 +987,534 @@ out_err:
return;
}
+void
+nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp)
+{
+ struct nfsd4_compoundres *resp = rqstp->rq_resp;
+
+ resp->cstate.statp = statp;
+}
+
+/*
+ * Dereference the result pages.
+ */
+static void
+nfsd4_release_respages(struct page **respages, short resused)
+{
+ int i;
+
+ dprintk("--> %s\n", __func__);
+ for (i = 0; i < resused; i++) {
+ if (!respages[i])
+ continue;
+ put_page(respages[i]);
+ respages[i] = NULL;
+ }
+}
+
+static void
+nfsd4_copy_pages(struct page **topages, struct page **frompages, short count)
+{
+ int i;
+
+ for (i = 0; i < count; i++) {
+ topages[i] = frompages[i];
+ if (!topages[i])
+ continue;
+ get_page(topages[i]);
+ }
+}
+
+/*
+ * Cache the reply pages up to NFSD_PAGES_PER_SLOT + 1, clearing the previous
+ * pages. We add a page to NFSD_PAGES_PER_SLOT for the case where the total
+ * length of the XDR response is less than se_fmaxresp_cached
+ * (NFSD_PAGES_PER_SLOT * PAGE_SIZE) but the xdr_buf pages is used for a
+ * of the reply (e.g. readdir).
+ *
+ * Store the base and length of the rq_req.head[0] page
+ * of the NFSv4.1 data, just past the rpc header.
+ */
+void
+nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
+{
+ struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry;
+ struct svc_rqst *rqstp = resp->rqstp;
+ struct nfsd4_compoundargs *args = rqstp->rq_argp;
+ struct nfsd4_op *op = &args->ops[resp->opcnt];
+ struct kvec *resv = &rqstp->rq_res.head[0];
+
+ dprintk("--> %s entry %p\n", __func__, entry);
+
+ /* Don't cache a failed OP_SEQUENCE. */
+ if (resp->opcnt == 1 && op->opnum == OP_SEQUENCE && resp->cstate.status)
+ return;
+
+ nfsd4_release_respages(entry->ce_respages, entry->ce_resused);
+ entry->ce_opcnt = resp->opcnt;
+ entry->ce_status = resp->cstate.status;
+
+ /*
+ * Don't need a page to cache just the sequence operation - the slot
+ * does this for us!
+ */
+
+ if (nfsd4_not_cached(resp)) {
+ entry->ce_resused = 0;
+ entry->ce_rpchdrlen = 0;
+ dprintk("%s Just cache SEQUENCE. ce_cachethis %d\n", __func__,
+ resp->cstate.slot->sl_cache_entry.ce_cachethis);
+ return;
+ }
+ entry->ce_resused = rqstp->rq_resused;
+ if (entry->ce_resused > NFSD_PAGES_PER_SLOT + 1)
+ entry->ce_resused = NFSD_PAGES_PER_SLOT + 1;
+ nfsd4_copy_pages(entry->ce_respages, rqstp->rq_respages,
+ entry->ce_resused);
+ entry->ce_datav.iov_base = resp->cstate.statp;
+ entry->ce_datav.iov_len = resv->iov_len - ((char *)resp->cstate.statp -
+ (char *)page_address(rqstp->rq_respages[0]));
+ /* Current request rpc header length*/
+ entry->ce_rpchdrlen = (char *)resp->cstate.statp -
+ (char *)page_address(rqstp->rq_respages[0]);
+}
+
+/*
+ * We keep the rpc header, but take the nfs reply from the replycache.
+ */
+static int
+nfsd41_copy_replay_data(struct nfsd4_compoundres *resp,
+ struct nfsd4_cache_entry *entry)
+{
+ struct svc_rqst *rqstp = resp->rqstp;
+ struct kvec *resv = &resp->rqstp->rq_res.head[0];
+ int len;
+
+ /* Current request rpc header length*/
+ len = (char *)resp->cstate.statp -
+ (char *)page_address(rqstp->rq_respages[0]);
+ if (entry->ce_datav.iov_len + len > PAGE_SIZE) {
+ dprintk("%s v41 cached reply too large (%Zd).\n", __func__,
+ entry->ce_datav.iov_len);
+ return 0;
+ }
+ /* copy the cached reply nfsd data past the current rpc header */
+ memcpy((char *)resv->iov_base + len, entry->ce_datav.iov_base,
+ entry->ce_datav.iov_len);
+ resv->iov_len = len + entry->ce_datav.iov_len;
+ return 1;
+}
+
+/*
+ * Keep the first page of the replay. Copy the NFSv4.1 data from the first
+ * cached page. Replace any futher replay pages from the cache.
+ */
+__be32
+nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
+ struct nfsd4_sequence *seq)
+{
+ struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry;
+ __be32 status;
+
+ dprintk("--> %s entry %p\n", __func__, entry);
+
+ /*
+ * If this is just the sequence operation, we did not keep
+ * a page in the cache entry because we can just use the
+ * slot info stored in struct nfsd4_sequence that was checked
+ * against the slot in nfsd4_sequence().
+ *
+ * This occurs when seq->cachethis is FALSE, or when the client
+ * session inactivity timer fires and a solo sequence operation
+ * is sent (lease renewal).
+ */
+ if (seq && nfsd4_not_cached(resp)) {
+ seq->maxslots = resp->cstate.session->se_fnumslots;
+ return nfs_ok;
+ }
+
+ if (!nfsd41_copy_replay_data(resp, entry)) {
+ /*
+ * Not enough room to use the replay rpc header, send the
+ * cached header. Release all the allocated result pages.
+ */
+ svc_free_res_pages(resp->rqstp);
+ nfsd4_copy_pages(resp->rqstp->rq_respages, entry->ce_respages,
+ entry->ce_resused);
+ } else {
+ /* Release all but the first allocated result page */
+
+ resp->rqstp->rq_resused--;
+ svc_free_res_pages(resp->rqstp);
+
+ nfsd4_copy_pages(&resp->rqstp->rq_respages[1],
+ &entry->ce_respages[1],
+ entry->ce_resused - 1);
+ }
+
+ resp->rqstp->rq_resused = entry->ce_resused;
+ resp->opcnt = entry->ce_opcnt;
+ resp->cstate.iovlen = entry->ce_datav.iov_len + entry->ce_rpchdrlen;
+ status = entry->ce_status;
+
+ return status;
+}
+
+/*
+ * Set the exchange_id flags returned by the server.
+ */
+static void
+nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
+{
+ /* pNFS is not supported */
+ new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS;
+
+ /* Referrals are supported, Migration is not. */
+ new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER;
+
+ /* set the wire flags to return to client. */
+ clid->flags = new->cl_exchange_flags;
+}
+
+__be32
+nfsd4_exchange_id(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ struct nfsd4_exchange_id *exid)
+{
+ struct nfs4_client *unconf, *conf, *new;
+ int status;
+ unsigned int strhashval;
+ char dname[HEXDIR_LEN];
+ nfs4_verifier verf = exid->verifier;
+ u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr;
+
+ dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p "
+ " ip_addr=%u flags %x, spa_how %d\n",
+ __func__, rqstp, exid, exid->clname.len, exid->clname.data,
+ ip_addr, exid->flags, exid->spa_how);
+
+ if (!check_name(exid->clname) || (exid->flags & ~EXCHGID4_FLAG_MASK_A))
+ return nfserr_inval;
+
+ /* Currently only support SP4_NONE */
+ switch (exid->spa_how) {
+ case SP4_NONE:
+ break;
+ case SP4_SSV:
+ return nfserr_encr_alg_unsupp;
+ default:
+ BUG(); /* checked by xdr code */
+ case SP4_MACH_CRED:
+ return nfserr_serverfault; /* no excuse :-/ */
+ }
+
+ status = nfs4_make_rec_clidname(dname, &exid->clname);
+
+ if (status)
+ goto error;
+
+ strhashval = clientstr_hashval(dname);
+
+ nfs4_lock_state();
+ status = nfs_ok;
+
+ conf = find_confirmed_client_by_str(dname, strhashval, true);
+ if (conf) {
+ if (!same_verf(&verf, &conf->cl_verifier)) {
+ /* 18.35.4 case 8 */
+ if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
+ status = nfserr_not_same;
+ goto out;
+ }
+ /* Client reboot: destroy old state */
+ expire_client(conf);
+ goto out_new;
+ }
+ if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
+ /* 18.35.4 case 9 */
+ if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
+ status = nfserr_perm;
+ goto out;
+ }
+ expire_client(conf);
+ goto out_new;
+ }
+ if (ip_addr != conf->cl_addr &&
+ !(exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A)) {
+ /* Client collision. 18.35.4 case 3 */
+ status = nfserr_clid_inuse;
+ goto out;
+ }
+ /*
+ * Set bit when the owner id and verifier map to an already
+ * confirmed client id (18.35.3).
+ */
+ exid->flags |= EXCHGID4_FLAG_CONFIRMED_R;
+
+ /*
+ * Falling into 18.35.4 case 2, possible router replay.
+ * Leave confirmed record intact and return same result.
+ */
+ copy_verf(conf, &verf);
+ new = conf;
+ goto out_copy;
+ } else {
+ /* 18.35.4 case 7 */
+ if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
+ status = nfserr_noent;
+ goto out;
+ }
+ }
+
+ unconf = find_unconfirmed_client_by_str(dname, strhashval, true);
+ if (unconf) {
+ /*
+ * Possible retry or client restart. Per 18.35.4 case 4,
+ * a new unconfirmed record should be generated regardless
+ * of whether any properties have changed.
+ */
+ expire_client(unconf);
+ }
+
+out_new:
+ /* Normal case */
+ new = create_client(exid->clname, dname);
+ if (new == NULL) {
+ status = nfserr_resource;
+ goto out;
+ }
+
+ copy_verf(new, &verf);
+ copy_cred(&new->cl_cred, &rqstp->rq_cred);
+ new->cl_addr = ip_addr;
+ gen_clid(new);
+ gen_confirm(new);
+ add_to_unconfirmed(new, strhashval);
+out_copy:
+ exid->clientid.cl_boot = new->cl_clientid.cl_boot;
+ exid->clientid.cl_id = new->cl_clientid.cl_id;
+
+ new->cl_slot.sl_seqid = 0;
+ exid->seqid = 1;
+ nfsd4_set_ex_flags(new, exid);
+
+ dprintk("nfsd4_exchange_id seqid %d flags %x\n",
+ new->cl_slot.sl_seqid, new->cl_exchange_flags);
+ status = nfs_ok;
+
+out:
+ nfs4_unlock_state();
+error:
+ dprintk("nfsd4_exchange_id returns %d\n", ntohl(status));
+ return status;
+}
+
+static int
+check_slot_seqid(u32 seqid, struct nfsd4_slot *slot)
+{
+ dprintk("%s enter. seqid %d slot->sl_seqid %d\n", __func__, seqid,
+ slot->sl_seqid);
+
+ /* The slot is in use, and no response has been sent. */
+ if (slot->sl_inuse) {
+ if (seqid == slot->sl_seqid)
+ return nfserr_jukebox;
+ else
+ return nfserr_seq_misordered;
+ }
+ /* Normal */
+ if (likely(seqid == slot->sl_seqid + 1))
+ return nfs_ok;
+ /* Replay */
+ if (seqid == slot->sl_seqid)
+ return nfserr_replay_cache;
+ /* Wraparound */
+ if (seqid == 1 && (slot->sl_seqid + 1) == 0)
+ return nfs_ok;
+ /* Misordered replay or misordered new request */
+ return nfserr_seq_misordered;
+}
+
+__be32
+nfsd4_create_session(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ struct nfsd4_create_session *cr_ses)
+{
+ u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr;
+ struct nfsd4_compoundres *resp = rqstp->rq_resp;
+ struct nfs4_client *conf, *unconf;
+ struct nfsd4_slot *slot = NULL;
+ int status = 0;
+
+ nfs4_lock_state();
+ unconf = find_unconfirmed_client(&cr_ses->clientid);
+ conf = find_confirmed_client(&cr_ses->clientid);
+
+ if (conf) {
+ slot = &conf->cl_slot;
+ status = check_slot_seqid(cr_ses->seqid, slot);
+ if (status == nfserr_replay_cache) {
+ dprintk("Got a create_session replay! seqid= %d\n",
+ slot->sl_seqid);
+ cstate->slot = slot;
+ cstate->status = status;
+ /* Return the cached reply status */
+ status = nfsd4_replay_cache_entry(resp, NULL);
+ goto out;
+ } else if (cr_ses->seqid != conf->cl_slot.sl_seqid + 1) {
+ status = nfserr_seq_misordered;
+ dprintk("Sequence misordered!\n");
+ dprintk("Expected seqid= %d but got seqid= %d\n",
+ slot->sl_seqid, cr_ses->seqid);
+ goto out;
+ }
+ conf->cl_slot.sl_seqid++;
+ } else if (unconf) {
+ if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
+ (ip_addr != unconf->cl_addr)) {
+ status = nfserr_clid_inuse;
+ goto out;
+ }
+
+ slot = &unconf->cl_slot;
+ status = check_slot_seqid(cr_ses->seqid, slot);
+ if (status) {
+ /* an unconfirmed replay returns misordered */
+ status = nfserr_seq_misordered;
+ goto out;
+ }
+
+ slot->sl_seqid++; /* from 0 to 1 */
+ move_to_confirmed(unconf);
+
+ /*
+ * We do not support RDMA or persistent sessions
+ */
+ cr_ses->flags &= ~SESSION4_PERSIST;
+ cr_ses->flags &= ~SESSION4_RDMA;
+
+ conf = unconf;
+ } else {
+ status = nfserr_stale_clientid;
+ goto out;
+ }
+
+ status = alloc_init_session(rqstp, conf, cr_ses);
+ if (status)
+ goto out;
+
+ memcpy(cr_ses->sessionid.data, conf->cl_sessionid.data,
+ NFS4_MAX_SESSIONID_LEN);
+ cr_ses->seqid = slot->sl_seqid;
+
+ slot->sl_inuse = true;
+ cstate->slot = slot;
+ /* Ensure a page is used for the cache */
+ slot->sl_cache_entry.ce_cachethis = 1;
+out:
+ nfs4_unlock_state();
+ dprintk("%s returns %d\n", __func__, ntohl(status));
+ return status;
+}
+
+__be32
+nfsd4_destroy_session(struct svc_rqst *r,
+ struct nfsd4_compound_state *cstate,
+ struct nfsd4_destroy_session *sessionid)
+{
+ struct nfsd4_session *ses;
+ u32 status = nfserr_badsession;
+
+ /* Notes:
+ * - The confirmed nfs4_client->cl_sessionid holds destroyed sessinid
+ * - Should we return nfserr_back_chan_busy if waiting for
+ * callbacks on to-be-destroyed session?
+ * - Do we need to clear any callback info from previous session?
+ */
+
+ dump_sessionid(__func__, &sessionid->sessionid);
+ spin_lock(&sessionid_lock);
+ ses = find_in_sessionid_hashtbl(&sessionid->sessionid);
+ if (!ses) {
+ spin_unlock(&sessionid_lock);
+ goto out;
+ }
+
+ unhash_session(ses);
+ spin_unlock(&sessionid_lock);
+
+ /* wait for callbacks */
+ shutdown_callback_client(ses->se_client);
+ nfsd4_put_session(ses);
+ status = nfs_ok;
+out:
+ dprintk("%s returns %d\n", __func__, ntohl(status));
+ return status;
+}
+
+__be32
+nfsd4_sequence(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ struct nfsd4_sequence *seq)
+{
+ struct nfsd4_compoundres *resp = rqstp->rq_resp;
+ struct nfsd4_session *session;
+ struct nfsd4_slot *slot;
+ int status;
+
+ if (resp->opcnt != 1)
+ return nfserr_sequence_pos;
+
+ spin_lock(&sessionid_lock);
+ status = nfserr_badsession;
+ session = find_in_sessionid_hashtbl(&seq->sessionid);
+ if (!session)
+ goto out;
+
+ status = nfserr_badslot;
+ if (seq->slotid >= session->se_fnumslots)
+ goto out;
+
+ slot = &session->se_slots[seq->slotid];
+ dprintk("%s: slotid %d\n", __func__, seq->slotid);
+
+ status = check_slot_seqid(seq->seqid, slot);
+ if (status == nfserr_replay_cache) {
+ cstate->slot = slot;
+ cstate->session = session;
+ /* Return the cached reply status and set cstate->status
+ * for nfsd4_svc_encode_compoundres processing */
+ status = nfsd4_replay_cache_entry(resp, seq);
+ cstate->status = nfserr_replay_cache;
+ goto replay_cache;
+ }
+ if (status)
+ goto out;
+
+ /* Success! bump slot seqid */
+ slot->sl_inuse = true;
+ slot->sl_seqid = seq->seqid;
+ slot->sl_cache_entry.ce_cachethis = seq->cachethis;
+ /* Always set the cache entry cachethis for solo sequence */
+ if (nfsd4_is_solo_sequence(resp))
+ slot->sl_cache_entry.ce_cachethis = 1;
+
+ cstate->slot = slot;
+ cstate->session = session;
+
+replay_cache:
+ /* Renew the clientid on success and on replay.
+ * Hold a session reference until done processing the compound:
+ * nfsd4_put_session called only if the cstate slot is set.
+ */
+ renew_client(session->se_client);
+ nfsd4_get_session(session);
+out:
+ spin_unlock(&sessionid_lock);
+ dprintk("%s: return %d\n", __func__, ntohl(status));
+ return status;
+}
+
__be32
nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_setclientid *setclid)
@@ -716,14 +1546,13 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
strhashval = clientstr_hashval(dname);
nfs4_lock_state();
- conf = find_confirmed_client_by_str(dname, strhashval);
+ conf = find_confirmed_client_by_str(dname, strhashval, false);
if (conf) {
/* RFC 3530 14.2.33 CASE 0: */
status = nfserr_clid_inuse;
- if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)
- || conf->cl_addr != sin->sin_addr.s_addr) {
- dprintk("NFSD: setclientid: string in use by clientat %pI4\n",
- &conf->cl_addr);
+ if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
+ dprintk("NFSD: setclientid: string in use by client"
+ " at %pI4\n", &conf->cl_addr);
goto out;
}
}
@@ -732,7 +1561,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
* has a description of SETCLIENTID request processing consisting
* of 5 bullet points, labeled as CASE0 - CASE4 below.
*/
- unconf = find_unconfirmed_client_by_str(dname, strhashval);
+ unconf = find_unconfirmed_client_by_str(dname, strhashval, false);
status = nfserr_resource;
if (!conf) {
/*
@@ -887,7 +1716,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
unsigned int hash =
clientstr_hashval(unconf->cl_recdir);
conf = find_confirmed_client_by_str(unconf->cl_recdir,
- hash);
+ hash, false);
if (conf) {
nfsd4_remove_clid_dir(conf);
expire_client(conf);
@@ -923,11 +1752,13 @@ alloc_init_file(struct inode *ino)
fp = kmem_cache_alloc(file_slab, GFP_KERNEL);
if (fp) {
- kref_init(&fp->fi_ref);
+ atomic_set(&fp->fi_ref, 1);
INIT_LIST_HEAD(&fp->fi_hash);
INIT_LIST_HEAD(&fp->fi_stateids);
INIT_LIST_HEAD(&fp->fi_delegations);
+ spin_lock(&recall_lock);
list_add(&fp->fi_hash, &file_hashtbl[hashval]);
+ spin_unlock(&recall_lock);
fp->fi_inode = igrab(ino);
fp->fi_id = current_fileid++;
fp->fi_had_conflict = false;
@@ -1037,48 +1868,6 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str
return sop;
}
-static void
-release_stateid_lockowners(struct nfs4_stateid *open_stp)
-{
- struct nfs4_stateowner *lock_sop;
-
- while (!list_empty(&open_stp->st_lockowners)) {
- lock_sop = list_entry(open_stp->st_lockowners.next,
- struct nfs4_stateowner, so_perstateid);
- /* list_del(&open_stp->st_lockowners); */
- BUG_ON(lock_sop->so_is_open_owner);
- release_stateowner(lock_sop);
- }
-}
-
-static void
-unhash_stateowner(struct nfs4_stateowner *sop)
-{
- struct nfs4_stateid *stp;
-
- list_del(&sop->so_idhash);
- list_del(&sop->so_strhash);
- if (sop->so_is_open_owner)
- list_del(&sop->so_perclient);
- list_del(&sop->so_perstateid);
- while (!list_empty(&sop->so_stateids)) {
- stp = list_entry(sop->so_stateids.next,
- struct nfs4_stateid, st_perstateowner);
- if (sop->so_is_open_owner)
- release_stateid(stp, OPEN_STATE);
- else
- release_stateid(stp, LOCK_STATE);
- }
-}
-
-static void
-release_stateowner(struct nfs4_stateowner *sop)
-{
- unhash_stateowner(sop);
- list_del(&sop->so_close_lru);
- nfs4_put_stateowner(sop);
-}
-
static inline void
init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
struct nfs4_stateowner *sop = open->op_stateowner;
@@ -1100,30 +1889,13 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *
stp->st_stateid.si_generation = 0;
stp->st_access_bmap = 0;
stp->st_deny_bmap = 0;
- __set_bit(open->op_share_access, &stp->st_access_bmap);
+ __set_bit(open->op_share_access & ~NFS4_SHARE_WANT_MASK,
+ &stp->st_access_bmap);
__set_bit(open->op_share_deny, &stp->st_deny_bmap);
stp->st_openstp = NULL;
}
static void
-release_stateid(struct nfs4_stateid *stp, int flags)
-{
- struct file *filp = stp->st_vfs_file;
-
- list_del(&stp->st_hash);
- list_del(&stp->st_perfile);
- list_del(&stp->st_perstateowner);
- if (flags & OPEN_STATE) {
- release_stateid_lockowners(stp);
- stp->st_vfs_file = NULL;
- nfsd_close(filp);
- } else if (flags & LOCK_STATE)
- locks_remove_posix(filp, (fl_owner_t) stp->st_stateowner);
- put_nfs4_file(stp->st_file);
- kmem_cache_free(stateid_slab, stp);
-}
-
-static void
move_to_close_lru(struct nfs4_stateowner *sop)
{
dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop);
@@ -1160,20 +1932,33 @@ find_file(struct inode *ino)
unsigned int hashval = file_hashval(ino);
struct nfs4_file *fp;
+ spin_lock(&recall_lock);
list_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) {
if (fp->fi_inode == ino) {
get_nfs4_file(fp);
+ spin_unlock(&recall_lock);
return fp;
}
}
+ spin_unlock(&recall_lock);
return NULL;
}
-static inline int access_valid(u32 x)
+static inline int access_valid(u32 x, u32 minorversion)
{
- if (x < NFS4_SHARE_ACCESS_READ)
+ if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ)
return 0;
- if (x > NFS4_SHARE_ACCESS_BOTH)
+ if ((x & NFS4_SHARE_ACCESS_MASK) > NFS4_SHARE_ACCESS_BOTH)
+ return 0;
+ x &= ~NFS4_SHARE_ACCESS_MASK;
+ if (minorversion && x) {
+ if ((x & NFS4_SHARE_WANT_MASK) > NFS4_SHARE_WANT_CANCEL)
+ return 0;
+ if ((x & NFS4_SHARE_WHEN_MASK) > NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED)
+ return 0;
+ x &= ~(NFS4_SHARE_WANT_MASK | NFS4_SHARE_WHEN_MASK);
+ }
+ if (x)
return 0;
return 1;
}
@@ -1409,7 +2194,8 @@ static struct lock_manager_operations nfsd_lease_mng_ops = {
__be32
-nfsd4_process_open1(struct nfsd4_open *open)
+nfsd4_process_open1(struct nfsd4_compound_state *cstate,
+ struct nfsd4_open *open)
{
clientid_t *clientid = &open->op_clientid;
struct nfs4_client *clp = NULL;
@@ -1432,10 +2218,13 @@ nfsd4_process_open1(struct nfsd4_open *open)
return nfserr_expired;
goto renew;
}
+ /* When sessions are used, skip open sequenceid processing */
+ if (nfsd4_has_session(cstate))
+ goto renew;
if (!sop->so_confirmed) {
/* Replace unconfirmed owners without checking for replay. */
clp = sop->so_client;
- release_stateowner(sop);
+ release_openowner(sop);
open->op_stateowner = NULL;
goto renew;
}
@@ -1709,6 +2498,7 @@ out:
__be32
nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
{
+ struct nfsd4_compoundres *resp = rqstp->rq_resp;
struct nfs4_file *fp = NULL;
struct inode *ino = current_fh->fh_dentry->d_inode;
struct nfs4_stateid *stp = NULL;
@@ -1716,7 +2506,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
__be32 status;
status = nfserr_inval;
- if (!access_valid(open->op_share_access)
+ if (!access_valid(open->op_share_access, resp->cstate.minorversion)
|| !deny_valid(open->op_share_deny))
goto out;
/*
@@ -1764,12 +2554,17 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
init_stateid(stp, fp, open);
status = nfsd4_truncate(rqstp, current_fh, open);
if (status) {
- release_stateid(stp, OPEN_STATE);
+ release_open_stateid(stp);
goto out;
}
+ if (nfsd4_has_session(&resp->cstate))
+ update_stateid(&stp->st_stateid);
}
memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t));
+ if (nfsd4_has_session(&resp->cstate))
+ open->op_stateowner->so_confirmed = 1;
+
/*
* Attempt to hand out a delegation. No error return, because the
* OPEN succeeds even if we fail.
@@ -1790,7 +2585,8 @@ out:
* To finish the open response, we just need to set the rflags.
*/
open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX;
- if (!open->op_stateowner->so_confirmed)
+ if (!open->op_stateowner->so_confirmed &&
+ !nfsd4_has_session(&resp->cstate))
open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM;
return status;
@@ -1898,7 +2694,7 @@ nfs4_laundromat(void)
}
dprintk("NFSD: purging unused open stateowner (so_id %d)\n",
sop->so_id);
- release_stateowner(sop);
+ release_openowner(sop);
}
if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT)
clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT;
@@ -1983,10 +2779,7 @@ out:
static inline __be32
check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
{
- /* Trying to call delegreturn with a special stateid? Yuch: */
- if (!(flags & (RD_STATE | WR_STATE)))
- return nfserr_bad_stateid;
- else if (ONE_STATEID(stateid) && (flags & RD_STATE))
+ if (ONE_STATEID(stateid) && (flags & RD_STATE))
return nfs_ok;
else if (locks_in_grace()) {
/* Answer in remaining cases depends on existance of
@@ -2005,14 +2798,20 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
* that are not able to provide mandatory locking.
*/
static inline int
-io_during_grace_disallowed(struct inode *inode, int flags)
+grace_disallows_io(struct inode *inode)
{
- return locks_in_grace() && (flags & (RD_STATE | WR_STATE))
- && mandatory_lock(inode);
+ return locks_in_grace() && mandatory_lock(inode);
}
-static int check_stateid_generation(stateid_t *in, stateid_t *ref)
+static int check_stateid_generation(stateid_t *in, stateid_t *ref, int flags)
{
+ /*
+ * When sessions are used the stateid generation number is ignored
+ * when it is zero.
+ */
+ if ((flags & HAS_SESSION) && in->si_generation == 0)
+ goto out;
+
/* If the client sends us a stateid from the future, it's buggy: */
if (in->si_generation > ref->si_generation)
return nfserr_bad_stateid;
@@ -2028,74 +2827,77 @@ static int check_stateid_generation(stateid_t *in, stateid_t *ref)
*/
if (in->si_generation < ref->si_generation)
return nfserr_old_stateid;
+out:
return nfs_ok;
}
+static int is_delegation_stateid(stateid_t *stateid)
+{
+ return stateid->si_fileid == 0;
+}
+
/*
* Checks for stateid operations
*/
__be32
-nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int flags, struct file **filpp)
+nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
+ stateid_t *stateid, int flags, struct file **filpp)
{
struct nfs4_stateid *stp = NULL;
struct nfs4_delegation *dp = NULL;
- stateid_t *stidp;
+ struct svc_fh *current_fh = &cstate->current_fh;
struct inode *ino = current_fh->fh_dentry->d_inode;
__be32 status;
- dprintk("NFSD: preprocess_stateid_op: stateid = (%08x/%08x/%08x/%08x)\n",
- stateid->si_boot, stateid->si_stateownerid,
- stateid->si_fileid, stateid->si_generation);
if (filpp)
*filpp = NULL;
- if (io_during_grace_disallowed(ino, flags))
+ if (grace_disallows_io(ino))
return nfserr_grace;
+ if (nfsd4_has_session(cstate))
+ flags |= HAS_SESSION;
+
if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
return check_special_stateids(current_fh, stateid, flags);
- /* STALE STATEID */
status = nfserr_stale_stateid;
if (STALE_STATEID(stateid))
goto out;
- /* BAD STATEID */
status = nfserr_bad_stateid;
- if (!stateid->si_fileid) { /* delegation stateid */
- if(!(dp = find_delegation_stateid(ino, stateid))) {
- dprintk("NFSD: delegation stateid not found\n");
+ if (is_delegation_stateid(stateid)) {
+ dp = find_delegation_stateid(ino, stateid);
+ if (!dp)
goto out;
- }
- stidp = &dp->dl_stateid;
+ status = check_stateid_generation(stateid, &dp->dl_stateid,
+ flags);
+ if (status)
+ goto out;
+ status = nfs4_check_delegmode(dp, flags);
+ if (status)
+ goto out;
+ renew_client(dp->dl_client);
+ if (filpp)
+ *filpp = dp->dl_vfs_file;
} else { /* open or lock stateid */
- if (!(stp = find_stateid(stateid, flags))) {
- dprintk("NFSD: open or lock stateid not found\n");
+ stp = find_stateid(stateid, flags);
+ if (!stp)
goto out;
- }
- if ((flags & CHECK_FH) && nfs4_check_fh(current_fh, stp))
+ if (nfs4_check_fh(current_fh, stp))
goto out;
if (!stp->st_stateowner->so_confirmed)
goto out;
- stidp = &stp->st_stateid;
- }
- status = check_stateid_generation(stateid, stidp);
- if (status)
- goto out;
- if (stp) {
- if ((status = nfs4_check_openmode(stp,flags)))
+ status = check_stateid_generation(stateid, &stp->st_stateid,
+ flags);
+ if (status)
+ goto out;
+ status = nfs4_check_openmode(stp, flags);
+ if (status)
goto out;
renew_client(stp->st_stateowner->so_client);
if (filpp)
*filpp = stp->st_vfs_file;
- } else {
- if ((status = nfs4_check_delegmode(dp, flags)))
- goto out;
- renew_client(dp->dl_client);
- if (flags & DELEG_RET)
- unhash_delegation(dp);
- if (filpp)
- *filpp = dp->dl_vfs_file;
}
status = nfs_ok;
out:
@@ -2113,10 +2915,14 @@ setlkflg (int type)
* Checks for sequence id mutating operations.
*/
static __be32
-nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *stateid, int flags, struct nfs4_stateowner **sopp, struct nfs4_stateid **stpp, struct nfsd4_lock *lock)
+nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
+ stateid_t *stateid, int flags,
+ struct nfs4_stateowner **sopp,
+ struct nfs4_stateid **stpp, struct nfsd4_lock *lock)
{
struct nfs4_stateid *stp;
struct nfs4_stateowner *sop;
+ struct svc_fh *current_fh = &cstate->current_fh;
__be32 status;
dprintk("NFSD: preprocess_seqid_op: seqid=%d "
@@ -2134,6 +2940,10 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
if (STALE_STATEID(stateid))
return nfserr_stale_stateid;
+
+ if (nfsd4_has_session(cstate))
+ flags |= HAS_SESSION;
+
/*
* We return BAD_STATEID if filehandle doesn't match stateid,
* the confirmed flag is incorrecly set, or the generation
@@ -2166,8 +2976,9 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
if (lock->lk_is_new) {
if (!sop->so_is_open_owner)
return nfserr_bad_stateid;
- if (!same_clid(&clp->cl_clientid, lockclid))
- return nfserr_bad_stateid;
+ if (!(flags & HAS_SESSION) &&
+ !same_clid(&clp->cl_clientid, lockclid))
+ return nfserr_bad_stateid;
/* stp is the open stateid */
status = nfs4_check_openmode(stp, lkflg);
if (status)
@@ -2190,7 +3001,7 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
* For the moment, we ignore the possibility of
* generation number wraparound.
*/
- if (seqid != sop->so_seqid)
+ if (!(flags & HAS_SESSION) && seqid != sop->so_seqid)
goto check_replay;
if (sop->so_confirmed && flags & CONFIRM) {
@@ -2203,7 +3014,7 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
" confirmed yet!\n");
return nfserr_bad_stateid;
}
- status = check_stateid_generation(stateid, &stp->st_stateid);
+ status = check_stateid_generation(stateid, &stp->st_stateid, flags);
if (status)
return status;
renew_client(sop->so_client);
@@ -2239,7 +3050,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
nfs4_lock_state();
- if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+ if ((status = nfs4_preprocess_seqid_op(cstate,
oc->oc_seqid, &oc->oc_req_stateid,
CONFIRM | OPEN_STATE,
&oc->oc_stateowner, &stp, NULL)))
@@ -2304,12 +3115,12 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
(int)cstate->current_fh.fh_dentry->d_name.len,
cstate->current_fh.fh_dentry->d_name.name);
- if (!access_valid(od->od_share_access)
+ if (!access_valid(od->od_share_access, cstate->minorversion)
|| !deny_valid(od->od_share_deny))
return nfserr_inval;
nfs4_lock_state();
- if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+ if ((status = nfs4_preprocess_seqid_op(cstate,
od->od_seqid,
&od->od_stateid,
OPEN_STATE,
@@ -2362,7 +3173,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
nfs4_lock_state();
/* check close_lru for replay */
- if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+ if ((status = nfs4_preprocess_seqid_op(cstate,
close->cl_seqid,
&close->cl_stateid,
OPEN_STATE | CLOSE_STATE,
@@ -2373,7 +3184,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
memcpy(&close->cl_stateid, &stp->st_stateid, sizeof(stateid_t));
/* release_stateid() calls nfsd_close() if needed */
- release_stateid(stp, OPEN_STATE);
+ release_open_stateid(stp);
/* place unused nfs4_stateowners on so_close_lru list to be
* released by the laundromat service after the lease period
@@ -2394,16 +3205,40 @@ __be32
nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_delegreturn *dr)
{
+ struct nfs4_delegation *dp;
+ stateid_t *stateid = &dr->dr_stateid;
+ struct inode *inode;
__be32 status;
+ int flags = 0;
if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
- goto out;
+ return status;
+ inode = cstate->current_fh.fh_dentry->d_inode;
+ if (nfsd4_has_session(cstate))
+ flags |= HAS_SESSION;
nfs4_lock_state();
- status = nfs4_preprocess_stateid_op(&cstate->current_fh,
- &dr->dr_stateid, DELEG_RET, NULL);
- nfs4_unlock_state();
+ status = nfserr_bad_stateid;
+ if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
+ goto out;
+ status = nfserr_stale_stateid;
+ if (STALE_STATEID(stateid))
+ goto out;
+ status = nfserr_bad_stateid;
+ if (!is_delegation_stateid(stateid))
+ goto out;
+ dp = find_delegation_stateid(inode, stateid);
+ if (!dp)
+ goto out;
+ status = check_stateid_generation(stateid, &dp->dl_stateid, flags);
+ if (status)
+ goto out;
+ renew_client(dp->dl_client);
+
+ unhash_delegation(dp);
out:
+ nfs4_unlock_state();
+
return status;
}
@@ -2684,11 +3519,12 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfs4_file *fp;
status = nfserr_stale_clientid;
- if (STALE_CLIENTID(&lock->lk_new_clientid))
+ if (!nfsd4_has_session(cstate) &&
+ STALE_CLIENTID(&lock->lk_new_clientid))
goto out;
/* validate and update open stateid and open seqid */
- status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+ status = nfs4_preprocess_seqid_op(cstate,
lock->lk_new_open_seqid,
&lock->lk_new_open_stateid,
OPEN_STATE,
@@ -2715,7 +3551,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
} else {
/* lock (lock owner + lock stateid) already exists */
- status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+ status = nfs4_preprocess_seqid_op(cstate,
lock->lk_old_lock_seqid,
&lock->lk_old_lock_stateid,
LOCK_STATE,
@@ -2788,7 +3624,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
}
out:
if (status && lock->lk_is_new && lock_sop)
- release_stateowner(lock_sop);
+ release_lockowner(lock_sop);
if (lock->lk_replay_owner) {
nfs4_get_stateowner(lock->lk_replay_owner);
cstate->replay_owner = lock->lk_replay_owner;
@@ -2838,7 +3674,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
nfs4_lock_state();
status = nfserr_stale_clientid;
- if (STALE_CLIENTID(&lockt->lt_clientid))
+ if (!nfsd4_has_session(cstate) && STALE_CLIENTID(&lockt->lt_clientid))
goto out;
if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) {
@@ -2911,7 +3747,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
nfs4_lock_state();
- if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+ if ((status = nfs4_preprocess_seqid_op(cstate,
locku->lu_seqid,
&locku->lu_stateid,
LOCK_STATE,
@@ -3037,7 +3873,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
/* unhash_stateowner deletes so_perclient only
* for openowners. */
list_del(&sop->so_perclient);
- release_stateowner(sop);
+ release_lockowner(sop);
}
out:
nfs4_unlock_state();
@@ -3051,12 +3887,12 @@ alloc_reclaim(void)
}
int
-nfs4_has_reclaimed_state(const char *name)
+nfs4_has_reclaimed_state(const char *name, bool use_exchange_id)
{
unsigned int strhashval = clientstr_hashval(name);
struct nfs4_client *clp;
- clp = find_confirmed_client_by_str(name, strhashval);
+ clp = find_confirmed_client_by_str(name, strhashval, use_exchange_id);
return clp ? 1 : 0;
}
@@ -3153,6 +3989,8 @@ nfs4_state_init(void)
INIT_LIST_HEAD(&unconf_str_hashtbl[i]);
INIT_LIST_HEAD(&unconf_id_hashtbl[i]);
}
+ for (i = 0; i < SESSION_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&sessionid_hashtbl[i]);
for (i = 0; i < FILE_HASH_SIZE; i++) {
INIT_LIST_HEAD(&file_hashtbl[i]);
}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 9250067943d8..b820c311931c 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -45,6 +45,7 @@
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/vfs.h>
+#include <linux/utsname.h>
#include <linux/sunrpc/xdr.h>
#include <linux/sunrpc/svc.h>
#include <linux/sunrpc/clnt.h>
@@ -188,6 +189,11 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
return p;
}
+static int zero_clientid(clientid_t *clid)
+{
+ return (clid->cl_boot == 0) && (clid->cl_id == 0);
+}
+
static int
defer_free(struct nfsd4_compoundargs *argp,
void (*release)(const void *), void *p)
@@ -230,6 +236,7 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
bmval[0] = 0;
bmval[1] = 0;
+ bmval[2] = 0;
READ_BUF(4);
READ32(bmlen);
@@ -241,13 +248,27 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
READ32(bmval[0]);
if (bmlen > 1)
READ32(bmval[1]);
+ if (bmlen > 2)
+ READ32(bmval[2]);
DECODE_TAIL;
}
+static u32 nfsd_attrmask[] = {
+ NFSD_WRITEABLE_ATTRS_WORD0,
+ NFSD_WRITEABLE_ATTRS_WORD1,
+ NFSD_WRITEABLE_ATTRS_WORD2
+};
+
+static u32 nfsd41_ex_attrmask[] = {
+ NFSD_SUPPATTR_EXCLCREAT_WORD0,
+ NFSD_SUPPATTR_EXCLCREAT_WORD1,
+ NFSD_SUPPATTR_EXCLCREAT_WORD2
+};
+
static __be32
-nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *iattr,
- struct nfs4_acl **acl)
+nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, u32 *writable,
+ struct iattr *iattr, struct nfs4_acl **acl)
{
int expected_len, len = 0;
u32 dummy32;
@@ -263,9 +284,12 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
* According to spec, unsupported attributes return ERR_ATTRNOTSUPP;
* read-only attributes return ERR_INVAL.
*/
- if ((bmval[0] & ~NFSD_SUPPORTED_ATTRS_WORD0) || (bmval[1] & ~NFSD_SUPPORTED_ATTRS_WORD1))
+ if ((bmval[0] & ~nfsd_suppattrs0(argp->minorversion)) ||
+ (bmval[1] & ~nfsd_suppattrs1(argp->minorversion)) ||
+ (bmval[2] & ~nfsd_suppattrs2(argp->minorversion)))
return nfserr_attrnotsupp;
- if ((bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0) || (bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1))
+ if ((bmval[0] & ~writable[0]) || (bmval[1] & ~writable[1]) ||
+ (bmval[2] & ~writable[2]))
return nfserr_inval;
READ_BUF(4);
@@ -400,6 +424,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
goto xdr_error;
}
}
+ BUG_ON(bmval[2]); /* no such writeable attr supported yet */
if (len != expected_len)
goto xdr_error;
@@ -493,7 +518,9 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
if ((status = check_filename(create->cr_name, create->cr_namelen, nfserr_inval)))
return status;
- if ((status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr, &create->cr_acl)))
+ status = nfsd4_decode_fattr(argp, create->cr_bmval, nfsd_attrmask,
+ &create->cr_iattr, &create->cr_acl);
+ if (status)
goto out;
DECODE_TAIL;
@@ -583,6 +610,8 @@ nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
READ_BUF(lockt->lt_owner.len);
READMEM(lockt->lt_owner.data, lockt->lt_owner.len);
+ if (argp->minorversion && !zero_clientid(&lockt->lt_clientid))
+ return nfserr_inval;
DECODE_TAIL;
}
@@ -652,13 +681,26 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
switch (open->op_createmode) {
case NFS4_CREATE_UNCHECKED:
case NFS4_CREATE_GUARDED:
- if ((status = nfsd4_decode_fattr(argp, open->op_bmval, &open->op_iattr, &open->op_acl)))
+ status = nfsd4_decode_fattr(argp, open->op_bmval,
+ nfsd_attrmask, &open->op_iattr, &open->op_acl);
+ if (status)
goto out;
break;
case NFS4_CREATE_EXCLUSIVE:
READ_BUF(8);
COPYMEM(open->op_verf.data, 8);
break;
+ case NFS4_CREATE_EXCLUSIVE4_1:
+ if (argp->minorversion < 1)
+ goto xdr_error;
+ READ_BUF(8);
+ COPYMEM(open->op_verf.data, 8);
+ status = nfsd4_decode_fattr(argp, open->op_bmval,
+ nfsd41_ex_attrmask, &open->op_iattr,
+ &open->op_acl);
+ if (status)
+ goto out;
+ break;
default:
goto xdr_error;
}
@@ -851,7 +893,7 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta
status = nfsd4_decode_stateid(argp, &setattr->sa_stateid);
if (status)
return status;
- return nfsd4_decode_fattr(argp, setattr->sa_bmval,
+ return nfsd4_decode_fattr(argp, setattr->sa_bmval, nfsd_attrmask,
&setattr->sa_iattr, &setattr->sa_acl);
}
@@ -993,6 +1035,241 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel
READ_BUF(rlockowner->rl_owner.len);
READMEM(rlockowner->rl_owner.data, rlockowner->rl_owner.len);
+ if (argp->minorversion && !zero_clientid(&rlockowner->rl_clientid))
+ return nfserr_inval;
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
+ struct nfsd4_exchange_id *exid)
+{
+ int dummy;
+ DECODE_HEAD;
+
+ READ_BUF(NFS4_VERIFIER_SIZE);
+ COPYMEM(exid->verifier.data, NFS4_VERIFIER_SIZE);
+
+ READ_BUF(4);
+ READ32(exid->clname.len);
+
+ READ_BUF(exid->clname.len);
+ SAVEMEM(exid->clname.data, exid->clname.len);
+
+ READ_BUF(4);
+ READ32(exid->flags);
+
+ /* Ignore state_protect4_a */
+ READ_BUF(4);
+ READ32(exid->spa_how);
+ switch (exid->spa_how) {
+ case SP4_NONE:
+ break;
+ case SP4_MACH_CRED:
+ /* spo_must_enforce */
+ READ_BUF(4);
+ READ32(dummy);
+ READ_BUF(dummy * 4);
+ p += dummy;
+
+ /* spo_must_allow */
+ READ_BUF(4);
+ READ32(dummy);
+ READ_BUF(dummy * 4);
+ p += dummy;
+ break;
+ case SP4_SSV:
+ /* ssp_ops */
+ READ_BUF(4);
+ READ32(dummy);
+ READ_BUF(dummy * 4);
+ p += dummy;
+
+ READ_BUF(4);
+ READ32(dummy);
+ READ_BUF(dummy * 4);
+ p += dummy;
+
+ /* ssp_hash_algs<> */
+ READ_BUF(4);
+ READ32(dummy);
+ READ_BUF(dummy);
+ p += XDR_QUADLEN(dummy);
+
+ /* ssp_encr_algs<> */
+ READ_BUF(4);
+ READ32(dummy);
+ READ_BUF(dummy);
+ p += XDR_QUADLEN(dummy);
+
+ /* ssp_window and ssp_num_gss_handles */
+ READ_BUF(8);
+ READ32(dummy);
+ READ32(dummy);
+ break;
+ default:
+ goto xdr_error;
+ }
+
+ /* Ignore Implementation ID */
+ READ_BUF(4); /* nfs_impl_id4 array length */
+ READ32(dummy);
+
+ if (dummy > 1)
+ goto xdr_error;
+
+ if (dummy == 1) {
+ /* nii_domain */
+ READ_BUF(4);
+ READ32(dummy);
+ READ_BUF(dummy);
+ p += XDR_QUADLEN(dummy);
+
+ /* nii_name */
+ READ_BUF(4);
+ READ32(dummy);
+ READ_BUF(dummy);
+ p += XDR_QUADLEN(dummy);
+
+ /* nii_date */
+ READ_BUF(12);
+ p += 3;
+ }
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
+ struct nfsd4_create_session *sess)
+{
+ DECODE_HEAD;
+
+ u32 dummy;
+ char *machine_name;
+ int i;
+ int nr_secflavs;
+
+ READ_BUF(16);
+ COPYMEM(&sess->clientid, 8);
+ READ32(sess->seqid);
+ READ32(sess->flags);
+
+ /* Fore channel attrs */
+ READ_BUF(28);
+ READ32(dummy); /* headerpadsz is always 0 */
+ READ32(sess->fore_channel.maxreq_sz);
+ READ32(sess->fore_channel.maxresp_sz);
+ READ32(sess->fore_channel.maxresp_cached);
+ READ32(sess->fore_channel.maxops);
+ READ32(sess->fore_channel.maxreqs);
+ READ32(sess->fore_channel.nr_rdma_attrs);
+ if (sess->fore_channel.nr_rdma_attrs == 1) {
+ READ_BUF(4);
+ READ32(sess->fore_channel.rdma_attrs);
+ } else if (sess->fore_channel.nr_rdma_attrs > 1) {
+ dprintk("Too many fore channel attr bitmaps!\n");
+ goto xdr_error;
+ }
+
+ /* Back channel attrs */
+ READ_BUF(28);
+ READ32(dummy); /* headerpadsz is always 0 */
+ READ32(sess->back_channel.maxreq_sz);
+ READ32(sess->back_channel.maxresp_sz);
+ READ32(sess->back_channel.maxresp_cached);
+ READ32(sess->back_channel.maxops);
+ READ32(sess->back_channel.maxreqs);
+ READ32(sess->back_channel.nr_rdma_attrs);
+ if (sess->back_channel.nr_rdma_attrs == 1) {
+ READ_BUF(4);
+ READ32(sess->back_channel.rdma_attrs);
+ } else if (sess->back_channel.nr_rdma_attrs > 1) {
+ dprintk("Too many back channel attr bitmaps!\n");
+ goto xdr_error;
+ }
+
+ READ_BUF(8);
+ READ32(sess->callback_prog);
+
+ /* callback_sec_params4 */
+ READ32(nr_secflavs);
+ for (i = 0; i < nr_secflavs; ++i) {
+ READ_BUF(4);
+ READ32(dummy);
+ switch (dummy) {
+ case RPC_AUTH_NULL:
+ /* Nothing to read */
+ break;
+ case RPC_AUTH_UNIX:
+ READ_BUF(8);
+ /* stamp */
+ READ32(dummy);
+
+ /* machine name */
+ READ32(dummy);
+ READ_BUF(dummy);
+ SAVEMEM(machine_name, dummy);
+
+ /* uid, gid */
+ READ_BUF(8);
+ READ32(sess->uid);
+ READ32(sess->gid);
+
+ /* more gids */
+ READ_BUF(4);
+ READ32(dummy);
+ READ_BUF(dummy * 4);
+ for (i = 0; i < dummy; ++i)
+ READ32(dummy);
+ break;
+ case RPC_AUTH_GSS:
+ dprintk("RPC_AUTH_GSS callback secflavor "
+ "not supported!\n");
+ READ_BUF(8);
+ /* gcbp_service */
+ READ32(dummy);
+ /* gcbp_handle_from_server */
+ READ32(dummy);
+ READ_BUF(dummy);
+ p += XDR_QUADLEN(dummy);
+ /* gcbp_handle_from_client */
+ READ_BUF(4);
+ READ32(dummy);
+ READ_BUF(dummy);
+ p += XDR_QUADLEN(dummy);
+ break;
+ default:
+ dprintk("Illegal callback secflavor\n");
+ return nfserr_inval;
+ }
+ }
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_destroy_session(struct nfsd4_compoundargs *argp,
+ struct nfsd4_destroy_session *destroy_session)
+{
+ DECODE_HEAD;
+ READ_BUF(NFS4_MAX_SESSIONID_LEN);
+ COPYMEM(destroy_session->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
+ struct nfsd4_sequence *seq)
+{
+ DECODE_HEAD;
+
+ READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
+ COPYMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+ READ32(seq->seqid);
+ READ32(seq->slotid);
+ READ32(seq->maxslots);
+ READ32(seq->cachethis);
+
DECODE_TAIL;
}
@@ -1005,7 +1282,7 @@ nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
static __be32
nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p)
{
- return nfserr_opnotsupp;
+ return nfserr_notsupp;
}
typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *);
@@ -1031,7 +1308,7 @@ static nfsd4_dec nfsd4_dec_ops[] = {
[OP_OPEN_CONFIRM] = (nfsd4_dec)nfsd4_decode_open_confirm,
[OP_OPEN_DOWNGRADE] = (nfsd4_dec)nfsd4_decode_open_downgrade,
[OP_PUTFH] = (nfsd4_dec)nfsd4_decode_putfh,
- [OP_PUTPUBFH] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_PUTPUBFH] = (nfsd4_dec)nfsd4_decode_noop,
[OP_PUTROOTFH] = (nfsd4_dec)nfsd4_decode_noop,
[OP_READ] = (nfsd4_dec)nfsd4_decode_read,
[OP_READDIR] = (nfsd4_dec)nfsd4_decode_readdir,
@@ -1050,6 +1327,67 @@ static nfsd4_dec nfsd4_dec_ops[] = {
[OP_RELEASE_LOCKOWNER] = (nfsd4_dec)nfsd4_decode_release_lockowner,
};
+static nfsd4_dec nfsd41_dec_ops[] = {
+ [OP_ACCESS] (nfsd4_dec)nfsd4_decode_access,
+ [OP_CLOSE] (nfsd4_dec)nfsd4_decode_close,
+ [OP_COMMIT] (nfsd4_dec)nfsd4_decode_commit,
+ [OP_CREATE] (nfsd4_dec)nfsd4_decode_create,
+ [OP_DELEGPURGE] (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_DELEGRETURN] (nfsd4_dec)nfsd4_decode_delegreturn,
+ [OP_GETATTR] (nfsd4_dec)nfsd4_decode_getattr,
+ [OP_GETFH] (nfsd4_dec)nfsd4_decode_noop,
+ [OP_LINK] (nfsd4_dec)nfsd4_decode_link,
+ [OP_LOCK] (nfsd4_dec)nfsd4_decode_lock,
+ [OP_LOCKT] (nfsd4_dec)nfsd4_decode_lockt,
+ [OP_LOCKU] (nfsd4_dec)nfsd4_decode_locku,
+ [OP_LOOKUP] (nfsd4_dec)nfsd4_decode_lookup,
+ [OP_LOOKUPP] (nfsd4_dec)nfsd4_decode_noop,
+ [OP_NVERIFY] (nfsd4_dec)nfsd4_decode_verify,
+ [OP_OPEN] (nfsd4_dec)nfsd4_decode_open,
+ [OP_OPENATTR] (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_OPEN_CONFIRM] (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_OPEN_DOWNGRADE] (nfsd4_dec)nfsd4_decode_open_downgrade,
+ [OP_PUTFH] (nfsd4_dec)nfsd4_decode_putfh,
+ [OP_PUTPUBFH] (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_PUTROOTFH] (nfsd4_dec)nfsd4_decode_noop,
+ [OP_READ] (nfsd4_dec)nfsd4_decode_read,
+ [OP_READDIR] (nfsd4_dec)nfsd4_decode_readdir,
+ [OP_READLINK] (nfsd4_dec)nfsd4_decode_noop,
+ [OP_REMOVE] (nfsd4_dec)nfsd4_decode_remove,
+ [OP_RENAME] (nfsd4_dec)nfsd4_decode_rename,
+ [OP_RENEW] (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_RESTOREFH] (nfsd4_dec)nfsd4_decode_noop,
+ [OP_SAVEFH] (nfsd4_dec)nfsd4_decode_noop,
+ [OP_SECINFO] (nfsd4_dec)nfsd4_decode_secinfo,
+ [OP_SETATTR] (nfsd4_dec)nfsd4_decode_setattr,
+ [OP_SETCLIENTID] (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_SETCLIENTID_CONFIRM](nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_VERIFY] (nfsd4_dec)nfsd4_decode_verify,
+ [OP_WRITE] (nfsd4_dec)nfsd4_decode_write,
+ [OP_RELEASE_LOCKOWNER] (nfsd4_dec)nfsd4_decode_notsupp,
+
+ /* new operations for NFSv4.1 */
+ [OP_BACKCHANNEL_CTL] (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_BIND_CONN_TO_SESSION](nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_EXCHANGE_ID] (nfsd4_dec)nfsd4_decode_exchange_id,
+ [OP_CREATE_SESSION] (nfsd4_dec)nfsd4_decode_create_session,
+ [OP_DESTROY_SESSION] (nfsd4_dec)nfsd4_decode_destroy_session,
+ [OP_FREE_STATEID] (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_GET_DIR_DELEGATION] (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_GETDEVICEINFO] (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_GETDEVICELIST] (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_LAYOUTCOMMIT] (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_LAYOUTGET] (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_LAYOUTRETURN] (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_SECINFO_NO_NAME] (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_SEQUENCE] (nfsd4_dec)nfsd4_decode_sequence,
+ [OP_SET_SSV] (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_TEST_STATEID] (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_WANT_DELEGATION] (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_DESTROY_CLIENTID] (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_RECLAIM_COMPLETE] (nfsd4_dec)nfsd4_decode_notsupp,
+};
+
struct nfsd4_minorversion_ops {
nfsd4_dec *decoders;
int nops;
@@ -1057,6 +1395,7 @@ struct nfsd4_minorversion_ops {
static struct nfsd4_minorversion_ops nfsd4_minorversion[] = {
[0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) },
+ [1] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) },
};
static __be32
@@ -1412,6 +1751,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
{
u32 bmval0 = bmval[0];
u32 bmval1 = bmval[1];
+ u32 bmval2 = bmval[2];
struct kstat stat;
struct svc_fh tempfh;
struct kstatfs statfs;
@@ -1425,12 +1765,16 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
int err;
int aclsupport = 0;
struct nfs4_acl *acl = NULL;
+ struct nfsd4_compoundres *resp = rqstp->rq_resp;
+ u32 minorversion = resp->cstate.minorversion;
BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1);
- BUG_ON(bmval0 & ~NFSD_SUPPORTED_ATTRS_WORD0);
- BUG_ON(bmval1 & ~NFSD_SUPPORTED_ATTRS_WORD1);
+ BUG_ON(bmval0 & ~nfsd_suppattrs0(minorversion));
+ BUG_ON(bmval1 & ~nfsd_suppattrs1(minorversion));
+ BUG_ON(bmval2 & ~nfsd_suppattrs2(minorversion));
if (exp->ex_fslocs.migrated) {
+ BUG_ON(bmval[2]);
status = fattr_handle_absent_fs(&bmval0, &bmval1, &rdattr_err);
if (status)
goto out;
@@ -1476,22 +1820,42 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
if ((buflen -= 16) < 0)
goto out_resource;
- WRITE32(2);
- WRITE32(bmval0);
- WRITE32(bmval1);
+ if (unlikely(bmval2)) {
+ WRITE32(3);
+ WRITE32(bmval0);
+ WRITE32(bmval1);
+ WRITE32(bmval2);
+ } else if (likely(bmval1)) {
+ WRITE32(2);
+ WRITE32(bmval0);
+ WRITE32(bmval1);
+ } else {
+ WRITE32(1);
+ WRITE32(bmval0);
+ }
attrlenp = p++; /* to be backfilled later */
if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
- u32 word0 = NFSD_SUPPORTED_ATTRS_WORD0;
+ u32 word0 = nfsd_suppattrs0(minorversion);
+ u32 word1 = nfsd_suppattrs1(minorversion);
+ u32 word2 = nfsd_suppattrs2(minorversion);
+
if ((buflen -= 12) < 0)
goto out_resource;
if (!aclsupport)
word0 &= ~FATTR4_WORD0_ACL;
if (!exp->ex_fslocs.locations)
word0 &= ~FATTR4_WORD0_FS_LOCATIONS;
- WRITE32(2);
- WRITE32(word0);
- WRITE32(NFSD_SUPPORTED_ATTRS_WORD1);
+ if (!word2) {
+ WRITE32(2);
+ WRITE32(word0);
+ WRITE32(word1);
+ } else {
+ WRITE32(3);
+ WRITE32(word0);
+ WRITE32(word1);
+ WRITE32(word2);
+ }
}
if (bmval0 & FATTR4_WORD0_TYPE) {
if ((buflen -= 4) < 0)
@@ -1801,6 +2165,13 @@ out_acl:
}
WRITE64(stat.ino);
}
+ if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
+ WRITE32(3);
+ WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0);
+ WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD1);
+ WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD2);
+ }
+
*attrlenp = htonl((char *)p - (char *)attrlenp - 4);
*countp = p - buffer;
status = nfs_ok;
@@ -2572,6 +2943,143 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w
}
static __be32
+nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, int nfserr,
+ struct nfsd4_exchange_id *exid)
+{
+ ENCODE_HEAD;
+ char *major_id;
+ char *server_scope;
+ int major_id_sz;
+ int server_scope_sz;
+ uint64_t minor_id = 0;
+
+ if (nfserr)
+ return nfserr;
+
+ major_id = utsname()->nodename;
+ major_id_sz = strlen(major_id);
+ server_scope = utsname()->nodename;
+ server_scope_sz = strlen(server_scope);
+
+ RESERVE_SPACE(
+ 8 /* eir_clientid */ +
+ 4 /* eir_sequenceid */ +
+ 4 /* eir_flags */ +
+ 4 /* spr_how (SP4_NONE) */ +
+ 8 /* so_minor_id */ +
+ 4 /* so_major_id.len */ +
+ (XDR_QUADLEN(major_id_sz) * 4) +
+ 4 /* eir_server_scope.len */ +
+ (XDR_QUADLEN(server_scope_sz) * 4) +
+ 4 /* eir_server_impl_id.count (0) */);
+
+ WRITEMEM(&exid->clientid, 8);
+ WRITE32(exid->seqid);
+ WRITE32(exid->flags);
+
+ /* state_protect4_r. Currently only support SP4_NONE */
+ BUG_ON(exid->spa_how != SP4_NONE);
+ WRITE32(exid->spa_how);
+
+ /* The server_owner struct */
+ WRITE64(minor_id); /* Minor id */
+ /* major id */
+ WRITE32(major_id_sz);
+ WRITEMEM(major_id, major_id_sz);
+
+ /* Server scope */
+ WRITE32(server_scope_sz);
+ WRITEMEM(server_scope, server_scope_sz);
+
+ /* Implementation id */
+ WRITE32(0); /* zero length nfs_impl_id4 array */
+ ADJUST_ARGS();
+ return 0;
+}
+
+static __be32
+nfsd4_encode_create_session(struct nfsd4_compoundres *resp, int nfserr,
+ struct nfsd4_create_session *sess)
+{
+ ENCODE_HEAD;
+
+ if (nfserr)
+ return nfserr;
+
+ RESERVE_SPACE(24);
+ WRITEMEM(sess->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+ WRITE32(sess->seqid);
+ WRITE32(sess->flags);
+ ADJUST_ARGS();
+
+ RESERVE_SPACE(28);
+ WRITE32(0); /* headerpadsz */
+ WRITE32(sess->fore_channel.maxreq_sz);
+ WRITE32(sess->fore_channel.maxresp_sz);
+ WRITE32(sess->fore_channel.maxresp_cached);
+ WRITE32(sess->fore_channel.maxops);
+ WRITE32(sess->fore_channel.maxreqs);
+ WRITE32(sess->fore_channel.nr_rdma_attrs);
+ ADJUST_ARGS();
+
+ if (sess->fore_channel.nr_rdma_attrs) {
+ RESERVE_SPACE(4);
+ WRITE32(sess->fore_channel.rdma_attrs);
+ ADJUST_ARGS();
+ }
+
+ RESERVE_SPACE(28);
+ WRITE32(0); /* headerpadsz */
+ WRITE32(sess->back_channel.maxreq_sz);
+ WRITE32(sess->back_channel.maxresp_sz);
+ WRITE32(sess->back_channel.maxresp_cached);
+ WRITE32(sess->back_channel.maxops);
+ WRITE32(sess->back_channel.maxreqs);
+ WRITE32(sess->back_channel.nr_rdma_attrs);
+ ADJUST_ARGS();
+
+ if (sess->back_channel.nr_rdma_attrs) {
+ RESERVE_SPACE(4);
+ WRITE32(sess->back_channel.rdma_attrs);
+ ADJUST_ARGS();
+ }
+ return 0;
+}
+
+static __be32
+nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, int nfserr,
+ struct nfsd4_destroy_session *destroy_session)
+{
+ return nfserr;
+}
+
+__be32
+nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
+ struct nfsd4_sequence *seq)
+{
+ ENCODE_HEAD;
+
+ if (nfserr)
+ return nfserr;
+
+ RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 20);
+ WRITEMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+ WRITE32(seq->seqid);
+ WRITE32(seq->slotid);
+ WRITE32(seq->maxslots);
+ /*
+ * FIXME: for now:
+ * target_maxslots = maxslots
+ * status_flags = 0
+ */
+ WRITE32(seq->maxslots);
+ WRITE32(0);
+
+ ADJUST_ARGS();
+ return 0;
+}
+
+static __be32
nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
{
return nfserr;
@@ -2579,6 +3087,11 @@ nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *);
+/*
+ * Note: nfsd4_enc_ops vector is shared for v4.0 and v4.1
+ * since we don't need to filter out obsolete ops as this is
+ * done in the decoding phase.
+ */
static nfsd4_enc nfsd4_enc_ops[] = {
[OP_ACCESS] = (nfsd4_enc)nfsd4_encode_access,
[OP_CLOSE] = (nfsd4_enc)nfsd4_encode_close,
@@ -2617,8 +3130,77 @@ static nfsd4_enc nfsd4_enc_ops[] = {
[OP_VERIFY] = (nfsd4_enc)nfsd4_encode_noop,
[OP_WRITE] = (nfsd4_enc)nfsd4_encode_write,
[OP_RELEASE_LOCKOWNER] = (nfsd4_enc)nfsd4_encode_noop,
+
+ /* NFSv4.1 operations */
+ [OP_BACKCHANNEL_CTL] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_EXCHANGE_ID] = (nfsd4_enc)nfsd4_encode_exchange_id,
+ [OP_CREATE_SESSION] = (nfsd4_enc)nfsd4_encode_create_session,
+ [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session,
+ [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence,
+ [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_TEST_STATEID] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_WANT_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_DESTROY_CLIENTID] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_RECLAIM_COMPLETE] = (nfsd4_enc)nfsd4_encode_noop,
};
+/*
+ * Calculate the total amount of memory that the compound response has taken
+ * after encoding the current operation.
+ *
+ * pad: add on 8 bytes for the next operation's op_code and status so that
+ * there is room to cache a failure on the next operation.
+ *
+ * Compare this length to the session se_fmaxresp_cached.
+ *
+ * Our se_fmaxresp_cached will always be a multiple of PAGE_SIZE, and so
+ * will be at least a page and will therefore hold the xdr_buf head.
+ */
+static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp)
+{
+ int status = 0;
+ struct xdr_buf *xb = &resp->rqstp->rq_res;
+ struct nfsd4_compoundargs *args = resp->rqstp->rq_argp;
+ struct nfsd4_session *session = NULL;
+ struct nfsd4_slot *slot = resp->cstate.slot;
+ u32 length, tlen = 0, pad = 8;
+
+ if (!nfsd4_has_session(&resp->cstate))
+ return status;
+
+ session = resp->cstate.session;
+ if (session == NULL || slot->sl_cache_entry.ce_cachethis == 0)
+ return status;
+
+ if (resp->opcnt >= args->opcnt)
+ pad = 0; /* this is the last operation */
+
+ if (xb->page_len == 0) {
+ length = (char *)resp->p - (char *)xb->head[0].iov_base + pad;
+ } else {
+ if (xb->tail[0].iov_base && xb->tail[0].iov_len > 0)
+ tlen = (char *)resp->p - (char *)xb->tail[0].iov_base;
+
+ length = xb->head[0].iov_len + xb->page_len + tlen + pad;
+ }
+ dprintk("%s length %u, xb->page_len %u tlen %u pad %u\n", __func__,
+ length, xb->page_len, tlen, pad);
+
+ if (length <= session->se_fmaxresp_cached)
+ return status;
+ else
+ return nfserr_rep_too_big_to_cache;
+}
+
void
nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
{
@@ -2635,6 +3217,9 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) ||
!nfsd4_enc_ops[op->opnum]);
op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u);
+ /* nfsd4_check_drc_limit guarantees enough room for error status */
+ if (!op->status && nfsd4_check_drc_limit(resp))
+ op->status = nfserr_rep_too_big_to_cache;
status:
/*
* Note: We write the status directly, instead of using WRITE32(),
@@ -2735,6 +3320,18 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
iov = &rqstp->rq_res.head[0];
iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base;
BUG_ON(iov->iov_len > PAGE_SIZE);
+ if (nfsd4_has_session(&resp->cstate)) {
+ if (resp->cstate.status == nfserr_replay_cache &&
+ !nfsd4_not_cached(resp)) {
+ iov->iov_len = resp->cstate.iovlen;
+ } else {
+ nfsd4_store_cache_entry(resp);
+ dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
+ resp->cstate.slot->sl_inuse = 0;
+ }
+ if (resp->cstate.session)
+ nfsd4_put_session(resp->cstate.session);
+ }
return 1;
}
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index a4ed8644d69c..af16849d243a 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -60,6 +60,7 @@ enum {
NFSD_FO_UnlockFS,
NFSD_Threads,
NFSD_Pool_Threads,
+ NFSD_Pool_Stats,
NFSD_Versions,
NFSD_Ports,
NFSD_MaxBlkSize,
@@ -172,6 +173,16 @@ static const struct file_operations exports_operations = {
.owner = THIS_MODULE,
};
+extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
+
+static struct file_operations pool_stats_operations = {
+ .open = nfsd_pool_stats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+ .owner = THIS_MODULE,
+};
+
/*----------------------------------------------------------------------------*/
/*
* payload - write methods
@@ -781,8 +792,9 @@ out_free:
static ssize_t __write_versions(struct file *file, char *buf, size_t size)
{
char *mesg = buf;
- char *vers, sign;
+ char *vers, *minorp, sign;
int len, num;
+ unsigned minor;
ssize_t tlen = 0;
char *sep;
@@ -803,9 +815,20 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
do {
sign = *vers;
if (sign == '+' || sign == '-')
- num = simple_strtol((vers+1), NULL, 0);
+ num = simple_strtol((vers+1), &minorp, 0);
else
- num = simple_strtol(vers, NULL, 0);
+ num = simple_strtol(vers, &minorp, 0);
+ if (*minorp == '.') {
+ if (num < 4)
+ return -EINVAL;
+ minor = simple_strtoul(minorp+1, NULL, 0);
+ if (minor == 0)
+ return -EINVAL;
+ if (nfsd_minorversion(minor, sign == '-' ?
+ NFSD_CLEAR : NFSD_SET) < 0)
+ return -EINVAL;
+ goto next;
+ }
switch(num) {
case 2:
case 3:
@@ -815,6 +838,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
default:
return -EINVAL;
}
+ next:
vers += len + 1;
tlen += len;
} while ((len = qword_get(&mesg, vers, size)) > 0);
@@ -833,6 +857,13 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
num);
sep = " ";
}
+ if (nfsd_vers(4, NFSD_AVAIL))
+ for (minor = 1; minor <= NFSD_SUPPORTED_MINOR_VERSION; minor++)
+ len += sprintf(buf+len, " %c4.%u",
+ (nfsd_vers(4, NFSD_TEST) &&
+ nfsd_minorversion(minor, NFSD_TEST)) ?
+ '+' : '-',
+ minor);
len += sprintf(buf+len, "\n");
return len;
}
@@ -1248,6 +1279,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
[NFSD_Fh] = {"filehandle", &transaction_ops, S_IWUSR|S_IRUSR},
[NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR},
[NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR},
+ [NFSD_Pool_Stats] = {"pool_stats", &pool_stats_operations, S_IRUGO},
[NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
[NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
[NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 6f7f26351227..e298e260b5f1 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -180,6 +180,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
{
__be32 nfserr;
int stable = 1;
+ unsigned long cnt = argp->len;
dprintk("nfsd: WRITE %s %d bytes at %d\n",
SVCFH_fmt(&argp->fh),
@@ -188,7 +189,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), NULL,
argp->offset,
rqstp->rq_vec, argp->vlen,
- argp->len,
+ &cnt,
&stable);
return nfsd_return_attrs(nfserr, resp);
}
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 7c09852be713..cbba4a935786 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -22,6 +22,7 @@
#include <linux/freezer.h>
#include <linux/fs_struct.h>
#include <linux/kthread.h>
+#include <linux/swap.h>
#include <linux/sunrpc/types.h>
#include <linux/sunrpc/stats.h>
@@ -40,9 +41,6 @@
extern struct svc_program nfsd_program;
static int nfsd(void *vrqstp);
struct timeval nfssvc_boot;
-static atomic_t nfsd_busy;
-static unsigned long nfsd_last_call;
-static DEFINE_SPINLOCK(nfsd_call_lock);
/*
* nfsd_mutex protects nfsd_serv -- both the pointer itself and the members
@@ -123,6 +121,8 @@ struct svc_program nfsd_program = {
};
+u32 nfsd_supported_minorversion;
+
int nfsd_vers(int vers, enum vers_op change)
{
if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS)
@@ -149,6 +149,28 @@ int nfsd_vers(int vers, enum vers_op change)
}
return 0;
}
+
+int nfsd_minorversion(u32 minorversion, enum vers_op change)
+{
+ if (minorversion > NFSD_SUPPORTED_MINOR_VERSION)
+ return -1;
+ switch(change) {
+ case NFSD_SET:
+ nfsd_supported_minorversion = minorversion;
+ break;
+ case NFSD_CLEAR:
+ if (minorversion == 0)
+ return -1;
+ nfsd_supported_minorversion = minorversion - 1;
+ break;
+ case NFSD_TEST:
+ return minorversion <= nfsd_supported_minorversion;
+ case NFSD_AVAIL:
+ return minorversion <= NFSD_SUPPORTED_MINOR_VERSION;
+ }
+ return 0;
+}
+
/*
* Maximum number of nfsd processes
*/
@@ -200,6 +222,28 @@ void nfsd_reset_versions(void)
}
}
+/*
+ * Each session guarantees a negotiated per slot memory cache for replies
+ * which in turn consumes memory beyond the v2/v3/v4.0 server. A dedicated
+ * NFSv4.1 server might want to use more memory for a DRC than a machine
+ * with mutiple services.
+ *
+ * Impose a hard limit on the number of pages for the DRC which varies
+ * according to the machines free pages. This is of course only a default.
+ *
+ * For now this is a #defined shift which could be under admin control
+ * in the future.
+ */
+static void set_max_drc(void)
+{
+ /* The percent of nr_free_buffer_pages used by the V4.1 server DRC */
+ #define NFSD_DRC_SIZE_SHIFT 7
+ nfsd_serv->sv_drc_max_pages = nr_free_buffer_pages()
+ >> NFSD_DRC_SIZE_SHIFT;
+ nfsd_serv->sv_drc_pages_used = 0;
+ dprintk("%s svc_drc_max_pages %u\n", __func__,
+ nfsd_serv->sv_drc_max_pages);
+}
int nfsd_create_serv(void)
{
@@ -227,11 +271,12 @@ int nfsd_create_serv(void)
nfsd_max_blksize /= 2;
}
- atomic_set(&nfsd_busy, 0);
nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
nfsd_last_thread, nfsd, THIS_MODULE);
if (nfsd_serv == NULL)
err = -ENOMEM;
+ else
+ set_max_drc();
do_gettimeofday(&nfssvc_boot); /* record boot time */
return err;
@@ -375,26 +420,6 @@ nfsd_svc(unsigned short port, int nrservs)
return error;
}
-static inline void
-update_thread_usage(int busy_threads)
-{
- unsigned long prev_call;
- unsigned long diff;
- int decile;
-
- spin_lock(&nfsd_call_lock);
- prev_call = nfsd_last_call;
- nfsd_last_call = jiffies;
- decile = busy_threads*10/nfsdstats.th_cnt;
- if (decile>0 && decile <= 10) {
- diff = nfsd_last_call - prev_call;
- if ( (nfsdstats.th_usage[decile-1] += diff) >= NFSD_USAGE_WRAP)
- nfsdstats.th_usage[decile-1] -= NFSD_USAGE_WRAP;
- if (decile == 10)
- nfsdstats.th_fullcnt++;
- }
- spin_unlock(&nfsd_call_lock);
-}
/*
* This is the NFS server kernel thread
@@ -460,8 +485,6 @@ nfsd(void *vrqstp)
continue;
}
- update_thread_usage(atomic_read(&nfsd_busy));
- atomic_inc(&nfsd_busy);
/* Lock the export hash tables for reading. */
exp_readlock();
@@ -470,8 +493,6 @@ nfsd(void *vrqstp)
/* Unlock export hash tables */
exp_readunlock();
- update_thread_usage(atomic_read(&nfsd_busy));
- atomic_dec(&nfsd_busy);
}
/* Clear signals before calling svc_exit_thread() */
@@ -539,6 +560,10 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
+ rqstp->rq_res.head[0].iov_len;
rqstp->rq_res.head[0].iov_len += sizeof(__be32);
+ /* NFSv4.1 DRC requires statp */
+ if (rqstp->rq_vers == 4)
+ nfsd4_set_statp(rqstp, statp);
+
/* Now call the procedure handler, and encode NFS status. */
nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
nfserr = map_new_errors(rqstp->rq_vers, nfserr);
@@ -570,3 +595,10 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
nfsd_cache_update(rqstp, proc->pc_cachetype, statp + 1);
return 1;
}
+
+int nfsd_pool_stats_open(struct inode *inode, struct file *file)
+{
+ if (nfsd_serv == NULL)
+ return -ENODEV;
+ return svc_pool_stats_open(nfsd_serv, file);
+}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 78376b6c0236..ab93fcfef254 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -366,8 +366,9 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
}
/* Revoke setuid/setgid on chown */
- if (((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) ||
- ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid)) {
+ if (!S_ISDIR(inode->i_mode) &&
+ (((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) ||
+ ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid))) {
iap->ia_valid |= ATTR_KILL_PRIV;
if (iap->ia_valid & ATTR_MODE) {
/* we're setting mode too, just clear the s*id bits */
@@ -960,7 +961,7 @@ static void kill_suid(struct dentry *dentry)
static __be32
nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
loff_t offset, struct kvec *vec, int vlen,
- unsigned long cnt, int *stablep)
+ unsigned long *cnt, int *stablep)
{
struct svc_export *exp;
struct dentry *dentry;
@@ -974,7 +975,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
err = nfserr_perm;
if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
- (!lock_may_write(file->f_path.dentry->d_inode, offset, cnt)))
+ (!lock_may_write(file->f_path.dentry->d_inode, offset, *cnt)))
goto out;
#endif
@@ -1009,7 +1010,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset);
set_fs(oldfs);
if (host_err >= 0) {
- nfsdstats.io_write += cnt;
+ nfsdstats.io_write += host_err;
fsnotify_modify(file->f_path.dentry);
}
@@ -1054,9 +1055,10 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
}
dprintk("nfsd: write complete host_err=%d\n", host_err);
- if (host_err >= 0)
+ if (host_err >= 0) {
err = 0;
- else
+ *cnt = host_err;
+ } else
err = nfserrno(host_err);
out:
return err;
@@ -1098,7 +1100,7 @@ out:
*/
__be32
nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
- loff_t offset, struct kvec *vec, int vlen, unsigned long cnt,
+ loff_t offset, struct kvec *vec, int vlen, unsigned long *cnt,
int *stablep)
{
__be32 err = 0;
@@ -1179,6 +1181,21 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp,
return 0;
}
+/* HPUX client sometimes creates a file in mode 000, and sets size to 0.
+ * setting size to 0 may fail for some specific file systems by the permission
+ * checking which requires WRITE permission but the mode is 000.
+ * we ignore the resizing(to 0) on the just new created file, since the size is
+ * 0 after file created.
+ *
+ * call this only after vfs_create() is called.
+ * */
+static void
+nfsd_check_ignore_resizing(struct iattr *iap)
+{
+ if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0))
+ iap->ia_valid &= ~ATTR_SIZE;
+}
+
/*
* Create a file (regular, directory, device, fifo); UNIX sockets
* not yet implemented.
@@ -1274,6 +1291,8 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
switch (type) {
case S_IFREG:
host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
+ if (!host_err)
+ nfsd_check_ignore_resizing(iap);
break;
case S_IFDIR:
host_err = vfs_mkdir(dirp, dchild, iap->ia_mode);
@@ -1427,6 +1446,8 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
/* setattr will sync the child (or not) */
}
+ nfsd_check_ignore_resizing(iap);
+
if (createmode == NFS3_CREATE_EXCLUSIVE) {
/* Cram the verifier into atime/mtime */
iap->ia_valid = ATTR_MTIME|ATTR_ATIME
diff --git a/fs/romfs/Kconfig b/fs/romfs/Kconfig
index 1a17020f9faf..ce2d6bcc6266 100644
--- a/fs/romfs/Kconfig
+++ b/fs/romfs/Kconfig
@@ -1,6 +1,6 @@
config ROMFS_FS
tristate "ROM file system support"
- depends on BLOCK
+ depends on BLOCK || MTD
---help---
This is a very small read-only file system mainly intended for
initial ram disks of installation disks, but it could be used for
@@ -14,3 +14,49 @@ config ROMFS_FS
If you don't know whether you need it, then you don't need it:
answer N.
+
+#
+# Select the backing stores to be supported
+#
+choice
+ prompt "RomFS backing stores"
+ depends on ROMFS_FS
+ default ROMFS_BACKED_BY_BLOCK
+ help
+ Select the backing stores to be supported.
+
+config ROMFS_BACKED_BY_BLOCK
+ bool "Block device-backed ROM file system support"
+ depends on BLOCK
+ help
+ This permits ROMFS to use block devices buffered through the page
+ cache as the medium from which to retrieve data. It does not allow
+ direct mapping of the medium.
+
+ If unsure, answer Y.
+
+config ROMFS_BACKED_BY_MTD
+ bool "MTD-backed ROM file system support"
+ depends on MTD=y || (ROMFS_FS=m && MTD)
+ help
+ This permits ROMFS to use MTD based devices directly, without the
+ intercession of the block layer (which may have been disabled). It
+ also allows direct mapping of MTD devices through romfs files under
+ NOMMU conditions if the underlying device is directly addressable by
+ the CPU.
+
+ If unsure, answer Y.
+
+config ROMFS_BACKED_BY_BOTH
+ bool "Both the above"
+ depends on BLOCK && (MTD=y || (ROMFS_FS=m && MTD))
+endchoice
+
+
+config ROMFS_ON_BLOCK
+ bool
+ default y if ROMFS_BACKED_BY_BLOCK || ROMFS_BACKED_BY_BOTH
+
+config ROMFS_ON_MTD
+ bool
+ default y if ROMFS_BACKED_BY_MTD || ROMFS_BACKED_BY_BOTH
diff --git a/fs/romfs/Makefile b/fs/romfs/Makefile
index c95b21cf49a3..420beb7d495c 100644
--- a/fs/romfs/Makefile
+++ b/fs/romfs/Makefile
@@ -1,7 +1,12 @@
#
-# Makefile for the linux romfs filesystem routines.
+# Makefile for the linux RomFS filesystem routines.
#
obj-$(CONFIG_ROMFS_FS) += romfs.o
-romfs-objs := inode.o
+romfs-y := storage.o super.o
+
+ifneq ($(CONFIG_MMU),y)
+romfs-$(CONFIG_ROMFS_ON_MTD) += mmap-nommu.o
+endif
+
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
deleted file mode 100644
index 98a232f7196b..000000000000
--- a/fs/romfs/inode.c
+++ /dev/null
@@ -1,665 +0,0 @@
-/*
- * ROMFS file system, Linux implementation
- *
- * Copyright (C) 1997-1999 Janos Farkas <chexum@shadow.banki.hu>
- *
- * Using parts of the minix filesystem
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * and parts of the affs filesystem additionally
- * Copyright (C) 1993 Ray Burr
- * Copyright (C) 1996 Hans-Joachim Widmaier
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Changes
- * Changed for 2.1.19 modules
- * Jan 1997 Initial release
- * Jun 1997 2.1.43+ changes
- * Proper page locking in readpage
- * Changed to work with 2.1.45+ fs
- * Jul 1997 Fixed follow_link
- * 2.1.47
- * lookup shouldn't return -ENOENT
- * from Horst von Brand:
- * fail on wrong checksum
- * double unlock_super was possible
- * correct namelen for statfs
- * spotted by Bill Hawes:
- * readlink shouldn't iput()
- * Jun 1998 2.1.106 from Avery Pennarun: glibc scandir()
- * exposed a problem in readdir
- * 2.1.107 code-freeze spellchecker run
- * Aug 1998 2.1.118+ VFS changes
- * Sep 1998 2.1.122 another VFS change (follow_link)
- * Apr 1999 2.2.7 no more EBADF checking in
- * lookup/readdir, use ERR_PTR
- * Jun 1999 2.3.6 d_alloc_root use changed
- * 2.3.9 clean up usage of ENOENT/negative
- * dentries in lookup
- * clean up page flags setting
- * (error, uptodate, locking) in
- * in readpage
- * use init_special_inode for
- * fifos/sockets (and streamline) in
- * read_inode, fix _ops table order
- * Aug 1999 2.3.16 __initfunc() => __init change
- * Oct 1999 2.3.24 page->owner hack obsoleted
- * Nov 1999 2.3.27 2.3.25+ page->offset => index change
- */
-
-/* todo:
- * - see Documentation/filesystems/romfs.txt
- * - use allocated, not stack memory for file names?
- * - considering write access...
- * - network (tftp) files?
- * - merge back some _op tables
- */
-
-/*
- * Sorry about some optimizations and for some goto's. I just wanted
- * to squeeze some more bytes out of this code.. :)
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/romfs_fs.h>
-#include <linux/fs.h>
-#include <linux/init.h>
-#include <linux/pagemap.h>
-#include <linux/smp_lock.h>
-#include <linux/buffer_head.h>
-#include <linux/vfs.h>
-
-#include <asm/uaccess.h>
-
-struct romfs_inode_info {
- unsigned long i_metasize; /* size of non-data area */
- unsigned long i_dataoffset; /* from the start of fs */
- struct inode vfs_inode;
-};
-
-static struct inode *romfs_iget(struct super_block *, unsigned long);
-
-/* instead of private superblock data */
-static inline unsigned long romfs_maxsize(struct super_block *sb)
-{
- return (unsigned long)sb->s_fs_info;
-}
-
-static inline struct romfs_inode_info *ROMFS_I(struct inode *inode)
-{
- return container_of(inode, struct romfs_inode_info, vfs_inode);
-}
-
-static __u32
-romfs_checksum(void *data, int size)
-{
- __u32 sum;
- __be32 *ptr;
-
- sum = 0; ptr = data;
- size>>=2;
- while (size>0) {
- sum += be32_to_cpu(*ptr++);
- size--;
- }
- return sum;
-}
-
-static const struct super_operations romfs_ops;
-
-static int romfs_fill_super(struct super_block *s, void *data, int silent)
-{
- struct buffer_head *bh;
- struct romfs_super_block *rsb;
- struct inode *root;
- int sz, ret = -EINVAL;
-
- /* I would parse the options here, but there are none.. :) */
-
- sb_set_blocksize(s, ROMBSIZE);
- s->s_maxbytes = 0xFFFFFFFF;
-
- bh = sb_bread(s, 0);
- if (!bh) {
- /* XXX merge with other printk? */
- printk ("romfs: unable to read superblock\n");
- goto outnobh;
- }
-
- rsb = (struct romfs_super_block *)bh->b_data;
- sz = be32_to_cpu(rsb->size);
- if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1
- || sz < ROMFH_SIZE) {
- if (!silent)
- printk ("VFS: Can't find a romfs filesystem on dev "
- "%s.\n", s->s_id);
- goto out;
- }
- if (romfs_checksum(rsb, min_t(int, sz, 512))) {
- printk ("romfs: bad initial checksum on dev "
- "%s.\n", s->s_id);
- goto out;
- }
-
- s->s_magic = ROMFS_MAGIC;
- s->s_fs_info = (void *)(long)sz;
-
- s->s_flags |= MS_RDONLY;
-
- /* Find the start of the fs */
- sz = (ROMFH_SIZE +
- strnlen(rsb->name, ROMFS_MAXFN) + 1 + ROMFH_PAD)
- & ROMFH_MASK;
-
- s->s_op = &romfs_ops;
- root = romfs_iget(s, sz);
- if (IS_ERR(root)) {
- ret = PTR_ERR(root);
- goto out;
- }
-
- ret = -ENOMEM;
- s->s_root = d_alloc_root(root);
- if (!s->s_root)
- goto outiput;
-
- brelse(bh);
- return 0;
-
-outiput:
- iput(root);
-out:
- brelse(bh);
-outnobh:
- return ret;
-}
-
-/* That's simple too. */
-
-static int
-romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
- buf->f_type = ROMFS_MAGIC;
- buf->f_bsize = ROMBSIZE;
- buf->f_bfree = buf->f_bavail = buf->f_ffree;
- buf->f_blocks = (romfs_maxsize(dentry->d_sb)+ROMBSIZE-1)>>ROMBSBITS;
- buf->f_namelen = ROMFS_MAXFN;
- return 0;
-}
-
-/* some helper routines */
-
-static int
-romfs_strnlen(struct inode *i, unsigned long offset, unsigned long count)
-{
- struct buffer_head *bh;
- unsigned long avail, maxsize, res;
-
- maxsize = romfs_maxsize(i->i_sb);
- if (offset >= maxsize)
- return -1;
-
- /* strnlen is almost always valid */
- if (count > maxsize || offset+count > maxsize)
- count = maxsize-offset;
-
- bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
- if (!bh)
- return -1; /* error */
-
- avail = ROMBSIZE - (offset & ROMBMASK);
- maxsize = min_t(unsigned long, count, avail);
- res = strnlen(((char *)bh->b_data)+(offset&ROMBMASK), maxsize);
- brelse(bh);
-
- if (res < maxsize)
- return res; /* found all of it */
-
- while (res < count) {
- offset += maxsize;
-
- bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
- if (!bh)
- return -1;
- maxsize = min_t(unsigned long, count - res, ROMBSIZE);
- avail = strnlen(bh->b_data, maxsize);
- res += avail;
- brelse(bh);
- if (avail < maxsize)
- return res;
- }
- return res;
-}
-
-static int
-romfs_copyfrom(struct inode *i, void *dest, unsigned long offset, unsigned long count)
-{
- struct buffer_head *bh;
- unsigned long avail, maxsize, res;
-
- maxsize = romfs_maxsize(i->i_sb);
- if (offset >= maxsize || count > maxsize || offset+count>maxsize)
- return -1;
-
- bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
- if (!bh)
- return -1; /* error */
-
- avail = ROMBSIZE - (offset & ROMBMASK);
- maxsize = min_t(unsigned long, count, avail);
- memcpy(dest, ((char *)bh->b_data) + (offset & ROMBMASK), maxsize);
- brelse(bh);
-
- res = maxsize; /* all of it */
-
- while (res < count) {
- offset += maxsize;
- dest += maxsize;
-
- bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
- if (!bh)
- return -1;
- maxsize = min_t(unsigned long, count - res, ROMBSIZE);
- memcpy(dest, bh->b_data, maxsize);
- brelse(bh);
- res += maxsize;
- }
- return res;
-}
-
-static unsigned char romfs_dtype_table[] = {
- DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_SOCK, DT_FIFO
-};
-
-static int
-romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
-{
- struct inode *i = filp->f_path.dentry->d_inode;
- struct romfs_inode ri;
- unsigned long offset, maxoff;
- int j, ino, nextfh;
- int stored = 0;
- char fsname[ROMFS_MAXFN]; /* XXX dynamic? */
-
- lock_kernel();
-
- maxoff = romfs_maxsize(i->i_sb);
-
- offset = filp->f_pos;
- if (!offset) {
- offset = i->i_ino & ROMFH_MASK;
- if (romfs_copyfrom(i, &ri, offset, ROMFH_SIZE) <= 0)
- goto out;
- offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
- }
-
- /* Not really failsafe, but we are read-only... */
- for(;;) {
- if (!offset || offset >= maxoff) {
- offset = maxoff;
- filp->f_pos = offset;
- goto out;
- }
- filp->f_pos = offset;
-
- /* Fetch inode info */
- if (romfs_copyfrom(i, &ri, offset, ROMFH_SIZE) <= 0)
- goto out;
-
- j = romfs_strnlen(i, offset+ROMFH_SIZE, sizeof(fsname)-1);
- if (j < 0)
- goto out;
-
- fsname[j]=0;
- romfs_copyfrom(i, fsname, offset+ROMFH_SIZE, j);
-
- ino = offset;
- nextfh = be32_to_cpu(ri.next);
- if ((nextfh & ROMFH_TYPE) == ROMFH_HRD)
- ino = be32_to_cpu(ri.spec);
- if (filldir(dirent, fsname, j, offset, ino,
- romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0) {
- goto out;
- }
- stored++;
- offset = nextfh & ROMFH_MASK;
- }
-out:
- unlock_kernel();
- return stored;
-}
-
-static struct dentry *
-romfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
-{
- unsigned long offset, maxoff;
- long res;
- int fslen;
- struct inode *inode = NULL;
- char fsname[ROMFS_MAXFN]; /* XXX dynamic? */
- struct romfs_inode ri;
- const char *name; /* got from dentry */
- int len;
-
- res = -EACCES; /* placeholder for "no data here" */
- offset = dir->i_ino & ROMFH_MASK;
- lock_kernel();
- if (romfs_copyfrom(dir, &ri, offset, ROMFH_SIZE) <= 0)
- goto error;
-
- maxoff = romfs_maxsize(dir->i_sb);
- offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
-
- /* OK, now find the file whose name is in "dentry" in the
- * directory specified by "dir". */
-
- name = dentry->d_name.name;
- len = dentry->d_name.len;
-
- for(;;) {
- if (!offset || offset >= maxoff)
- goto success; /* negative success */
- if (romfs_copyfrom(dir, &ri, offset, ROMFH_SIZE) <= 0)
- goto error;
-
- /* try to match the first 16 bytes of name */
- fslen = romfs_strnlen(dir, offset+ROMFH_SIZE, ROMFH_SIZE);
- if (len < ROMFH_SIZE) {
- if (len == fslen) {
- /* both are shorter, and same size */
- romfs_copyfrom(dir, fsname, offset+ROMFH_SIZE, len+1);
- if (strncmp (name, fsname, len) == 0)
- break;
- }
- } else if (fslen >= ROMFH_SIZE) {
- /* both are longer; XXX optimize max size */
- fslen = romfs_strnlen(dir, offset+ROMFH_SIZE, sizeof(fsname)-1);
- if (len == fslen) {
- romfs_copyfrom(dir, fsname, offset+ROMFH_SIZE, len+1);
- if (strncmp(name, fsname, len) == 0)
- break;
- }
- }
- /* next entry */
- offset = be32_to_cpu(ri.next) & ROMFH_MASK;
- }
-
- /* Hard link handling */
- if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD)
- offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
-
- inode = romfs_iget(dir->i_sb, offset);
- if (IS_ERR(inode)) {
- res = PTR_ERR(inode);
- goto error;
- }
-
-success:
- d_add(dentry, inode);
- res = 0;
-error:
- unlock_kernel();
- return ERR_PTR(res);
-}
-
-/*
- * Ok, we do readpage, to be able to execute programs. Unfortunately,
- * we can't use bmap, since we may have looser alignments.
- */
-
-static int
-romfs_readpage(struct file *file, struct page * page)
-{
- struct inode *inode = page->mapping->host;
- loff_t offset, size;
- unsigned long filled;
- void *buf;
- int result = -EIO;
-
- page_cache_get(page);
- lock_kernel();
- buf = kmap(page);
- if (!buf)
- goto err_out;
-
- /* 32 bit warning -- but not for us :) */
- offset = page_offset(page);
- size = i_size_read(inode);
- filled = 0;
- result = 0;
- if (offset < size) {
- unsigned long readlen;
-
- size -= offset;
- readlen = size > PAGE_SIZE ? PAGE_SIZE : size;
-
- filled = romfs_copyfrom(inode, buf, ROMFS_I(inode)->i_dataoffset+offset, readlen);
-
- if (filled != readlen) {
- SetPageError(page);
- filled = 0;
- result = -EIO;
- }
- }
-
- if (filled < PAGE_SIZE)
- memset(buf + filled, 0, PAGE_SIZE-filled);
-
- if (!result)
- SetPageUptodate(page);
- flush_dcache_page(page);
-
- unlock_page(page);
-
- kunmap(page);
-err_out:
- page_cache_release(page);
- unlock_kernel();
-
- return result;
-}
-
-/* Mapping from our types to the kernel */
-
-static const struct address_space_operations romfs_aops = {
- .readpage = romfs_readpage
-};
-
-static const struct file_operations romfs_dir_operations = {
- .read = generic_read_dir,
- .readdir = romfs_readdir,
-};
-
-static const struct inode_operations romfs_dir_inode_operations = {
- .lookup = romfs_lookup,
-};
-
-static mode_t romfs_modemap[] =
-{
- 0, S_IFDIR+0644, S_IFREG+0644, S_IFLNK+0777,
- S_IFBLK+0600, S_IFCHR+0600, S_IFSOCK+0644, S_IFIFO+0644
-};
-
-static struct inode *
-romfs_iget(struct super_block *sb, unsigned long ino)
-{
- int nextfh, ret;
- struct romfs_inode ri;
- struct inode *i;
-
- ino &= ROMFH_MASK;
- i = iget_locked(sb, ino);
- if (!i)
- return ERR_PTR(-ENOMEM);
- if (!(i->i_state & I_NEW))
- return i;
-
- i->i_mode = 0;
-
- /* Loop for finding the real hard link */
- for(;;) {
- if (romfs_copyfrom(i, &ri, ino, ROMFH_SIZE) <= 0) {
- printk(KERN_ERR "romfs: read error for inode 0x%lx\n",
- ino);
- iget_failed(i);
- return ERR_PTR(-EIO);
- }
- /* XXX: do romfs_checksum here too (with name) */
-
- nextfh = be32_to_cpu(ri.next);
- if ((nextfh & ROMFH_TYPE) != ROMFH_HRD)
- break;
-
- ino = be32_to_cpu(ri.spec) & ROMFH_MASK;
- }
-
- i->i_nlink = 1; /* Hard to decide.. */
- i->i_size = be32_to_cpu(ri.size);
- i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
- i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
-
- /* Precalculate the data offset */
- ret = romfs_strnlen(i, ino + ROMFH_SIZE, ROMFS_MAXFN);
- if (ret >= 0)
- ino = (ROMFH_SIZE + ret + 1 + ROMFH_PAD) & ROMFH_MASK;
- else
- ino = 0;
-
- ROMFS_I(i)->i_metasize = ino;
- ROMFS_I(i)->i_dataoffset = ino+(i->i_ino&ROMFH_MASK);
-
- /* Compute permissions */
- ino = romfs_modemap[nextfh & ROMFH_TYPE];
- /* only "normal" files have ops */
- switch (nextfh & ROMFH_TYPE) {
- case 1:
- i->i_size = ROMFS_I(i)->i_metasize;
- i->i_op = &romfs_dir_inode_operations;
- i->i_fop = &romfs_dir_operations;
- if (nextfh & ROMFH_EXEC)
- ino |= S_IXUGO;
- i->i_mode = ino;
- break;
- case 2:
- i->i_fop = &generic_ro_fops;
- i->i_data.a_ops = &romfs_aops;
- if (nextfh & ROMFH_EXEC)
- ino |= S_IXUGO;
- i->i_mode = ino;
- break;
- case 3:
- i->i_op = &page_symlink_inode_operations;
- i->i_data.a_ops = &romfs_aops;
- i->i_mode = ino | S_IRWXUGO;
- break;
- default:
- /* depending on MBZ for sock/fifos */
- nextfh = be32_to_cpu(ri.spec);
- init_special_inode(i, ino,
- MKDEV(nextfh>>16,nextfh&0xffff));
- }
- unlock_new_inode(i);
- return i;
-}
-
-static struct kmem_cache * romfs_inode_cachep;
-
-static struct inode *romfs_alloc_inode(struct super_block *sb)
-{
- struct romfs_inode_info *ei;
- ei = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL);
- if (!ei)
- return NULL;
- return &ei->vfs_inode;
-}
-
-static void romfs_destroy_inode(struct inode *inode)
-{
- kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
-}
-
-static void init_once(void *foo)
-{
- struct romfs_inode_info *ei = foo;
-
- inode_init_once(&ei->vfs_inode);
-}
-
-static int init_inodecache(void)
-{
- romfs_inode_cachep = kmem_cache_create("romfs_inode_cache",
- sizeof(struct romfs_inode_info),
- 0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
- init_once);
- if (romfs_inode_cachep == NULL)
- return -ENOMEM;
- return 0;
-}
-
-static void destroy_inodecache(void)
-{
- kmem_cache_destroy(romfs_inode_cachep);
-}
-
-static int romfs_remount(struct super_block *sb, int *flags, char *data)
-{
- *flags |= MS_RDONLY;
- return 0;
-}
-
-static const struct super_operations romfs_ops = {
- .alloc_inode = romfs_alloc_inode,
- .destroy_inode = romfs_destroy_inode,
- .statfs = romfs_statfs,
- .remount_fs = romfs_remount,
-};
-
-static int romfs_get_sb(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data, struct vfsmount *mnt)
-{
- return get_sb_bdev(fs_type, flags, dev_name, data, romfs_fill_super,
- mnt);
-}
-
-static struct file_system_type romfs_fs_type = {
- .owner = THIS_MODULE,
- .name = "romfs",
- .get_sb = romfs_get_sb,
- .kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV,
-};
-
-static int __init init_romfs_fs(void)
-{
- int err = init_inodecache();
- if (err)
- goto out1;
- err = register_filesystem(&romfs_fs_type);
- if (err)
- goto out;
- return 0;
-out:
- destroy_inodecache();
-out1:
- return err;
-}
-
-static void __exit exit_romfs_fs(void)
-{
- unregister_filesystem(&romfs_fs_type);
- destroy_inodecache();
-}
-
-/* Yes, works even as a module... :) */
-
-module_init(init_romfs_fs)
-module_exit(exit_romfs_fs)
-MODULE_LICENSE("GPL");
diff --git a/fs/romfs/internal.h b/fs/romfs/internal.h
new file mode 100644
index 000000000000..06044a9dc62d
--- /dev/null
+++ b/fs/romfs/internal.h
@@ -0,0 +1,47 @@
+/* RomFS internal definitions
+ *
+ * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/romfs_fs.h>
+
+struct romfs_inode_info {
+ struct inode vfs_inode;
+ unsigned long i_metasize; /* size of non-data area */
+ unsigned long i_dataoffset; /* from the start of fs */
+};
+
+static inline size_t romfs_maxsize(struct super_block *sb)
+{
+ return (size_t) (unsigned long) sb->s_fs_info;
+}
+
+static inline struct romfs_inode_info *ROMFS_I(struct inode *inode)
+{
+ return container_of(inode, struct romfs_inode_info, vfs_inode);
+}
+
+/*
+ * mmap-nommu.c
+ */
+#if !defined(CONFIG_MMU) && defined(CONFIG_ROMFS_ON_MTD)
+extern const struct file_operations romfs_ro_fops;
+#else
+#define romfs_ro_fops generic_ro_fops
+#endif
+
+/*
+ * storage.c
+ */
+extern int romfs_dev_read(struct super_block *sb, unsigned long pos,
+ void *buf, size_t buflen);
+extern ssize_t romfs_dev_strnlen(struct super_block *sb,
+ unsigned long pos, size_t maxlen);
+extern int romfs_dev_strncmp(struct super_block *sb, unsigned long pos,
+ const char *str, size_t size);
diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
new file mode 100644
index 000000000000..f0511e816967
--- /dev/null
+++ b/fs/romfs/mmap-nommu.c
@@ -0,0 +1,75 @@
+/* NOMMU mmap support for RomFS on MTD devices
+ *
+ * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/mm.h>
+#include <linux/mtd/super.h>
+#include "internal.h"
+
+/*
+ * try to determine where a shared mapping can be made
+ * - only supported for NOMMU at the moment (MMU can't doesn't copy private
+ * mappings)
+ * - attempts to map through to the underlying MTD device
+ */
+static unsigned long romfs_get_unmapped_area(struct file *file,
+ unsigned long addr,
+ unsigned long len,
+ unsigned long pgoff,
+ unsigned long flags)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct mtd_info *mtd = inode->i_sb->s_mtd;
+ unsigned long isize, offset;
+
+ if (!mtd)
+ goto cant_map_directly;
+
+ isize = i_size_read(inode);
+ offset = pgoff << PAGE_SHIFT;
+ if (offset > isize || len > isize || offset > isize - len)
+ return (unsigned long) -EINVAL;
+
+ /* we need to call down to the MTD layer to do the actual mapping */
+ if (mtd->get_unmapped_area) {
+ if (addr != 0)
+ return (unsigned long) -EINVAL;
+
+ if (len > mtd->size || pgoff >= (mtd->size >> PAGE_SHIFT))
+ return (unsigned long) -EINVAL;
+
+ offset += ROMFS_I(inode)->i_dataoffset;
+ if (offset > mtd->size - len)
+ return (unsigned long) -EINVAL;
+
+ return mtd->get_unmapped_area(mtd, len, offset, flags);
+ }
+
+cant_map_directly:
+ return (unsigned long) -ENOSYS;
+}
+
+/*
+ * permit a R/O mapping to be made directly through onto an MTD device if
+ * possible
+ */
+static int romfs_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS;
+}
+
+const struct file_operations romfs_ro_fops = {
+ .llseek = generic_file_llseek,
+ .read = do_sync_read,
+ .aio_read = generic_file_aio_read,
+ .splice_read = generic_file_splice_read,
+ .mmap = romfs_mmap,
+ .get_unmapped_area = romfs_get_unmapped_area,
+};
diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c
new file mode 100644
index 000000000000..7e3e1e12a081
--- /dev/null
+++ b/fs/romfs/storage.c
@@ -0,0 +1,261 @@
+/* RomFS storage access routines
+ *
+ * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/fs.h>
+#include <linux/mtd/super.h>
+#include <linux/buffer_head.h>
+#include "internal.h"
+
+#if !defined(CONFIG_ROMFS_ON_MTD) && !defined(CONFIG_ROMFS_ON_BLOCK)
+#error no ROMFS backing store interface configured
+#endif
+
+#ifdef CONFIG_ROMFS_ON_MTD
+#define ROMFS_MTD_READ(sb, ...) ((sb)->s_mtd->read((sb)->s_mtd, ##__VA_ARGS__))
+
+/*
+ * read data from an romfs image on an MTD device
+ */
+static int romfs_mtd_read(struct super_block *sb, unsigned long pos,
+ void *buf, size_t buflen)
+{
+ size_t rlen;
+ int ret;
+
+ ret = ROMFS_MTD_READ(sb, pos, buflen, &rlen, buf);
+ return (ret < 0 || rlen != buflen) ? -EIO : 0;
+}
+
+/*
+ * determine the length of a string in a romfs image on an MTD device
+ */
+static ssize_t romfs_mtd_strnlen(struct super_block *sb,
+ unsigned long pos, size_t maxlen)
+{
+ ssize_t n = 0;
+ size_t segment;
+ u_char buf[16], *p;
+ size_t len;
+ int ret;
+
+ /* scan the string up to 16 bytes at a time */
+ while (maxlen > 0) {
+ segment = min_t(size_t, maxlen, 16);
+ ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf);
+ if (ret < 0)
+ return ret;
+ p = memchr(buf, 0, len);
+ if (p)
+ return n + (p - buf);
+ maxlen -= len;
+ pos += len;
+ n += len;
+ }
+
+ return n;
+}
+
+/*
+ * compare a string to one in a romfs image on MTD
+ * - return 1 if matched, 0 if differ, -ve if error
+ */
+static int romfs_mtd_strncmp(struct super_block *sb, unsigned long pos,
+ const char *str, size_t size)
+{
+ u_char buf[16];
+ size_t len, segment;
+ int ret;
+
+ /* scan the string up to 16 bytes at a time */
+ while (size > 0) {
+ segment = min_t(size_t, size, 16);
+ ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf);
+ if (ret < 0)
+ return ret;
+ if (memcmp(buf, str, len) != 0)
+ return 0;
+ size -= len;
+ pos += len;
+ str += len;
+ }
+
+ return 1;
+}
+#endif /* CONFIG_ROMFS_ON_MTD */
+
+#ifdef CONFIG_ROMFS_ON_BLOCK
+/*
+ * read data from an romfs image on a block device
+ */
+static int romfs_blk_read(struct super_block *sb, unsigned long pos,
+ void *buf, size_t buflen)
+{
+ struct buffer_head *bh;
+ unsigned long offset;
+ size_t segment;
+
+ /* copy the string up to blocksize bytes at a time */
+ while (buflen > 0) {
+ offset = pos & (ROMBSIZE - 1);
+ segment = min_t(size_t, buflen, ROMBSIZE - offset);
+ bh = sb_bread(sb, pos >> ROMBSBITS);
+ if (!bh)
+ return -EIO;
+ memcpy(buf, bh->b_data + offset, segment);
+ brelse(bh);
+ buflen -= segment;
+ pos += segment;
+ }
+
+ return 0;
+}
+
+/*
+ * determine the length of a string in romfs on a block device
+ */
+static ssize_t romfs_blk_strnlen(struct super_block *sb,
+ unsigned long pos, size_t limit)
+{
+ struct buffer_head *bh;
+ unsigned long offset;
+ ssize_t n = 0;
+ size_t segment;
+ u_char *buf, *p;
+
+ /* scan the string up to blocksize bytes at a time */
+ while (limit > 0) {
+ offset = pos & (ROMBSIZE - 1);
+ segment = min_t(size_t, limit, ROMBSIZE - offset);
+ bh = sb_bread(sb, pos >> ROMBSBITS);
+ if (!bh)
+ return -EIO;
+ buf = bh->b_data + offset;
+ p = memchr(buf, 0, segment);
+ brelse(bh);
+ if (p)
+ return n + (p - buf);
+ limit -= segment;
+ pos += segment;
+ n += segment;
+ }
+
+ return n;
+}
+
+/*
+ * compare a string to one in a romfs image on a block device
+ * - return 1 if matched, 0 if differ, -ve if error
+ */
+static int romfs_blk_strncmp(struct super_block *sb, unsigned long pos,
+ const char *str, size_t size)
+{
+ struct buffer_head *bh;
+ unsigned long offset;
+ size_t segment;
+ bool x;
+
+ /* scan the string up to 16 bytes at a time */
+ while (size > 0) {
+ offset = pos & (ROMBSIZE - 1);
+ segment = min_t(size_t, size, ROMBSIZE - offset);
+ bh = sb_bread(sb, pos >> ROMBSBITS);
+ if (!bh)
+ return -EIO;
+ x = (memcmp(bh->b_data + offset, str, segment) != 0);
+ brelse(bh);
+ if (x)
+ return 0;
+ size -= segment;
+ pos += segment;
+ str += segment;
+ }
+
+ return 1;
+}
+#endif /* CONFIG_ROMFS_ON_BLOCK */
+
+/*
+ * read data from the romfs image
+ */
+int romfs_dev_read(struct super_block *sb, unsigned long pos,
+ void *buf, size_t buflen)
+{
+ size_t limit;
+
+ limit = romfs_maxsize(sb);
+ if (pos >= limit)
+ return -EIO;
+ if (buflen > limit - pos)
+ buflen = limit - pos;
+
+#ifdef CONFIG_ROMFS_ON_MTD
+ if (sb->s_mtd)
+ return romfs_mtd_read(sb, pos, buf, buflen);
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+ if (sb->s_bdev)
+ return romfs_blk_read(sb, pos, buf, buflen);
+#endif
+ return -EIO;
+}
+
+/*
+ * determine the length of a string in romfs
+ */
+ssize_t romfs_dev_strnlen(struct super_block *sb,
+ unsigned long pos, size_t maxlen)
+{
+ size_t limit;
+
+ limit = romfs_maxsize(sb);
+ if (pos >= limit)
+ return -EIO;
+ if (maxlen > limit - pos)
+ maxlen = limit - pos;
+
+#ifdef CONFIG_ROMFS_ON_MTD
+ if (sb->s_mtd)
+ return romfs_mtd_strnlen(sb, pos, limit);
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+ if (sb->s_bdev)
+ return romfs_blk_strnlen(sb, pos, limit);
+#endif
+ return -EIO;
+}
+
+/*
+ * compare a string to one in romfs
+ * - return 1 if matched, 0 if differ, -ve if error
+ */
+int romfs_dev_strncmp(struct super_block *sb, unsigned long pos,
+ const char *str, size_t size)
+{
+ size_t limit;
+
+ limit = romfs_maxsize(sb);
+ if (pos >= limit)
+ return -EIO;
+ if (size > ROMFS_MAXFN)
+ return -ENAMETOOLONG;
+ if (size > limit - pos)
+ return -EIO;
+
+#ifdef CONFIG_ROMFS_ON_MTD
+ if (sb->s_mtd)
+ return romfs_mtd_strncmp(sb, pos, str, size);
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+ if (sb->s_bdev)
+ return romfs_blk_strncmp(sb, pos, str, size);
+#endif
+ return -EIO;
+}
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
new file mode 100644
index 000000000000..1e548a4975ba
--- /dev/null
+++ b/fs/romfs/super.c
@@ -0,0 +1,648 @@
+/* Block- or MTD-based romfs
+ *
+ * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * Derived from: ROMFS file system, Linux implementation
+ *
+ * Copyright © 1997-1999 Janos Farkas <chexum@shadow.banki.hu>
+ *
+ * Using parts of the minix filesystem
+ * Copyright © 1991, 1992 Linus Torvalds
+ *
+ * and parts of the affs filesystem additionally
+ * Copyright © 1993 Ray Burr
+ * Copyright © 1996 Hans-Joachim Widmaier
+ *
+ * Changes
+ * Changed for 2.1.19 modules
+ * Jan 1997 Initial release
+ * Jun 1997 2.1.43+ changes
+ * Proper page locking in readpage
+ * Changed to work with 2.1.45+ fs
+ * Jul 1997 Fixed follow_link
+ * 2.1.47
+ * lookup shouldn't return -ENOENT
+ * from Horst von Brand:
+ * fail on wrong checksum
+ * double unlock_super was possible
+ * correct namelen for statfs
+ * spotted by Bill Hawes:
+ * readlink shouldn't iput()
+ * Jun 1998 2.1.106 from Avery Pennarun: glibc scandir()
+ * exposed a problem in readdir
+ * 2.1.107 code-freeze spellchecker run
+ * Aug 1998 2.1.118+ VFS changes
+ * Sep 1998 2.1.122 another VFS change (follow_link)
+ * Apr 1999 2.2.7 no more EBADF checking in
+ * lookup/readdir, use ERR_PTR
+ * Jun 1999 2.3.6 d_alloc_root use changed
+ * 2.3.9 clean up usage of ENOENT/negative
+ * dentries in lookup
+ * clean up page flags setting
+ * (error, uptodate, locking) in
+ * in readpage
+ * use init_special_inode for
+ * fifos/sockets (and streamline) in
+ * read_inode, fix _ops table order
+ * Aug 1999 2.3.16 __initfunc() => __init change
+ * Oct 1999 2.3.24 page->owner hack obsoleted
+ * Nov 1999 2.3.27 2.3.25+ page->offset => index change
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/parser.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/statfs.h>
+#include <linux/mtd/super.h>
+#include <linux/ctype.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/uaccess.h>
+#include "internal.h"
+
+static struct kmem_cache *romfs_inode_cachep;
+
+static const umode_t romfs_modemap[8] = {
+ 0, /* hard link */
+ S_IFDIR | 0644, /* directory */
+ S_IFREG | 0644, /* regular file */
+ S_IFLNK | 0777, /* symlink */
+ S_IFBLK | 0600, /* blockdev */
+ S_IFCHR | 0600, /* chardev */
+ S_IFSOCK | 0644, /* socket */
+ S_IFIFO | 0644 /* FIFO */
+};
+
+static const unsigned char romfs_dtype_table[] = {
+ DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_SOCK, DT_FIFO
+};
+
+static struct inode *romfs_iget(struct super_block *sb, unsigned long pos);
+
+/*
+ * read a page worth of data from the image
+ */
+static int romfs_readpage(struct file *file, struct page *page)
+{
+ struct inode *inode = page->mapping->host;
+ loff_t offset, size;
+ unsigned long fillsize, pos;
+ void *buf;
+ int ret;
+
+ buf = kmap(page);
+ if (!buf)
+ return -ENOMEM;
+
+ /* 32 bit warning -- but not for us :) */
+ offset = page_offset(page);
+ size = i_size_read(inode);
+ fillsize = 0;
+ ret = 0;
+ if (offset < size) {
+ size -= offset;
+ fillsize = size > PAGE_SIZE ? PAGE_SIZE : size;
+
+ pos = ROMFS_I(inode)->i_dataoffset + offset;
+
+ ret = romfs_dev_read(inode->i_sb, pos, buf, fillsize);
+ if (ret < 0) {
+ SetPageError(page);
+ fillsize = 0;
+ ret = -EIO;
+ }
+ }
+
+ if (fillsize < PAGE_SIZE)
+ memset(buf + fillsize, 0, PAGE_SIZE - fillsize);
+ if (ret == 0)
+ SetPageUptodate(page);
+
+ flush_dcache_page(page);
+ kunmap(page);
+ unlock_page(page);
+ return ret;
+}
+
+static const struct address_space_operations romfs_aops = {
+ .readpage = romfs_readpage
+};
+
+/*
+ * read the entries from a directory
+ */
+static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+ struct inode *i = filp->f_dentry->d_inode;
+ struct romfs_inode ri;
+ unsigned long offset, maxoff;
+ int j, ino, nextfh;
+ int stored = 0;
+ char fsname[ROMFS_MAXFN]; /* XXX dynamic? */
+ int ret;
+
+ maxoff = romfs_maxsize(i->i_sb);
+
+ offset = filp->f_pos;
+ if (!offset) {
+ offset = i->i_ino & ROMFH_MASK;
+ ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
+ if (ret < 0)
+ goto out;
+ offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
+ }
+
+ /* Not really failsafe, but we are read-only... */
+ for (;;) {
+ if (!offset || offset >= maxoff) {
+ offset = maxoff;
+ filp->f_pos = offset;
+ goto out;
+ }
+ filp->f_pos = offset;
+
+ /* Fetch inode info */
+ ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
+ if (ret < 0)
+ goto out;
+
+ j = romfs_dev_strnlen(i->i_sb, offset + ROMFH_SIZE,
+ sizeof(fsname) - 1);
+ if (j < 0)
+ goto out;
+
+ ret = romfs_dev_read(i->i_sb, offset + ROMFH_SIZE, fsname, j);
+ if (ret < 0)
+ goto out;
+ fsname[j] = '\0';
+
+ ino = offset;
+ nextfh = be32_to_cpu(ri.next);
+ if ((nextfh & ROMFH_TYPE) == ROMFH_HRD)
+ ino = be32_to_cpu(ri.spec);
+ if (filldir(dirent, fsname, j, offset, ino,
+ romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0)
+ goto out;
+
+ stored++;
+ offset = nextfh & ROMFH_MASK;
+ }
+
+out:
+ return stored;
+}
+
+/*
+ * look up an entry in a directory
+ */
+static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry,
+ struct nameidata *nd)
+{
+ unsigned long offset, maxoff;
+ struct inode *inode;
+ struct romfs_inode ri;
+ const char *name; /* got from dentry */
+ int len, ret;
+
+ offset = dir->i_ino & ROMFH_MASK;
+ ret = romfs_dev_read(dir->i_sb, offset, &ri, ROMFH_SIZE);
+ if (ret < 0)
+ goto error;
+
+ /* search all the file entries in the list starting from the one
+ * pointed to by the directory's special data */
+ maxoff = romfs_maxsize(dir->i_sb);
+ offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
+
+ name = dentry->d_name.name;
+ len = dentry->d_name.len;
+
+ for (;;) {
+ if (!offset || offset >= maxoff)
+ goto out0;
+
+ ret = romfs_dev_read(dir->i_sb, offset, &ri, sizeof(ri));
+ if (ret < 0)
+ goto error;
+
+ /* try to match the first 16 bytes of name */
+ ret = romfs_dev_strncmp(dir->i_sb, offset + ROMFH_SIZE, name,
+ len);
+ if (ret < 0)
+ goto error;
+ if (ret == 1)
+ break;
+
+ /* next entry */
+ offset = be32_to_cpu(ri.next) & ROMFH_MASK;
+ }
+
+ /* Hard link handling */
+ if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD)
+ offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
+
+ inode = romfs_iget(dir->i_sb, offset);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ goto error;
+ }
+ goto outi;
+
+ /*
+ * it's a bit funky, _lookup needs to return an error code
+ * (negative) or a NULL, both as a dentry. ENOENT should not
+ * be returned, instead we need to create a negative dentry by
+ * d_add(dentry, NULL); and return 0 as no error.
+ * (Although as I see, it only matters on writable file
+ * systems).
+ */
+out0:
+ inode = NULL;
+outi:
+ d_add(dentry, inode);
+ ret = 0;
+error:
+ return ERR_PTR(ret);
+}
+
+static const struct file_operations romfs_dir_operations = {
+ .read = generic_read_dir,
+ .readdir = romfs_readdir,
+};
+
+static struct inode_operations romfs_dir_inode_operations = {
+ .lookup = romfs_lookup,
+};
+
+/*
+ * get a romfs inode based on its position in the image (which doubles as the
+ * inode number)
+ */
+static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
+{
+ struct romfs_inode_info *inode;
+ struct romfs_inode ri;
+ struct inode *i;
+ unsigned long nlen;
+ unsigned nextfh, ret;
+ umode_t mode;
+
+ /* we might have to traverse a chain of "hard link" file entries to get
+ * to the actual file */
+ for (;;) {
+ ret = romfs_dev_read(sb, pos, &ri, sizeof(ri));
+ if (ret < 0)
+ goto error;
+
+ /* XXX: do romfs_checksum here too (with name) */
+
+ nextfh = be32_to_cpu(ri.next);
+ if ((nextfh & ROMFH_TYPE) != ROMFH_HRD)
+ break;
+
+ pos = be32_to_cpu(ri.spec) & ROMFH_MASK;
+ }
+
+ /* determine the length of the filename */
+ nlen = romfs_dev_strnlen(sb, pos + ROMFH_SIZE, ROMFS_MAXFN);
+ if (IS_ERR_VALUE(nlen))
+ goto eio;
+
+ /* get an inode for this image position */
+ i = iget_locked(sb, pos);
+ if (!i)
+ return ERR_PTR(-ENOMEM);
+
+ if (!(i->i_state & I_NEW))
+ return i;
+
+ /* precalculate the data offset */
+ inode = ROMFS_I(i);
+ inode->i_metasize = (ROMFH_SIZE + nlen + 1 + ROMFH_PAD) & ROMFH_MASK;
+ inode->i_dataoffset = pos + inode->i_metasize;
+
+ i->i_nlink = 1; /* Hard to decide.. */
+ i->i_size = be32_to_cpu(ri.size);
+ i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
+ i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
+
+ /* set up mode and ops */
+ mode = romfs_modemap[nextfh & ROMFH_TYPE];
+
+ switch (nextfh & ROMFH_TYPE) {
+ case ROMFH_DIR:
+ i->i_size = ROMFS_I(i)->i_metasize;
+ i->i_op = &romfs_dir_inode_operations;
+ i->i_fop = &romfs_dir_operations;
+ if (nextfh & ROMFH_EXEC)
+ mode |= S_IXUGO;
+ break;
+ case ROMFH_REG:
+ i->i_fop = &romfs_ro_fops;
+ i->i_data.a_ops = &romfs_aops;
+ if (i->i_sb->s_mtd)
+ i->i_data.backing_dev_info =
+ i->i_sb->s_mtd->backing_dev_info;
+ if (nextfh & ROMFH_EXEC)
+ mode |= S_IXUGO;
+ break;
+ case ROMFH_SYM:
+ i->i_op = &page_symlink_inode_operations;
+ i->i_data.a_ops = &romfs_aops;
+ mode |= S_IRWXUGO;
+ break;
+ default:
+ /* depending on MBZ for sock/fifos */
+ nextfh = be32_to_cpu(ri.spec);
+ init_special_inode(i, mode, MKDEV(nextfh >> 16,
+ nextfh & 0xffff));
+ break;
+ }
+
+ i->i_mode = mode;
+
+ unlock_new_inode(i);
+ return i;
+
+eio:
+ ret = -EIO;
+error:
+ printk(KERN_ERR "ROMFS: read error for inode 0x%lx\n", pos);
+ return ERR_PTR(ret);
+}
+
+/*
+ * allocate a new inode
+ */
+static struct inode *romfs_alloc_inode(struct super_block *sb)
+{
+ struct romfs_inode_info *inode;
+ inode = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL);
+ return inode ? &inode->vfs_inode : NULL;
+}
+
+/*
+ * return a spent inode to the slab cache
+ */
+static void romfs_destroy_inode(struct inode *inode)
+{
+ kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
+}
+
+/*
+ * get filesystem statistics
+ */
+static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ buf->f_type = ROMFS_MAGIC;
+ buf->f_namelen = ROMFS_MAXFN;
+ buf->f_bsize = ROMBSIZE;
+ buf->f_bfree = buf->f_bavail = buf->f_ffree;
+ buf->f_blocks =
+ (romfs_maxsize(dentry->d_sb) + ROMBSIZE - 1) >> ROMBSBITS;
+ return 0;
+}
+
+/*
+ * remounting must involve read-only
+ */
+static int romfs_remount(struct super_block *sb, int *flags, char *data)
+{
+ *flags |= MS_RDONLY;
+ return 0;
+}
+
+static const struct super_operations romfs_super_ops = {
+ .alloc_inode = romfs_alloc_inode,
+ .destroy_inode = romfs_destroy_inode,
+ .statfs = romfs_statfs,
+ .remount_fs = romfs_remount,
+};
+
+/*
+ * checksum check on part of a romfs filesystem
+ */
+static __u32 romfs_checksum(const void *data, int size)
+{
+ const __be32 *ptr = data;
+ __u32 sum;
+
+ sum = 0;
+ size >>= 2;
+ while (size > 0) {
+ sum += be32_to_cpu(*ptr++);
+ size--;
+ }
+ return sum;
+}
+
+/*
+ * fill in the superblock
+ */
+static int romfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+ struct romfs_super_block *rsb;
+ struct inode *root;
+ unsigned long pos, img_size;
+ const char *storage;
+ size_t len;
+ int ret;
+
+#ifdef CONFIG_BLOCK
+ if (!sb->s_mtd) {
+ sb_set_blocksize(sb, ROMBSIZE);
+ } else {
+ sb->s_blocksize = ROMBSIZE;
+ sb->s_blocksize_bits = blksize_bits(ROMBSIZE);
+ }
+#endif
+
+ sb->s_maxbytes = 0xFFFFFFFF;
+ sb->s_magic = ROMFS_MAGIC;
+ sb->s_flags |= MS_RDONLY | MS_NOATIME;
+ sb->s_op = &romfs_super_ops;
+
+ /* read the image superblock and check it */
+ rsb = kmalloc(512, GFP_KERNEL);
+ if (!rsb)
+ return -ENOMEM;
+
+ sb->s_fs_info = (void *) 512;
+ ret = romfs_dev_read(sb, 0, rsb, 512);
+ if (ret < 0)
+ goto error_rsb;
+
+ img_size = be32_to_cpu(rsb->size);
+
+ if (sb->s_mtd && img_size > sb->s_mtd->size)
+ goto error_rsb_inval;
+
+ sb->s_fs_info = (void *) img_size;
+
+ if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1 ||
+ img_size < ROMFH_SIZE) {
+ if (!silent)
+ printk(KERN_WARNING "VFS:"
+ " Can't find a romfs filesystem on dev %s.\n",
+ sb->s_id);
+ goto error_rsb_inval;
+ }
+
+ if (romfs_checksum(rsb, min_t(size_t, img_size, 512))) {
+ printk(KERN_ERR "ROMFS: bad initial checksum on dev %s.\n",
+ sb->s_id);
+ goto error_rsb_inval;
+ }
+
+ storage = sb->s_mtd ? "MTD" : "the block layer";
+
+ len = strnlen(rsb->name, ROMFS_MAXFN);
+ if (!silent)
+ printk(KERN_NOTICE "ROMFS: Mounting image '%*.*s' through %s\n",
+ (unsigned) len, (unsigned) len, rsb->name, storage);
+
+ kfree(rsb);
+ rsb = NULL;
+
+ /* find the root directory */
+ pos = (ROMFH_SIZE + len + 1 + ROMFH_PAD) & ROMFH_MASK;
+
+ root = romfs_iget(sb, pos);
+ if (!root)
+ goto error;
+
+ sb->s_root = d_alloc_root(root);
+ if (!sb->s_root)
+ goto error_i;
+
+ return 0;
+
+error_i:
+ iput(root);
+error:
+ return -EINVAL;
+error_rsb_inval:
+ ret = -EINVAL;
+error_rsb:
+ return ret;
+}
+
+/*
+ * get a superblock for mounting
+ */
+static int romfs_get_sb(struct file_system_type *fs_type,
+ int flags, const char *dev_name,
+ void *data, struct vfsmount *mnt)
+{
+ int ret = -EINVAL;
+
+#ifdef CONFIG_ROMFS_ON_MTD
+ ret = get_sb_mtd(fs_type, flags, dev_name, data, romfs_fill_super,
+ mnt);
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+ if (ret == -EINVAL)
+ ret = get_sb_bdev(fs_type, flags, dev_name, data,
+ romfs_fill_super, mnt);
+#endif
+ return ret;
+}
+
+/*
+ * destroy a romfs superblock in the appropriate manner
+ */
+static void romfs_kill_sb(struct super_block *sb)
+{
+#ifdef CONFIG_ROMFS_ON_MTD
+ if (sb->s_mtd) {
+ kill_mtd_super(sb);
+ return;
+ }
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+ if (sb->s_bdev) {
+ kill_block_super(sb);
+ return;
+ }
+#endif
+}
+
+static struct file_system_type romfs_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "romfs",
+ .get_sb = romfs_get_sb,
+ .kill_sb = romfs_kill_sb,
+ .fs_flags = FS_REQUIRES_DEV,
+};
+
+/*
+ * inode storage initialiser
+ */
+static void romfs_i_init_once(void *_inode)
+{
+ struct romfs_inode_info *inode = _inode;
+
+ inode_init_once(&inode->vfs_inode);
+}
+
+/*
+ * romfs module initialisation
+ */
+static int __init init_romfs_fs(void)
+{
+ int ret;
+
+ printk(KERN_INFO "ROMFS MTD (C) 2007 Red Hat, Inc.\n");
+
+ romfs_inode_cachep =
+ kmem_cache_create("romfs_i",
+ sizeof(struct romfs_inode_info), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ romfs_i_init_once);
+
+ if (!romfs_inode_cachep) {
+ printk(KERN_ERR
+ "ROMFS error: Failed to initialise inode cache\n");
+ return -ENOMEM;
+ }
+ ret = register_filesystem(&romfs_fs_type);
+ if (ret) {
+ printk(KERN_ERR "ROMFS error: Failed to register filesystem\n");
+ goto error_register;
+ }
+ return 0;
+
+error_register:
+ kmem_cache_destroy(romfs_inode_cachep);
+ return ret;
+}
+
+/*
+ * romfs module removal
+ */
+static void __exit exit_romfs_fs(void)
+{
+ unregister_filesystem(&romfs_fs_type);
+ kmem_cache_destroy(romfs_inode_cachep);
+}
+
+module_init(init_romfs_fs);
+module_exit(exit_romfs_fs);
+
+MODULE_DESCRIPTION("Direct-MTD Capable RomFS");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL"); /* Actually dual-licensed, but it doesn't matter for */
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
index 69e971d5ddc1..2b1b8fe5e037 100644
--- a/fs/squashfs/export.c
+++ b/fs/squashfs/export.c
@@ -40,6 +40,7 @@
#include <linux/dcache.h>
#include <linux/exportfs.h>
#include <linux/zlib.h>
+#include <linux/slab.h>
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index f393620890ee..af1914462f02 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -194,29 +194,26 @@ static int make_free_space(struct ubifs_info *c)
}
/**
- * ubifs_calc_min_idx_lebs - calculate amount of eraseblocks for the index.
+ * ubifs_calc_min_idx_lebs - calculate amount of LEBs for the index.
* @c: UBIFS file-system description object
*
- * This function calculates and returns the number of eraseblocks which should
- * be kept for index usage.
+ * This function calculates and returns the number of LEBs which should be kept
+ * for index usage.
*/
int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
{
- int idx_lebs, eff_leb_size = c->leb_size - c->max_idx_node_sz;
+ int idx_lebs;
long long idx_size;
idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx;
-
/* And make sure we have thrice the index size of space reserved */
- idx_size = idx_size + (idx_size << 1);
-
+ idx_size += idx_size << 1;
/*
* We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes'
* pair, nor similarly the two variables for the new index size, so we
* have to do this costly 64-bit division on fast-path.
*/
- idx_size += eff_leb_size - 1;
- idx_lebs = div_u64(idx_size, eff_leb_size);
+ idx_lebs = div_u64(idx_size + c->idx_leb_size - 1, c->idx_leb_size);
/*
* The index head is not available for the in-the-gaps method, so add an
* extra LEB to compensate.
@@ -310,23 +307,23 @@ static int can_use_rp(struct ubifs_info *c)
* do_budget_space - reserve flash space for index and data growth.
* @c: UBIFS file-system description object
*
- * This function makes sure UBIFS has enough free eraseblocks for index growth
- * and data.
+ * This function makes sure UBIFS has enough free LEBs for index growth and
+ * data.
*
* When budgeting index space, UBIFS reserves thrice as many LEBs as the index
* would take if it was consolidated and written to the flash. This guarantees
* that the "in-the-gaps" commit method always succeeds and UBIFS will always
* be able to commit dirty index. So this function basically adds amount of
* budgeted index space to the size of the current index, multiplies this by 3,
- * and makes sure this does not exceed the amount of free eraseblocks.
+ * and makes sure this does not exceed the amount of free LEBs.
*
* Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables:
* o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might
* be large, because UBIFS does not do any index consolidation as long as
* there is free space. IOW, the index may take a lot of LEBs, but the LEBs
* will contain a lot of dirt.
- * o @c->min_idx_lebs is the the index presumably takes. IOW, the index may be
- * consolidated to take up to @c->min_idx_lebs LEBs.
+ * o @c->min_idx_lebs is the number of LEBS the index presumably takes. IOW,
+ * the index may be consolidated to take up to @c->min_idx_lebs LEBs.
*
* This function returns zero in case of success, and %-ENOSPC in case of
* failure.
@@ -695,12 +692,12 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free)
* This function calculates amount of free space to report to user-space.
*
* Because UBIFS may introduce substantial overhead (the index, node headers,
- * alignment, wastage at the end of eraseblocks, etc), it cannot report real
- * amount of free flash space it has (well, because not all dirty space is
- * reclaimable, UBIFS does not actually know the real amount). If UBIFS did so,
- * it would bread user expectations about what free space is. Users seem to
- * accustomed to assume that if the file-system reports N bytes of free space,
- * they would be able to fit a file of N bytes to the FS. This almost works for
+ * alignment, wastage at the end of LEBs, etc), it cannot report real amount of
+ * free flash space it has (well, because not all dirty space is reclaimable,
+ * UBIFS does not actually know the real amount). If UBIFS did so, it would
+ * bread user expectations about what free space is. Users seem to accustomed
+ * to assume that if the file-system reports N bytes of free space, they would
+ * be able to fit a file of N bytes to the FS. This almost works for
* traditional file-systems, because they have way less overhead than UBIFS.
* So, to keep users happy, UBIFS tries to take the overhead into account.
*/
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index e975bd82f38b..ce2cd8343618 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -479,9 +479,9 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
"bad or corrupted node)");
else {
for (i = 0; i < nlen && dent->name[i]; i++)
- printk("%c", dent->name[i]);
+ printk(KERN_CONT "%c", dent->name[i]);
}
- printk("\n");
+ printk(KERN_CONT "\n");
break;
}
@@ -1214,7 +1214,7 @@ static int dbg_check_znode(struct ubifs_info *c, struct ubifs_zbranch *zbr)
/*
* Make sure the last key in our znode is less or
- * equivalent than the the key in zbranch which goes
+ * equivalent than the key in the zbranch which goes
* after our pointing zbranch.
*/
cmp = keys_cmp(c, max,
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 0ff89fe71e51..6d34dc7e33e1 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -430,6 +430,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
struct ubifs_inode *ui = ubifs_inode(inode);
pgoff_t index = pos >> PAGE_CACHE_SHIFT;
int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
+ int skipped_read = 0;
struct page *page;
ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size);
@@ -444,7 +445,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
if (!PageUptodate(page)) {
/* The page is not loaded from the flash */
- if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
+ if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) {
/*
* We change whole page so no need to load it. But we
* have to set the @PG_checked flag to make the further
@@ -453,7 +454,8 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
* the media.
*/
SetPageChecked(page);
- else {
+ skipped_read = 1;
+ } else {
err = do_readpage(page);
if (err) {
unlock_page(page);
@@ -470,6 +472,14 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
if (unlikely(err)) {
ubifs_assert(err == -ENOSPC);
/*
+ * If we skipped reading the page because we were going to
+ * write all of it, then it is not up to date.
+ */
+ if (skipped_read) {
+ ClearPageChecked(page);
+ ClearPageUptodate(page);
+ }
+ /*
* Budgeting failed which means it would have to force
* write-back but didn't, because we set the @fast flag in the
* request. Write-back cannot be done now, while we have the
@@ -949,7 +959,7 @@ static int do_writepage(struct page *page, int len)
* whole index and correct all inode sizes, which is long an unacceptable.
*
* To prevent situations like this, UBIFS writes pages back only if they are
- * within last synchronized inode size, i.e. the the size which has been
+ * within the last synchronized inode size, i.e. the size which has been
* written to the flash media last time. Otherwise, UBIFS forces inode
* write-back, thus making sure the on-flash inode contains current inode size,
* and then keeps writing pages back.
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 717d79c97c5e..1d54383d1269 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -478,7 +478,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c,
* ubifs_find_free_space - find a data LEB with free space.
* @c: the UBIFS file-system description object
* @min_space: minimum amount of required free space
- * @free: contains amount of free space in the LEB on exit
+ * @offs: contains offset of where free space starts on exit
* @squeeze: whether to try to find space in a non-empty LEB first
*
* This function looks for an LEB with at least @min_space bytes of free space.
@@ -490,7 +490,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c,
* failed to find a LEB with @min_space bytes of free space and other a negative
* error codes in case of failure.
*/
-int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
+int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs,
int squeeze)
{
const struct ubifs_lprops *lprops;
@@ -558,10 +558,10 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
spin_unlock(&c->space_lock);
}
- *free = lprops->free;
+ *offs = c->leb_size - lprops->free;
ubifs_release_lprops(c);
- if (*free == c->leb_size) {
+ if (*offs == 0) {
/*
* Ensure that empty LEBs have been unmapped. They may not have
* been, for example, because of an unclean unmount. Also
@@ -573,8 +573,8 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
return err;
}
- dbg_find("found LEB %d, free %d", lnum, *free);
- ubifs_assert(*free >= min_space);
+ dbg_find("found LEB %d, free %d", lnum, c->leb_size - *offs);
+ ubifs_assert(*offs <= c->leb_size - min_space);
return lnum;
out:
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index a711d33b3d3e..f0f5f15d384e 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -47,7 +47,7 @@
* have to waste large pieces of free space at the end of LEB B, because nodes
* from LEB A would not fit. And the worst situation is when all nodes are of
* maximum size. So dark watermark is the amount of free + dirty space in LEB
- * which are guaranteed to be reclaimable. If LEB has less space, the GC migh
+ * which are guaranteed to be reclaimable. If LEB has less space, the GC might
* be unable to reclaim it. So, LEBs with free + dirty greater than dark
* watermark are "good" LEBs from GC's point of few. The other LEBs are not so
* good, and GC takes extra care when moving them.
@@ -57,14 +57,6 @@
#include "ubifs.h"
/*
- * GC tries to optimize the way it fit nodes to available space, and it sorts
- * nodes a little. The below constants are watermarks which define "large",
- * "medium", and "small" nodes.
- */
-#define MEDIUM_NODE_WM (UBIFS_BLOCK_SIZE / 4)
-#define SMALL_NODE_WM UBIFS_MAX_DENT_NODE_SZ
-
-/*
* GC may need to move more than one LEB to make progress. The below constants
* define "soft" and "hard" limits on the number of LEBs the garbage collector
* may move.
@@ -116,83 +108,222 @@ static int switch_gc_head(struct ubifs_info *c)
}
/**
- * joinup - bring data nodes for an inode together.
- * @c: UBIFS file-system description object
- * @sleb: describes scanned LEB
- * @inum: inode number
- * @blk: block number
- * @data: list to which to add data nodes
+ * list_sort - sort a list.
+ * @priv: private data, passed to @cmp
+ * @head: the list to sort
+ * @cmp: the elements comparison function
*
- * This function looks at the first few nodes in the scanned LEB @sleb and adds
- * them to @data if they are data nodes from @inum and have a larger block
- * number than @blk. This function returns %0 on success and a negative error
- * code on failure.
+ * This function has been implemented by Mark J Roberts <mjr@znex.org>. It
+ * implements "merge sort" which has O(nlog(n)) complexity. The list is sorted
+ * in ascending order.
+ *
+ * The comparison function @cmp is supposed to return a negative value if @a is
+ * than @b, and a positive value if @a is greater than @b. If @a and @b are
+ * equivalent, then it does not matter what this function returns.
*/
-static int joinup(struct ubifs_info *c, struct ubifs_scan_leb *sleb, ino_t inum,
- unsigned int blk, struct list_head *data)
+static void list_sort(void *priv, struct list_head *head,
+ int (*cmp)(void *priv, struct list_head *a,
+ struct list_head *b))
{
- int err, cnt = 6, lnum = sleb->lnum, offs;
- struct ubifs_scan_node *snod, *tmp;
- union ubifs_key *key;
+ struct list_head *p, *q, *e, *list, *tail, *oldhead;
+ int insize, nmerges, psize, qsize, i;
+
+ if (list_empty(head))
+ return;
+
+ list = head->next;
+ list_del(head);
+ insize = 1;
+ for (;;) {
+ p = oldhead = list;
+ list = tail = NULL;
+ nmerges = 0;
+
+ while (p) {
+ nmerges++;
+ q = p;
+ psize = 0;
+ for (i = 0; i < insize; i++) {
+ psize++;
+ q = q->next == oldhead ? NULL : q->next;
+ if (!q)
+ break;
+ }
- list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
- key = &snod->key;
- if (key_inum(c, key) == inum &&
- key_type(c, key) == UBIFS_DATA_KEY &&
- key_block(c, key) > blk) {
- offs = snod->offs;
- err = ubifs_tnc_has_node(c, key, 0, lnum, offs, 0);
- if (err < 0)
- return err;
- list_del(&snod->list);
- if (err) {
- list_add_tail(&snod->list, data);
- blk = key_block(c, key);
- } else
- kfree(snod);
- cnt = 6;
- } else if (--cnt == 0)
+ qsize = insize;
+ while (psize > 0 || (qsize > 0 && q)) {
+ if (!psize) {
+ e = q;
+ q = q->next;
+ qsize--;
+ if (q == oldhead)
+ q = NULL;
+ } else if (!qsize || !q) {
+ e = p;
+ p = p->next;
+ psize--;
+ if (p == oldhead)
+ p = NULL;
+ } else if (cmp(priv, p, q) <= 0) {
+ e = p;
+ p = p->next;
+ psize--;
+ if (p == oldhead)
+ p = NULL;
+ } else {
+ e = q;
+ q = q->next;
+ qsize--;
+ if (q == oldhead)
+ q = NULL;
+ }
+ if (tail)
+ tail->next = e;
+ else
+ list = e;
+ e->prev = tail;
+ tail = e;
+ }
+ p = q;
+ }
+
+ tail->next = list;
+ list->prev = tail;
+
+ if (nmerges <= 1)
break;
+
+ insize *= 2;
}
- return 0;
+
+ head->next = list;
+ head->prev = list->prev;
+ list->prev->next = head;
+ list->prev = head;
}
/**
- * move_nodes - move nodes.
+ * data_nodes_cmp - compare 2 data nodes.
+ * @priv: UBIFS file-system description object
+ * @a: first data node
+ * @a: second data node
+ *
+ * This function compares data nodes @a and @b. Returns %1 if @a has greater
+ * inode or block number, and %-1 otherwise.
+ */
+int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+ ino_t inuma, inumb;
+ struct ubifs_info *c = priv;
+ struct ubifs_scan_node *sa, *sb;
+
+ cond_resched();
+ sa = list_entry(a, struct ubifs_scan_node, list);
+ sb = list_entry(b, struct ubifs_scan_node, list);
+ ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY);
+ ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY);
+
+ inuma = key_inum(c, &sa->key);
+ inumb = key_inum(c, &sb->key);
+
+ if (inuma == inumb) {
+ unsigned int blka = key_block(c, &sa->key);
+ unsigned int blkb = key_block(c, &sb->key);
+
+ if (blka <= blkb)
+ return -1;
+ } else if (inuma <= inumb)
+ return -1;
+
+ return 1;
+}
+
+/*
+ * nondata_nodes_cmp - compare 2 non-data nodes.
+ * @priv: UBIFS file-system description object
+ * @a: first node
+ * @a: second node
+ *
+ * This function compares nodes @a and @b. It makes sure that inode nodes go
+ * first and sorted by length in descending order. Directory entry nodes go
+ * after inode nodes and are sorted in ascending hash valuer order.
+ */
+int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+ int typea, typeb;
+ ino_t inuma, inumb;
+ struct ubifs_info *c = priv;
+ struct ubifs_scan_node *sa, *sb;
+
+ cond_resched();
+ sa = list_entry(a, struct ubifs_scan_node, list);
+ sb = list_entry(b, struct ubifs_scan_node, list);
+ typea = key_type(c, &sa->key);
+ typeb = key_type(c, &sb->key);
+ ubifs_assert(typea != UBIFS_DATA_KEY && typeb != UBIFS_DATA_KEY);
+
+ /* Inodes go before directory entries */
+ if (typea == UBIFS_INO_KEY) {
+ if (typeb == UBIFS_INO_KEY)
+ return sb->len - sa->len;
+ return -1;
+ }
+ if (typeb == UBIFS_INO_KEY)
+ return 1;
+
+ ubifs_assert(typea == UBIFS_DENT_KEY && typeb == UBIFS_DENT_KEY);
+ inuma = key_inum(c, &sa->key);
+ inumb = key_inum(c, &sb->key);
+
+ if (inuma == inumb) {
+ uint32_t hasha = key_hash(c, &sa->key);
+ uint32_t hashb = key_hash(c, &sb->key);
+
+ if (hasha <= hashb)
+ return -1;
+ } else if (inuma <= inumb)
+ return -1;
+
+ return 1;
+}
+
+/**
+ * sort_nodes - sort nodes for GC.
* @c: UBIFS file-system description object
- * @sleb: describes nodes to move
+ * @sleb: describes nodes to sort and contains the result on exit
+ * @nondata: contains non-data nodes on exit
+ * @min: minimum node size is returned here
*
- * This function moves valid nodes from data LEB described by @sleb to the GC
- * journal head. The obsolete nodes are dropped.
+ * This function sorts the list of inodes to garbage collect. First of all, it
+ * kills obsolete nodes and separates data and non-data nodes to the
+ * @sleb->nodes and @nondata lists correspondingly.
+ *
+ * Data nodes are then sorted in block number order - this is important for
+ * bulk-read; data nodes with lower inode number go before data nodes with
+ * higher inode number, and data nodes with lower block number go before data
+ * nodes with higher block number;
*
- * When moving nodes we have to deal with classical bin-packing problem: the
- * space in the current GC journal head LEB and in @c->gc_lnum are the "bins",
- * where the nodes in the @sleb->nodes list are the elements which should be
- * fit optimally to the bins. This function uses the "first fit decreasing"
- * strategy, although it does not really sort the nodes but just split them on
- * 3 classes - large, medium, and small, so they are roughly sorted.
+ * Non-data nodes are sorted as follows.
+ * o First go inode nodes - they are sorted in descending length order.
+ * o Then go directory entry nodes - they are sorted in hash order, which
+ * should supposedly optimize 'readdir()'. Direntry nodes with lower parent
+ * inode number go before direntry nodes with higher parent inode number,
+ * and direntry nodes with lower name hash values go before direntry nodes
+ * with higher name hash values.
*
- * This function returns zero in case of success, %-EAGAIN if commit is
- * required, and other negative error codes in case of other failures.
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
*/
-static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
+static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+ struct list_head *nondata, int *min)
{
struct ubifs_scan_node *snod, *tmp;
- struct list_head data, large, medium, small;
- struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
- int avail, err, min = INT_MAX;
- unsigned int blk = 0;
- ino_t inum = 0;
- INIT_LIST_HEAD(&data);
- INIT_LIST_HEAD(&large);
- INIT_LIST_HEAD(&medium);
- INIT_LIST_HEAD(&small);
+ *min = INT_MAX;
- while (!list_empty(&sleb->nodes)) {
- struct list_head *lst = sleb->nodes.next;
-
- snod = list_entry(lst, struct ubifs_scan_node, list);
+ /* Separate data nodes and non-data nodes */
+ list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
+ int err;
ubifs_assert(snod->type != UBIFS_IDX_NODE);
ubifs_assert(snod->type != UBIFS_REF_NODE);
@@ -201,53 +332,72 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum,
snod->offs, 0);
if (err < 0)
- goto out;
+ return err;
- list_del(lst);
if (!err) {
/* The node is obsolete, remove it from the list */
+ list_del(&snod->list);
kfree(snod);
continue;
}
- /*
- * Sort the list of nodes so that data nodes go first, large
- * nodes go second, and small nodes go last.
- */
- if (key_type(c, &snod->key) == UBIFS_DATA_KEY) {
- if (inum != key_inum(c, &snod->key)) {
- if (inum) {
- /*
- * Try to move data nodes from the same
- * inode together.
- */
- err = joinup(c, sleb, inum, blk, &data);
- if (err)
- goto out;
- }
- inum = key_inum(c, &snod->key);
- blk = key_block(c, &snod->key);
- }
- list_add_tail(lst, &data);
- } else if (snod->len > MEDIUM_NODE_WM)
- list_add_tail(lst, &large);
- else if (snod->len > SMALL_NODE_WM)
- list_add_tail(lst, &medium);
- else
- list_add_tail(lst, &small);
-
- /* And find the smallest node */
- if (snod->len < min)
- min = snod->len;
+ if (snod->len < *min)
+ *min = snod->len;
+
+ if (key_type(c, &snod->key) != UBIFS_DATA_KEY)
+ list_move_tail(&snod->list, nondata);
}
- /*
- * Join the tree lists so that we'd have one roughly sorted list
- * ('large' will be the head of the joined list).
- */
- list_splice(&data, &large);
- list_splice(&medium, large.prev);
- list_splice(&small, large.prev);
+ /* Sort data and non-data nodes */
+ list_sort(c, &sleb->nodes, &data_nodes_cmp);
+ list_sort(c, nondata, &nondata_nodes_cmp);
+ return 0;
+}
+
+/**
+ * move_node - move a node.
+ * @c: UBIFS file-system description object
+ * @sleb: describes the LEB to move nodes from
+ * @snod: the mode to move
+ * @wbuf: write-buffer to move node to
+ *
+ * This function moves node @snod to @wbuf, changes TNC correspondingly, and
+ * destroys @snod. Returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int move_node(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+ struct ubifs_scan_node *snod, struct ubifs_wbuf *wbuf)
+{
+ int err, new_lnum = wbuf->lnum, new_offs = wbuf->offs + wbuf->used;
+
+ cond_resched();
+ err = ubifs_wbuf_write_nolock(wbuf, snod->node, snod->len);
+ if (err)
+ return err;
+
+ err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
+ snod->offs, new_lnum, new_offs,
+ snod->len);
+ list_del(&snod->list);
+ kfree(snod);
+ return err;
+}
+
+/**
+ * move_nodes - move nodes.
+ * @c: UBIFS file-system description object
+ * @sleb: describes the LEB to move nodes from
+ *
+ * This function moves valid nodes from data LEB described by @sleb to the GC
+ * journal head. This function returns zero in case of success, %-EAGAIN if
+ * commit is required, and other negative error codes in case of other
+ * failures.
+ */
+static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
+{
+ int err, min;
+ LIST_HEAD(nondata);
+ struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
if (wbuf->lnum == -1) {
/*
@@ -256,42 +406,59 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
*/
err = switch_gc_head(c);
if (err)
- goto out;
+ return err;
}
+ err = sort_nodes(c, sleb, &nondata, &min);
+ if (err)
+ goto out;
+
/* Write nodes to their new location. Use the first-fit strategy */
while (1) {
- avail = c->leb_size - wbuf->offs - wbuf->used;
- list_for_each_entry_safe(snod, tmp, &large, list) {
- int new_lnum, new_offs;
+ int avail;
+ struct ubifs_scan_node *snod, *tmp;
+
+ /* Move data nodes */
+ list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
+ avail = c->leb_size - wbuf->offs - wbuf->used;
+ if (snod->len > avail)
+ /*
+ * Do not skip data nodes in order to optimize
+ * bulk-read.
+ */
+ break;
+
+ err = move_node(c, sleb, snod, wbuf);
+ if (err)
+ goto out;
+ }
+ /* Move non-data nodes */
+ list_for_each_entry_safe(snod, tmp, &nondata, list) {
+ avail = c->leb_size - wbuf->offs - wbuf->used;
if (avail < min)
break;
- if (snod->len > avail)
- /* This node does not fit */
+ if (snod->len > avail) {
+ /*
+ * Keep going only if this is an inode with
+ * some data. Otherwise stop and switch the GC
+ * head. IOW, we assume that data-less inode
+ * nodes and direntry nodes are roughly of the
+ * same size.
+ */
+ if (key_type(c, &snod->key) == UBIFS_DENT_KEY ||
+ snod->len == UBIFS_INO_NODE_SZ)
+ break;
continue;
+ }
- cond_resched();
-
- new_lnum = wbuf->lnum;
- new_offs = wbuf->offs + wbuf->used;
- err = ubifs_wbuf_write_nolock(wbuf, snod->node,
- snod->len);
+ err = move_node(c, sleb, snod, wbuf);
if (err)
goto out;
- err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
- snod->offs, new_lnum, new_offs,
- snod->len);
- if (err)
- goto out;
-
- avail = c->leb_size - wbuf->offs - wbuf->used;
- list_del(&snod->list);
- kfree(snod);
}
- if (list_empty(&large))
+ if (list_empty(&sleb->nodes) && list_empty(&nondata))
break;
/*
@@ -306,10 +473,7 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
return 0;
out:
- list_for_each_entry_safe(snod, tmp, &large, list) {
- list_del(&snod->list);
- kfree(snod);
- }
+ list_splice_tail(&nondata, &sleb->nodes);
return err;
}
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index a11ca0958a23..64b5f3a309f5 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -114,7 +114,7 @@ static inline void zero_trun_node_unused(struct ubifs_trun_node *trun)
*/
static int reserve_space(struct ubifs_info *c, int jhead, int len)
{
- int err = 0, err1, retries = 0, avail, lnum, offs, free, squeeze;
+ int err = 0, err1, retries = 0, avail, lnum, offs, squeeze;
struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf;
/*
@@ -139,10 +139,9 @@ again:
* Write buffer wasn't seek'ed or there is no enough space - look for an
* LEB with some empty space.
*/
- lnum = ubifs_find_free_space(c, len, &free, squeeze);
+ lnum = ubifs_find_free_space(c, len, &offs, squeeze);
if (lnum >= 0) {
/* Found an LEB, add it to the journal head */
- offs = c->leb_size - free;
err = ubifs_add_bud_to_log(c, jhead, lnum, offs);
if (err)
goto out_return;
@@ -1366,7 +1365,7 @@ out_ro:
* @host: host inode
*
* This function writes the updated version of an extended attribute inode and
- * the host inode tho the journal (to the base head). The host inode is written
+ * the host inode to the journal (to the base head). The host inode is written
* after the extended attribute inode in order to guarantee that the extended
* attribute will be flushed when the inode is synchronized by 'fsync()' and
* consequently, the write-buffer is synchronized. This function returns zero
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index efb3430a2581..5fa27ea031ba 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -381,8 +381,8 @@ static inline ino_t key_inum_flash(const struct ubifs_info *c, const void *k)
* @c: UBIFS file-system description object
* @key: the key to get hash from
*/
-static inline int key_hash(const struct ubifs_info *c,
- const union ubifs_key *key)
+static inline uint32_t key_hash(const struct ubifs_info *c,
+ const union ubifs_key *key)
{
return key->u32[1] & UBIFS_S_KEY_HASH_MASK;
}
@@ -392,7 +392,7 @@ static inline int key_hash(const struct ubifs_info *c,
* @c: UBIFS file-system description object
* @k: the key to get hash from
*/
-static inline int key_hash_flash(const struct ubifs_info *c, const void *k)
+static inline uint32_t key_hash_flash(const struct ubifs_info *c, const void *k)
{
const union ubifs_key *key = k;
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index 3e0aa7367556..56e33772a1ee 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -239,7 +239,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
}
/*
- * Make sure the the amount of space in buds will not exceed
+ * Make sure the amount of space in buds will not exceed the
* 'c->max_bud_bytes' limit, because we want to guarantee mount time
* limits.
*
@@ -367,7 +367,6 @@ static void remove_buds(struct ubifs_info *c)
bud->jhead, c->leb_size - bud->start,
c->cmt_bud_bytes);
rb_erase(p1, &c->buds);
- list_del(&bud->list);
/*
* If the commit does not finish, the recovery will need
* to replay the journal, in which case the old buds
@@ -375,7 +374,7 @@ static void remove_buds(struct ubifs_info *c)
* commit i.e. do not allow them to be garbage
* collected.
*/
- list_add(&bud->list, &c->old_buds);
+ list_move(&bud->list, &c->old_buds);
}
}
spin_unlock(&c->buds_lock);
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 3216a1f277f8..8cbfb8248025 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -229,7 +229,7 @@ static int layout_cnodes(struct ubifs_info *c)
while (offs + len > c->leb_size) {
alen = ALIGN(offs, c->min_io_size);
upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
- dbg_chk_lpt_sz(c, 2, alen - offs);
+ dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
err = alloc_lpt_leb(c, &lnum);
if (err)
goto no_space;
@@ -272,7 +272,7 @@ static int layout_cnodes(struct ubifs_info *c)
if (offs + c->lsave_sz > c->leb_size) {
alen = ALIGN(offs, c->min_io_size);
upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
- dbg_chk_lpt_sz(c, 2, alen - offs);
+ dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
err = alloc_lpt_leb(c, &lnum);
if (err)
goto no_space;
@@ -292,7 +292,7 @@ static int layout_cnodes(struct ubifs_info *c)
if (offs + c->ltab_sz > c->leb_size) {
alen = ALIGN(offs, c->min_io_size);
upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
- dbg_chk_lpt_sz(c, 2, alen - offs);
+ dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
err = alloc_lpt_leb(c, &lnum);
if (err)
goto no_space;
@@ -416,14 +416,12 @@ static int write_cnodes(struct ubifs_info *c)
alen, UBI_SHORTTERM);
if (err)
return err;
- dbg_chk_lpt_sz(c, 4, alen - wlen);
}
- dbg_chk_lpt_sz(c, 2, 0);
+ dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
err = realloc_lpt_leb(c, &lnum);
if (err)
goto no_space;
- offs = 0;
- from = 0;
+ offs = from = 0;
ubifs_assert(lnum >= c->lpt_first &&
lnum <= c->lpt_last);
err = ubifs_leb_unmap(c, lnum);
@@ -477,11 +475,11 @@ static int write_cnodes(struct ubifs_info *c)
UBI_SHORTTERM);
if (err)
return err;
- dbg_chk_lpt_sz(c, 2, alen - wlen);
+ dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
err = realloc_lpt_leb(c, &lnum);
if (err)
goto no_space;
- offs = 0;
+ offs = from = 0;
ubifs_assert(lnum >= c->lpt_first &&
lnum <= c->lpt_last);
err = ubifs_leb_unmap(c, lnum);
@@ -504,11 +502,11 @@ static int write_cnodes(struct ubifs_info *c)
UBI_SHORTTERM);
if (err)
return err;
- dbg_chk_lpt_sz(c, 2, alen - wlen);
+ dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
err = realloc_lpt_leb(c, &lnum);
if (err)
goto no_space;
- offs = 0;
+ offs = from = 0;
ubifs_assert(lnum >= c->lpt_first &&
lnum <= c->lpt_last);
err = ubifs_leb_unmap(c, lnum);
@@ -1756,10 +1754,16 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c)
/**
* dbg_chk_lpt_sz - check LPT does not write more than LPT size.
* @c: the UBIFS file-system description object
- * @action: action
+ * @action: what to do
* @len: length written
*
* This function returns %0 on success and a negative error code on failure.
+ * The @action argument may be one of:
+ * o %0 - LPT debugging checking starts, initialize debugging variables;
+ * o %1 - wrote an LPT node, increase LPT size by @len bytes;
+ * o %2 - switched to a different LEB and wasted @len bytes;
+ * o %3 - check that we've written the right number of bytes.
+ * o %4 - wasted @len bytes;
*/
int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
{
@@ -1917,12 +1921,12 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
lnum, offs);
err = ubifs_unpack_nnode(c, buf, &nnode);
for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
- printk("%d:%d", nnode.nbranch[i].lnum,
+ printk(KERN_CONT "%d:%d", nnode.nbranch[i].lnum,
nnode.nbranch[i].offs);
if (i != UBIFS_LPT_FANOUT - 1)
- printk(", ");
+ printk(KERN_CONT ", ");
}
- printk("\n");
+ printk(KERN_CONT "\n");
break;
}
case UBIFS_LPT_LTAB:
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 90acac603e63..10662975d2ef 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -425,59 +425,35 @@ static void clean_buf(const struct ubifs_info *c, void **buf, int lnum,
* @lnum: LEB number of the LEB from which @buf was read
* @offs: offset from which @buf was read
*
- * This function scans @buf for more nodes and returns %0 is a node is found and
- * %1 if no more nodes are found.
+ * This function ensures that the corrupted node at @offs is the last thing
+ * written to a LEB. This function returns %1 if more data is not found and
+ * %0 if more data is found.
*/
static int no_more_nodes(const struct ubifs_info *c, void *buf, int len,
int lnum, int offs)
{
- int skip, next_offs = 0;
+ struct ubifs_ch *ch = buf;
+ int skip, dlen = le32_to_cpu(ch->len);
- if (len > UBIFS_DATA_NODE_SZ) {
- struct ubifs_ch *ch = buf;
- int dlen = le32_to_cpu(ch->len);
-
- if (ch->node_type == UBIFS_DATA_NODE && dlen >= UBIFS_CH_SZ &&
- dlen <= UBIFS_MAX_DATA_NODE_SZ)
- /* The corrupt node looks like a data node */
- next_offs = ALIGN(offs + dlen, 8);
- }
-
- if (c->min_io_size == 1)
- skip = 8;
- else
- skip = ALIGN(offs + 1, c->min_io_size) - offs;
-
- offs += skip;
- buf += skip;
- len -= skip;
- while (len > 8) {
- struct ubifs_ch *ch = buf;
- uint32_t magic = le32_to_cpu(ch->magic);
- int ret;
-
- if (magic == UBIFS_NODE_MAGIC) {
- ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
- if (ret == SCANNED_A_NODE || ret > 0) {
- /*
- * There is a small chance this is just data in
- * a data node, so check that possibility. e.g.
- * this is part of a file that itself contains
- * a UBIFS image.
- */
- if (next_offs && offs + le32_to_cpu(ch->len) <=
- next_offs)
- continue;
- dbg_rcvry("unexpected node at %d:%d", lnum,
- offs);
- return 0;
- }
- }
- offs += 8;
- buf += 8;
- len -= 8;
+ /* Check for empty space after the corrupt node's common header */
+ skip = ALIGN(offs + UBIFS_CH_SZ, c->min_io_size) - offs;
+ if (is_empty(buf + skip, len - skip))
+ return 1;
+ /*
+ * The area after the common header size is not empty, so the common
+ * header must be intact. Check it.
+ */
+ if (ubifs_check_node(c, buf, lnum, offs, 1, 0) != -EUCLEAN) {
+ dbg_rcvry("unexpected bad common header at %d:%d", lnum, offs);
+ return 0;
}
- return 1;
+ /* Now we know the corrupt node's length we can skip over it */
+ skip = ALIGN(offs + dlen, c->min_io_size) - offs;
+ /* After which there should be empty space */
+ if (is_empty(buf + skip, len - skip))
+ return 1;
+ dbg_rcvry("unexpected data at %d:%d", lnum, offs + skip);
+ return 0;
}
/**
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index ce42a7b0ca5a..11cc80125a49 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -143,7 +143,7 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
dirty -= c->leb_size - lp->free;
/*
* If the replay order was perfect the dirty space would now be
- * zero. The order is not perfect because the the journal heads
+ * zero. The order is not perfect because the journal heads
* race with each other. This is not a problem but is does mean
* that the dirty space may temporarily exceed c->leb_size
* during the replay.
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index e070c643d1bb..57085e43320f 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -193,6 +193,7 @@ static int create_default_filesystem(struct ubifs_info *c)
if (tmp64 > DEFAULT_MAX_RP_SIZE)
tmp64 = DEFAULT_MAX_RP_SIZE;
sup->rp_size = cpu_to_le64(tmp64);
+ sup->ro_compat_version = cpu_to_le32(UBIFS_RO_COMPAT_VERSION);
err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0, UBI_LONGTERM);
kfree(sup);
@@ -532,17 +533,39 @@ int ubifs_read_superblock(struct ubifs_info *c)
if (IS_ERR(sup))
return PTR_ERR(sup);
+ c->fmt_version = le32_to_cpu(sup->fmt_version);
+ c->ro_compat_version = le32_to_cpu(sup->ro_compat_version);
+
/*
* The software supports all previous versions but not future versions,
* due to the unavailability of time-travelling equipment.
*/
- c->fmt_version = le32_to_cpu(sup->fmt_version);
if (c->fmt_version > UBIFS_FORMAT_VERSION) {
- ubifs_err("on-flash format version is %d, but software only "
- "supports up to version %d", c->fmt_version,
- UBIFS_FORMAT_VERSION);
- err = -EINVAL;
- goto out;
+ struct super_block *sb = c->vfs_sb;
+ int mounting_ro = sb->s_flags & MS_RDONLY;
+
+ ubifs_assert(!c->ro_media || mounting_ro);
+ if (!mounting_ro ||
+ c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) {
+ ubifs_err("on-flash format version is w%d/r%d, but "
+ "software only supports up to version "
+ "w%d/r%d", c->fmt_version,
+ c->ro_compat_version, UBIFS_FORMAT_VERSION,
+ UBIFS_RO_COMPAT_VERSION);
+ if (c->ro_compat_version <= UBIFS_RO_COMPAT_VERSION) {
+ ubifs_msg("only R/O mounting is possible");
+ err = -EROFS;
+ } else
+ err = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * The FS is mounted R/O, and the media format is
+ * R/O-compatible with the UBIFS implementation, so we can
+ * mount.
+ */
+ c->rw_incompat = 1;
}
if (c->fmt_version < 3) {
@@ -623,7 +646,6 @@ int ubifs_read_superblock(struct ubifs_info *c)
c->main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS;
c->main_lebs -= c->log_lebs + c->lpt_lebs + c->orph_lebs;
c->main_first = c->leb_cnt - c->main_lebs;
- c->report_rp_size = ubifs_reported_space(c, c->rp_size);
err = validate_sb(c, sup);
out:
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index e7bab52a1410..02feb59cefca 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -206,8 +206,7 @@ static int shrink_tnc_trees(int nr, int age, int *contention)
* Move this one to the end of the list to provide some
* fairness.
*/
- list_del(&c->infos_list);
- list_add_tail(&c->infos_list, &ubifs_infos);
+ list_move_tail(&c->infos_list, &ubifs_infos);
mutex_unlock(&c->umount_mutex);
if (freed >= nr)
break;
@@ -263,8 +262,7 @@ static int kick_a_thread(void)
}
if (i == 1) {
- list_del(&c->infos_list);
- list_add_tail(&c->infos_list, &ubifs_infos);
+ list_move_tail(&c->infos_list, &ubifs_infos);
spin_unlock(&ubifs_infos_lock);
ubifs_request_bg_commit(c);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index c5c98355459a..faa44f90608a 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -421,8 +421,8 @@ static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt)
seq_printf(s, ",no_chk_data_crc");
if (c->mount_opts.override_compr) {
- seq_printf(s, ",compr=");
- seq_printf(s, ubifs_compr_name(c->mount_opts.compr_type));
+ seq_printf(s, ",compr=%s",
+ ubifs_compr_name(c->mount_opts.compr_type));
}
return 0;
@@ -700,6 +700,8 @@ static int init_constants_sb(struct ubifs_info *c)
if (err)
return err;
+ /* Initialize effective LEB size used in budgeting calculations */
+ c->idx_leb_size = c->leb_size - c->max_idx_node_sz;
return 0;
}
@@ -716,6 +718,7 @@ static void init_constants_master(struct ubifs_info *c)
long long tmp64;
c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+ c->report_rp_size = ubifs_reported_space(c, c->rp_size);
/*
* Calculate total amount of FS blocks. This number is not used
@@ -1201,7 +1204,7 @@ static int mount_ubifs(struct ubifs_info *c)
goto out_cbuf;
/* Create background thread */
- c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name);
+ c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name);
if (IS_ERR(c->bgt)) {
err = PTR_ERR(c->bgt);
c->bgt = NULL;
@@ -1318,11 +1321,15 @@ static int mount_ubifs(struct ubifs_info *c)
else {
c->need_recovery = 0;
ubifs_msg("recovery completed");
- /* GC LEB has to be empty and taken at this point */
- ubifs_assert(c->lst.taken_empty_lebs == 1);
+ /*
+ * GC LEB has to be empty and taken at this point. But
+ * the journal head LEBs may also be accounted as
+ * "empty taken" if they are empty.
+ */
+ ubifs_assert(c->lst.taken_empty_lebs > 0);
}
} else
- ubifs_assert(c->lst.taken_empty_lebs == 1);
+ ubifs_assert(c->lst.taken_empty_lebs > 0);
err = dbg_check_filesystem(c);
if (err)
@@ -1344,8 +1351,9 @@ static int mount_ubifs(struct ubifs_info *c)
x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes;
ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d "
"LEBs)", x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt);
- ubifs_msg("media format: %d (latest is %d)",
- c->fmt_version, UBIFS_FORMAT_VERSION);
+ ubifs_msg("media format: w%d/r%d (latest is w%d/r%d)",
+ c->fmt_version, c->ro_compat_version,
+ UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION);
ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr));
ubifs_msg("reserved for root: %llu bytes (%llu KiB)",
c->report_rp_size, c->report_rp_size >> 10);
@@ -1485,6 +1493,15 @@ static int ubifs_remount_rw(struct ubifs_info *c)
{
int err, lnum;
+ if (c->rw_incompat) {
+ ubifs_err("the file-system is not R/W-compatible");
+ ubifs_msg("on-flash format version is w%d/r%d, but software "
+ "only supports up to version w%d/r%d", c->fmt_version,
+ c->ro_compat_version, UBIFS_FORMAT_VERSION,
+ UBIFS_RO_COMPAT_VERSION);
+ return -EROFS;
+ }
+
mutex_lock(&c->umount_mutex);
dbg_save_space_info(c);
c->remounting_rw = 1;
@@ -1554,7 +1571,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
ubifs_create_buds_lists(c);
/* Create background thread */
- c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name);
+ c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name);
if (IS_ERR(c->bgt)) {
err = PTR_ERR(c->bgt);
c->bgt = NULL;
@@ -1775,7 +1792,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
c->bu.buf = NULL;
}
- ubifs_assert(c->lst.taken_empty_lebs == 1);
+ ubifs_assert(c->lst.taken_empty_lebs > 0);
return 0;
}
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index fa28a84c6a1b..f249f7b0d656 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -1252,7 +1252,7 @@ int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
* splitting in the middle of the colliding sequence. Also, when
* removing the leftmost key, we would have to correct the key of the
* parent node, which would introduce additional complications. Namely,
- * if we changed the the leftmost key of the parent znode, the garbage
+ * if we changed the leftmost key of the parent znode, the garbage
* collector would be unable to find it (GC is doing this when GC'ing
* indexing LEBs). Although we already have an additional RB-tree where
* we save such changed znodes (see 'ins_clr_old_idx_znode()') until
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index b25fc36cf72f..3eee07e0c495 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -36,9 +36,31 @@
/* UBIFS node magic number (must not have the padding byte first or last) */
#define UBIFS_NODE_MAGIC 0x06101831
-/* UBIFS on-flash format version */
+/*
+ * UBIFS on-flash format version. This version is increased when the on-flash
+ * format is changing. If this happens, UBIFS is will support older versions as
+ * well. But older UBIFS code will not support newer formats. Format changes
+ * will be rare and only when absolutely necessary, e.g. to fix a bug or to add
+ * a new feature.
+ *
+ * UBIFS went into mainline kernel with format version 4. The older formats
+ * were development formats.
+ */
#define UBIFS_FORMAT_VERSION 4
+/*
+ * Read-only compatibility version. If the UBIFS format is changed, older UBIFS
+ * implementations will not be able to mount newer formats in read-write mode.
+ * However, depending on the change, it may be possible to mount newer formats
+ * in R/O mode. This is indicated by the R/O compatibility version which is
+ * stored in the super-block.
+ *
+ * This is needed to support boot-loaders which only need R/O mounting. With
+ * this flag it is possible to do UBIFS format changes without a need to update
+ * boot-loaders.
+ */
+#define UBIFS_RO_COMPAT_VERSION 0
+
/* Minimum logical eraseblock size in bytes */
#define UBIFS_MIN_LEB_SZ (15*1024)
@@ -53,7 +75,7 @@
/*
* If compressed data length is less than %UBIFS_MIN_COMPRESS_DIFF bytes
- * shorter than uncompressed data length, UBIFS preferes to leave this data
+ * shorter than uncompressed data length, UBIFS prefers to leave this data
* node uncompress, because it'll be read faster.
*/
#define UBIFS_MIN_COMPRESS_DIFF 64
@@ -586,6 +608,7 @@ struct ubifs_pad_node {
* @padding2: reserved for future, zeroes
* @time_gran: time granularity in nanoseconds
* @uuid: UUID generated when the file system image was created
+ * @ro_compat_version: UBIFS R/O compatibility version
*/
struct ubifs_sb_node {
struct ubifs_ch ch;
@@ -612,7 +635,8 @@ struct ubifs_sb_node {
__le64 rp_size;
__le32 time_gran;
__u8 uuid[16];
- __u8 padding2[3972];
+ __le32 ro_compat_version;
+ __u8 padding2[3968];
} __attribute__ ((packed));
/**
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 039a68bee29a..0a8341e14088 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -934,6 +934,7 @@ struct ubifs_debug_info;
* by @commit_sem
* @cnt_lock: protects @highest_inum and @max_sqnum counters
* @fmt_version: UBIFS on-flash format version
+ * @ro_compat_version: R/O compatibility version
* @uuid: UUID from super block
*
* @lhead_lnum: log head logical eraseblock number
@@ -966,6 +967,7 @@ struct ubifs_debug_info;
* recovery)
* @bulk_read: enable bulk-reads
* @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
+ * @rw_incompat: the media is not R/W compatible
*
* @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and
* @calc_idx_sz
@@ -1015,6 +1017,8 @@ struct ubifs_debug_info;
* @min_io_shift: number of bits in @min_io_size minus one
* @leb_size: logical eraseblock size in bytes
* @half_leb_size: half LEB size
+ * @idx_leb_size: how many bytes of an LEB are effectively available when it is
+ * used to store indexing nodes (@leb_size - @max_idx_node_sz)
* @leb_cnt: count of logical eraseblocks
* @max_leb_cnt: maximum count of logical eraseblocks
* @old_leb_cnt: count of logical eraseblocks before re-size
@@ -1132,8 +1136,8 @@ struct ubifs_debug_info;
* previous commit start
* @uncat_list: list of un-categorized LEBs
* @empty_list: list of empty LEBs
- * @freeable_list: list of freeable non-index LEBs (free + dirty == leb_size)
- * @frdi_idx_list: list of freeable index LEBs (free + dirty == leb_size)
+ * @freeable_list: list of freeable non-index LEBs (free + dirty == @leb_size)
+ * @frdi_idx_list: list of freeable index LEBs (free + dirty == @leb_size)
* @freeable_cnt: number of freeable LEBs in @freeable_list
*
* @ltab_lnum: LEB number of LPT's own lprops table
@@ -1177,6 +1181,7 @@ struct ubifs_info {
unsigned long long cmt_no;
spinlock_t cnt_lock;
int fmt_version;
+ int ro_compat_version;
unsigned char uuid[16];
int lhead_lnum;
@@ -1205,6 +1210,7 @@ struct ubifs_info {
unsigned int no_chk_data_crc:1;
unsigned int bulk_read:1;
unsigned int default_compr:2;
+ unsigned int rw_incompat:1;
struct mutex tnc_mutex;
struct ubifs_zbranch zroot;
@@ -1253,6 +1259,7 @@ struct ubifs_info {
int min_io_shift;
int leb_size;
int half_leb_size;
+ int idx_leb_size;
int leb_cnt;
int max_leb_cnt;
int old_leb_cnt;
@@ -1500,7 +1507,7 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free);
long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs);
/* find.c */
-int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
+int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs,
int squeeze);
int ubifs_find_free_leb_for_idx(struct ubifs_info *c);
int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,