diff options
Diffstat (limited to 'fs')
303 files changed, 44215 insertions, 4198 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index cef8b18ceaa3..9f7270f36b2a 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -66,6 +66,13 @@ config GENERIC_ACL bool select FS_POSIX_ACL +menu "Caches" + +source "fs/fscache/Kconfig" +source "fs/cachefiles/Kconfig" + +endmenu + if BLOCK menu "CD-ROM/DVD Filesystems" @@ -168,6 +175,33 @@ source "fs/qnx4/Kconfig" source "fs/romfs/Kconfig" source "fs/sysv/Kconfig" source "fs/ufs/Kconfig" +source "fs/exofs/Kconfig" + +config NILFS2_FS + tristate "NILFS2 file system support (EXPERIMENTAL)" + depends on BLOCK && EXPERIMENTAL + select CRC32 + help + NILFS2 is a log-structured file system (LFS) supporting continuous + snapshotting. In addition to versioning capability of the entire + file system, users can even restore files mistakenly overwritten or + destroyed just a few seconds ago. Since this file system can keep + consistency like conventional LFS, it achieves quick recovery after + system crashes. + + NILFS2 creates a number of checkpoints every few seconds or per + synchronous write basis (unless there is no change). Users can + select significant versions among continuously created checkpoints, + and can change them into snapshots which will be preserved for long + periods until they are changed back to checkpoints. Each + snapshot is mountable as a read-only file system concurrently with + its writable mount, and this feature is convenient for online backup. + + Some features including atime, extended attributes, and POSIX ACLs, + are not supported yet. + + To compile this file system support as a module, choose M here: the + module will be called nilfs2. If unsure, say N. endif # MISC_FILESYSTEMS diff --git a/fs/Makefile b/fs/Makefile index 6e82a307bcd4..af6d04700d9c 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \ attr.o bad_inode.o file.o filesystems.o namespace.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ pnode.o drop_caches.o splice.o sync.o utimes.o \ - stack.o + stack.o fs_struct.o ifeq ($(CONFIG_BLOCK),y) obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o @@ -63,6 +63,7 @@ obj-$(CONFIG_PROFILING) += dcookies.o obj-$(CONFIG_DLM) += dlm/ # Do not add any filesystems before this line +obj-$(CONFIG_FSCACHE) += fscache/ obj-$(CONFIG_REISERFS_FS) += reiserfs/ obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 obj-$(CONFIG_EXT2_FS) += ext2/ @@ -113,10 +114,13 @@ obj-$(CONFIG_JFS_FS) += jfs/ obj-$(CONFIG_XFS_FS) += xfs/ obj-$(CONFIG_9P_FS) += 9p/ obj-$(CONFIG_AFS_FS) += afs/ +obj-$(CONFIG_NILFS2_FS) += nilfs2/ obj-$(CONFIG_BEFS_FS) += befs/ obj-$(CONFIG_HOSTFS) += hostfs/ obj-$(CONFIG_HPPFS) += hppfs/ +obj-$(CONFIG_CACHEFILES) += cachefiles/ obj-$(CONFIG_DEBUG_FS) += debugfs/ obj-$(CONFIG_OCFS2_FS) += ocfs2/ obj-$(CONFIG_BTRFS_FS) += btrfs/ obj-$(CONFIG_GFS2_FS) += gfs2/ +obj-$(CONFIG_EXOFS_FS) += exofs/ diff --git a/fs/adfs/super.c b/fs/adfs/super.c index 7f83a46f2b7e..dd9becca4241 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -219,16 +219,20 @@ static int adfs_remount(struct super_block *sb, int *flags, char *data) static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf) { - struct adfs_sb_info *asb = ADFS_SB(dentry->d_sb); + struct super_block *sb = dentry->d_sb; + struct adfs_sb_info *sbi = ADFS_SB(sb); + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); buf->f_type = ADFS_SUPER_MAGIC; - buf->f_namelen = asb->s_namelen; - buf->f_bsize = dentry->d_sb->s_blocksize; - buf->f_blocks = asb->s_size; - buf->f_files = asb->s_ids_per_zone * asb->s_map_size; + buf->f_namelen = sbi->s_namelen; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = sbi->s_size; + buf->f_files = sbi->s_ids_per_zone * sbi->s_map_size; buf->f_bavail = - buf->f_bfree = adfs_map_free(dentry->d_sb); + buf->f_bfree = adfs_map_free(sb); buf->f_ffree = (long)(buf->f_bfree * buf->f_files) / (long)buf->f_blocks; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); return 0; } diff --git a/fs/affs/super.c b/fs/affs/super.c index a19d64b582aa..5ce695e707fe 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -533,6 +533,7 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; int free; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); pr_debug("AFFS: statfs() partsize=%d, reserved=%d\n",AFFS_SB(sb)->s_partition_size, AFFS_SB(sb)->s_reserved); @@ -543,6 +544,9 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = AFFS_SB(sb)->s_partition_size - AFFS_SB(sb)->s_reserved; buf->f_bfree = free; buf->f_bavail = free; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_namelen = 30; return 0; } diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig index e7b522fe15e1..5c4e61d3c772 100644 --- a/fs/afs/Kconfig +++ b/fs/afs/Kconfig @@ -19,3 +19,11 @@ config AFS_DEBUG See <file:Documentation/filesystems/afs.txt> for more information. If unsure, say N. + +config AFS_FSCACHE + bool "Provide AFS client caching support (EXPERIMENTAL)" + depends on EXPERIMENTAL + depends on AFS_FS=m && FSCACHE || AFS_FS=y && FSCACHE=y + help + Say Y here if you want AFS data to be cached locally on disk through + the generic filesystem cache manager diff --git a/fs/afs/Makefile b/fs/afs/Makefile index a66671082cfb..4f64b95d57bd 100644 --- a/fs/afs/Makefile +++ b/fs/afs/Makefile @@ -2,7 +2,10 @@ # Makefile for Red Hat Linux AFS client. # +afs-cache-$(CONFIG_AFS_FSCACHE) := cache.o + kafs-objs := \ + $(afs-cache-y) \ callback.o \ cell.o \ cmservice.o \ diff --git a/fs/afs/cache.c b/fs/afs/cache.c index de0d7de69edc..e2b1d3f16519 100644 --- a/fs/afs/cache.c +++ b/fs/afs/cache.c @@ -1,6 +1,6 @@ /* AFS caching stuff * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or @@ -9,248 +9,395 @@ * 2 of the License, or (at your option) any later version. */ -#ifdef AFS_CACHING_SUPPORT -static cachefs_match_val_t afs_cell_cache_match(void *target, - const void *entry); -static void afs_cell_cache_update(void *source, void *entry); - -struct cachefs_index_def afs_cache_cell_index_def = { - .name = "cell_ix", - .data_size = sizeof(struct afs_cache_cell), - .keys[0] = { CACHEFS_INDEX_KEYS_ASCIIZ, 64 }, - .match = afs_cell_cache_match, - .update = afs_cell_cache_update, +#include <linux/slab.h> +#include <linux/sched.h> +#include "internal.h" + +static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t buflen); +static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t buflen); +static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data, + const void *buffer, + uint16_t buflen); + +static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t buflen); +static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t buflen); +static enum fscache_checkaux afs_vlocation_cache_check_aux( + void *cookie_netfs_data, const void *buffer, uint16_t buflen); + +static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t buflen); + +static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t buflen); +static void afs_vnode_cache_get_attr(const void *cookie_netfs_data, + uint64_t *size); +static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t buflen); +static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data, + const void *buffer, + uint16_t buflen); +static void afs_vnode_cache_now_uncached(void *cookie_netfs_data); + +struct fscache_netfs afs_cache_netfs = { + .name = "afs", + .version = 0, +}; + +struct fscache_cookie_def afs_cell_cache_index_def = { + .name = "AFS.cell", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = afs_cell_cache_get_key, + .get_aux = afs_cell_cache_get_aux, + .check_aux = afs_cell_cache_check_aux, +}; + +struct fscache_cookie_def afs_vlocation_cache_index_def = { + .name = "AFS.vldb", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = afs_vlocation_cache_get_key, + .get_aux = afs_vlocation_cache_get_aux, + .check_aux = afs_vlocation_cache_check_aux, +}; + +struct fscache_cookie_def afs_volume_cache_index_def = { + .name = "AFS.volume", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = afs_volume_cache_get_key, +}; + +struct fscache_cookie_def afs_vnode_cache_index_def = { + .name = "AFS.vnode", + .type = FSCACHE_COOKIE_TYPE_DATAFILE, + .get_key = afs_vnode_cache_get_key, + .get_attr = afs_vnode_cache_get_attr, + .get_aux = afs_vnode_cache_get_aux, + .check_aux = afs_vnode_cache_check_aux, + .now_uncached = afs_vnode_cache_now_uncached, }; -#endif /* - * match a cell record obtained from the cache + * set the key for the index entry */ -#ifdef AFS_CACHING_SUPPORT -static cachefs_match_val_t afs_cell_cache_match(void *target, - const void *entry) +static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) { - const struct afs_cache_cell *ccell = entry; - struct afs_cell *cell = target; + const struct afs_cell *cell = cookie_netfs_data; + uint16_t klen; - _enter("{%s},{%s}", ccell->name, cell->name); + _enter("%p,%p,%u", cell, buffer, bufmax); - if (strncmp(ccell->name, cell->name, sizeof(ccell->name)) == 0) { - _leave(" = SUCCESS"); - return CACHEFS_MATCH_SUCCESS; - } + klen = strlen(cell->name); + if (klen > bufmax) + return 0; - _leave(" = FAILED"); - return CACHEFS_MATCH_FAILED; + memcpy(buffer, cell->name, klen); + return klen; } -#endif /* - * update a cell record in the cache + * provide new auxilliary cache data */ -#ifdef AFS_CACHING_SUPPORT -static void afs_cell_cache_update(void *source, void *entry) +static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) { - struct afs_cache_cell *ccell = entry; - struct afs_cell *cell = source; + const struct afs_cell *cell = cookie_netfs_data; + uint16_t dlen; - _enter("%p,%p", source, entry); + _enter("%p,%p,%u", cell, buffer, bufmax); - strncpy(ccell->name, cell->name, sizeof(ccell->name)); + dlen = cell->vl_naddrs * sizeof(cell->vl_addrs[0]); + dlen = min(dlen, bufmax); + dlen &= ~(sizeof(cell->vl_addrs[0]) - 1); - memcpy(ccell->vl_servers, - cell->vl_addrs, - min(sizeof(ccell->vl_servers), sizeof(cell->vl_addrs))); + memcpy(buffer, cell->vl_addrs, dlen); + return dlen; +} +/* + * check that the auxilliary data indicates that the entry is still valid + */ +static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data, + const void *buffer, + uint16_t buflen) +{ + _leave(" = OKAY"); + return FSCACHE_CHECKAUX_OKAY; } -#endif - -#ifdef AFS_CACHING_SUPPORT -static cachefs_match_val_t afs_vlocation_cache_match(void *target, - const void *entry); -static void afs_vlocation_cache_update(void *source, void *entry); - -struct cachefs_index_def afs_vlocation_cache_index_def = { - .name = "vldb", - .data_size = sizeof(struct afs_cache_vlocation), - .keys[0] = { CACHEFS_INDEX_KEYS_ASCIIZ, 64 }, - .match = afs_vlocation_cache_match, - .update = afs_vlocation_cache_update, -}; -#endif +/*****************************************************************************/ /* - * match a VLDB record stored in the cache - * - may also load target from entry + * set the key for the index entry */ -#ifdef AFS_CACHING_SUPPORT -static cachefs_match_val_t afs_vlocation_cache_match(void *target, - const void *entry) +static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) { - const struct afs_cache_vlocation *vldb = entry; - struct afs_vlocation *vlocation = target; + const struct afs_vlocation *vlocation = cookie_netfs_data; + uint16_t klen; + + _enter("{%s},%p,%u", vlocation->vldb.name, buffer, bufmax); + + klen = strnlen(vlocation->vldb.name, sizeof(vlocation->vldb.name)); + if (klen > bufmax) + return 0; - _enter("{%s},{%s}", vlocation->vldb.name, vldb->name); + memcpy(buffer, vlocation->vldb.name, klen); - if (strncmp(vlocation->vldb.name, vldb->name, sizeof(vldb->name)) == 0 - ) { - if (!vlocation->valid || - vlocation->vldb.rtime == vldb->rtime + _leave(" = %u", klen); + return klen; +} + +/* + * provide new auxilliary cache data + */ +static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct afs_vlocation *vlocation = cookie_netfs_data; + uint16_t dlen; + + _enter("{%s},%p,%u", vlocation->vldb.name, buffer, bufmax); + + dlen = sizeof(struct afs_cache_vlocation); + dlen -= offsetof(struct afs_cache_vlocation, nservers); + if (dlen > bufmax) + return 0; + + memcpy(buffer, (uint8_t *)&vlocation->vldb.nservers, dlen); + + _leave(" = %u", dlen); + return dlen; +} + +/* + * check that the auxilliary data indicates that the entry is still valid + */ +static +enum fscache_checkaux afs_vlocation_cache_check_aux(void *cookie_netfs_data, + const void *buffer, + uint16_t buflen) +{ + const struct afs_cache_vlocation *cvldb; + struct afs_vlocation *vlocation = cookie_netfs_data; + uint16_t dlen; + + _enter("{%s},%p,%u", vlocation->vldb.name, buffer, buflen); + + /* check the size of the data is what we're expecting */ + dlen = sizeof(struct afs_cache_vlocation); + dlen -= offsetof(struct afs_cache_vlocation, nservers); + if (dlen != buflen) + return FSCACHE_CHECKAUX_OBSOLETE; + + cvldb = container_of(buffer, struct afs_cache_vlocation, nservers); + + /* if what's on disk is more valid than what's in memory, then use the + * VL record from the cache */ + if (!vlocation->valid || vlocation->vldb.rtime == cvldb->rtime) { + memcpy((uint8_t *)&vlocation->vldb.nservers, buffer, dlen); + vlocation->valid = 1; + _leave(" = SUCCESS [c->m]"); + return FSCACHE_CHECKAUX_OKAY; + } + + /* need to update the cache if the cached info differs */ + if (memcmp(&vlocation->vldb, buffer, dlen) != 0) { + /* delete if the volume IDs for this name differ */ + if (memcmp(&vlocation->vldb.vid, &cvldb->vid, + sizeof(cvldb->vid)) != 0 ) { - vlocation->vldb = *vldb; - vlocation->valid = 1; - _leave(" = SUCCESS [c->m]"); - return CACHEFS_MATCH_SUCCESS; - } else if (memcmp(&vlocation->vldb, vldb, sizeof(*vldb)) != 0) { - /* delete if VIDs for this name differ */ - if (memcmp(&vlocation->vldb.vid, - &vldb->vid, - sizeof(vldb->vid)) != 0) { - _leave(" = DELETE"); - return CACHEFS_MATCH_SUCCESS_DELETE; - } - - _leave(" = UPDATE"); - return CACHEFS_MATCH_SUCCESS_UPDATE; - } else { - _leave(" = SUCCESS"); - return CACHEFS_MATCH_SUCCESS; + _leave(" = OBSOLETE"); + return FSCACHE_CHECKAUX_OBSOLETE; } + + _leave(" = UPDATE"); + return FSCACHE_CHECKAUX_NEEDS_UPDATE; } - _leave(" = FAILED"); - return CACHEFS_MATCH_FAILED; + _leave(" = OKAY"); + return FSCACHE_CHECKAUX_OKAY; } -#endif +/*****************************************************************************/ /* - * update a VLDB record stored in the cache + * set the key for the volume index entry */ -#ifdef AFS_CACHING_SUPPORT -static void afs_vlocation_cache_update(void *source, void *entry) +static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) { - struct afs_cache_vlocation *vldb = entry; - struct afs_vlocation *vlocation = source; + const struct afs_volume *volume = cookie_netfs_data; + uint16_t klen; + + _enter("{%u},%p,%u", volume->type, buffer, bufmax); + + klen = sizeof(volume->type); + if (klen > bufmax) + return 0; - _enter(""); + memcpy(buffer, &volume->type, sizeof(volume->type)); + + _leave(" = %u", klen); + return klen; - *vldb = vlocation->vldb; } -#endif - -#ifdef AFS_CACHING_SUPPORT -static cachefs_match_val_t afs_volume_cache_match(void *target, - const void *entry); -static void afs_volume_cache_update(void *source, void *entry); - -struct cachefs_index_def afs_volume_cache_index_def = { - .name = "volume", - .data_size = sizeof(struct afs_cache_vhash), - .keys[0] = { CACHEFS_INDEX_KEYS_BIN, 1 }, - .keys[1] = { CACHEFS_INDEX_KEYS_BIN, 1 }, - .match = afs_volume_cache_match, - .update = afs_volume_cache_update, -}; -#endif +/*****************************************************************************/ /* - * match a volume hash record stored in the cache + * set the key for the index entry */ -#ifdef AFS_CACHING_SUPPORT -static cachefs_match_val_t afs_volume_cache_match(void *target, - const void *entry) +static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) { - const struct afs_cache_vhash *vhash = entry; - struct afs_volume *volume = target; + const struct afs_vnode *vnode = cookie_netfs_data; + uint16_t klen; - _enter("{%u},{%u}", volume->type, vhash->vtype); + _enter("{%x,%x,%llx},%p,%u", + vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version, + buffer, bufmax); - if (volume->type == vhash->vtype) { - _leave(" = SUCCESS"); - return CACHEFS_MATCH_SUCCESS; - } + klen = sizeof(vnode->fid.vnode); + if (klen > bufmax) + return 0; + + memcpy(buffer, &vnode->fid.vnode, sizeof(vnode->fid.vnode)); - _leave(" = FAILED"); - return CACHEFS_MATCH_FAILED; + _leave(" = %u", klen); + return klen; } -#endif /* - * update a volume hash record stored in the cache + * provide updated file attributes */ -#ifdef AFS_CACHING_SUPPORT -static void afs_volume_cache_update(void *source, void *entry) +static void afs_vnode_cache_get_attr(const void *cookie_netfs_data, + uint64_t *size) { - struct afs_cache_vhash *vhash = entry; - struct afs_volume *volume = source; + const struct afs_vnode *vnode = cookie_netfs_data; - _enter(""); + _enter("{%x,%x,%llx},", + vnode->fid.vnode, vnode->fid.unique, + vnode->status.data_version); - vhash->vtype = volume->type; + *size = vnode->status.size; } -#endif - -#ifdef AFS_CACHING_SUPPORT -static cachefs_match_val_t afs_vnode_cache_match(void *target, - const void *entry); -static void afs_vnode_cache_update(void *source, void *entry); - -struct cachefs_index_def afs_vnode_cache_index_def = { - .name = "vnode", - .data_size = sizeof(struct afs_cache_vnode), - .keys[0] = { CACHEFS_INDEX_KEYS_BIN, 4 }, - .match = afs_vnode_cache_match, - .update = afs_vnode_cache_update, -}; -#endif /* - * match a vnode record stored in the cache + * provide new auxilliary cache data + */ +static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct afs_vnode *vnode = cookie_netfs_data; + uint16_t dlen; + + _enter("{%x,%x,%Lx},%p,%u", + vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version, + buffer, bufmax); + + dlen = sizeof(vnode->fid.unique) + sizeof(vnode->status.data_version); + if (dlen > bufmax) + return 0; + + memcpy(buffer, &vnode->fid.unique, sizeof(vnode->fid.unique)); + buffer += sizeof(vnode->fid.unique); + memcpy(buffer, &vnode->status.data_version, + sizeof(vnode->status.data_version)); + + _leave(" = %u", dlen); + return dlen; +} + +/* + * check that the auxilliary data indicates that the entry is still valid */ -#ifdef AFS_CACHING_SUPPORT -static cachefs_match_val_t afs_vnode_cache_match(void *target, - const void *entry) +static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data, + const void *buffer, + uint16_t buflen) { - const struct afs_cache_vnode *cvnode = entry; - struct afs_vnode *vnode = target; - - _enter("{%x,%x,%Lx},{%x,%x,%Lx}", - vnode->fid.vnode, - vnode->fid.unique, - vnode->status.version, - cvnode->vnode_id, - cvnode->vnode_unique, - cvnode->data_version); - - if (vnode->fid.vnode != cvnode->vnode_id) { - _leave(" = FAILED"); - return CACHEFS_MATCH_FAILED; + struct afs_vnode *vnode = cookie_netfs_data; + uint16_t dlen; + + _enter("{%x,%x,%llx},%p,%u", + vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version, + buffer, buflen); + + /* check the size of the data is what we're expecting */ + dlen = sizeof(vnode->fid.unique) + sizeof(vnode->status.data_version); + if (dlen != buflen) { + _leave(" = OBSOLETE [len %hx != %hx]", dlen, buflen); + return FSCACHE_CHECKAUX_OBSOLETE; } - if (vnode->fid.unique != cvnode->vnode_unique || - vnode->status.version != cvnode->data_version) { - _leave(" = DELETE"); - return CACHEFS_MATCH_SUCCESS_DELETE; + if (memcmp(buffer, + &vnode->fid.unique, + sizeof(vnode->fid.unique) + ) != 0) { + unsigned unique; + + memcpy(&unique, buffer, sizeof(unique)); + + _leave(" = OBSOLETE [uniq %x != %x]", + unique, vnode->fid.unique); + return FSCACHE_CHECKAUX_OBSOLETE; + } + + if (memcmp(buffer + sizeof(vnode->fid.unique), + &vnode->status.data_version, + sizeof(vnode->status.data_version) + ) != 0) { + afs_dataversion_t version; + + memcpy(&version, buffer + sizeof(vnode->fid.unique), + sizeof(version)); + + _leave(" = OBSOLETE [vers %llx != %llx]", + version, vnode->status.data_version); + return FSCACHE_CHECKAUX_OBSOLETE; } _leave(" = SUCCESS"); - return CACHEFS_MATCH_SUCCESS; + return FSCACHE_CHECKAUX_OKAY; } -#endif /* - * update a vnode record stored in the cache + * indication the cookie is no longer uncached + * - this function is called when the backing store currently caching a cookie + * is removed + * - the netfs should use this to clean up any markers indicating cached pages + * - this is mandatory for any object that may have data */ -#ifdef AFS_CACHING_SUPPORT -static void afs_vnode_cache_update(void *source, void *entry) +static void afs_vnode_cache_now_uncached(void *cookie_netfs_data) { - struct afs_cache_vnode *cvnode = entry; - struct afs_vnode *vnode = source; + struct afs_vnode *vnode = cookie_netfs_data; + struct pagevec pvec; + pgoff_t first; + int loop, nr_pages; + + _enter("{%x,%x,%Lx}", + vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version); + + pagevec_init(&pvec, 0); + first = 0; + + for (;;) { + /* grab a bunch of pages to clean */ + nr_pages = pagevec_lookup(&pvec, vnode->vfs_inode.i_mapping, + first, + PAGEVEC_SIZE - pagevec_count(&pvec)); + if (!nr_pages) + break; - _enter(""); + for (loop = 0; loop < nr_pages; loop++) + ClearPageFsCache(pvec.pages[loop]); + + first = pvec.pages[nr_pages - 1]->index + 1; + + pvec.nr = nr_pages; + pagevec_release(&pvec); + cond_resched(); + } - cvnode->vnode_id = vnode->fid.vnode; - cvnode->vnode_unique = vnode->fid.unique; - cvnode->data_version = vnode->status.version; + _leave(""); } -#endif diff --git a/fs/afs/cache.h b/fs/afs/cache.h index 36a3642cf90e..5c4f6b499e90 100644 --- a/fs/afs/cache.h +++ b/fs/afs/cache.h @@ -1,6 +1,6 @@ /* AFS local cache management interface * - * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or @@ -9,15 +9,4 @@ * 2 of the License, or (at your option) any later version. */ -#ifndef AFS_CACHE_H -#define AFS_CACHE_H - -#undef AFS_CACHING_SUPPORT - -#include <linux/mm.h> -#ifdef AFS_CACHING_SUPPORT -#include <linux/cachefs.h> -#endif -#include "types.h" - -#endif /* AFS_CACHE_H */ +#include <linux/fscache.h> diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 5e1df14e16b1..e19c13f059ed 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -147,12 +147,11 @@ struct afs_cell *afs_cell_create(const char *name, char *vllist) if (ret < 0) goto error; -#ifdef AFS_CACHING_SUPPORT - /* put it up for caching */ - cachefs_acquire_cookie(afs_cache_netfs.primary_index, - &afs_vlocation_cache_index_def, - cell, - &cell->cache); +#ifdef CONFIG_AFS_FSCACHE + /* put it up for caching (this never returns an error) */ + cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index, + &afs_cell_cache_index_def, + cell); #endif /* add to the cell lists */ @@ -362,10 +361,9 @@ static void afs_cell_destroy(struct afs_cell *cell) list_del_init(&cell->proc_link); up_write(&afs_proc_cells_sem); -#ifdef AFS_CACHING_SUPPORT - cachefs_relinquish_cookie(cell->cache, 0); +#ifdef CONFIG_AFS_FSCACHE + fscache_relinquish_cookie(cell->cache, 0); #endif - key_put(cell->anonymous_key); kfree(cell); diff --git a/fs/afs/file.c b/fs/afs/file.c index a3901769a96c..7a1d942ef68d 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -23,6 +23,9 @@ static void afs_invalidatepage(struct page *page, unsigned long offset); static int afs_releasepage(struct page *page, gfp_t gfp_flags); static int afs_launder_page(struct page *page); +static int afs_readpages(struct file *filp, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages); + const struct file_operations afs_file_operations = { .open = afs_open, .release = afs_release, @@ -46,6 +49,7 @@ const struct inode_operations afs_file_inode_operations = { const struct address_space_operations afs_fs_aops = { .readpage = afs_readpage, + .readpages = afs_readpages, .set_page_dirty = afs_set_page_dirty, .launder_page = afs_launder_page, .releasepage = afs_releasepage, @@ -101,37 +105,18 @@ int afs_release(struct inode *inode, struct file *file) /* * deal with notification that a page was read from the cache */ -#ifdef AFS_CACHING_SUPPORT -static void afs_readpage_read_complete(void *cookie_data, - struct page *page, - void *data, - int error) +static void afs_file_readpage_read_complete(struct page *page, + void *data, + int error) { - _enter("%p,%p,%p,%d", cookie_data, page, data, error); + _enter("%p,%p,%d", page, data, error); - if (error) - SetPageError(page); - else + /* if the read completes with an error, we just unlock the page and let + * the VM reissue the readpage */ + if (!error) SetPageUptodate(page); unlock_page(page); - } -#endif - -/* - * deal with notification that a page was written to the cache - */ -#ifdef AFS_CACHING_SUPPORT -static void afs_readpage_write_complete(void *cookie_data, - struct page *page, - void *data, - int error) -{ - _enter("%p,%p,%p,%d", cookie_data, page, data, error); - - unlock_page(page); -} -#endif /* * AFS read page from file, directory or symlink @@ -161,9 +146,9 @@ static int afs_readpage(struct file *file, struct page *page) if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) goto error; -#ifdef AFS_CACHING_SUPPORT /* is it cached? */ - ret = cachefs_read_or_alloc_page(vnode->cache, +#ifdef CONFIG_AFS_FSCACHE + ret = fscache_read_or_alloc_page(vnode->cache, page, afs_file_readpage_read_complete, NULL, @@ -171,20 +156,21 @@ static int afs_readpage(struct file *file, struct page *page) #else ret = -ENOBUFS; #endif - switch (ret) { - /* read BIO submitted and wb-journal entry found */ - case 1: - BUG(); // TODO - handle wb-journal match - /* read BIO submitted (page in cache) */ case 0: break; - /* no page available in cache */ - case -ENOBUFS: + /* page not yet cached */ case -ENODATA: + _debug("cache said ENODATA"); + goto go_on; + + /* page will not be cached */ + case -ENOBUFS: + _debug("cache said ENOBUFS"); default: + go_on: offset = page->index << PAGE_CACHE_SHIFT; len = min_t(size_t, i_size_read(inode) - offset, PAGE_SIZE); @@ -198,27 +184,25 @@ static int afs_readpage(struct file *file, struct page *page) set_bit(AFS_VNODE_DELETED, &vnode->flags); ret = -ESTALE; } -#ifdef AFS_CACHING_SUPPORT - cachefs_uncache_page(vnode->cache, page); + +#ifdef CONFIG_AFS_FSCACHE + fscache_uncache_page(vnode->cache, page); #endif + BUG_ON(PageFsCache(page)); goto error; } SetPageUptodate(page); -#ifdef AFS_CACHING_SUPPORT - if (cachefs_write_page(vnode->cache, - page, - afs_file_readpage_write_complete, - NULL, - GFP_KERNEL) != 0 - ) { - cachefs_uncache_page(vnode->cache, page); - unlock_page(page); + /* send the page to the cache */ +#ifdef CONFIG_AFS_FSCACHE + if (PageFsCache(page) && + fscache_write_page(vnode->cache, page, GFP_KERNEL) != 0) { + fscache_uncache_page(vnode->cache, page); + BUG_ON(PageFsCache(page)); } -#else - unlock_page(page); #endif + unlock_page(page); } _leave(" = 0"); @@ -232,34 +216,59 @@ error: } /* - * invalidate part or all of a page + * read a set of pages */ -static void afs_invalidatepage(struct page *page, unsigned long offset) +static int afs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) { - int ret = 1; + struct afs_vnode *vnode; + int ret = 0; - _enter("{%lu},%lu", page->index, offset); + _enter(",{%lu},,%d", mapping->host->i_ino, nr_pages); - BUG_ON(!PageLocked(page)); + vnode = AFS_FS_I(mapping->host); + if (vnode->flags & AFS_VNODE_DELETED) { + _leave(" = -ESTALE"); + return -ESTALE; + } - if (PagePrivate(page)) { - /* We release buffers only if the entire page is being - * invalidated. - * The get_block cached value has been unconditionally - * invalidated, so real IO is not possible anymore. - */ - if (offset == 0) { - BUG_ON(!PageLocked(page)); - - ret = 0; - if (!PageWriteback(page)) - ret = page->mapping->a_ops->releasepage(page, - 0); - /* possibly should BUG_ON(!ret); - neilb */ - } + /* attempt to read as many of the pages as possible */ +#ifdef CONFIG_AFS_FSCACHE + ret = fscache_read_or_alloc_pages(vnode->cache, + mapping, + pages, + &nr_pages, + afs_file_readpage_read_complete, + NULL, + mapping_gfp_mask(mapping)); +#else + ret = -ENOBUFS; +#endif + + switch (ret) { + /* all pages are being read from the cache */ + case 0: + BUG_ON(!list_empty(pages)); + BUG_ON(nr_pages != 0); + _leave(" = 0 [reading all]"); + return 0; + + /* there were pages that couldn't be read from the cache */ + case -ENODATA: + case -ENOBUFS: + break; + + /* other error */ + default: + _leave(" = %d", ret); + return ret; } - _leave(" = %d", ret); + /* load the missing pages from the network */ + ret = read_cache_pages(mapping, pages, (void *) afs_readpage, file); + + _leave(" = %d [netting]", ret); + return ret; } /* @@ -273,25 +282,82 @@ static int afs_launder_page(struct page *page) } /* - * release a page and cleanup its private data + * invalidate part or all of a page + * - release a page and clean up its private data if offset is 0 (indicating + * the entire page) + */ +static void afs_invalidatepage(struct page *page, unsigned long offset) +{ + struct afs_writeback *wb = (struct afs_writeback *) page_private(page); + + _enter("{%lu},%lu", page->index, offset); + + BUG_ON(!PageLocked(page)); + + /* we clean up only if the entire page is being invalidated */ + if (offset == 0) { +#ifdef CONFIG_AFS_FSCACHE + if (PageFsCache(page)) { + struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); + fscache_wait_on_page_write(vnode->cache, page); + fscache_uncache_page(vnode->cache, page); + ClearPageFsCache(page); + } +#endif + + if (PagePrivate(page)) { + if (wb && !PageWriteback(page)) { + set_page_private(page, 0); + afs_put_writeback(wb); + } + + if (!page_private(page)) + ClearPagePrivate(page); + } + } + + _leave(""); +} + +/* + * release a page and clean up its private state if it's not busy + * - return true if the page can now be released, false if not */ static int afs_releasepage(struct page *page, gfp_t gfp_flags) { + struct afs_writeback *wb = (struct afs_writeback *) page_private(page); struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); - struct afs_writeback *wb; _enter("{{%x:%u}[%lu],%lx},%x", vnode->fid.vid, vnode->fid.vnode, page->index, page->flags, gfp_flags); + /* deny if page is being written to the cache and the caller hasn't + * elected to wait */ +#ifdef CONFIG_AFS_FSCACHE + if (PageFsCache(page)) { + if (fscache_check_page_write(vnode->cache, page)) { + if (!(gfp_flags & __GFP_WAIT)) { + _leave(" = F [cache busy]"); + return 0; + } + fscache_wait_on_page_write(vnode->cache, page); + } + + fscache_uncache_page(vnode->cache, page); + ClearPageFsCache(page); + } +#endif + if (PagePrivate(page)) { - wb = (struct afs_writeback *) page_private(page); - ASSERT(wb != NULL); - set_page_private(page, 0); + if (wb) { + set_page_private(page, 0); + afs_put_writeback(wb); + } ClearPagePrivate(page); - afs_put_writeback(wb); } - _leave(" = 0"); - return 0; + /* indicate that the page can be released */ + _leave(" = T"); + return 1; } diff --git a/fs/afs/inode.c b/fs/afs/inode.c index bb47217f6a18..c048f0658751 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -61,6 +61,11 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) return -EBADMSG; } +#ifdef CONFIG_AFS_FSCACHE + if (vnode->status.size != inode->i_size) + fscache_attr_changed(vnode->cache); +#endif + inode->i_nlink = vnode->status.nlink; inode->i_uid = vnode->status.owner; inode->i_gid = 0; @@ -149,15 +154,6 @@ struct inode *afs_iget(struct super_block *sb, struct key *key, return inode; } -#ifdef AFS_CACHING_SUPPORT - /* set up caching before reading the status, as fetch-status reads the - * first page of symlinks to see if they're really mntpts */ - cachefs_acquire_cookie(vnode->volume->cache, - NULL, - vnode, - &vnode->cache); -#endif - if (!status) { /* it's a remotely extant inode */ set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags); @@ -183,6 +179,15 @@ struct inode *afs_iget(struct super_block *sb, struct key *key, } } + /* set up caching before mapping the status, as map-status reads the + * first page of symlinks to see if they're really mountpoints */ + inode->i_size = vnode->status.size; +#ifdef CONFIG_AFS_FSCACHE + vnode->cache = fscache_acquire_cookie(vnode->volume->cache, + &afs_vnode_cache_index_def, + vnode); +#endif + ret = afs_inode_map_status(vnode, key); if (ret < 0) goto bad_inode; @@ -196,6 +201,10 @@ struct inode *afs_iget(struct super_block *sb, struct key *key, /* failure */ bad_inode: +#ifdef CONFIG_AFS_FSCACHE + fscache_relinquish_cookie(vnode->cache, 0); + vnode->cache = NULL; +#endif iget_failed(inode); _leave(" = %d [bad]", ret); return ERR_PTR(ret); @@ -340,8 +349,8 @@ void afs_clear_inode(struct inode *inode) ASSERT(list_empty(&vnode->writebacks)); ASSERT(!vnode->cb_promised); -#ifdef AFS_CACHING_SUPPORT - cachefs_relinquish_cookie(vnode->cache, 0); +#ifdef CONFIG_AFS_FSCACHE + fscache_relinquish_cookie(vnode->cache, 0); vnode->cache = NULL; #endif diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 67f259d99cd6..106be66dafd2 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -21,6 +21,7 @@ #include "afs.h" #include "afs_vl.h" +#include "cache.h" #define AFS_CELL_MAX_ADDRS 15 @@ -193,8 +194,8 @@ struct afs_cell { struct key *anonymous_key; /* anonymous user key for this cell */ struct list_head proc_link; /* /proc cell list link */ struct proc_dir_entry *proc_dir; /* /proc dir for this cell */ -#ifdef AFS_CACHING_SUPPORT - struct cachefs_cookie *cache; /* caching cookie */ +#ifdef CONFIG_AFS_FSCACHE + struct fscache_cookie *cache; /* caching cookie */ #endif /* server record management */ @@ -249,8 +250,8 @@ struct afs_vlocation { struct list_head grave; /* link in master graveyard list */ struct list_head update; /* link in master update list */ struct afs_cell *cell; /* cell to which volume belongs */ -#ifdef AFS_CACHING_SUPPORT - struct cachefs_cookie *cache; /* caching cookie */ +#ifdef CONFIG_AFS_FSCACHE + struct fscache_cookie *cache; /* caching cookie */ #endif struct afs_cache_vlocation vldb; /* volume information DB record */ struct afs_volume *vols[3]; /* volume access record pointer (index by type) */ @@ -302,8 +303,8 @@ struct afs_volume { atomic_t usage; struct afs_cell *cell; /* cell to which belongs (unrefd ptr) */ struct afs_vlocation *vlocation; /* volume location */ -#ifdef AFS_CACHING_SUPPORT - struct cachefs_cookie *cache; /* caching cookie */ +#ifdef CONFIG_AFS_FSCACHE + struct fscache_cookie *cache; /* caching cookie */ #endif afs_volid_t vid; /* volume ID */ afs_voltype_t type; /* type of volume */ @@ -333,8 +334,8 @@ struct afs_vnode { struct afs_server *server; /* server currently supplying this file */ struct afs_fid fid; /* the file identifier for this inode */ struct afs_file_status status; /* AFS status info for this file */ -#ifdef AFS_CACHING_SUPPORT - struct cachefs_cookie *cache; /* caching cookie */ +#ifdef CONFIG_AFS_FSCACHE + struct fscache_cookie *cache; /* caching cookie */ #endif struct afs_permits *permits; /* cache of permits so far obtained */ struct mutex permits_lock; /* lock for altering permits list */ @@ -428,6 +429,22 @@ struct afs_uuid { /*****************************************************************************/ /* + * cache.c + */ +#ifdef CONFIG_AFS_FSCACHE +extern struct fscache_netfs afs_cache_netfs; +extern struct fscache_cookie_def afs_cell_cache_index_def; +extern struct fscache_cookie_def afs_vlocation_cache_index_def; +extern struct fscache_cookie_def afs_volume_cache_index_def; +extern struct fscache_cookie_def afs_vnode_cache_index_def; +#else +#define afs_cell_cache_index_def (*(struct fscache_cookie_def *) NULL) +#define afs_vlocation_cache_index_def (*(struct fscache_cookie_def *) NULL) +#define afs_volume_cache_index_def (*(struct fscache_cookie_def *) NULL) +#define afs_vnode_cache_index_def (*(struct fscache_cookie_def *) NULL) +#endif + +/* * callback.c */ extern void afs_init_callback_state(struct afs_server *); @@ -446,9 +463,6 @@ extern void afs_callback_update_kill(void); */ extern struct rw_semaphore afs_proc_cells_sem; extern struct list_head afs_proc_cells; -#ifdef AFS_CACHING_SUPPORT -extern struct cachefs_index_def afs_cache_cell_index_def; -#endif #define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0) extern int afs_cell_init(char *); @@ -554,9 +568,6 @@ extern void afs_clear_inode(struct inode *); * main.c */ extern struct afs_uuid afs_uuid; -#ifdef AFS_CACHING_SUPPORT -extern struct cachefs_netfs afs_cache_netfs; -#endif /* * misc.c @@ -637,10 +648,6 @@ extern int afs_get_MAC_address(u8 *, size_t); /* * vlclient.c */ -#ifdef AFS_CACHING_SUPPORT -extern struct cachefs_index_def afs_vlocation_cache_index_def; -#endif - extern int afs_vl_get_entry_by_name(struct in_addr *, struct key *, const char *, struct afs_cache_vlocation *, const struct afs_wait_mode *); @@ -664,12 +671,6 @@ extern void afs_vlocation_purge(void); /* * vnode.c */ -#ifdef AFS_CACHING_SUPPORT -extern struct cachefs_index_def afs_vnode_cache_index_def; -#endif - -extern struct afs_timer_ops afs_vnode_cb_timed_out_ops; - static inline struct afs_vnode *AFS_FS_I(struct inode *inode) { return container_of(inode, struct afs_vnode, vfs_inode); @@ -711,10 +712,6 @@ extern int afs_vnode_release_lock(struct afs_vnode *, struct key *); /* * volume.c */ -#ifdef AFS_CACHING_SUPPORT -extern struct cachefs_index_def afs_volume_cache_index_def; -#endif - #define afs_get_volume(V) do { atomic_inc(&(V)->usage); } while(0) extern void afs_put_volume(struct afs_volume *); diff --git a/fs/afs/main.c b/fs/afs/main.c index 2d3e5d4fb9f7..66d54d348c55 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -1,6 +1,6 @@ /* AFS client file system * - * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2002,5 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or @@ -29,18 +29,6 @@ static char *rootcell; module_param(rootcell, charp, 0); MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list"); -#ifdef AFS_CACHING_SUPPORT -static struct cachefs_netfs_operations afs_cache_ops = { - .get_page_cookie = afs_cache_get_page_cookie, -}; - -struct cachefs_netfs afs_cache_netfs = { - .name = "afs", - .version = 0, - .ops = &afs_cache_ops, -}; -#endif - struct afs_uuid afs_uuid; /* @@ -104,10 +92,9 @@ static int __init afs_init(void) if (ret < 0) return ret; -#ifdef AFS_CACHING_SUPPORT +#ifdef CONFIG_AFS_FSCACHE /* we want to be able to cache */ - ret = cachefs_register_netfs(&afs_cache_netfs, - &afs_cache_cell_index_def); + ret = fscache_register_netfs(&afs_cache_netfs); if (ret < 0) goto error_cache; #endif @@ -142,8 +129,8 @@ error_fs: error_open_socket: error_vl_update_init: error_cell_init: -#ifdef AFS_CACHING_SUPPORT - cachefs_unregister_netfs(&afs_cache_netfs); +#ifdef CONFIG_AFS_FSCACHE + fscache_unregister_netfs(&afs_cache_netfs); error_cache: #endif afs_callback_update_kill(); @@ -175,8 +162,8 @@ static void __exit afs_exit(void) afs_vlocation_purge(); flush_scheduled_work(); afs_cell_purge(); -#ifdef AFS_CACHING_SUPPORT - cachefs_unregister_netfs(&afs_cache_netfs); +#ifdef CONFIG_AFS_FSCACHE + fscache_unregister_netfs(&afs_cache_netfs); #endif afs_proc_cleanup(); rcu_barrier(); diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 78db4953a800..2b9e2d03a390 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -173,9 +173,9 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) if (PageError(page)) goto error; - buf = kmap(page); + buf = kmap_atomic(page, KM_USER0); memcpy(devname, buf, size); - kunmap(page); + kunmap_atomic(buf, KM_USER0); page_cache_release(page); page = NULL; diff --git a/fs/afs/netdevices.c b/fs/afs/netdevices.c index 49f189423063..7ad36506c256 100644 --- a/fs/afs/netdevices.c +++ b/fs/afs/netdevices.c @@ -20,8 +20,7 @@ int afs_get_MAC_address(u8 *mac, size_t maclen) struct net_device *dev; int ret = -ENODEV; - if (maclen != ETH_ALEN) - BUG(); + BUG_ON(maclen != ETH_ALEN); rtnl_lock(); dev = __dev_getfirstbyhwtype(&init_net, ARPHRD_ETHER); diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c index 849fc3160cb5..ec2a7431e458 100644 --- a/fs/afs/vlocation.c +++ b/fs/afs/vlocation.c @@ -281,9 +281,8 @@ static void afs_vlocation_apply_update(struct afs_vlocation *vl, vl->vldb = *vldb; -#ifdef AFS_CACHING_SUPPORT - /* update volume entry in local cache */ - cachefs_update_cookie(vl->cache); +#ifdef CONFIG_AFS_FSCACHE + fscache_update_cookie(vl->cache); #endif } @@ -304,11 +303,9 @@ static int afs_vlocation_fill_in_record(struct afs_vlocation *vl, memset(&vldb, 0, sizeof(vldb)); /* see if we have an in-cache copy (will set vl->valid if there is) */ -#ifdef AFS_CACHING_SUPPORT - cachefs_acquire_cookie(cell->cache, - &afs_volume_cache_index_def, - vlocation, - &vl->cache); +#ifdef CONFIG_AFS_FSCACHE + vl->cache = fscache_acquire_cookie(vl->cell->cache, + &afs_vlocation_cache_index_def, vl); #endif if (vl->valid) { @@ -420,6 +417,11 @@ fill_in_record: spin_unlock(&vl->lock); wake_up(&vl->waitq); + /* update volume entry in local cache */ +#ifdef CONFIG_AFS_FSCACHE + fscache_update_cookie(vl->cache); +#endif + /* schedule for regular updates */ afs_vlocation_queue_for_updates(vl); goto success; @@ -465,7 +467,7 @@ found_in_memory: spin_unlock(&vl->lock); success: - _leave(" = %p",vl); + _leave(" = %p", vl); return vl; error_abandon: @@ -523,10 +525,9 @@ static void afs_vlocation_destroy(struct afs_vlocation *vl) { _enter("%p", vl); -#ifdef AFS_CACHING_SUPPORT - cachefs_relinquish_cookie(vl->cache, 0); +#ifdef CONFIG_AFS_FSCACHE + fscache_relinquish_cookie(vl->cache, 0); #endif - afs_put_cell(vl->cell); kfree(vl); } diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 8bab0e3437f9..a353e69e2391 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -124,13 +124,11 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params) } /* attach the cache and volume location */ -#ifdef AFS_CACHING_SUPPORT - cachefs_acquire_cookie(vlocation->cache, - &afs_vnode_cache_index_def, - volume, - &volume->cache); +#ifdef CONFIG_AFS_FSCACHE + volume->cache = fscache_acquire_cookie(vlocation->cache, + &afs_volume_cache_index_def, + volume); #endif - afs_get_vlocation(vlocation); volume->vlocation = vlocation; @@ -194,8 +192,8 @@ void afs_put_volume(struct afs_volume *volume) up_write(&vlocation->cell->vl_sem); /* finish cleaning up the volume */ -#ifdef AFS_CACHING_SUPPORT - cachefs_relinquish_cookie(volume->cache, 0); +#ifdef CONFIG_AFS_FSCACHE + fscache_relinquish_cookie(volume->cache, 0); #endif afs_put_vlocation(vlocation); diff --git a/fs/afs/write.c b/fs/afs/write.c index 3fb36d433621..c2e7a7ff0080 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -780,3 +780,24 @@ int afs_fsync(struct file *file, struct dentry *dentry, int datasync) _leave(" = %d", ret); return ret; } + +/* + * notification that a previously read-only page is about to become writable + * - if it returns an error, the caller will deliver a bus error signal + */ +int afs_page_mkwrite(struct vm_area_struct *vma, struct page *page) +{ + struct afs_vnode *vnode = AFS_FS_I(vma->vm_file->f_mapping->host); + + _enter("{{%x:%u}},{%lx}", + vnode->fid.vid, vnode->fid.vnode, page->index); + + /* wait for the page to be written to the cache before we allow it to + * be modified */ +#ifdef CONFIG_AFS_FSCACHE + fscache_wait_on_page_write(vnode->cache, page); +#endif + + _leave(" = 0"); + return 0; +} diff --git a/fs/befs/debug.c b/fs/befs/debug.c index b8e304a0661e..622e73775c83 100644 --- a/fs/befs/debug.c +++ b/fs/befs/debug.c @@ -17,6 +17,7 @@ #include <linux/spinlock.h> #include <linux/kernel.h> #include <linux/fs.h> +#include <linux/slab.h> #endif /* __KERNEL__ */ diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index d06cb023ad02..76afd0d6b86c 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -900,6 +900,7 @@ static int befs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); befs_debug(sb, "---> befs_statfs()"); @@ -910,6 +911,8 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = buf->f_bfree; buf->f_files = 0; /* UNKNOWN */ buf->f_ffree = 0; /* UNKNOWN */ + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); buf->f_namelen = BEFS_NAME_LEN; befs_debug(sb, "<--- befs_statfs()"); diff --git a/fs/befs/super.c b/fs/befs/super.c index 41f2b4d0093e..ca40f828f64d 100644 --- a/fs/befs/super.c +++ b/fs/befs/super.c @@ -8,6 +8,7 @@ */ #include <linux/fs.h> +#include <asm/page.h> /* for PAGE_SIZE */ #include "befs.h" #include "super.h" diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 33b7235f853b..40381df34869 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -12,8 +12,6 @@ #include <linux/module.h> #include <linux/kernel.h> #include <linux/fs.h> -#include <linux/stat.h> -#include <linux/time.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/errno.h> @@ -21,20 +19,15 @@ #include <linux/binfmts.h> #include <linux/string.h> #include <linux/file.h> -#include <linux/fcntl.h> -#include <linux/ptrace.h> #include <linux/slab.h> -#include <linux/shm.h> #include <linux/personality.h> #include <linux/elfcore.h> #include <linux/init.h> #include <linux/highuid.h> -#include <linux/smp.h> #include <linux/compiler.h> #include <linux/highmem.h> #include <linux/pagemap.h> #include <linux/security.h> -#include <linux/syscalls.h> #include <linux/random.h> #include <linux/elf.h> #include <linux/utsname.h> @@ -576,7 +569,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) unsigned long error; struct elf_phdr *elf_ppnt, *elf_phdata; unsigned long elf_bss, elf_brk; - int elf_exec_fileno; int retval, i; unsigned int size; unsigned long elf_entry; @@ -631,12 +623,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) goto out_free_ph; } - retval = get_unused_fd(); - if (retval < 0) - goto out_free_ph; - get_file(bprm->file); - fd_install(elf_exec_fileno = retval, bprm->file); - elf_ppnt = elf_phdata; elf_bss = 0; elf_brk = 0; @@ -655,13 +641,13 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) retval = -ENOEXEC; if (elf_ppnt->p_filesz > PATH_MAX || elf_ppnt->p_filesz < 2) - goto out_free_file; + goto out_free_ph; retval = -ENOMEM; elf_interpreter = kmalloc(elf_ppnt->p_filesz, GFP_KERNEL); if (!elf_interpreter) - goto out_free_file; + goto out_free_ph; retval = kernel_read(bprm->file, elf_ppnt->p_offset, elf_interpreter, @@ -956,8 +942,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) kfree(elf_phdata); - sys_close(elf_exec_fileno); - set_binfmt(&elf_format); #ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES @@ -1028,8 +1012,6 @@ out_free_dentry: fput(interpreter); out_free_interp: kfree(elf_interpreter); -out_free_file: - sys_close(elf_exec_fileno); out_free_ph: kfree(elf_phdata); goto out; diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index f3e72c5c19f5..70cfc4b84ae0 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -972,9 +972,12 @@ static int elf_fdpic_map_file_constdisp_on_uclinux( params->elfhdr_addr = seg->addr; /* clear any space allocated but not loaded */ - if (phdr->p_filesz < phdr->p_memsz) - clear_user((void *) (seg->addr + phdr->p_filesz), - phdr->p_memsz - phdr->p_filesz); + if (phdr->p_filesz < phdr->p_memsz) { + ret = clear_user((void *) (seg->addr + phdr->p_filesz), + phdr->p_memsz - phdr->p_filesz); + if (ret) + return ret; + } if (mm) { if (phdr->p_flags & PF_X) { @@ -1014,7 +1017,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, struct elf32_fdpic_loadseg *seg; struct elf32_phdr *phdr; unsigned long load_addr, delta_vaddr; - int loop, dvset; + int loop, dvset, ret; load_addr = params->load_addr; delta_vaddr = 0; @@ -1114,7 +1117,9 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, * PT_LOAD */ if (prot & PROT_WRITE && disp > 0) { kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp); - clear_user((void __user *) maddr, disp); + ret = clear_user((void __user *) maddr, disp); + if (ret) + return ret; maddr += disp; } @@ -1149,15 +1154,19 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, if (prot & PROT_WRITE && excess1 > 0) { kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr + phdr->p_filesz, excess1); - clear_user((void __user *) maddr + phdr->p_filesz, - excess1); + ret = clear_user((void __user *) maddr + phdr->p_filesz, + excess1); + if (ret) + return ret; } #else if (excess > 0) { kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr + phdr->p_filesz, excess); - clear_user((void *) maddr + phdr->p_filesz, excess); + ret = clear_user((void *) maddr + phdr->p_filesz, excess); + if (ret) + return ret; } #endif diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c index 08644a61616e..eff74b9c9e77 100644 --- a/fs/binfmt_som.c +++ b/fs/binfmt_som.c @@ -188,7 +188,6 @@ out: static int load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs) { - int som_exec_fileno; int retval; unsigned int size; unsigned long som_entry; @@ -220,12 +219,6 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs) goto out_free; } - retval = get_unused_fd(); - if (retval < 0) - goto out_free; - get_file(bprm->file); - fd_install(som_exec_fileno = retval, bprm->file); - /* Flush all traces of the currently running executable */ retval = flush_old_exec(bprm); if (retval) @@ -1420,8 +1420,7 @@ static void bio_pair_end_2(struct bio *bi, int err) } /* - * split a bio - only worry about a bio with a single page - * in it's iovec + * split a bio - only worry about a bio with a single page in its iovec */ struct bio_pair *bio_split(struct bio *bi, int first_sectors) { diff --git a/fs/block_dev.c b/fs/block_dev.c index 8c3c6899ccf3..f45dbc18dd17 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -204,6 +204,7 @@ int fsync_bdev(struct block_device *bdev) } return sync_blockdev(bdev); } +EXPORT_SYMBOL(fsync_bdev); /** * freeze_bdev -- lock a filesystem and force it into a consistent state diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 1d53b62dbba5..7fdd184a528d 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -256,7 +256,7 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir) } if (!acl) - inode->i_mode &= ~current->fs->umask; + inode->i_mode &= ~current_umask(); } if (IS_POSIXACL(dir) && acl) { diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index c84ca1f5259a..51bfdfc8fcda 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -20,7 +20,6 @@ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/freezer.h> -#include <linux/ftrace.h> #include "async-thread.h" #define WORK_QUEUED_BIT 0 @@ -195,6 +194,9 @@ again_locked: if (!list_empty(&worker->pending)) continue; + if (kthread_should_stop()) + break; + /* still no more work?, sleep for real */ spin_lock_irq(&worker->lock); set_current_state(TASK_INTERRUPTIBLE); @@ -208,7 +210,8 @@ again_locked: worker->working = 0; spin_unlock_irq(&worker->lock); - schedule(); + if (!kthread_should_stop()) + schedule(); } __set_current_state(TASK_RUNNING); } diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index dbb724124633..e5b2533b691a 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1244,9 +1244,9 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, * readahead one full node of leaves, finding things that are close * to the block in 'slot', and triggering ra on them. */ -static noinline void reada_for_search(struct btrfs_root *root, - struct btrfs_path *path, - int level, int slot, u64 objectid) +static void reada_for_search(struct btrfs_root *root, + struct btrfs_path *path, + int level, int slot, u64 objectid) { struct extent_buffer *node; struct btrfs_disk_key disk_key; @@ -1447,6 +1447,117 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) } /* + * helper function for btrfs_search_slot. The goal is to find a block + * in cache without setting the path to blocking. If we find the block + * we return zero and the path is unchanged. + * + * If we can't find the block, we set the path blocking and do some + * reada. -EAGAIN is returned and the search must be repeated. + */ +static int +read_block_for_search(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *p, + struct extent_buffer **eb_ret, int level, int slot, + struct btrfs_key *key) +{ + u64 blocknr; + u64 gen; + u32 blocksize; + struct extent_buffer *b = *eb_ret; + struct extent_buffer *tmp; + + blocknr = btrfs_node_blockptr(b, slot); + gen = btrfs_node_ptr_generation(b, slot); + blocksize = btrfs_level_size(root, level - 1); + + tmp = btrfs_find_tree_block(root, blocknr, blocksize); + if (tmp && btrfs_buffer_uptodate(tmp, gen)) { + *eb_ret = tmp; + return 0; + } + + /* + * reduce lock contention at high levels + * of the btree by dropping locks before + * we read. + */ + btrfs_release_path(NULL, p); + if (tmp) + free_extent_buffer(tmp); + if (p->reada) + reada_for_search(root, p, level, slot, key->objectid); + + tmp = read_tree_block(root, blocknr, blocksize, gen); + if (tmp) + free_extent_buffer(tmp); + return -EAGAIN; +} + +/* + * helper function for btrfs_search_slot. This does all of the checks + * for node-level blocks and does any balancing required based on + * the ins_len. + * + * If no extra work was required, zero is returned. If we had to + * drop the path, -EAGAIN is returned and btrfs_search_slot must + * start over + */ +static int +setup_nodes_for_search(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *p, + struct extent_buffer *b, int level, int ins_len) +{ + int ret; + if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= + BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { + int sret; + + sret = reada_for_balance(root, p, level); + if (sret) + goto again; + + btrfs_set_path_blocking(p); + sret = split_node(trans, root, p, level); + btrfs_clear_path_blocking(p, NULL); + + BUG_ON(sret > 0); + if (sret) { + ret = sret; + goto done; + } + b = p->nodes[level]; + } else if (ins_len < 0 && btrfs_header_nritems(b) < + BTRFS_NODEPTRS_PER_BLOCK(root) / 4) { + int sret; + + sret = reada_for_balance(root, p, level); + if (sret) + goto again; + + btrfs_set_path_blocking(p); + sret = balance_level(trans, root, p, level); + btrfs_clear_path_blocking(p, NULL); + + if (sret) { + ret = sret; + goto done; + } + b = p->nodes[level]; + if (!b) { + btrfs_release_path(NULL, p); + goto again; + } + BUG_ON(btrfs_header_nritems(b) == 1); + } + return 0; + +again: + ret = -EAGAIN; +done: + return ret; +} + +/* * look for key in the tree. path is filled in with nodes along the way * if key is found, we return zero and you can find the item in the leaf * level of the path (level 0) @@ -1464,16 +1575,11 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root ins_len, int cow) { struct extent_buffer *b; - struct extent_buffer *tmp; int slot; int ret; int level; - int should_reada = p->reada; int lowest_unlock = 1; - int blocksize; u8 lowest_level = 0; - u64 blocknr; - u64 gen; lowest_level = p->lowest_level; WARN_ON(lowest_level && ins_len > 0); @@ -1502,7 +1608,11 @@ again: if (cow) { int wret; - /* is a cow on this block not required */ + /* + * if we don't really need to cow this block + * then we don't want to set the path blocking, + * so we test it here + */ if (btrfs_header_generation(b) == trans->transid && btrfs_header_owner(b) == root->root_key.objectid && !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) { @@ -1557,51 +1667,15 @@ cow_done: if (ret && slot > 0) slot -= 1; p->slots[level] = slot; - if ((p->search_for_split || ins_len > 0) && - btrfs_header_nritems(b) >= - BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { - int sret; - - sret = reada_for_balance(root, p, level); - if (sret) - goto again; - - btrfs_set_path_blocking(p); - sret = split_node(trans, root, p, level); - btrfs_clear_path_blocking(p, NULL); - - BUG_ON(sret > 0); - if (sret) { - ret = sret; - goto done; - } - b = p->nodes[level]; - slot = p->slots[level]; - } else if (ins_len < 0 && - btrfs_header_nritems(b) < - BTRFS_NODEPTRS_PER_BLOCK(root) / 4) { - int sret; - - sret = reada_for_balance(root, p, level); - if (sret) - goto again; - - btrfs_set_path_blocking(p); - sret = balance_level(trans, root, p, level); - btrfs_clear_path_blocking(p, NULL); + ret = setup_nodes_for_search(trans, root, p, b, level, + ins_len); + if (ret == -EAGAIN) + goto again; + else if (ret) + goto done; + b = p->nodes[level]; + slot = p->slots[level]; - if (sret) { - ret = sret; - goto done; - } - b = p->nodes[level]; - if (!b) { - btrfs_release_path(NULL, p); - goto again; - } - slot = p->slots[level]; - BUG_ON(btrfs_header_nritems(b) == 1); - } unlock_up(p, level, lowest_unlock); /* this is only true while dropping a snapshot */ @@ -1610,44 +1684,11 @@ cow_done: goto done; } - blocknr = btrfs_node_blockptr(b, slot); - gen = btrfs_node_ptr_generation(b, slot); - blocksize = btrfs_level_size(root, level - 1); + ret = read_block_for_search(trans, root, p, + &b, level, slot, key); + if (ret == -EAGAIN) + goto again; - tmp = btrfs_find_tree_block(root, blocknr, blocksize); - if (tmp && btrfs_buffer_uptodate(tmp, gen)) { - b = tmp; - } else { - /* - * reduce lock contention at high levels - * of the btree by dropping locks before - * we read. - */ - if (level > 0) { - btrfs_release_path(NULL, p); - if (tmp) - free_extent_buffer(tmp); - if (should_reada) - reada_for_search(root, p, - level, slot, - key->objectid); - - tmp = read_tree_block(root, blocknr, - blocksize, gen); - if (tmp) - free_extent_buffer(tmp); - goto again; - } else { - btrfs_set_path_blocking(p); - if (tmp) - free_extent_buffer(tmp); - if (should_reada) - reada_for_search(root, p, - level, slot, - key->objectid); - b = read_node_slot(root, b, slot); - } - } if (!p->skip_locking) { int lret; @@ -2116,8 +2157,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root BUG_ON(!path->nodes[level]); lower = path->nodes[level]; nritems = btrfs_header_nritems(lower); - if (slot > nritems) - BUG(); + BUG_ON(slot > nritems); if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root)) BUG(); if (slot != nritems) { @@ -4086,28 +4126,44 @@ next: int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) { int slot; - int level = 1; + int level; struct extent_buffer *c; - struct extent_buffer *next = NULL; + struct extent_buffer *next; struct btrfs_key key; u32 nritems; int ret; + int old_spinning = path->leave_spinning; + int force_blocking = 0; nritems = btrfs_header_nritems(path->nodes[0]); if (nritems == 0) return 1; - btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); + /* + * we take the blocks in an order that upsets lockdep. Using + * blocking mode is the only way around it. + */ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + force_blocking = 1; +#endif + btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); +again: + level = 1; + next = NULL; btrfs_release_path(root, path); + path->keep_locks = 1; + + if (!force_blocking) + path->leave_spinning = 1; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); path->keep_locks = 0; if (ret < 0) return ret; - btrfs_set_path_blocking(path); nritems = btrfs_header_nritems(path->nodes[0]); /* * by releasing the path above we dropped all our locks. A balance @@ -4117,19 +4173,24 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) */ if (nritems > 0 && path->slots[0] < nritems - 1) { path->slots[0]++; + ret = 0; goto done; } while (level < BTRFS_MAX_LEVEL) { - if (!path->nodes[level]) - return 1; + if (!path->nodes[level]) { + ret = 1; + goto done; + } slot = path->slots[level] + 1; c = path->nodes[level]; if (slot >= btrfs_header_nritems(c)) { level++; - if (level == BTRFS_MAX_LEVEL) - return 1; + if (level == BTRFS_MAX_LEVEL) { + ret = 1; + goto done; + } continue; } @@ -4138,16 +4199,22 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) free_extent_buffer(next); } - /* the path was set to blocking above */ - if (level == 1 && (path->locks[1] || path->skip_locking) && - path->reada) - reada_for_search(root, path, level, slot, 0); + next = c; + ret = read_block_for_search(NULL, root, path, &next, level, + slot, &key); + if (ret == -EAGAIN) + goto again; - next = read_node_slot(root, c, slot); if (!path->skip_locking) { - btrfs_assert_tree_locked(c); - btrfs_tree_lock(next); - btrfs_set_lock_blocking(next); + ret = btrfs_try_spin_lock(next); + if (!ret) { + btrfs_set_path_blocking(path); + btrfs_tree_lock(next); + if (!force_blocking) + btrfs_clear_path_blocking(path, next); + } + if (force_blocking) + btrfs_set_lock_blocking(next); } break; } @@ -4157,27 +4224,42 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) c = path->nodes[level]; if (path->locks[level]) btrfs_tree_unlock(c); + free_extent_buffer(c); path->nodes[level] = next; path->slots[level] = 0; if (!path->skip_locking) path->locks[level] = 1; + if (!level) break; - btrfs_set_path_blocking(path); - if (level == 1 && path->locks[1] && path->reada) - reada_for_search(root, path, level, slot, 0); - next = read_node_slot(root, next, 0); + ret = read_block_for_search(NULL, root, path, &next, level, + 0, &key); + if (ret == -EAGAIN) + goto again; + if (!path->skip_locking) { btrfs_assert_tree_locked(path->nodes[level]); - btrfs_tree_lock(next); - btrfs_set_lock_blocking(next); + ret = btrfs_try_spin_lock(next); + if (!ret) { + btrfs_set_path_blocking(path); + btrfs_tree_lock(next); + if (!force_blocking) + btrfs_clear_path_blocking(path, next); + } + if (force_blocking) + btrfs_set_lock_blocking(next); } } + ret = 0; done: unlock_up(path, 0, 1); - return 0; + path->leave_spinning = old_spinning; + if (!old_spinning) + btrfs_set_path_blocking(path); + + return ret; } /* diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9417713542a2..ad96495dedc5 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -143,12 +143,15 @@ static int btrfs_csum_sizes[] = { 4, 0 }; #define BTRFS_FT_MAX 9 /* - * the key defines the order in the tree, and so it also defines (optimal) - * block layout. objectid corresonds to the inode number. The flags - * tells us things about the object, and is a kind of stream selector. - * so for a given inode, keys with flags of 1 might refer to the inode - * data, flags of 2 may point to file data in the btree and flags == 3 - * may point to extents. + * The key defines the order in the tree, and so it also defines (optimal) + * block layout. + * + * objectid corresponds to the inode number. + * + * type tells us things about the object, and is a kind of stream selector. + * so for a given inode, keys with type of 1 might refer to the inode data, + * type of 2 may point to file data in the btree and type == 3 may point to + * extents. * * offset is the starting byte offset for this key in the stream. * @@ -200,7 +203,7 @@ struct btrfs_dev_item { /* * starting byte of this partition on the device, - * to allowr for stripe alignment in the future + * to allow for stripe alignment in the future */ __le64 start_offset; @@ -633,18 +636,35 @@ struct btrfs_space_info { struct rw_semaphore groups_sem; }; -struct btrfs_free_space { - struct rb_node bytes_index; - struct rb_node offset_index; - u64 offset; - u64 bytes; +/* + * free clusters are used to claim free space in relatively large chunks, + * allowing us to do less seeky writes. They are used for all metadata + * allocations and data allocations in ssd mode. + */ +struct btrfs_free_cluster { + spinlock_t lock; + spinlock_t refill_lock; + struct rb_root root; + + /* largest extent in this cluster */ + u64 max_size; + + /* first extent starting offset */ + u64 window_start; + + struct btrfs_block_group_cache *block_group; + /* + * when a cluster is allocated from a block group, we put the + * cluster onto a list in the block group so that it can + * be freed before the block group is freed. + */ + struct list_head block_group_list; }; struct btrfs_block_group_cache { struct btrfs_key key; struct btrfs_block_group_item item; spinlock_t lock; - struct mutex alloc_mutex; struct mutex cache_mutex; u64 pinned; u64 reserved; @@ -656,6 +676,7 @@ struct btrfs_block_group_cache { struct btrfs_space_info *space_info; /* free space cache stuff */ + spinlock_t tree_lock; struct rb_root free_space_bytes; struct rb_root free_space_offset; @@ -667,6 +688,11 @@ struct btrfs_block_group_cache { /* usage count */ atomic_t count; + + /* List of struct btrfs_free_clusters for this block group. + * Today it will only have one thing on it, but that may change + */ + struct list_head cluster_list; }; struct btrfs_leaf_ref_tree { @@ -728,7 +754,6 @@ struct btrfs_fs_info { struct mutex tree_log_mutex; struct mutex transaction_kthread_mutex; struct mutex cleaner_mutex; - struct mutex pinned_mutex; struct mutex chunk_mutex; struct mutex drop_mutex; struct mutex volume_mutex; @@ -839,8 +864,12 @@ struct btrfs_fs_info { spinlock_t delalloc_lock; spinlock_t new_trans_lock; u64 delalloc_bytes; - u64 last_alloc; - u64 last_data_alloc; + + /* data_alloc_cluster is only used in ssd mode */ + struct btrfs_free_cluster data_alloc_cluster; + + /* all metadata allocations go through this cluster */ + struct btrfs_free_cluster meta_alloc_cluster; spinlock_t ref_cache_lock; u64 total_ref_cache_size; @@ -932,7 +961,6 @@ struct btrfs_root { }; /* - * inode items have the data typically returned from stat and store other * info about object characteristics. There is one for every file and dir in * the FS @@ -963,7 +991,7 @@ struct btrfs_root { #define BTRFS_EXTENT_CSUM_KEY 128 /* - * root items point to tree roots. There are typically in the root + * root items point to tree roots. They are typically in the root * tree used by the super block to find all the other trees */ #define BTRFS_ROOT_ITEM_KEY 132 @@ -1010,6 +1038,8 @@ struct btrfs_root { #define BTRFS_MOUNT_SSD (1 << 3) #define BTRFS_MOUNT_DEGRADED (1 << 4) #define BTRFS_MOUNT_COMPRESS (1 << 5) +#define BTRFS_MOUNT_NOTREELOG (1 << 6) +#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -1748,6 +1778,7 @@ static inline struct dentry *fdentry(struct file *file) } /* extent-tree.c */ +void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count); int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); @@ -2174,21 +2205,4 @@ int btrfs_check_acl(struct inode *inode, int mask); int btrfs_init_acl(struct inode *inode, struct inode *dir); int btrfs_acl_chmod(struct inode *inode); -/* free-space-cache.c */ -int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, - u64 bytenr, u64 size); -int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes); -int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, - u64 bytenr, u64 size); -int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes); -void btrfs_remove_free_space_cache(struct btrfs_block_group_cache - *block_group); -struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache - *block_group, u64 offset, - u64 bytes); -void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, - u64 bytes); -u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group); #endif diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index cbf7dc8ae3ec..d6c01c096a40 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -18,7 +18,6 @@ #include <linux/sched.h> #include <linux/sort.h> -#include <linux/ftrace.h> #include "ctree.h" #include "delayed-ref.h" #include "transaction.h" diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 92d73929d381..92caa8035f36 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -38,6 +38,7 @@ #include "locking.h" #include "ref-cache.h" #include "tree-log.h" +#include "free-space-cache.h" static struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); @@ -1412,8 +1413,6 @@ static int bio_ready_for_csum(struct bio *bio) ret = extent_range_uptodate(io_tree, start + length, start + buf_len - 1); - if (ret == 1) - return ret; return ret; } @@ -1647,12 +1646,15 @@ struct btrfs_root *open_ctree(struct super_block *sb, mutex_init(&fs_info->ordered_operations_mutex); mutex_init(&fs_info->tree_log_mutex); mutex_init(&fs_info->drop_mutex); - mutex_init(&fs_info->pinned_mutex); mutex_init(&fs_info->chunk_mutex); mutex_init(&fs_info->transaction_kthread_mutex); mutex_init(&fs_info->cleaner_mutex); mutex_init(&fs_info->volume_mutex); mutex_init(&fs_info->tree_reloc_mutex); + + btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); + btrfs_init_free_cluster(&fs_info->data_alloc_cluster); + init_waitqueue_head(&fs_info->transaction_throttle); init_waitqueue_head(&fs_info->transaction_wait); init_waitqueue_head(&fs_info->async_submit_wait); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f5e7cae63d80..178df4c67de4 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -31,6 +31,7 @@ #include "volumes.h" #include "locking.h" #include "ref-cache.h" +#include "free-space-cache.h" #define PENDING_EXTENT_INSERT 0 #define PENDING_EXTENT_DELETE 1 @@ -166,7 +167,6 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group, u64 extent_start, extent_end, size; int ret; - mutex_lock(&info->pinned_mutex); while (start < end) { ret = find_first_extent_bit(&info->pinned_extents, start, &extent_start, &extent_end, @@ -192,7 +192,6 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group, ret = btrfs_add_free_space(block_group, start, size); BUG_ON(ret); } - mutex_unlock(&info->pinned_mutex); return 0; } @@ -291,8 +290,8 @@ next: block_group->key.objectid + block_group->key.offset); - remove_sb_from_cache(root, block_group); block_group->cached = 1; + remove_sb_from_cache(root, block_group); ret = 0; err: btrfs_free_path(path); @@ -326,7 +325,7 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group( return cache; } -static inline void put_block_group(struct btrfs_block_group_cache *cache) +void btrfs_put_block_group(struct btrfs_block_group_cache *cache) { if (atomic_dec_and_test(&cache->count)) kfree(cache); @@ -399,12 +398,12 @@ again: div_factor(cache->key.offset, factor)) { group_start = cache->key.objectid; spin_unlock(&cache->lock); - put_block_group(cache); + btrfs_put_block_group(cache); goto found; } } spin_unlock(&cache->lock); - put_block_group(cache); + btrfs_put_block_group(cache); cond_resched(); } if (!wrapped) { @@ -1594,7 +1593,7 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) if (!block_group || block_group->ro) readonly = 1; if (block_group) - put_block_group(block_group); + btrfs_put_block_group(block_group); return readonly; } @@ -2018,7 +2017,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, WARN_ON(ret); } } - put_block_group(cache); + btrfs_put_block_group(cache); total -= num_bytes; bytenr += num_bytes; } @@ -2035,7 +2034,7 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) return 0; bytenr = cache->key.objectid; - put_block_group(cache); + btrfs_put_block_group(cache); return bytenr; } @@ -2047,7 +2046,6 @@ int btrfs_update_pinned_extents(struct btrfs_root *root, struct btrfs_block_group_cache *cache; struct btrfs_fs_info *fs_info = root->fs_info; - WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex)); if (pin) { set_extent_dirty(&fs_info->pinned_extents, bytenr, bytenr + num - 1, GFP_NOFS); @@ -2055,7 +2053,6 @@ int btrfs_update_pinned_extents(struct btrfs_root *root, clear_extent_dirty(&fs_info->pinned_extents, bytenr, bytenr + num - 1, GFP_NOFS); } - mutex_unlock(&root->fs_info->pinned_mutex); while (num > 0) { cache = btrfs_lookup_block_group(fs_info, bytenr); @@ -2081,7 +2078,7 @@ int btrfs_update_pinned_extents(struct btrfs_root *root, if (cache->cached) btrfs_add_free_space(cache, bytenr, len); } - put_block_group(cache); + btrfs_put_block_group(cache); bytenr += len; num -= len; } @@ -2112,7 +2109,7 @@ static int update_reserved_extents(struct btrfs_root *root, } spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - put_block_group(cache); + btrfs_put_block_group(cache); bytenr += len; num -= len; } @@ -2127,7 +2124,6 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy) struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents; int ret; - mutex_lock(&root->fs_info->pinned_mutex); while (1) { ret = find_first_extent_bit(pinned_extents, last, &start, &end, EXTENT_DIRTY); @@ -2136,7 +2132,6 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy) set_extent_dirty(copy, start, end, GFP_NOFS); last = end + 1; } - mutex_unlock(&root->fs_info->pinned_mutex); return 0; } @@ -2149,7 +2144,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, int ret; while (1) { - mutex_lock(&root->fs_info->pinned_mutex); ret = find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY); if (ret) @@ -2163,7 +2157,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, cond_resched(); } - mutex_unlock(&root->fs_info->pinned_mutex); return ret; } @@ -2205,7 +2198,6 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans, free_extent_buffer(buf); pinit: btrfs_set_path_blocking(path); - mutex_lock(&root->fs_info->pinned_mutex); /* unlocks the pinned mutex */ btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); @@ -2511,8 +2503,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, */ if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID && owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { - mutex_lock(&root->fs_info->pinned_mutex); - /* unlocks the pinned mutex */ btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); update_reserved_extents(root, bytenr, num_bytes, 0); @@ -2554,228 +2544,237 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, { int ret = 0; struct btrfs_root *root = orig_root->fs_info->extent_root; - u64 total_needed = num_bytes; - u64 *last_ptr = NULL; - u64 last_wanted = 0; + struct btrfs_free_cluster *last_ptr = NULL; struct btrfs_block_group_cache *block_group = NULL; - int chunk_alloc_done = 0; int empty_cluster = 2 * 1024 * 1024; int allowed_chunk_alloc = 0; - struct list_head *head = NULL, *cur = NULL; - int loop = 0; - int extra_loop = 0; struct btrfs_space_info *space_info; + int last_ptr_loop = 0; + int loop = 0; WARN_ON(num_bytes < root->sectorsize); btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); ins->objectid = 0; ins->offset = 0; + space_info = __find_space_info(root->fs_info, data); + if (orig_root->ref_cows || empty_size) allowed_chunk_alloc = 1; if (data & BTRFS_BLOCK_GROUP_METADATA) { - last_ptr = &root->fs_info->last_alloc; + last_ptr = &root->fs_info->meta_alloc_cluster; if (!btrfs_test_opt(root, SSD)) empty_cluster = 64 * 1024; } - if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) - last_ptr = &root->fs_info->last_data_alloc; + if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) { + last_ptr = &root->fs_info->data_alloc_cluster; + } if (last_ptr) { - if (*last_ptr) { - hint_byte = *last_ptr; - last_wanted = *last_ptr; - } else - empty_size += empty_cluster; - } else { - empty_cluster = 0; + spin_lock(&last_ptr->lock); + if (last_ptr->block_group) + hint_byte = last_ptr->window_start; + spin_unlock(&last_ptr->lock); } + search_start = max(search_start, first_logical_byte(root, 0)); search_start = max(search_start, hint_byte); - if (last_wanted && search_start != last_wanted) { - last_wanted = 0; - empty_size += empty_cluster; + if (!last_ptr) { + empty_cluster = 0; + loop = 1; } - total_needed += empty_size; - block_group = btrfs_lookup_block_group(root->fs_info, search_start); - if (!block_group) - block_group = btrfs_lookup_first_block_group(root->fs_info, - search_start); - space_info = __find_space_info(root->fs_info, data); + if (search_start == hint_byte) { + block_group = btrfs_lookup_block_group(root->fs_info, + search_start); + if (block_group && block_group_bits(block_group, data)) { + down_read(&space_info->groups_sem); + goto have_block_group; + } else if (block_group) { + btrfs_put_block_group(block_group); + } + } +search: down_read(&space_info->groups_sem); - while (1) { - struct btrfs_free_space *free_space; - /* - * the only way this happens if our hint points to a block - * group thats not of the proper type, while looping this - * should never happen - */ - if (empty_size) - extra_loop = 1; + list_for_each_entry(block_group, &space_info->block_groups, list) { + u64 offset; - if (!block_group) - goto new_group_no_lock; + atomic_inc(&block_group->count); + search_start = block_group->key.objectid; +have_block_group: if (unlikely(!block_group->cached)) { mutex_lock(&block_group->cache_mutex); ret = cache_block_group(root, block_group); mutex_unlock(&block_group->cache_mutex); - if (ret) + if (ret) { + btrfs_put_block_group(block_group); break; + } } - mutex_lock(&block_group->alloc_mutex); - if (unlikely(!block_group_bits(block_group, data))) - goto new_group; - if (unlikely(block_group->ro)) - goto new_group; + goto loop; - free_space = btrfs_find_free_space(block_group, search_start, - total_needed); - if (free_space) { - u64 start = block_group->key.objectid; - u64 end = block_group->key.objectid + - block_group->key.offset; + if (last_ptr) { + /* + * the refill lock keeps out other + * people trying to start a new cluster + */ + spin_lock(&last_ptr->refill_lock); + offset = btrfs_alloc_from_cluster(block_group, last_ptr, + num_bytes, search_start); + if (offset) { + /* we have a block, we're done */ + spin_unlock(&last_ptr->refill_lock); + goto checks; + } - search_start = stripe_align(root, free_space->offset); + spin_lock(&last_ptr->lock); + /* + * whoops, this cluster doesn't actually point to + * this block group. Get a ref on the block + * group is does point to and try again + */ + if (!last_ptr_loop && last_ptr->block_group && + last_ptr->block_group != block_group) { + + btrfs_put_block_group(block_group); + block_group = last_ptr->block_group; + atomic_inc(&block_group->count); + spin_unlock(&last_ptr->lock); + spin_unlock(&last_ptr->refill_lock); + + last_ptr_loop = 1; + search_start = block_group->key.objectid; + goto have_block_group; + } + spin_unlock(&last_ptr->lock); - /* move on to the next group */ - if (search_start + num_bytes >= search_end) - goto new_group; + /* + * this cluster didn't work out, free it and + * start over + */ + btrfs_return_cluster_to_free_space(NULL, last_ptr); - /* move on to the next group */ - if (search_start + num_bytes > end) - goto new_group; + last_ptr_loop = 0; - if (last_wanted && search_start != last_wanted) { - total_needed += empty_cluster; - empty_size += empty_cluster; - last_wanted = 0; + /* allocate a cluster in this block group */ + ret = btrfs_find_space_cluster(trans, + block_group, last_ptr, + offset, num_bytes, + empty_cluster + empty_size); + if (ret == 0) { /* - * if search_start is still in this block group - * then we just re-search this block group + * now pull our allocation out of this + * cluster */ - if (search_start >= start && - search_start < end) { - mutex_unlock(&block_group->alloc_mutex); - continue; + offset = btrfs_alloc_from_cluster(block_group, + last_ptr, num_bytes, + search_start); + if (offset) { + /* we found one, proceed */ + spin_unlock(&last_ptr->refill_lock); + goto checks; } - - /* else we go to the next block group */ - goto new_group; } - - if (exclude_nr > 0 && - (search_start + num_bytes > exclude_start && - search_start < exclude_start + exclude_nr)) { - search_start = exclude_start + exclude_nr; - /* - * if search_start is still in this block group - * then we just re-search this block group - */ - if (search_start >= start && - search_start < end) { - mutex_unlock(&block_group->alloc_mutex); - last_wanted = 0; - continue; - } - - /* else we go to the next block group */ - goto new_group; + /* + * at this point we either didn't find a cluster + * or we weren't able to allocate a block from our + * cluster. Free the cluster we've been trying + * to use, and go to the next block group + */ + if (loop < 2) { + btrfs_return_cluster_to_free_space(NULL, + last_ptr); + spin_unlock(&last_ptr->refill_lock); + goto loop; } + spin_unlock(&last_ptr->refill_lock); + } - ins->objectid = search_start; - ins->offset = num_bytes; + offset = btrfs_find_space_for_alloc(block_group, search_start, + num_bytes, empty_size); + if (!offset) + goto loop; +checks: + search_start = stripe_align(root, offset); + + /* move on to the next group */ + if (search_start + num_bytes >= search_end) { + btrfs_add_free_space(block_group, offset, num_bytes); + goto loop; + } - btrfs_remove_free_space_lock(block_group, search_start, - num_bytes); - /* we are all good, lets return */ - mutex_unlock(&block_group->alloc_mutex); - break; + /* move on to the next group */ + if (search_start + num_bytes > + block_group->key.objectid + block_group->key.offset) { + btrfs_add_free_space(block_group, offset, num_bytes); + goto loop; } -new_group: - mutex_unlock(&block_group->alloc_mutex); - put_block_group(block_group); - block_group = NULL; -new_group_no_lock: - /* don't try to compare new allocations against the - * last allocation any more - */ - last_wanted = 0; - /* - * Here's how this works. - * loop == 0: we were searching a block group via a hint - * and didn't find anything, so we start at - * the head of the block groups and keep searching - * loop == 1: we're searching through all of the block groups - * if we hit the head again we have searched - * all of the block groups for this space and we - * need to try and allocate, if we cant error out. - * loop == 2: we allocated more space and are looping through - * all of the block groups again. - */ - if (loop == 0) { - head = &space_info->block_groups; - cur = head->next; - loop++; - } else if (loop == 1 && cur == head) { - int keep_going; - - /* at this point we give up on the empty_size - * allocations and just try to allocate the min - * space. - * - * The extra_loop field was set if an empty_size - * allocation was attempted above, and if this - * is try we need to try the loop again without - * the additional empty_size. + if (exclude_nr > 0 && + (search_start + num_bytes > exclude_start && + search_start < exclude_start + exclude_nr)) { + search_start = exclude_start + exclude_nr; + + btrfs_add_free_space(block_group, offset, num_bytes); + /* + * if search_start is still in this block group + * then we just re-search this block group */ - total_needed -= empty_size; - empty_size = 0; - keep_going = extra_loop; - loop++; + if (search_start >= block_group->key.objectid && + search_start < (block_group->key.objectid + + block_group->key.offset)) + goto have_block_group; + goto loop; + } - if (allowed_chunk_alloc && !chunk_alloc_done) { - up_read(&space_info->groups_sem); - ret = do_chunk_alloc(trans, root, num_bytes + - 2 * 1024 * 1024, data, 1); - down_read(&space_info->groups_sem); - if (ret < 0) - goto loop_check; - head = &space_info->block_groups; - /* - * we've allocated a new chunk, keep - * trying - */ - keep_going = 1; - chunk_alloc_done = 1; - } else if (!allowed_chunk_alloc) { - space_info->force_alloc = 1; - } -loop_check: - if (keep_going) { - cur = head->next; - extra_loop = 0; - } else { - break; - } - } else if (cur == head) { - break; + ins->objectid = search_start; + ins->offset = num_bytes; + + if (offset < search_start) + btrfs_add_free_space(block_group, offset, + search_start - offset); + BUG_ON(offset > search_start); + + /* we are all good, lets return */ + break; +loop: + btrfs_put_block_group(block_group); + } + up_read(&space_info->groups_sem); + + /* loop == 0, try to find a clustered alloc in every block group + * loop == 1, try again after forcing a chunk allocation + * loop == 2, set empty_size and empty_cluster to 0 and try again + */ + if (!ins->objectid && loop < 3 && + (empty_size || empty_cluster || allowed_chunk_alloc)) { + if (loop >= 2) { + empty_size = 0; + empty_cluster = 0; } - block_group = list_entry(cur, struct btrfs_block_group_cache, - list); - atomic_inc(&block_group->count); + if (allowed_chunk_alloc) { + ret = do_chunk_alloc(trans, root, num_bytes + + 2 * 1024 * 1024, data, 1); + allowed_chunk_alloc = 0; + } else { + space_info->force_alloc = 1; + } - search_start = block_group->key.objectid; - cur = cur->next; + if (loop < 3) { + loop++; + goto search; + } + ret = -ENOSPC; + } else if (!ins->objectid) { + ret = -ENOSPC; } /* we found what we needed */ @@ -2783,21 +2782,10 @@ loop_check: if (!(data & BTRFS_BLOCK_GROUP_DATA)) trans->block_group = block_group->key.objectid; - if (last_ptr) - *last_ptr = ins->objectid + ins->offset; + btrfs_put_block_group(block_group); ret = 0; - } else if (!ret) { - printk(KERN_ERR "btrfs searching for %llu bytes, " - "num_bytes %llu, loop %d, allowed_alloc %d\n", - (unsigned long long)total_needed, - (unsigned long long)num_bytes, - loop, allowed_chunk_alloc); - ret = -ENOSPC; } - if (block_group) - put_block_group(block_group); - up_read(&space_info->groups_sem); return ret; } @@ -2902,7 +2890,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) ret = btrfs_discard_extent(root, start, len); btrfs_add_free_space(cache, start, len); - put_block_group(cache); + btrfs_put_block_group(cache); update_reserved_extents(root, start, len, 0); return ret; @@ -3040,7 +3028,7 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans, ret = btrfs_remove_free_space(block_group, ins->objectid, ins->offset); BUG_ON(ret); - put_block_group(block_group); + btrfs_put_block_group(block_group); ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, ref_generation, owner, ins, 1); return ret; @@ -5729,7 +5717,7 @@ next: WARN_ON(block_group->reserved > 0); WARN_ON(btrfs_block_group_used(&block_group->item) > 0); spin_unlock(&block_group->lock); - put_block_group(block_group); + btrfs_put_block_group(block_group); ret = 0; out: btrfs_free_path(path); @@ -5856,9 +5844,10 @@ int btrfs_read_block_groups(struct btrfs_root *root) atomic_set(&cache->count, 1); spin_lock_init(&cache->lock); - mutex_init(&cache->alloc_mutex); + spin_lock_init(&cache->tree_lock); mutex_init(&cache->cache_mutex); INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); read_extent_buffer(leaf, &cache->item, btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(cache->item)); @@ -5912,9 +5901,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; atomic_set(&cache->count, 1); spin_lock_init(&cache->lock); - mutex_init(&cache->alloc_mutex); + spin_lock_init(&cache->tree_lock); mutex_init(&cache->cache_mutex); INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); btrfs_set_block_group_used(&cache->item, bytes_used); btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); @@ -5974,8 +5964,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_unlock(&block_group->space_info->lock); block_group->space_info->full = 0; - put_block_group(block_group); - put_block_group(block_group); + btrfs_put_block_group(block_group); + btrfs_put_block_group(block_group); ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 08085af089e2..eb2bee8b7fbf 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2884,25 +2884,19 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, disko = 0; flags = 0; - switch (em->block_start) { - case EXTENT_MAP_LAST_BYTE: + if (em->block_start == EXTENT_MAP_LAST_BYTE) { end = 1; flags |= FIEMAP_EXTENT_LAST; - break; - case EXTENT_MAP_HOLE: + } else if (em->block_start == EXTENT_MAP_HOLE) { flags |= FIEMAP_EXTENT_UNWRITTEN; - break; - case EXTENT_MAP_INLINE: + } else if (em->block_start == EXTENT_MAP_INLINE) { flags |= (FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED); - break; - case EXTENT_MAP_DELALLOC: + } else if (em->block_start == EXTENT_MAP_DELALLOC) { flags |= (FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN); - break; - default: + } else { disko = em->block_start; - break; } if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) flags |= FIEMAP_EXTENT_ENCODED; diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 50da69da20ce..b187917b36fa 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -234,7 +234,6 @@ int add_extent_mapping(struct extent_map_tree *tree, rb = tree_insert(&tree->map, em->start, &em->rb_node); if (rb) { ret = -EEXIST; - free_extent_map(merge); goto out; } atomic_inc(&em->refs); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index d1e5f0e84c58..768b9523662d 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -18,6 +18,15 @@ #include <linux/sched.h> #include "ctree.h" +#include "free-space-cache.h" +#include "transaction.h" + +struct btrfs_free_space { + struct rb_node bytes_index; + struct rb_node offset_index; + u64 offset; + u64 bytes; +}; static int tree_insert_offset(struct rb_root *root, u64 offset, struct rb_node *node) @@ -68,14 +77,24 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes, } /* - * searches the tree for the given offset. If contains is set we will return - * the free space that contains the given offset. If contains is not set we - * will return the free space that starts at or after the given offset and is - * at least bytes long. + * searches the tree for the given offset. + * + * fuzzy == 1: this is used for allocations where we are given a hint of where + * to look for free space. Because the hint may not be completely on an offset + * mark, or the hint may no longer point to free space we need to fudge our + * results a bit. So we look for free space starting at or after offset with at + * least bytes size. We prefer to find as close to the given offset as we can. + * Also if the offset is within a free space range, then we will return the free + * space that contains the given offset, which means we can return a free space + * chunk with an offset before the provided offset. + * + * fuzzy == 0: this is just a normal tree search. Give us the free space that + * starts at the given offset which is at least bytes size, and if its not there + * return NULL. */ static struct btrfs_free_space *tree_search_offset(struct rb_root *root, u64 offset, u64 bytes, - int contains) + int fuzzy) { struct rb_node *n = root->rb_node; struct btrfs_free_space *entry, *ret = NULL; @@ -84,13 +103,14 @@ static struct btrfs_free_space *tree_search_offset(struct rb_root *root, entry = rb_entry(n, struct btrfs_free_space, offset_index); if (offset < entry->offset) { - if (!contains && + if (fuzzy && (!ret || entry->offset < ret->offset) && (bytes <= entry->bytes)) ret = entry; n = n->rb_left; } else if (offset > entry->offset) { - if ((entry->offset + entry->bytes - 1) >= offset && + if (fuzzy && + (entry->offset + entry->bytes - 1) >= offset && bytes <= entry->bytes) { ret = entry; break; @@ -171,6 +191,7 @@ static int link_free_space(struct btrfs_block_group_cache *block_group, int ret = 0; + BUG_ON(!info->bytes); ret = tree_insert_offset(&block_group->free_space_offset, info->offset, &info->offset_index); if (ret) @@ -184,108 +205,70 @@ static int link_free_space(struct btrfs_block_group_cache *block_group, return ret; } -static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes) +int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes) { struct btrfs_free_space *right_info; struct btrfs_free_space *left_info; struct btrfs_free_space *info = NULL; - struct btrfs_free_space *alloc_info; int ret = 0; - alloc_info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); - if (!alloc_info) + info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); + if (!info) return -ENOMEM; + info->offset = offset; + info->bytes = bytes; + + spin_lock(&block_group->tree_lock); + /* * first we want to see if there is free space adjacent to the range we * are adding, if there is remove that struct and add a new one to * cover the entire range */ right_info = tree_search_offset(&block_group->free_space_offset, - offset+bytes, 0, 1); + offset+bytes, 0, 0); left_info = tree_search_offset(&block_group->free_space_offset, offset-1, 0, 1); - if (right_info && right_info->offset == offset+bytes) { + if (right_info) { unlink_free_space(block_group, right_info); - info = right_info; - info->offset = offset; - info->bytes += bytes; - } else if (right_info && right_info->offset != offset+bytes) { - printk(KERN_ERR "btrfs adding space in the middle of an " - "existing free space area. existing: " - "offset=%llu, bytes=%llu. new: offset=%llu, " - "bytes=%llu\n", (unsigned long long)right_info->offset, - (unsigned long long)right_info->bytes, - (unsigned long long)offset, - (unsigned long long)bytes); - BUG(); + info->bytes += right_info->bytes; + kfree(right_info); } - if (left_info) { + if (left_info && left_info->offset + left_info->bytes == offset) { unlink_free_space(block_group, left_info); - - if (unlikely((left_info->offset + left_info->bytes) != - offset)) { - printk(KERN_ERR "btrfs free space to the left " - "of new free space isn't " - "quite right. existing: offset=%llu, " - "bytes=%llu. new: offset=%llu, bytes=%llu\n", - (unsigned long long)left_info->offset, - (unsigned long long)left_info->bytes, - (unsigned long long)offset, - (unsigned long long)bytes); - BUG(); - } - - if (info) { - info->offset = left_info->offset; - info->bytes += left_info->bytes; - kfree(left_info); - } else { - info = left_info; - info->bytes += bytes; - } + info->offset = left_info->offset; + info->bytes += left_info->bytes; + kfree(left_info); } - if (info) { - ret = link_free_space(block_group, info); - if (!ret) - info = NULL; - goto out; - } - - info = alloc_info; - alloc_info = NULL; - info->offset = offset; - info->bytes = bytes; - ret = link_free_space(block_group, info); if (ret) kfree(info); -out: + + spin_unlock(&block_group->tree_lock); + if (ret) { printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret); - if (ret == -EEXIST) - BUG(); + BUG_ON(ret == -EEXIST); } - kfree(alloc_info); - return ret; } -static int -__btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes) +int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes) { struct btrfs_free_space *info; int ret = 0; + spin_lock(&block_group->tree_lock); + info = tree_search_offset(&block_group->free_space_offset, offset, 0, 1); - if (info && info->offset == offset) { if (info->bytes < bytes) { printk(KERN_ERR "Found free space at %llu, size %llu," @@ -295,12 +278,14 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, (unsigned long long)bytes); WARN_ON(1); ret = -EINVAL; + spin_unlock(&block_group->tree_lock); goto out; } unlink_free_space(block_group, info); if (info->bytes == bytes) { kfree(info); + spin_unlock(&block_group->tree_lock); goto out; } @@ -308,6 +293,7 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, info->bytes -= bytes; ret = link_free_space(block_group, info); + spin_unlock(&block_group->tree_lock); BUG_ON(ret); } else if (info && info->offset < offset && info->offset + info->bytes >= offset + bytes) { @@ -333,70 +319,33 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, */ kfree(info); } - + spin_unlock(&block_group->tree_lock); /* step two, insert a new info struct to cover anything * before the hole */ - ret = __btrfs_add_free_space(block_group, old_start, - offset - old_start); + ret = btrfs_add_free_space(block_group, old_start, + offset - old_start); BUG_ON(ret); } else { + spin_unlock(&block_group->tree_lock); + if (!info) { + printk(KERN_ERR "couldn't find space %llu to free\n", + (unsigned long long)offset); + printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n", + block_group->cached, block_group->key.objectid, + block_group->key.offset); + btrfs_dump_free_space(block_group, bytes); + } else if (info) { + printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, " + "but wanted offset=%llu bytes=%llu\n", + info->offset, info->bytes, offset, bytes); + } WARN_ON(1); } out: return ret; } -int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes) -{ - int ret; - struct btrfs_free_space *sp; - - mutex_lock(&block_group->alloc_mutex); - ret = __btrfs_add_free_space(block_group, offset, bytes); - sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1); - BUG_ON(!sp); - mutex_unlock(&block_group->alloc_mutex); - - return ret; -} - -int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes) -{ - int ret; - struct btrfs_free_space *sp; - - ret = __btrfs_add_free_space(block_group, offset, bytes); - sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1); - BUG_ON(!sp); - - return ret; -} - -int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes) -{ - int ret = 0; - - mutex_lock(&block_group->alloc_mutex); - ret = __btrfs_remove_free_space(block_group, offset, bytes); - mutex_unlock(&block_group->alloc_mutex); - - return ret; -} - -int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes) -{ - int ret; - - ret = __btrfs_remove_free_space(block_group, offset, bytes); - - return ret; -} - void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, u64 bytes) { @@ -408,6 +357,8 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, info = rb_entry(n, struct btrfs_free_space, offset_index); if (info->bytes >= bytes) count++; + printk(KERN_ERR "entry offset %llu, bytes %llu\n", info->offset, + info->bytes); } printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" "\n", count); @@ -428,68 +379,337 @@ u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group) return ret; } +/* + * for a given cluster, put all of its extents back into the free + * space cache. If the block group passed doesn't match the block group + * pointed to by the cluster, someone else raced in and freed the + * cluster already. In that case, we just return without changing anything + */ +static int +__btrfs_return_cluster_to_free_space( + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster) +{ + struct btrfs_free_space *entry; + struct rb_node *node; + + spin_lock(&cluster->lock); + if (cluster->block_group != block_group) + goto out; + + cluster->window_start = 0; + node = rb_first(&cluster->root); + while(node) { + entry = rb_entry(node, struct btrfs_free_space, offset_index); + node = rb_next(&entry->offset_index); + rb_erase(&entry->offset_index, &cluster->root); + link_free_space(block_group, entry); + } + list_del_init(&cluster->block_group_list); + + btrfs_put_block_group(cluster->block_group); + cluster->block_group = NULL; + cluster->root.rb_node = NULL; +out: + spin_unlock(&cluster->lock); + return 0; +} + void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) { struct btrfs_free_space *info; struct rb_node *node; + struct btrfs_free_cluster *cluster; + struct btrfs_free_cluster *safe; + + spin_lock(&block_group->tree_lock); + + list_for_each_entry_safe(cluster, safe, &block_group->cluster_list, + block_group_list) { + + WARN_ON(cluster->block_group != block_group); + __btrfs_return_cluster_to_free_space(block_group, cluster); + } - mutex_lock(&block_group->alloc_mutex); while ((node = rb_last(&block_group->free_space_bytes)) != NULL) { info = rb_entry(node, struct btrfs_free_space, bytes_index); unlink_free_space(block_group, info); kfree(info); if (need_resched()) { - mutex_unlock(&block_group->alloc_mutex); + spin_unlock(&block_group->tree_lock); cond_resched(); - mutex_lock(&block_group->alloc_mutex); + spin_lock(&block_group->tree_lock); } } - mutex_unlock(&block_group->alloc_mutex); + spin_unlock(&block_group->tree_lock); } -#if 0 -static struct btrfs_free_space *btrfs_find_free_space_offset(struct - btrfs_block_group_cache - *block_group, u64 offset, - u64 bytes) +u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes, u64 empty_size) { - struct btrfs_free_space *ret; + struct btrfs_free_space *entry = NULL; + u64 ret = 0; - mutex_lock(&block_group->alloc_mutex); - ret = tree_search_offset(&block_group->free_space_offset, offset, - bytes, 0); - mutex_unlock(&block_group->alloc_mutex); + spin_lock(&block_group->tree_lock); + entry = tree_search_offset(&block_group->free_space_offset, offset, + bytes + empty_size, 1); + if (!entry) + entry = tree_search_bytes(&block_group->free_space_bytes, + offset, bytes + empty_size); + if (entry) { + unlink_free_space(block_group, entry); + ret = entry->offset; + entry->offset += bytes; + entry->bytes -= bytes; + + if (!entry->bytes) + kfree(entry); + else + link_free_space(block_group, entry); + } + spin_unlock(&block_group->tree_lock); return ret; } -static struct btrfs_free_space *btrfs_find_free_space_bytes(struct - btrfs_block_group_cache - *block_group, u64 offset, - u64 bytes) +/* + * given a cluster, put all of its extents back into the free space + * cache. If a block group is passed, this function will only free + * a cluster that belongs to the passed block group. + * + * Otherwise, it'll get a reference on the block group pointed to by the + * cluster and remove the cluster from it. + */ +int btrfs_return_cluster_to_free_space( + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster) { - struct btrfs_free_space *ret; + int ret; - mutex_lock(&block_group->alloc_mutex); + /* first, get a safe pointer to the block group */ + spin_lock(&cluster->lock); + if (!block_group) { + block_group = cluster->block_group; + if (!block_group) { + spin_unlock(&cluster->lock); + return 0; + } + } else if (cluster->block_group != block_group) { + /* someone else has already freed it don't redo their work */ + spin_unlock(&cluster->lock); + return 0; + } + atomic_inc(&block_group->count); + spin_unlock(&cluster->lock); - ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes); - mutex_unlock(&block_group->alloc_mutex); + /* now return any extents the cluster had on it */ + spin_lock(&block_group->tree_lock); + ret = __btrfs_return_cluster_to_free_space(block_group, cluster); + spin_unlock(&block_group->tree_lock); + /* finally drop our ref */ + btrfs_put_block_group(block_group); return ret; } -#endif -struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache - *block_group, u64 offset, - u64 bytes) +/* + * given a cluster, try to allocate 'bytes' from it, returns 0 + * if it couldn't find anything suitably large, or a logical disk offset + * if things worked out + */ +u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, u64 bytes, + u64 min_start) +{ + struct btrfs_free_space *entry = NULL; + struct rb_node *node; + u64 ret = 0; + + spin_lock(&cluster->lock); + if (bytes > cluster->max_size) + goto out; + + if (cluster->block_group != block_group) + goto out; + + node = rb_first(&cluster->root); + if (!node) + goto out; + + entry = rb_entry(node, struct btrfs_free_space, offset_index); + + while(1) { + if (entry->bytes < bytes || entry->offset < min_start) { + struct rb_node *node; + + node = rb_next(&entry->offset_index); + if (!node) + break; + entry = rb_entry(node, struct btrfs_free_space, + offset_index); + continue; + } + ret = entry->offset; + + entry->offset += bytes; + entry->bytes -= bytes; + + if (entry->bytes == 0) { + rb_erase(&entry->offset_index, &cluster->root); + kfree(entry); + } + break; + } +out: + spin_unlock(&cluster->lock); + return ret; +} + +/* + * here we try to find a cluster of blocks in a block group. The goal + * is to find at least bytes free and up to empty_size + bytes free. + * We might not find them all in one contiguous area. + * + * returns zero and sets up cluster if things worked out, otherwise + * it returns -enospc + */ +int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + u64 offset, u64 bytes, u64 empty_size) { - struct btrfs_free_space *ret = NULL; + struct btrfs_free_space *entry = NULL; + struct rb_node *node; + struct btrfs_free_space *next; + struct btrfs_free_space *last; + u64 min_bytes; + u64 window_start; + u64 window_free; + u64 max_extent = 0; + int total_retries = 0; + int ret; + + /* for metadata, allow allocates with more holes */ + if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { + /* + * we want to do larger allocations when we are + * flushing out the delayed refs, it helps prevent + * making more work as we go along. + */ + if (trans->transaction->delayed_refs.flushing) + min_bytes = max(bytes, (bytes + empty_size) >> 1); + else + min_bytes = max(bytes, (bytes + empty_size) >> 4); + } else + min_bytes = max(bytes, (bytes + empty_size) >> 2); + + spin_lock(&block_group->tree_lock); + spin_lock(&cluster->lock); + + /* someone already found a cluster, hooray */ + if (cluster->block_group) { + ret = 0; + goto out; + } +again: + min_bytes = min(min_bytes, bytes + empty_size); + entry = tree_search_bytes(&block_group->free_space_bytes, + offset, min_bytes); + if (!entry) { + ret = -ENOSPC; + goto out; + } + window_start = entry->offset; + window_free = entry->bytes; + last = entry; + max_extent = entry->bytes; + + while(1) { + /* out window is just right, lets fill it */ + if (window_free >= bytes + empty_size) + break; - ret = tree_search_offset(&block_group->free_space_offset, offset, - bytes, 0); - if (!ret) - ret = tree_search_bytes(&block_group->free_space_bytes, - offset, bytes); + node = rb_next(&last->offset_index); + if (!node) { + ret = -ENOSPC; + goto out; + } + next = rb_entry(node, struct btrfs_free_space, offset_index); + + /* + * we haven't filled the empty size and the window is + * very large. reset and try again + */ + if (next->offset - window_start > (bytes + empty_size) * 2) { + entry = next; + window_start = entry->offset; + window_free = entry->bytes; + last = entry; + max_extent = 0; + total_retries++; + if (total_retries % 256 == 0) { + if (min_bytes >= (bytes + empty_size)) { + ret = -ENOSPC; + goto out; + } + /* + * grow our allocation a bit, we're not having + * much luck + */ + min_bytes *= 2; + goto again; + } + } else { + last = next; + window_free += next->bytes; + if (entry->bytes > max_extent) + max_extent = entry->bytes; + } + } + + cluster->window_start = entry->offset; + + /* + * now we've found our entries, pull them out of the free space + * cache and put them into the cluster rbtree + * + * The cluster includes an rbtree, but only uses the offset index + * of each free space cache entry. + */ + while(1) { + node = rb_next(&entry->offset_index); + unlink_free_space(block_group, entry); + ret = tree_insert_offset(&cluster->root, entry->offset, + &entry->offset_index); + BUG_ON(ret); + + if (!node || entry == last) + break; + + entry = rb_entry(node, struct btrfs_free_space, offset_index); + } + ret = 0; + cluster->max_size = max_extent; + atomic_inc(&block_group->count); + list_add_tail(&cluster->block_group_list, &block_group->cluster_list); + cluster->block_group = block_group; +out: + spin_unlock(&cluster->lock); + spin_unlock(&block_group->tree_lock); return ret; } + +/* + * simple code to zero out a cluster + */ +void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) +{ + spin_lock_init(&cluster->lock); + spin_lock_init(&cluster->refill_lock); + cluster->root.rb_node = NULL; + cluster->max_size = 0; + INIT_LIST_HEAD(&cluster->block_group_list); + cluster->block_group = NULL; +} + diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h new file mode 100644 index 000000000000..ab0bdc0a63ce --- /dev/null +++ b/fs/btrfs/free-space-cache.h @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2009 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_FREE_SPACE_CACHE +#define __BTRFS_FREE_SPACE_CACHE + +int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, + u64 bytenr, u64 size); +int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, + u64 bytenr, u64 size); +void btrfs_remove_free_space_cache(struct btrfs_block_group_cache + *block_group); +u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes, u64 empty_size); +void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, + u64 bytes); +u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group); +int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + u64 offset, u64 bytes, u64 empty_size); +void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster); +u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, u64 bytes, + u64 min_start); +int btrfs_return_cluster_to_free_space( + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster); +#endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 06d8db5afb08..a0d1dd492a58 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3481,8 +3481,10 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, if (dir) { ret = btrfs_set_inode_index(dir, index); - if (ret) + if (ret) { + iput(inode); return ERR_PTR(ret); + } } /* * index_cnt is ignored for everything but a dir, @@ -3565,6 +3567,7 @@ fail: if (dir) BTRFS_I(dir)->index_cnt--; btrfs_free_path(path); + iput(inode); return ERR_PTR(ret); } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index bca729fc80c8..7594bec1be10 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -267,7 +267,7 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name, goto out_dput; if (!IS_POSIXACL(parent->dentry->d_inode)) - mode &= ~current->fs->umask; + mode &= ~current_umask(); error = mnt_want_write(parent->mnt); if (error) diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index a5310c0f41e2..1c36e5cd8f55 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -60,8 +60,8 @@ void btrfs_clear_lock_blocking(struct extent_buffer *eb) /* * unfortunately, many of the places that currently set a lock to blocking - * don't end up blocking for every long, and often they don't block - * at all. For a dbench 50 run, if we don't spin one the blocking bit + * don't end up blocking for very long, and often they don't block + * at all. For a dbench 50 run, if we don't spin on the blocking bit * at all, the context switch rate can jump up to 400,000/sec or more. * * So, we're still stuck with this crummy spin on the blocking bit, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 19a4daf03ccb..9744af9d71e9 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -24,6 +24,7 @@ #include <linux/highmem.h> #include <linux/time.h> #include <linux/init.h> +#include <linux/seq_file.h> #include <linux/string.h> #include <linux/smp_lock.h> #include <linux/backing-dev.h> @@ -66,7 +67,8 @@ static void btrfs_put_super(struct super_block *sb) enum { Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, - Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_err, + Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_notreelog, + Opt_flushoncommit, Opt_err, }; static match_table_t tokens = { @@ -83,6 +85,8 @@ static match_table_t tokens = { {Opt_compress, "compress"}, {Opt_ssd, "ssd"}, {Opt_noacl, "noacl"}, + {Opt_notreelog, "notreelog"}, + {Opt_flushoncommit, "flushoncommit"}, {Opt_err, NULL}, }; @@ -222,6 +226,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_noacl: root->fs_info->sb->s_flags &= ~MS_POSIXACL; break; + case Opt_notreelog: + printk(KERN_INFO "btrfs: disabling tree log\n"); + btrfs_set_opt(info->mount_opt, NOTREELOG); + break; + case Opt_flushoncommit: + printk(KERN_INFO "btrfs: turning on flush-on-commit\n"); + btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT); + break; default: break; } @@ -363,9 +375,8 @@ fail_close: int btrfs_sync_fs(struct super_block *sb, int wait) { struct btrfs_trans_handle *trans; - struct btrfs_root *root; + struct btrfs_root *root = btrfs_sb(sb); int ret; - root = btrfs_sb(sb); if (sb->s_flags & MS_RDONLY) return 0; @@ -385,6 +396,41 @@ int btrfs_sync_fs(struct super_block *sb, int wait) return ret; } +static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) +{ + struct btrfs_root *root = btrfs_sb(vfs->mnt_sb); + struct btrfs_fs_info *info = root->fs_info; + + if (btrfs_test_opt(root, DEGRADED)) + seq_puts(seq, ",degraded"); + if (btrfs_test_opt(root, NODATASUM)) + seq_puts(seq, ",nodatasum"); + if (btrfs_test_opt(root, NODATACOW)) + seq_puts(seq, ",nodatacow"); + if (btrfs_test_opt(root, NOBARRIER)) + seq_puts(seq, ",nobarrier"); + if (info->max_extent != (u64)-1) + seq_printf(seq, ",max_extent=%llu", info->max_extent); + if (info->max_inline != 8192 * 1024) + seq_printf(seq, ",max_inline=%llu", info->max_inline); + if (info->alloc_start != 0) + seq_printf(seq, ",alloc_start=%llu", info->alloc_start); + if (info->thread_pool_size != min_t(unsigned long, + num_online_cpus() + 2, 8)) + seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); + if (btrfs_test_opt(root, COMPRESS)) + seq_puts(seq, ",compress"); + if (btrfs_test_opt(root, SSD)) + seq_puts(seq, ",ssd"); + if (btrfs_test_opt(root, NOTREELOG)) + seq_puts(seq, ",no-treelog"); + if (btrfs_test_opt(root, FLUSHONCOMMIT)) + seq_puts(seq, ",flush-on-commit"); + if (!(root->fs_info->sb->s_flags & MS_POSIXACL)) + seq_puts(seq, ",noacl"); + return 0; +} + static void btrfs_write_super(struct super_block *sb) { sb->s_dirt = 0; @@ -630,7 +676,7 @@ static struct super_operations btrfs_super_ops = { .put_super = btrfs_put_super, .write_super = btrfs_write_super, .sync_fs = btrfs_sync_fs, - .show_options = generic_show_options, + .show_options = btrfs_show_options, .write_inode = btrfs_write_inode, .dirty_inode = btrfs_dirty_inode, .alloc_inode = btrfs_alloc_inode, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 664782c6a2df..2869b3361eb6 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -53,8 +53,6 @@ static noinline int join_transaction(struct btrfs_root *root) GFP_NOFS); BUG_ON(!cur_trans); root->fs_info->generation++; - root->fs_info->last_alloc = 0; - root->fs_info->last_data_alloc = 0; cur_trans->num_writers = 1; cur_trans->num_joined = 0; cur_trans->transid = root->fs_info->generation; @@ -974,6 +972,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, int ret; int should_grow = 0; unsigned long now = get_seconds(); + int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); btrfs_run_ordered_operations(root, 0); @@ -1053,7 +1052,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, mutex_unlock(&root->fs_info->trans_mutex); - if (snap_pending) { + if (flush_on_commit || snap_pending) { + if (flush_on_commit) + btrfs_start_delalloc_inodes(root); ret = btrfs_wait_ordered_extents(root, 1); BUG_ON(ret); } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index fc9b87a7975b..25f20ea11f27 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -262,11 +262,9 @@ static int process_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, struct walk_control *wc, u64 gen) { - if (wc->pin) { - mutex_lock(&log->fs_info->pinned_mutex); + if (wc->pin) btrfs_update_pinned_extents(log->fs_info->extent_root, eb->start, eb->len, 1); - } if (btrfs_buffer_uptodate(eb, gen)) { if (wc->write) @@ -1224,8 +1222,7 @@ insert: ret = insert_one_name(trans, root, path, key->objectid, key->offset, name, name_len, log_type, &log_key); - if (ret && ret != -ENOENT) - BUG(); + BUG_ON(ret && ret != -ENOENT); goto out; } @@ -2900,6 +2897,11 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, sb = inode->i_sb; + if (btrfs_test_opt(root, NOTREELOG)) { + ret = 1; + goto end_no_trans; + } + if (root->fs_info->last_trans_log_full_commit > root->fs_info->last_trans_committed) { ret = 1; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index dd06e18e5aac..e0913e469728 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -20,6 +20,7 @@ #include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/random.h> +#include <linux/iocontext.h> #include <asm/div64.h> #include "compat.h" #include "ctree.h" @@ -145,8 +146,9 @@ static noinline int run_scheduled_bios(struct btrfs_device *device) int again = 0; unsigned long num_run = 0; unsigned long limit; + unsigned long last_waited = 0; - bdi = device->bdev->bd_inode->i_mapping->backing_dev_info; + bdi = blk_get_backing_dev_info(device->bdev); fs_info = device->dev_root->fs_info; limit = btrfs_async_submit_limit(fs_info); limit = limit * 2 / 3; @@ -207,7 +209,32 @@ loop_lock: if (pending && bdi_write_congested(bdi) && num_run > 16 && fs_info->fs_devices->open_devices > 1) { struct bio *old_head; + struct io_context *ioc; + ioc = current->io_context; + + /* + * the main goal here is that we don't want to + * block if we're going to be able to submit + * more requests without blocking. + * + * This code does two great things, it pokes into + * the elevator code from a filesystem _and_ + * it makes assumptions about how batching works. + */ + if (ioc && ioc->nr_batch_requests > 0 && + time_before(jiffies, ioc->last_waited + HZ/50UL) && + (last_waited == 0 || + ioc->last_waited == last_waited)) { + /* + * we want to go through our batch of + * requests and stop. So, we copy out + * the ioc->last_waited time and test + * against it before looping + */ + last_waited = ioc->last_waited; + continue; + } spin_lock(&device->io_lock); old_head = device->pending_bios; @@ -231,6 +258,18 @@ loop_lock: if (device->pending_bios) goto loop_lock; spin_unlock(&device->io_lock); + + /* + * IO has already been through a long path to get here. Checksumming, + * async helper threads, perhaps compression. We've done a pretty + * good job of collecting a batch of IO and should just unplug + * the device right away. + * + * This will help anyone who is waiting on the IO, they might have + * already unplugged, but managed to do so before the bio they + * cared about found its way down here. + */ + blk_run_backing_dev(bdi, NULL); done: return 0; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 86c44e9ae110..2185de72ff7d 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -76,7 +76,7 @@ struct btrfs_device { struct btrfs_fs_devices { u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ - /* the device with this id has the most recent coyp of the super */ + /* the device with this id has the most recent copy of the super */ u64 latest_devid; u64 latest_trans; u64 num_devices; diff --git a/fs/buffer.c b/fs/buffer.c index f5f8b15a6e40..13edf7ad3ff1 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -199,13 +199,13 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) head = page_buffers(page); bh = head; do { - if (bh->b_blocknr == block) { + if (!buffer_mapped(bh)) + all_mapped = 0; + else if (bh->b_blocknr == block) { ret = bh; get_bh(bh); goto out_unlock; } - if (!buffer_mapped(bh)) - all_mapped = 0; bh = bh->b_this_page; } while (bh != head); @@ -737,7 +737,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) { struct buffer_head *bh; struct list_head tmp; - struct address_space *mapping; + struct address_space *mapping, *prev_mapping = NULL; int err = 0, err2; INIT_LIST_HEAD(&tmp); @@ -762,7 +762,18 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) * contents - it is a noop if I/O is still in * flight on potentially older contents. */ - ll_rw_block(SWRITE_SYNC, 1, &bh); + ll_rw_block(SWRITE_SYNC_PLUG, 1, &bh); + + /* + * Kick off IO for the previous mapping. Note + * that we will not run the very last mapping, + * wait_on_buffer() will do that for us + * through sync_buffer(). + */ + if (prev_mapping && prev_mapping != mapping) + blk_run_address_space(prev_mapping); + prev_mapping = mapping; + brelse(bh); spin_lock(lock); } @@ -1585,6 +1596,16 @@ EXPORT_SYMBOL(unmap_underlying_metadata); * locked buffer. This only can happen if someone has written the buffer * directly, with submit_bh(). At the address_space level PageWriteback * prevents this contention from occurring. + * + * If block_write_full_page() is called with wbc->sync_mode == + * WB_SYNC_ALL, the writes are posted using WRITE_SYNC_PLUG; this + * causes the writes to be flagged as synchronous writes, but the + * block device queue will NOT be unplugged, since usually many pages + * will be pushed to the out before the higher-level caller actually + * waits for the writes to be completed. The various wait functions, + * such as wait_on_writeback_range() will ultimately call sync_page() + * which will ultimately call blk_run_backing_dev(), which will end up + * unplugging the device queue. */ static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block, struct writeback_control *wbc) @@ -1595,6 +1616,8 @@ static int __block_write_full_page(struct inode *inode, struct page *page, struct buffer_head *bh, *head; const unsigned blocksize = 1 << inode->i_blkbits; int nr_underway = 0; + int write_op = (wbc->sync_mode == WB_SYNC_ALL ? + WRITE_SYNC_PLUG : WRITE); BUG_ON(!PageLocked(page)); @@ -1686,7 +1709,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page, do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { - submit_bh(WRITE, bh); + submit_bh(write_op, bh); nr_underway++; } bh = next; @@ -1740,7 +1763,7 @@ recover: struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { clear_buffer_dirty(bh); - submit_bh(WRITE, bh); + submit_bh(write_op, bh); nr_underway++; } bh = next; @@ -2956,12 +2979,13 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) for (i = 0; i < nr; i++) { struct buffer_head *bh = bhs[i]; - if (rw == SWRITE || rw == SWRITE_SYNC) + if (rw == SWRITE || rw == SWRITE_SYNC || rw == SWRITE_SYNC_PLUG) lock_buffer(bh); else if (!trylock_buffer(bh)) continue; - if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) { + if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC || + rw == SWRITE_SYNC_PLUG) { if (test_clear_buffer_dirty(bh)) { bh->b_end_io = end_buffer_write_sync; get_bh(bh); @@ -2997,7 +3021,7 @@ int sync_dirty_buffer(struct buffer_head *bh) if (test_clear_buffer_dirty(bh)) { get_bh(bh); bh->b_end_io = end_buffer_write_sync; - ret = submit_bh(WRITE, bh); + ret = submit_bh(WRITE_SYNC, bh); wait_on_buffer(bh); if (buffer_eopnotsupp(bh)) { clear_buffer_eopnotsupp(bh); @@ -3315,7 +3339,6 @@ EXPORT_SYMBOL(cont_write_begin); EXPORT_SYMBOL(end_buffer_read_sync); EXPORT_SYMBOL(end_buffer_write_sync); EXPORT_SYMBOL(file_fsync); -EXPORT_SYMBOL(fsync_bdev); EXPORT_SYMBOL(generic_block_bmap); EXPORT_SYMBOL(generic_cont_expand_simple); EXPORT_SYMBOL(init_buffer); diff --git a/fs/cachefiles/Kconfig b/fs/cachefiles/Kconfig new file mode 100644 index 000000000000..80e9c6167f0b --- /dev/null +++ b/fs/cachefiles/Kconfig @@ -0,0 +1,39 @@ + +config CACHEFILES + tristate "Filesystem caching on files" + depends on FSCACHE && BLOCK + help + This permits use of a mounted filesystem as a cache for other + filesystems - primarily networking filesystems - thus allowing fast + local disk to enhance the speed of slower devices. + + See Documentation/filesystems/caching/cachefiles.txt for more + information. + +config CACHEFILES_DEBUG + bool "Debug CacheFiles" + depends on CACHEFILES + help + This permits debugging to be dynamically enabled in the filesystem + caching on files module. If this is set, the debugging output may be + enabled by setting bits in /sys/modules/cachefiles/parameter/debug or + by including a debugging specifier in /etc/cachefilesd.conf. + +config CACHEFILES_HISTOGRAM + bool "Gather latency information on CacheFiles" + depends on CACHEFILES && PROC_FS + help + + This option causes latency information to be gathered on CacheFiles + operation and exported through file: + + /proc/fs/cachefiles/histogram + + The generation of this histogram adds a certain amount of overhead to + execution as there are a number of points at which data is gathered, + and on a multi-CPU system these may be on cachelines that keep + bouncing between CPUs. On the other hand, the histogram may be + useful for debugging purposes. Saying 'N' here is recommended. + + See Documentation/filesystems/caching/cachefiles.txt for more + information. diff --git a/fs/cachefiles/Makefile b/fs/cachefiles/Makefile new file mode 100644 index 000000000000..32cbab0ffce3 --- /dev/null +++ b/fs/cachefiles/Makefile @@ -0,0 +1,18 @@ +# +# Makefile for caching in a mounted filesystem +# + +cachefiles-y := \ + bind.o \ + daemon.o \ + interface.o \ + key.o \ + main.o \ + namei.o \ + rdwr.o \ + security.o \ + xattr.o + +cachefiles-$(CONFIG_CACHEFILES_HISTOGRAM) += proc.o + +obj-$(CONFIG_CACHEFILES) := cachefiles.o diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c new file mode 100644 index 000000000000..3797e0077b35 --- /dev/null +++ b/fs/cachefiles/bind.c @@ -0,0 +1,286 @@ +/* Bind and unbind a cache from the filesystem backing it + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/completion.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/namei.h> +#include <linux/mount.h> +#include <linux/statfs.h> +#include <linux/ctype.h> +#include "internal.h" + +static int cachefiles_daemon_add_cache(struct cachefiles_cache *caches); + +/* + * bind a directory as a cache + */ +int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args) +{ + _enter("{%u,%u,%u,%u,%u,%u},%s", + cache->frun_percent, + cache->fcull_percent, + cache->fstop_percent, + cache->brun_percent, + cache->bcull_percent, + cache->bstop_percent, + args); + + /* start by checking things over */ + ASSERT(cache->fstop_percent >= 0 && + cache->fstop_percent < cache->fcull_percent && + cache->fcull_percent < cache->frun_percent && + cache->frun_percent < 100); + + ASSERT(cache->bstop_percent >= 0 && + cache->bstop_percent < cache->bcull_percent && + cache->bcull_percent < cache->brun_percent && + cache->brun_percent < 100); + + if (*args) { + kerror("'bind' command doesn't take an argument"); + return -EINVAL; + } + + if (!cache->rootdirname) { + kerror("No cache directory specified"); + return -EINVAL; + } + + /* don't permit already bound caches to be re-bound */ + if (test_bit(CACHEFILES_READY, &cache->flags)) { + kerror("Cache already bound"); + return -EBUSY; + } + + /* make sure we have copies of the tag and dirname strings */ + if (!cache->tag) { + /* the tag string is released by the fops->release() + * function, so we don't release it on error here */ + cache->tag = kstrdup("CacheFiles", GFP_KERNEL); + if (!cache->tag) + return -ENOMEM; + } + + /* add the cache */ + return cachefiles_daemon_add_cache(cache); +} + +/* + * add a cache + */ +static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache) +{ + struct cachefiles_object *fsdef; + struct nameidata nd; + struct kstatfs stats; + struct dentry *graveyard, *cachedir, *root; + const struct cred *saved_cred; + int ret; + + _enter(""); + + /* we want to work under the module's security ID */ + ret = cachefiles_get_security_ID(cache); + if (ret < 0) + return ret; + + cachefiles_begin_secure(cache, &saved_cred); + + /* allocate the root index object */ + ret = -ENOMEM; + + fsdef = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL); + if (!fsdef) + goto error_root_object; + + ASSERTCMP(fsdef->backer, ==, NULL); + + atomic_set(&fsdef->usage, 1); + fsdef->type = FSCACHE_COOKIE_TYPE_INDEX; + + _debug("- fsdef %p", fsdef); + + /* look up the directory at the root of the cache */ + memset(&nd, 0, sizeof(nd)); + + ret = path_lookup(cache->rootdirname, LOOKUP_DIRECTORY, &nd); + if (ret < 0) + goto error_open_root; + + cache->mnt = mntget(nd.path.mnt); + root = dget(nd.path.dentry); + path_put(&nd.path); + + /* check parameters */ + ret = -EOPNOTSUPP; + if (!root->d_inode || + !root->d_inode->i_op || + !root->d_inode->i_op->lookup || + !root->d_inode->i_op->mkdir || + !root->d_inode->i_op->setxattr || + !root->d_inode->i_op->getxattr || + !root->d_sb || + !root->d_sb->s_op || + !root->d_sb->s_op->statfs || + !root->d_sb->s_op->sync_fs) + goto error_unsupported; + + ret = -EROFS; + if (root->d_sb->s_flags & MS_RDONLY) + goto error_unsupported; + + /* determine the security of the on-disk cache as this governs + * security ID of files we create */ + ret = cachefiles_determine_cache_security(cache, root, &saved_cred); + if (ret < 0) + goto error_unsupported; + + /* get the cache size and blocksize */ + ret = vfs_statfs(root, &stats); + if (ret < 0) + goto error_unsupported; + + ret = -ERANGE; + if (stats.f_bsize <= 0) + goto error_unsupported; + + ret = -EOPNOTSUPP; + if (stats.f_bsize > PAGE_SIZE) + goto error_unsupported; + + cache->bsize = stats.f_bsize; + cache->bshift = 0; + if (stats.f_bsize < PAGE_SIZE) + cache->bshift = PAGE_SHIFT - ilog2(stats.f_bsize); + + _debug("blksize %u (shift %u)", + cache->bsize, cache->bshift); + + _debug("size %llu, avail %llu", + (unsigned long long) stats.f_blocks, + (unsigned long long) stats.f_bavail); + + /* set up caching limits */ + do_div(stats.f_files, 100); + cache->fstop = stats.f_files * cache->fstop_percent; + cache->fcull = stats.f_files * cache->fcull_percent; + cache->frun = stats.f_files * cache->frun_percent; + + _debug("limits {%llu,%llu,%llu} files", + (unsigned long long) cache->frun, + (unsigned long long) cache->fcull, + (unsigned long long) cache->fstop); + + stats.f_blocks >>= cache->bshift; + do_div(stats.f_blocks, 100); + cache->bstop = stats.f_blocks * cache->bstop_percent; + cache->bcull = stats.f_blocks * cache->bcull_percent; + cache->brun = stats.f_blocks * cache->brun_percent; + + _debug("limits {%llu,%llu,%llu} blocks", + (unsigned long long) cache->brun, + (unsigned long long) cache->bcull, + (unsigned long long) cache->bstop); + + /* get the cache directory and check its type */ + cachedir = cachefiles_get_directory(cache, root, "cache"); + if (IS_ERR(cachedir)) { + ret = PTR_ERR(cachedir); + goto error_unsupported; + } + + fsdef->dentry = cachedir; + fsdef->fscache.cookie = NULL; + + ret = cachefiles_check_object_type(fsdef); + if (ret < 0) + goto error_unsupported; + + /* get the graveyard directory */ + graveyard = cachefiles_get_directory(cache, root, "graveyard"); + if (IS_ERR(graveyard)) { + ret = PTR_ERR(graveyard); + goto error_unsupported; + } + + cache->graveyard = graveyard; + + /* publish the cache */ + fscache_init_cache(&cache->cache, + &cachefiles_cache_ops, + "%s", + fsdef->dentry->d_sb->s_id); + + fscache_object_init(&fsdef->fscache, NULL, &cache->cache); + + ret = fscache_add_cache(&cache->cache, &fsdef->fscache, cache->tag); + if (ret < 0) + goto error_add_cache; + + /* done */ + set_bit(CACHEFILES_READY, &cache->flags); + dput(root); + + printk(KERN_INFO "CacheFiles:" + " File cache on %s registered\n", + cache->cache.identifier); + + /* check how much space the cache has */ + cachefiles_has_space(cache, 0, 0); + cachefiles_end_secure(cache, saved_cred); + return 0; + +error_add_cache: + dput(cache->graveyard); + cache->graveyard = NULL; +error_unsupported: + mntput(cache->mnt); + cache->mnt = NULL; + dput(fsdef->dentry); + fsdef->dentry = NULL; + dput(root); +error_open_root: + kmem_cache_free(cachefiles_object_jar, fsdef); +error_root_object: + cachefiles_end_secure(cache, saved_cred); + kerror("Failed to register: %d", ret); + return ret; +} + +/* + * unbind a cache on fd release + */ +void cachefiles_daemon_unbind(struct cachefiles_cache *cache) +{ + _enter(""); + + if (test_bit(CACHEFILES_READY, &cache->flags)) { + printk(KERN_INFO "CacheFiles:" + " File cache on %s unregistering\n", + cache->cache.identifier); + + fscache_withdraw_cache(&cache->cache); + } + + dput(cache->graveyard); + mntput(cache->mnt); + + kfree(cache->rootdirname); + kfree(cache->secctx); + kfree(cache->tag); + + _leave(""); +} diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c new file mode 100644 index 000000000000..4618516dd994 --- /dev/null +++ b/fs/cachefiles/daemon.c @@ -0,0 +1,755 @@ +/* Daemon interface + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/completion.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/namei.h> +#include <linux/poll.h> +#include <linux/mount.h> +#include <linux/statfs.h> +#include <linux/ctype.h> +#include <linux/fs_struct.h> +#include "internal.h" + +static int cachefiles_daemon_open(struct inode *, struct file *); +static int cachefiles_daemon_release(struct inode *, struct file *); +static ssize_t cachefiles_daemon_read(struct file *, char __user *, size_t, + loff_t *); +static ssize_t cachefiles_daemon_write(struct file *, const char __user *, + size_t, loff_t *); +static unsigned int cachefiles_daemon_poll(struct file *, + struct poll_table_struct *); +static int cachefiles_daemon_frun(struct cachefiles_cache *, char *); +static int cachefiles_daemon_fcull(struct cachefiles_cache *, char *); +static int cachefiles_daemon_fstop(struct cachefiles_cache *, char *); +static int cachefiles_daemon_brun(struct cachefiles_cache *, char *); +static int cachefiles_daemon_bcull(struct cachefiles_cache *, char *); +static int cachefiles_daemon_bstop(struct cachefiles_cache *, char *); +static int cachefiles_daemon_cull(struct cachefiles_cache *, char *); +static int cachefiles_daemon_debug(struct cachefiles_cache *, char *); +static int cachefiles_daemon_dir(struct cachefiles_cache *, char *); +static int cachefiles_daemon_inuse(struct cachefiles_cache *, char *); +static int cachefiles_daemon_secctx(struct cachefiles_cache *, char *); +static int cachefiles_daemon_tag(struct cachefiles_cache *, char *); + +static unsigned long cachefiles_open; + +const struct file_operations cachefiles_daemon_fops = { + .owner = THIS_MODULE, + .open = cachefiles_daemon_open, + .release = cachefiles_daemon_release, + .read = cachefiles_daemon_read, + .write = cachefiles_daemon_write, + .poll = cachefiles_daemon_poll, +}; + +struct cachefiles_daemon_cmd { + char name[8]; + int (*handler)(struct cachefiles_cache *cache, char *args); +}; + +static const struct cachefiles_daemon_cmd cachefiles_daemon_cmds[] = { + { "bind", cachefiles_daemon_bind }, + { "brun", cachefiles_daemon_brun }, + { "bcull", cachefiles_daemon_bcull }, + { "bstop", cachefiles_daemon_bstop }, + { "cull", cachefiles_daemon_cull }, + { "debug", cachefiles_daemon_debug }, + { "dir", cachefiles_daemon_dir }, + { "frun", cachefiles_daemon_frun }, + { "fcull", cachefiles_daemon_fcull }, + { "fstop", cachefiles_daemon_fstop }, + { "inuse", cachefiles_daemon_inuse }, + { "secctx", cachefiles_daemon_secctx }, + { "tag", cachefiles_daemon_tag }, + { "", NULL } +}; + + +/* + * do various checks + */ +static int cachefiles_daemon_open(struct inode *inode, struct file *file) +{ + struct cachefiles_cache *cache; + + _enter(""); + + /* only the superuser may do this */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* the cachefiles device may only be open once at a time */ + if (xchg(&cachefiles_open, 1) == 1) + return -EBUSY; + + /* allocate a cache record */ + cache = kzalloc(sizeof(struct cachefiles_cache), GFP_KERNEL); + if (!cache) { + cachefiles_open = 0; + return -ENOMEM; + } + + mutex_init(&cache->daemon_mutex); + cache->active_nodes = RB_ROOT; + rwlock_init(&cache->active_lock); + init_waitqueue_head(&cache->daemon_pollwq); + + /* set default caching limits + * - limit at 1% free space and/or free files + * - cull below 5% free space and/or free files + * - cease culling above 7% free space and/or free files + */ + cache->frun_percent = 7; + cache->fcull_percent = 5; + cache->fstop_percent = 1; + cache->brun_percent = 7; + cache->bcull_percent = 5; + cache->bstop_percent = 1; + + file->private_data = cache; + cache->cachefilesd = file; + return 0; +} + +/* + * release a cache + */ +static int cachefiles_daemon_release(struct inode *inode, struct file *file) +{ + struct cachefiles_cache *cache = file->private_data; + + _enter(""); + + ASSERT(cache); + + set_bit(CACHEFILES_DEAD, &cache->flags); + + cachefiles_daemon_unbind(cache); + + ASSERT(!cache->active_nodes.rb_node); + + /* clean up the control file interface */ + cache->cachefilesd = NULL; + file->private_data = NULL; + cachefiles_open = 0; + + kfree(cache); + + _leave(""); + return 0; +} + +/* + * read the cache state + */ +static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer, + size_t buflen, loff_t *pos) +{ + struct cachefiles_cache *cache = file->private_data; + char buffer[256]; + int n; + + //_enter(",,%zu,", buflen); + + if (!test_bit(CACHEFILES_READY, &cache->flags)) + return 0; + + /* check how much space the cache has */ + cachefiles_has_space(cache, 0, 0); + + /* summarise */ + clear_bit(CACHEFILES_STATE_CHANGED, &cache->flags); + + n = snprintf(buffer, sizeof(buffer), + "cull=%c" + " frun=%llx" + " fcull=%llx" + " fstop=%llx" + " brun=%llx" + " bcull=%llx" + " bstop=%llx", + test_bit(CACHEFILES_CULLING, &cache->flags) ? '1' : '0', + (unsigned long long) cache->frun, + (unsigned long long) cache->fcull, + (unsigned long long) cache->fstop, + (unsigned long long) cache->brun, + (unsigned long long) cache->bcull, + (unsigned long long) cache->bstop + ); + + if (n > buflen) + return -EMSGSIZE; + + if (copy_to_user(_buffer, buffer, n) != 0) + return -EFAULT; + + return n; +} + +/* + * command the cache + */ +static ssize_t cachefiles_daemon_write(struct file *file, + const char __user *_data, + size_t datalen, + loff_t *pos) +{ + const struct cachefiles_daemon_cmd *cmd; + struct cachefiles_cache *cache = file->private_data; + ssize_t ret; + char *data, *args, *cp; + + //_enter(",,%zu,", datalen); + + ASSERT(cache); + + if (test_bit(CACHEFILES_DEAD, &cache->flags)) + return -EIO; + + if (datalen < 0 || datalen > PAGE_SIZE - 1) + return -EOPNOTSUPP; + + /* drag the command string into the kernel so we can parse it */ + data = kmalloc(datalen + 1, GFP_KERNEL); + if (!data) + return -ENOMEM; + + ret = -EFAULT; + if (copy_from_user(data, _data, datalen) != 0) + goto error; + + data[datalen] = '\0'; + + ret = -EINVAL; + if (memchr(data, '\0', datalen)) + goto error; + + /* strip any newline */ + cp = memchr(data, '\n', datalen); + if (cp) { + if (cp == data) + goto error; + + *cp = '\0'; + } + + /* parse the command */ + ret = -EOPNOTSUPP; + + for (args = data; *args; args++) + if (isspace(*args)) + break; + if (*args) { + if (args == data) + goto error; + *args = '\0'; + for (args++; isspace(*args); args++) + continue; + } + + /* run the appropriate command handler */ + for (cmd = cachefiles_daemon_cmds; cmd->name[0]; cmd++) + if (strcmp(cmd->name, data) == 0) + goto found_command; + +error: + kfree(data); + //_leave(" = %zd", ret); + return ret; + +found_command: + mutex_lock(&cache->daemon_mutex); + + ret = -EIO; + if (!test_bit(CACHEFILES_DEAD, &cache->flags)) + ret = cmd->handler(cache, args); + + mutex_unlock(&cache->daemon_mutex); + + if (ret == 0) + ret = datalen; + goto error; +} + +/* + * poll for culling state + * - use POLLOUT to indicate culling state + */ +static unsigned int cachefiles_daemon_poll(struct file *file, + struct poll_table_struct *poll) +{ + struct cachefiles_cache *cache = file->private_data; + unsigned int mask; + + poll_wait(file, &cache->daemon_pollwq, poll); + mask = 0; + + if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags)) + mask |= POLLIN; + + if (test_bit(CACHEFILES_CULLING, &cache->flags)) + mask |= POLLOUT; + + return mask; +} + +/* + * give a range error for cache space constraints + * - can be tail-called + */ +static int cachefiles_daemon_range_error(struct cachefiles_cache *cache, + char *args) +{ + kerror("Free space limits must be in range" + " 0%%<=stop<cull<run<100%%"); + + return -EINVAL; +} + +/* + * set the percentage of files at which to stop culling + * - command: "frun <N>%" + */ +static int cachefiles_daemon_frun(struct cachefiles_cache *cache, char *args) +{ + unsigned long frun; + + _enter(",%s", args); + + if (!*args) + return -EINVAL; + + frun = simple_strtoul(args, &args, 10); + if (args[0] != '%' || args[1] != '\0') + return -EINVAL; + + if (frun <= cache->fcull_percent || frun >= 100) + return cachefiles_daemon_range_error(cache, args); + + cache->frun_percent = frun; + return 0; +} + +/* + * set the percentage of files at which to start culling + * - command: "fcull <N>%" + */ +static int cachefiles_daemon_fcull(struct cachefiles_cache *cache, char *args) +{ + unsigned long fcull; + + _enter(",%s", args); + + if (!*args) + return -EINVAL; + + fcull = simple_strtoul(args, &args, 10); + if (args[0] != '%' || args[1] != '\0') + return -EINVAL; + + if (fcull <= cache->fstop_percent || fcull >= cache->frun_percent) + return cachefiles_daemon_range_error(cache, args); + + cache->fcull_percent = fcull; + return 0; +} + +/* + * set the percentage of files at which to stop allocating + * - command: "fstop <N>%" + */ +static int cachefiles_daemon_fstop(struct cachefiles_cache *cache, char *args) +{ + unsigned long fstop; + + _enter(",%s", args); + + if (!*args) + return -EINVAL; + + fstop = simple_strtoul(args, &args, 10); + if (args[0] != '%' || args[1] != '\0') + return -EINVAL; + + if (fstop < 0 || fstop >= cache->fcull_percent) + return cachefiles_daemon_range_error(cache, args); + + cache->fstop_percent = fstop; + return 0; +} + +/* + * set the percentage of blocks at which to stop culling + * - command: "brun <N>%" + */ +static int cachefiles_daemon_brun(struct cachefiles_cache *cache, char *args) +{ + unsigned long brun; + + _enter(",%s", args); + + if (!*args) + return -EINVAL; + + brun = simple_strtoul(args, &args, 10); + if (args[0] != '%' || args[1] != '\0') + return -EINVAL; + + if (brun <= cache->bcull_percent || brun >= 100) + return cachefiles_daemon_range_error(cache, args); + + cache->brun_percent = brun; + return 0; +} + +/* + * set the percentage of blocks at which to start culling + * - command: "bcull <N>%" + */ +static int cachefiles_daemon_bcull(struct cachefiles_cache *cache, char *args) +{ + unsigned long bcull; + + _enter(",%s", args); + + if (!*args) + return -EINVAL; + + bcull = simple_strtoul(args, &args, 10); + if (args[0] != '%' || args[1] != '\0') + return -EINVAL; + + if (bcull <= cache->bstop_percent || bcull >= cache->brun_percent) + return cachefiles_daemon_range_error(cache, args); + + cache->bcull_percent = bcull; + return 0; +} + +/* + * set the percentage of blocks at which to stop allocating + * - command: "bstop <N>%" + */ +static int cachefiles_daemon_bstop(struct cachefiles_cache *cache, char *args) +{ + unsigned long bstop; + + _enter(",%s", args); + + if (!*args) + return -EINVAL; + + bstop = simple_strtoul(args, &args, 10); + if (args[0] != '%' || args[1] != '\0') + return -EINVAL; + + if (bstop < 0 || bstop >= cache->bcull_percent) + return cachefiles_daemon_range_error(cache, args); + + cache->bstop_percent = bstop; + return 0; +} + +/* + * set the cache directory + * - command: "dir <name>" + */ +static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args) +{ + char *dir; + + _enter(",%s", args); + + if (!*args) { + kerror("Empty directory specified"); + return -EINVAL; + } + + if (cache->rootdirname) { + kerror("Second cache directory specified"); + return -EEXIST; + } + + dir = kstrdup(args, GFP_KERNEL); + if (!dir) + return -ENOMEM; + + cache->rootdirname = dir; + return 0; +} + +/* + * set the cache security context + * - command: "secctx <ctx>" + */ +static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args) +{ + char *secctx; + + _enter(",%s", args); + + if (!*args) { + kerror("Empty security context specified"); + return -EINVAL; + } + + if (cache->secctx) { + kerror("Second security context specified"); + return -EINVAL; + } + + secctx = kstrdup(args, GFP_KERNEL); + if (!secctx) + return -ENOMEM; + + cache->secctx = secctx; + return 0; +} + +/* + * set the cache tag + * - command: "tag <name>" + */ +static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args) +{ + char *tag; + + _enter(",%s", args); + + if (!*args) { + kerror("Empty tag specified"); + return -EINVAL; + } + + if (cache->tag) + return -EEXIST; + + tag = kstrdup(args, GFP_KERNEL); + if (!tag) + return -ENOMEM; + + cache->tag = tag; + return 0; +} + +/* + * request a node in the cache be culled from the current working directory + * - command: "cull <name>" + */ +static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args) +{ + struct fs_struct *fs; + struct dentry *dir; + const struct cred *saved_cred; + int ret; + + _enter(",%s", args); + + if (strchr(args, '/')) + goto inval; + + if (!test_bit(CACHEFILES_READY, &cache->flags)) { + kerror("cull applied to unready cache"); + return -EIO; + } + + if (test_bit(CACHEFILES_DEAD, &cache->flags)) { + kerror("cull applied to dead cache"); + return -EIO; + } + + /* extract the directory dentry from the cwd */ + fs = current->fs; + read_lock(&fs->lock); + dir = dget(fs->pwd.dentry); + read_unlock(&fs->lock); + + if (!S_ISDIR(dir->d_inode->i_mode)) + goto notdir; + + cachefiles_begin_secure(cache, &saved_cred); + ret = cachefiles_cull(cache, dir, args); + cachefiles_end_secure(cache, saved_cred); + + dput(dir); + _leave(" = %d", ret); + return ret; + +notdir: + dput(dir); + kerror("cull command requires dirfd to be a directory"); + return -ENOTDIR; + +inval: + kerror("cull command requires dirfd and filename"); + return -EINVAL; +} + +/* + * set debugging mode + * - command: "debug <mask>" + */ +static int cachefiles_daemon_debug(struct cachefiles_cache *cache, char *args) +{ + unsigned long mask; + + _enter(",%s", args); + + mask = simple_strtoul(args, &args, 0); + if (args[0] != '\0') + goto inval; + + cachefiles_debug = mask; + _leave(" = 0"); + return 0; + +inval: + kerror("debug command requires mask"); + return -EINVAL; +} + +/* + * find out whether an object in the current working directory is in use or not + * - command: "inuse <name>" + */ +static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args) +{ + struct fs_struct *fs; + struct dentry *dir; + const struct cred *saved_cred; + int ret; + + //_enter(",%s", args); + + if (strchr(args, '/')) + goto inval; + + if (!test_bit(CACHEFILES_READY, &cache->flags)) { + kerror("inuse applied to unready cache"); + return -EIO; + } + + if (test_bit(CACHEFILES_DEAD, &cache->flags)) { + kerror("inuse applied to dead cache"); + return -EIO; + } + + /* extract the directory dentry from the cwd */ + fs = current->fs; + read_lock(&fs->lock); + dir = dget(fs->pwd.dentry); + read_unlock(&fs->lock); + + if (!S_ISDIR(dir->d_inode->i_mode)) + goto notdir; + + cachefiles_begin_secure(cache, &saved_cred); + ret = cachefiles_check_in_use(cache, dir, args); + cachefiles_end_secure(cache, saved_cred); + + dput(dir); + //_leave(" = %d", ret); + return ret; + +notdir: + dput(dir); + kerror("inuse command requires dirfd to be a directory"); + return -ENOTDIR; + +inval: + kerror("inuse command requires dirfd and filename"); + return -EINVAL; +} + +/* + * see if we have space for a number of pages and/or a number of files in the + * cache + */ +int cachefiles_has_space(struct cachefiles_cache *cache, + unsigned fnr, unsigned bnr) +{ + struct kstatfs stats; + int ret; + + //_enter("{%llu,%llu,%llu,%llu,%llu,%llu},%u,%u", + // (unsigned long long) cache->frun, + // (unsigned long long) cache->fcull, + // (unsigned long long) cache->fstop, + // (unsigned long long) cache->brun, + // (unsigned long long) cache->bcull, + // (unsigned long long) cache->bstop, + // fnr, bnr); + + /* find out how many pages of blockdev are available */ + memset(&stats, 0, sizeof(stats)); + + ret = vfs_statfs(cache->mnt->mnt_root, &stats); + if (ret < 0) { + if (ret == -EIO) + cachefiles_io_error(cache, "statfs failed"); + _leave(" = %d", ret); + return ret; + } + + stats.f_bavail >>= cache->bshift; + + //_debug("avail %llu,%llu", + // (unsigned long long) stats.f_ffree, + // (unsigned long long) stats.f_bavail); + + /* see if there is sufficient space */ + if (stats.f_ffree > fnr) + stats.f_ffree -= fnr; + else + stats.f_ffree = 0; + + if (stats.f_bavail > bnr) + stats.f_bavail -= bnr; + else + stats.f_bavail = 0; + + ret = -ENOBUFS; + if (stats.f_ffree < cache->fstop || + stats.f_bavail < cache->bstop) + goto begin_cull; + + ret = 0; + if (stats.f_ffree < cache->fcull || + stats.f_bavail < cache->bcull) + goto begin_cull; + + if (test_bit(CACHEFILES_CULLING, &cache->flags) && + stats.f_ffree >= cache->frun && + stats.f_bavail >= cache->brun && + test_and_clear_bit(CACHEFILES_CULLING, &cache->flags) + ) { + _debug("cease culling"); + cachefiles_state_changed(cache); + } + + //_leave(" = 0"); + return 0; + +begin_cull: + if (!test_and_set_bit(CACHEFILES_CULLING, &cache->flags)) { + _debug("### CULL CACHE ###"); + cachefiles_state_changed(cache); + } + + _leave(" = %d", ret); + return ret; +} diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c new file mode 100644 index 000000000000..1e962348d111 --- /dev/null +++ b/fs/cachefiles/interface.c @@ -0,0 +1,449 @@ +/* FS-Cache interface to CacheFiles + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/mount.h> +#include <linux/buffer_head.h> +#include "internal.h" + +#define list_to_page(head) (list_entry((head)->prev, struct page, lru)) + +struct cachefiles_lookup_data { + struct cachefiles_xattr *auxdata; /* auxiliary data */ + char *key; /* key path */ +}; + +static int cachefiles_attr_changed(struct fscache_object *_object); + +/* + * allocate an object record for a cookie lookup and prepare the lookup data + */ +static struct fscache_object *cachefiles_alloc_object( + struct fscache_cache *_cache, + struct fscache_cookie *cookie) +{ + struct cachefiles_lookup_data *lookup_data; + struct cachefiles_object *object; + struct cachefiles_cache *cache; + struct cachefiles_xattr *auxdata; + unsigned keylen, auxlen; + void *buffer; + char *key; + + cache = container_of(_cache, struct cachefiles_cache, cache); + + _enter("{%s},%p,", cache->cache.identifier, cookie); + + lookup_data = kmalloc(sizeof(*lookup_data), GFP_KERNEL); + if (!lookup_data) + goto nomem_lookup_data; + + /* create a new object record and a temporary leaf image */ + object = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL); + if (!object) + goto nomem_object; + + ASSERTCMP(object->backer, ==, NULL); + + BUG_ON(test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)); + atomic_set(&object->usage, 1); + + fscache_object_init(&object->fscache, cookie, &cache->cache); + + object->type = cookie->def->type; + + /* get hold of the raw key + * - stick the length on the front and leave space on the back for the + * encoder + */ + buffer = kmalloc((2 + 512) + 3, GFP_KERNEL); + if (!buffer) + goto nomem_buffer; + + keylen = cookie->def->get_key(cookie->netfs_data, buffer + 2, 512); + ASSERTCMP(keylen, <, 512); + + *(uint16_t *)buffer = keylen; + ((char *)buffer)[keylen + 2] = 0; + ((char *)buffer)[keylen + 3] = 0; + ((char *)buffer)[keylen + 4] = 0; + + /* turn the raw key into something that can work with as a filename */ + key = cachefiles_cook_key(buffer, keylen + 2, object->type); + if (!key) + goto nomem_key; + + /* get hold of the auxiliary data and prepend the object type */ + auxdata = buffer; + auxlen = 0; + if (cookie->def->get_aux) { + auxlen = cookie->def->get_aux(cookie->netfs_data, + auxdata->data, 511); + ASSERTCMP(auxlen, <, 511); + } + + auxdata->len = auxlen + 1; + auxdata->type = cookie->def->type; + + lookup_data->auxdata = auxdata; + lookup_data->key = key; + object->lookup_data = lookup_data; + + _leave(" = %p [%p]", &object->fscache, lookup_data); + return &object->fscache; + +nomem_key: + kfree(buffer); +nomem_buffer: + BUG_ON(test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)); + kmem_cache_free(cachefiles_object_jar, object); + fscache_object_destroyed(&cache->cache); +nomem_object: + kfree(lookup_data); +nomem_lookup_data: + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); +} + +/* + * attempt to look up the nominated node in this cache + */ +static void cachefiles_lookup_object(struct fscache_object *_object) +{ + struct cachefiles_lookup_data *lookup_data; + struct cachefiles_object *parent, *object; + struct cachefiles_cache *cache; + const struct cred *saved_cred; + int ret; + + _enter("{OBJ%x}", _object->debug_id); + + cache = container_of(_object->cache, struct cachefiles_cache, cache); + parent = container_of(_object->parent, + struct cachefiles_object, fscache); + object = container_of(_object, struct cachefiles_object, fscache); + lookup_data = object->lookup_data; + + ASSERTCMP(lookup_data, !=, NULL); + + /* look up the key, creating any missing bits */ + cachefiles_begin_secure(cache, &saved_cred); + ret = cachefiles_walk_to_object(parent, object, + lookup_data->key, + lookup_data->auxdata); + cachefiles_end_secure(cache, saved_cred); + + /* polish off by setting the attributes of non-index files */ + if (ret == 0 && + object->fscache.cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) + cachefiles_attr_changed(&object->fscache); + + if (ret < 0) { + printk(KERN_WARNING "CacheFiles: Lookup failed error %d\n", + ret); + fscache_object_lookup_error(&object->fscache); + } + + _leave(" [%d]", ret); +} + +/* + * indication of lookup completion + */ +static void cachefiles_lookup_complete(struct fscache_object *_object) +{ + struct cachefiles_object *object; + + object = container_of(_object, struct cachefiles_object, fscache); + + _enter("{OBJ%x,%p}", object->fscache.debug_id, object->lookup_data); + + if (object->lookup_data) { + kfree(object->lookup_data->key); + kfree(object->lookup_data->auxdata); + kfree(object->lookup_data); + object->lookup_data = NULL; + } +} + +/* + * increment the usage count on an inode object (may fail if unmounting) + */ +static +struct fscache_object *cachefiles_grab_object(struct fscache_object *_object) +{ + struct cachefiles_object *object = + container_of(_object, struct cachefiles_object, fscache); + + _enter("{OBJ%x,%d}", _object->debug_id, atomic_read(&object->usage)); + +#ifdef CACHEFILES_DEBUG_SLAB + ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000); +#endif + + atomic_inc(&object->usage); + return &object->fscache; +} + +/* + * update the auxilliary data for an object object on disk + */ +static void cachefiles_update_object(struct fscache_object *_object) +{ + struct cachefiles_object *object; + struct cachefiles_xattr *auxdata; + struct cachefiles_cache *cache; + struct fscache_cookie *cookie; + const struct cred *saved_cred; + unsigned auxlen; + + _enter("{OBJ%x}", _object->debug_id); + + object = container_of(_object, struct cachefiles_object, fscache); + cache = container_of(object->fscache.cache, struct cachefiles_cache, + cache); + cookie = object->fscache.cookie; + + if (!cookie->def->get_aux) { + _leave(" [no aux]"); + return; + } + + auxdata = kmalloc(2 + 512 + 3, GFP_KERNEL); + if (!auxdata) { + _leave(" [nomem]"); + return; + } + + auxlen = cookie->def->get_aux(cookie->netfs_data, auxdata->data, 511); + ASSERTCMP(auxlen, <, 511); + + auxdata->len = auxlen + 1; + auxdata->type = cookie->def->type; + + cachefiles_begin_secure(cache, &saved_cred); + cachefiles_update_object_xattr(object, auxdata); + cachefiles_end_secure(cache, saved_cred); + kfree(auxdata); + _leave(""); +} + +/* + * discard the resources pinned by an object and effect retirement if + * requested + */ +static void cachefiles_drop_object(struct fscache_object *_object) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + const struct cred *saved_cred; + + ASSERT(_object); + + object = container_of(_object, struct cachefiles_object, fscache); + + _enter("{OBJ%x,%d}", + object->fscache.debug_id, atomic_read(&object->usage)); + + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + +#ifdef CACHEFILES_DEBUG_SLAB + ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000); +#endif + + /* delete retired objects */ + if (object->fscache.state == FSCACHE_OBJECT_RECYCLING && + _object != cache->cache.fsdef + ) { + _debug("- retire object OBJ%x", object->fscache.debug_id); + cachefiles_begin_secure(cache, &saved_cred); + cachefiles_delete_object(cache, object); + cachefiles_end_secure(cache, saved_cred); + } + + /* close the filesystem stuff attached to the object */ + if (object->backer != object->dentry) + dput(object->backer); + object->backer = NULL; + + /* note that the object is now inactive */ + if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) { + write_lock(&cache->active_lock); + if (!test_and_clear_bit(CACHEFILES_OBJECT_ACTIVE, + &object->flags)) + BUG(); + rb_erase(&object->active_node, &cache->active_nodes); + wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE); + write_unlock(&cache->active_lock); + } + + dput(object->dentry); + object->dentry = NULL; + + _leave(""); +} + +/* + * dispose of a reference to an object + */ +static void cachefiles_put_object(struct fscache_object *_object) +{ + struct cachefiles_object *object; + struct fscache_cache *cache; + + ASSERT(_object); + + object = container_of(_object, struct cachefiles_object, fscache); + + _enter("{OBJ%x,%d}", + object->fscache.debug_id, atomic_read(&object->usage)); + +#ifdef CACHEFILES_DEBUG_SLAB + ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000); +#endif + + ASSERTIFCMP(object->fscache.parent, + object->fscache.parent->n_children, >, 0); + + if (atomic_dec_and_test(&object->usage)) { + _debug("- kill object OBJ%x", object->fscache.debug_id); + + ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)); + ASSERTCMP(object->fscache.parent, ==, NULL); + ASSERTCMP(object->backer, ==, NULL); + ASSERTCMP(object->dentry, ==, NULL); + ASSERTCMP(object->fscache.n_ops, ==, 0); + ASSERTCMP(object->fscache.n_children, ==, 0); + + if (object->lookup_data) { + kfree(object->lookup_data->key); + kfree(object->lookup_data->auxdata); + kfree(object->lookup_data); + object->lookup_data = NULL; + } + + cache = object->fscache.cache; + kmem_cache_free(cachefiles_object_jar, object); + fscache_object_destroyed(cache); + } + + _leave(""); +} + +/* + * sync a cache + */ +static void cachefiles_sync_cache(struct fscache_cache *_cache) +{ + struct cachefiles_cache *cache; + const struct cred *saved_cred; + int ret; + + _enter("%p", _cache); + + cache = container_of(_cache, struct cachefiles_cache, cache); + + /* make sure all pages pinned by operations on behalf of the netfs are + * written to disc */ + cachefiles_begin_secure(cache, &saved_cred); + ret = fsync_super(cache->mnt->mnt_sb); + cachefiles_end_secure(cache, saved_cred); + + if (ret == -EIO) + cachefiles_io_error(cache, + "Attempt to sync backing fs superblock" + " returned error %d", + ret); +} + +/* + * notification the attributes on an object have changed + * - called with reads/writes excluded by FS-Cache + */ +static int cachefiles_attr_changed(struct fscache_object *_object) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + const struct cred *saved_cred; + struct iattr newattrs; + uint64_t ni_size; + loff_t oi_size; + int ret; + + _object->cookie->def->get_attr(_object->cookie->netfs_data, &ni_size); + + _enter("{OBJ%x},[%llu]", + _object->debug_id, (unsigned long long) ni_size); + + object = container_of(_object, struct cachefiles_object, fscache); + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + + if (ni_size == object->i_size) + return 0; + + if (!object->backer) + return -ENOBUFS; + + ASSERT(S_ISREG(object->backer->d_inode->i_mode)); + + fscache_set_store_limit(&object->fscache, ni_size); + + oi_size = i_size_read(object->backer->d_inode); + if (oi_size == ni_size) + return 0; + + newattrs.ia_size = ni_size; + newattrs.ia_valid = ATTR_SIZE; + + cachefiles_begin_secure(cache, &saved_cred); + mutex_lock(&object->backer->d_inode->i_mutex); + ret = notify_change(object->backer, &newattrs); + mutex_unlock(&object->backer->d_inode->i_mutex); + cachefiles_end_secure(cache, saved_cred); + + if (ret == -EIO) { + fscache_set_store_limit(&object->fscache, 0); + cachefiles_io_error_obj(object, "Size set failed"); + ret = -ENOBUFS; + } + + _leave(" = %d", ret); + return ret; +} + +/* + * dissociate a cache from all the pages it was backing + */ +static void cachefiles_dissociate_pages(struct fscache_cache *cache) +{ + _enter(""); +} + +const struct fscache_cache_ops cachefiles_cache_ops = { + .name = "cachefiles", + .alloc_object = cachefiles_alloc_object, + .lookup_object = cachefiles_lookup_object, + .lookup_complete = cachefiles_lookup_complete, + .grab_object = cachefiles_grab_object, + .update_object = cachefiles_update_object, + .drop_object = cachefiles_drop_object, + .put_object = cachefiles_put_object, + .sync_cache = cachefiles_sync_cache, + .attr_changed = cachefiles_attr_changed, + .read_or_alloc_page = cachefiles_read_or_alloc_page, + .read_or_alloc_pages = cachefiles_read_or_alloc_pages, + .allocate_page = cachefiles_allocate_page, + .allocate_pages = cachefiles_allocate_pages, + .write_page = cachefiles_write_page, + .uncache_page = cachefiles_uncache_page, + .dissociate_pages = cachefiles_dissociate_pages, +}; diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h new file mode 100644 index 000000000000..19218e1463d6 --- /dev/null +++ b/fs/cachefiles/internal.h @@ -0,0 +1,360 @@ +/* General netfs cache on cache files internal defs + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/fscache-cache.h> +#include <linux/timer.h> +#include <linux/wait.h> +#include <linux/workqueue.h> +#include <linux/security.h> + +struct cachefiles_cache; +struct cachefiles_object; + +extern unsigned cachefiles_debug; +#define CACHEFILES_DEBUG_KENTER 1 +#define CACHEFILES_DEBUG_KLEAVE 2 +#define CACHEFILES_DEBUG_KDEBUG 4 + +/* + * node records + */ +struct cachefiles_object { + struct fscache_object fscache; /* fscache handle */ + struct cachefiles_lookup_data *lookup_data; /* cached lookup data */ + struct dentry *dentry; /* the file/dir representing this object */ + struct dentry *backer; /* backing file */ + loff_t i_size; /* object size */ + unsigned long flags; +#define CACHEFILES_OBJECT_ACTIVE 0 /* T if marked active */ + atomic_t usage; /* object usage count */ + uint8_t type; /* object type */ + uint8_t new; /* T if object new */ + spinlock_t work_lock; + struct rb_node active_node; /* link in active tree (dentry is key) */ +}; + +extern struct kmem_cache *cachefiles_object_jar; + +/* + * Cache files cache definition + */ +struct cachefiles_cache { + struct fscache_cache cache; /* FS-Cache record */ + struct vfsmount *mnt; /* mountpoint holding the cache */ + struct dentry *graveyard; /* directory into which dead objects go */ + struct file *cachefilesd; /* manager daemon handle */ + const struct cred *cache_cred; /* security override for accessing cache */ + struct mutex daemon_mutex; /* command serialisation mutex */ + wait_queue_head_t daemon_pollwq; /* poll waitqueue for daemon */ + struct rb_root active_nodes; /* active nodes (can't be culled) */ + rwlock_t active_lock; /* lock for active_nodes */ + atomic_t gravecounter; /* graveyard uniquifier */ + unsigned frun_percent; /* when to stop culling (% files) */ + unsigned fcull_percent; /* when to start culling (% files) */ + unsigned fstop_percent; /* when to stop allocating (% files) */ + unsigned brun_percent; /* when to stop culling (% blocks) */ + unsigned bcull_percent; /* when to start culling (% blocks) */ + unsigned bstop_percent; /* when to stop allocating (% blocks) */ + unsigned bsize; /* cache's block size */ + unsigned bshift; /* min(ilog2(PAGE_SIZE / bsize), 0) */ + uint64_t frun; /* when to stop culling */ + uint64_t fcull; /* when to start culling */ + uint64_t fstop; /* when to stop allocating */ + sector_t brun; /* when to stop culling */ + sector_t bcull; /* when to start culling */ + sector_t bstop; /* when to stop allocating */ + unsigned long flags; +#define CACHEFILES_READY 0 /* T if cache prepared */ +#define CACHEFILES_DEAD 1 /* T if cache dead */ +#define CACHEFILES_CULLING 2 /* T if cull engaged */ +#define CACHEFILES_STATE_CHANGED 3 /* T if state changed (poll trigger) */ + char *rootdirname; /* name of cache root directory */ + char *secctx; /* LSM security context */ + char *tag; /* cache binding tag */ +}; + +/* + * backing file read tracking + */ +struct cachefiles_one_read { + wait_queue_t monitor; /* link into monitored waitqueue */ + struct page *back_page; /* backing file page we're waiting for */ + struct page *netfs_page; /* netfs page we're going to fill */ + struct fscache_retrieval *op; /* retrieval op covering this */ + struct list_head op_link; /* link in op's todo list */ +}; + +/* + * backing file write tracking + */ +struct cachefiles_one_write { + struct page *netfs_page; /* netfs page to copy */ + struct cachefiles_object *object; + struct list_head obj_link; /* link in object's lists */ + fscache_rw_complete_t end_io_func; + void *context; +}; + +/* + * auxiliary data xattr buffer + */ +struct cachefiles_xattr { + uint16_t len; + uint8_t type; + uint8_t data[]; +}; + +/* + * note change of state for daemon + */ +static inline void cachefiles_state_changed(struct cachefiles_cache *cache) +{ + set_bit(CACHEFILES_STATE_CHANGED, &cache->flags); + wake_up_all(&cache->daemon_pollwq); +} + +/* + * cf-bind.c + */ +extern int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args); +extern void cachefiles_daemon_unbind(struct cachefiles_cache *cache); + +/* + * cf-daemon.c + */ +extern const struct file_operations cachefiles_daemon_fops; + +extern int cachefiles_has_space(struct cachefiles_cache *cache, + unsigned fnr, unsigned bnr); + +/* + * cf-interface.c + */ +extern const struct fscache_cache_ops cachefiles_cache_ops; + +/* + * cf-key.c + */ +extern char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type); + +/* + * cf-namei.c + */ +extern int cachefiles_delete_object(struct cachefiles_cache *cache, + struct cachefiles_object *object); +extern int cachefiles_walk_to_object(struct cachefiles_object *parent, + struct cachefiles_object *object, + const char *key, + struct cachefiles_xattr *auxdata); +extern struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, + struct dentry *dir, + const char *name); + +extern int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, + char *filename); + +extern int cachefiles_check_in_use(struct cachefiles_cache *cache, + struct dentry *dir, char *filename); + +/* + * cf-proc.c + */ +#ifdef CONFIG_CACHEFILES_HISTOGRAM +extern atomic_t cachefiles_lookup_histogram[HZ]; +extern atomic_t cachefiles_mkdir_histogram[HZ]; +extern atomic_t cachefiles_create_histogram[HZ]; + +extern int __init cachefiles_proc_init(void); +extern void cachefiles_proc_cleanup(void); +static inline +void cachefiles_hist(atomic_t histogram[], unsigned long start_jif) +{ + unsigned long jif = jiffies - start_jif; + if (jif >= HZ) + jif = HZ - 1; + atomic_inc(&histogram[jif]); +} + +#else +#define cachefiles_proc_init() (0) +#define cachefiles_proc_cleanup() do {} while (0) +#define cachefiles_hist(hist, start_jif) do {} while (0) +#endif + +/* + * cf-rdwr.c + */ +extern int cachefiles_read_or_alloc_page(struct fscache_retrieval *, + struct page *, gfp_t); +extern int cachefiles_read_or_alloc_pages(struct fscache_retrieval *, + struct list_head *, unsigned *, + gfp_t); +extern int cachefiles_allocate_page(struct fscache_retrieval *, struct page *, + gfp_t); +extern int cachefiles_allocate_pages(struct fscache_retrieval *, + struct list_head *, unsigned *, gfp_t); +extern int cachefiles_write_page(struct fscache_storage *, struct page *); +extern void cachefiles_uncache_page(struct fscache_object *, struct page *); + +/* + * cf-security.c + */ +extern int cachefiles_get_security_ID(struct cachefiles_cache *cache); +extern int cachefiles_determine_cache_security(struct cachefiles_cache *cache, + struct dentry *root, + const struct cred **_saved_cred); + +static inline void cachefiles_begin_secure(struct cachefiles_cache *cache, + const struct cred **_saved_cred) +{ + *_saved_cred = override_creds(cache->cache_cred); +} + +static inline void cachefiles_end_secure(struct cachefiles_cache *cache, + const struct cred *saved_cred) +{ + revert_creds(saved_cred); +} + +/* + * cf-xattr.c + */ +extern int cachefiles_check_object_type(struct cachefiles_object *object); +extern int cachefiles_set_object_xattr(struct cachefiles_object *object, + struct cachefiles_xattr *auxdata); +extern int cachefiles_update_object_xattr(struct cachefiles_object *object, + struct cachefiles_xattr *auxdata); +extern int cachefiles_check_object_xattr(struct cachefiles_object *object, + struct cachefiles_xattr *auxdata); +extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache, + struct dentry *dentry); + + +/* + * error handling + */ +#define kerror(FMT, ...) printk(KERN_ERR "CacheFiles: "FMT"\n", ##__VA_ARGS__) + +#define cachefiles_io_error(___cache, FMT, ...) \ +do { \ + kerror("I/O Error: " FMT, ##__VA_ARGS__); \ + fscache_io_error(&(___cache)->cache); \ + set_bit(CACHEFILES_DEAD, &(___cache)->flags); \ +} while (0) + +#define cachefiles_io_error_obj(object, FMT, ...) \ +do { \ + struct cachefiles_cache *___cache; \ + \ + ___cache = container_of((object)->fscache.cache, \ + struct cachefiles_cache, cache); \ + cachefiles_io_error(___cache, FMT, ##__VA_ARGS__); \ +} while (0) + + +/* + * debug tracing + */ +#define dbgprintk(FMT, ...) \ + printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__) + +/* make sure we maintain the format strings, even when debugging is disabled */ +static inline void _dbprintk(const char *fmt, ...) + __attribute__((format(printf, 1, 2))); +static inline void _dbprintk(const char *fmt, ...) +{ +} + +#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__) +#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__) +#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__) + + +#if defined(__KDEBUG) +#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__) +#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__) +#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__) + +#elif defined(CONFIG_CACHEFILES_DEBUG) +#define _enter(FMT, ...) \ +do { \ + if (cachefiles_debug & CACHEFILES_DEBUG_KENTER) \ + kenter(FMT, ##__VA_ARGS__); \ +} while (0) + +#define _leave(FMT, ...) \ +do { \ + if (cachefiles_debug & CACHEFILES_DEBUG_KLEAVE) \ + kleave(FMT, ##__VA_ARGS__); \ +} while (0) + +#define _debug(FMT, ...) \ +do { \ + if (cachefiles_debug & CACHEFILES_DEBUG_KDEBUG) \ + kdebug(FMT, ##__VA_ARGS__); \ +} while (0) + +#else +#define _enter(FMT, ...) _dbprintk("==> %s("FMT")", __func__, ##__VA_ARGS__) +#define _leave(FMT, ...) _dbprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__) +#define _debug(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__) +#endif + +#if 1 /* defined(__KDEBUGALL) */ + +#define ASSERT(X) \ +do { \ + if (unlikely(!(X))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "CacheFiles: Assertion failed\n"); \ + BUG(); \ + } \ +} while (0) + +#define ASSERTCMP(X, OP, Y) \ +do { \ + if (unlikely(!((X) OP (Y)))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "CacheFiles: Assertion failed\n"); \ + printk(KERN_ERR "%lx " #OP " %lx is false\n", \ + (unsigned long)(X), (unsigned long)(Y)); \ + BUG(); \ + } \ +} while (0) + +#define ASSERTIF(C, X) \ +do { \ + if (unlikely((C) && !(X))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "CacheFiles: Assertion failed\n"); \ + BUG(); \ + } \ +} while (0) + +#define ASSERTIFCMP(C, X, OP, Y) \ +do { \ + if (unlikely((C) && !((X) OP (Y)))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "CacheFiles: Assertion failed\n"); \ + printk(KERN_ERR "%lx " #OP " %lx is false\n", \ + (unsigned long)(X), (unsigned long)(Y)); \ + BUG(); \ + } \ +} while (0) + +#else + +#define ASSERT(X) do {} while (0) +#define ASSERTCMP(X, OP, Y) do {} while (0) +#define ASSERTIF(C, X) do {} while (0) +#define ASSERTIFCMP(C, X, OP, Y) do {} while (0) + +#endif diff --git a/fs/cachefiles/key.c b/fs/cachefiles/key.c new file mode 100644 index 000000000000..81b8b2b3a674 --- /dev/null +++ b/fs/cachefiles/key.c @@ -0,0 +1,159 @@ +/* Key to pathname encoder + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/slab.h> +#include "internal.h" + +static const char cachefiles_charmap[64] = + "0123456789" /* 0 - 9 */ + "abcdefghijklmnopqrstuvwxyz" /* 10 - 35 */ + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" /* 36 - 61 */ + "_-" /* 62 - 63 */ + ; + +static const char cachefiles_filecharmap[256] = { + /* we skip space and tab and control chars */ + [33 ... 46] = 1, /* '!' -> '.' */ + /* we skip '/' as it's significant to pathwalk */ + [48 ... 127] = 1, /* '0' -> '~' */ +}; + +/* + * turn the raw key into something cooked + * - the raw key should include the length in the two bytes at the front + * - the key may be up to 514 bytes in length (including the length word) + * - "base64" encode the strange keys, mapping 3 bytes of raw to four of + * cooked + * - need to cut the cooked key into 252 char lengths (189 raw bytes) + */ +char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type) +{ + unsigned char csum, ch; + unsigned int acc; + char *key; + int loop, len, max, seg, mark, print; + + _enter(",%d", keylen); + + BUG_ON(keylen < 2 || keylen > 514); + + csum = raw[0] + raw[1]; + print = 1; + for (loop = 2; loop < keylen; loop++) { + ch = raw[loop]; + csum += ch; + print &= cachefiles_filecharmap[ch]; + } + + if (print) { + /* if the path is usable ASCII, then we render it directly */ + max = keylen - 2; + max += 2; /* two base64'd length chars on the front */ + max += 5; /* @checksum/M */ + max += 3 * 2; /* maximum number of segment dividers (".../M") + * is ((514 + 251) / 252) = 3 + */ + max += 1; /* NUL on end */ + } else { + /* calculate the maximum length of the cooked key */ + keylen = (keylen + 2) / 3; + + max = keylen * 4; + max += 5; /* @checksum/M */ + max += 3 * 2; /* maximum number of segment dividers (".../M") + * is ((514 + 188) / 189) = 3 + */ + max += 1; /* NUL on end */ + } + + max += 1; /* 2nd NUL on end */ + + _debug("max: %d", max); + + key = kmalloc(max, GFP_KERNEL); + if (!key) + return NULL; + + len = 0; + + /* build the cooked key */ + sprintf(key, "@%02x%c+", (unsigned) csum, 0); + len = 5; + mark = len - 1; + + if (print) { + acc = *(uint16_t *) raw; + raw += 2; + + key[len + 1] = cachefiles_charmap[acc & 63]; + acc >>= 6; + key[len] = cachefiles_charmap[acc & 63]; + len += 2; + + seg = 250; + for (loop = keylen; loop > 0; loop--) { + if (seg <= 0) { + key[len++] = '\0'; + mark = len; + key[len++] = '+'; + seg = 252; + } + + key[len++] = *raw++; + ASSERT(len < max); + } + + switch (type) { + case FSCACHE_COOKIE_TYPE_INDEX: type = 'I'; break; + case FSCACHE_COOKIE_TYPE_DATAFILE: type = 'D'; break; + default: type = 'S'; break; + } + } else { + seg = 252; + for (loop = keylen; loop > 0; loop--) { + if (seg <= 0) { + key[len++] = '\0'; + mark = len; + key[len++] = '+'; + seg = 252; + } + + acc = *raw++; + acc |= *raw++ << 8; + acc |= *raw++ << 16; + + _debug("acc: %06x", acc); + + key[len++] = cachefiles_charmap[acc & 63]; + acc >>= 6; + key[len++] = cachefiles_charmap[acc & 63]; + acc >>= 6; + key[len++] = cachefiles_charmap[acc & 63]; + acc >>= 6; + key[len++] = cachefiles_charmap[acc & 63]; + + ASSERT(len < max); + } + + switch (type) { + case FSCACHE_COOKIE_TYPE_INDEX: type = 'J'; break; + case FSCACHE_COOKIE_TYPE_DATAFILE: type = 'E'; break; + default: type = 'T'; break; + } + } + + key[mark] = type; + key[len++] = 0; + key[len] = 0; + + _leave(" = %p %d", key, len); + return key; +} diff --git a/fs/cachefiles/main.c b/fs/cachefiles/main.c new file mode 100644 index 000000000000..4bfa8cf43bf5 --- /dev/null +++ b/fs/cachefiles/main.c @@ -0,0 +1,106 @@ +/* Network filesystem caching backend to use cache files on a premounted + * filesystem + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/completion.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/namei.h> +#include <linux/mount.h> +#include <linux/statfs.h> +#include <linux/sysctl.h> +#include <linux/miscdevice.h> +#include "internal.h" + +unsigned cachefiles_debug; +module_param_named(debug, cachefiles_debug, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(cachefiles_debug, "CacheFiles debugging mask"); + +MODULE_DESCRIPTION("Mounted-filesystem based cache"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); + +struct kmem_cache *cachefiles_object_jar; + +static struct miscdevice cachefiles_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "cachefiles", + .fops = &cachefiles_daemon_fops, +}; + +static void cachefiles_object_init_once(void *_object) +{ + struct cachefiles_object *object = _object; + + memset(object, 0, sizeof(*object)); + spin_lock_init(&object->work_lock); +} + +/* + * initialise the fs caching module + */ +static int __init cachefiles_init(void) +{ + int ret; + + ret = misc_register(&cachefiles_dev); + if (ret < 0) + goto error_dev; + + /* create an object jar */ + ret = -ENOMEM; + cachefiles_object_jar = + kmem_cache_create("cachefiles_object_jar", + sizeof(struct cachefiles_object), + 0, + SLAB_HWCACHE_ALIGN, + cachefiles_object_init_once); + if (!cachefiles_object_jar) { + printk(KERN_NOTICE + "CacheFiles: Failed to allocate an object jar\n"); + goto error_object_jar; + } + + ret = cachefiles_proc_init(); + if (ret < 0) + goto error_proc; + + printk(KERN_INFO "CacheFiles: Loaded\n"); + return 0; + +error_proc: + kmem_cache_destroy(cachefiles_object_jar); +error_object_jar: + misc_deregister(&cachefiles_dev); +error_dev: + kerror("failed to register: %d", ret); + return ret; +} + +fs_initcall(cachefiles_init); + +/* + * clean up on module removal + */ +static void __exit cachefiles_exit(void) +{ + printk(KERN_INFO "CacheFiles: Unloading\n"); + + cachefiles_proc_cleanup(); + kmem_cache_destroy(cachefiles_object_jar); + misc_deregister(&cachefiles_dev); +} + +module_exit(cachefiles_exit); diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c new file mode 100644 index 000000000000..4ce818ae39ea --- /dev/null +++ b/fs/cachefiles/namei.c @@ -0,0 +1,771 @@ +/* CacheFiles path walking and related routines + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/fsnotify.h> +#include <linux/quotaops.h> +#include <linux/xattr.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/security.h> +#include "internal.h" + +static int cachefiles_wait_bit(void *flags) +{ + schedule(); + return 0; +} + +/* + * record the fact that an object is now active + */ +static void cachefiles_mark_object_active(struct cachefiles_cache *cache, + struct cachefiles_object *object) +{ + struct cachefiles_object *xobject; + struct rb_node **_p, *_parent = NULL; + struct dentry *dentry; + + _enter(",%p", object); + +try_again: + write_lock(&cache->active_lock); + + if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) + BUG(); + + dentry = object->dentry; + _p = &cache->active_nodes.rb_node; + while (*_p) { + _parent = *_p; + xobject = rb_entry(_parent, + struct cachefiles_object, active_node); + + ASSERT(xobject != object); + + if (xobject->dentry > dentry) + _p = &(*_p)->rb_left; + else if (xobject->dentry < dentry) + _p = &(*_p)->rb_right; + else + goto wait_for_old_object; + } + + rb_link_node(&object->active_node, _parent, _p); + rb_insert_color(&object->active_node, &cache->active_nodes); + + write_unlock(&cache->active_lock); + _leave(""); + return; + + /* an old object from a previous incarnation is hogging the slot - we + * need to wait for it to be destroyed */ +wait_for_old_object: + if (xobject->fscache.state < FSCACHE_OBJECT_DYING) { + printk(KERN_ERR "\n"); + printk(KERN_ERR "CacheFiles: Error:" + " Unexpected object collision\n"); + printk(KERN_ERR "xobject: OBJ%x\n", + xobject->fscache.debug_id); + printk(KERN_ERR "xobjstate=%s\n", + fscache_object_states[xobject->fscache.state]); + printk(KERN_ERR "xobjflags=%lx\n", xobject->fscache.flags); + printk(KERN_ERR "xobjevent=%lx [%lx]\n", + xobject->fscache.events, xobject->fscache.event_mask); + printk(KERN_ERR "xops=%u inp=%u exc=%u\n", + xobject->fscache.n_ops, xobject->fscache.n_in_progress, + xobject->fscache.n_exclusive); + printk(KERN_ERR "xcookie=%p [pr=%p nd=%p fl=%lx]\n", + xobject->fscache.cookie, + xobject->fscache.cookie->parent, + xobject->fscache.cookie->netfs_data, + xobject->fscache.cookie->flags); + printk(KERN_ERR "xparent=%p\n", + xobject->fscache.parent); + printk(KERN_ERR "object: OBJ%x\n", + object->fscache.debug_id); + printk(KERN_ERR "cookie=%p [pr=%p nd=%p fl=%lx]\n", + object->fscache.cookie, + object->fscache.cookie->parent, + object->fscache.cookie->netfs_data, + object->fscache.cookie->flags); + printk(KERN_ERR "parent=%p\n", + object->fscache.parent); + BUG(); + } + atomic_inc(&xobject->usage); + write_unlock(&cache->active_lock); + + _debug(">>> wait"); + wait_on_bit(&xobject->flags, CACHEFILES_OBJECT_ACTIVE, + cachefiles_wait_bit, TASK_UNINTERRUPTIBLE); + _debug("<<< waited"); + + cache->cache.ops->put_object(&xobject->fscache); + goto try_again; +} + +/* + * delete an object representation from the cache + * - file backed objects are unlinked + * - directory backed objects are stuffed into the graveyard for userspace to + * delete + * - unlocks the directory mutex + */ +static int cachefiles_bury_object(struct cachefiles_cache *cache, + struct dentry *dir, + struct dentry *rep) +{ + struct dentry *grave, *trap; + char nbuffer[8 + 8 + 1]; + int ret; + + _enter(",'%*.*s','%*.*s'", + dir->d_name.len, dir->d_name.len, dir->d_name.name, + rep->d_name.len, rep->d_name.len, rep->d_name.name); + + /* non-directories can just be unlinked */ + if (!S_ISDIR(rep->d_inode->i_mode)) { + _debug("unlink stale object"); + ret = vfs_unlink(dir->d_inode, rep); + + mutex_unlock(&dir->d_inode->i_mutex); + + if (ret == -EIO) + cachefiles_io_error(cache, "Unlink failed"); + + _leave(" = %d", ret); + return ret; + } + + /* directories have to be moved to the graveyard */ + _debug("move stale object to graveyard"); + mutex_unlock(&dir->d_inode->i_mutex); + +try_again: + /* first step is to make up a grave dentry in the graveyard */ + sprintf(nbuffer, "%08x%08x", + (uint32_t) get_seconds(), + (uint32_t) atomic_inc_return(&cache->gravecounter)); + + /* do the multiway lock magic */ + trap = lock_rename(cache->graveyard, dir); + + /* do some checks before getting the grave dentry */ + if (rep->d_parent != dir) { + /* the entry was probably culled when we dropped the parent dir + * lock */ + unlock_rename(cache->graveyard, dir); + _leave(" = 0 [culled?]"); + return 0; + } + + if (!S_ISDIR(cache->graveyard->d_inode->i_mode)) { + unlock_rename(cache->graveyard, dir); + cachefiles_io_error(cache, "Graveyard no longer a directory"); + return -EIO; + } + + if (trap == rep) { + unlock_rename(cache->graveyard, dir); + cachefiles_io_error(cache, "May not make directory loop"); + return -EIO; + } + + if (d_mountpoint(rep)) { + unlock_rename(cache->graveyard, dir); + cachefiles_io_error(cache, "Mountpoint in cache"); + return -EIO; + } + + grave = lookup_one_len(nbuffer, cache->graveyard, strlen(nbuffer)); + if (IS_ERR(grave)) { + unlock_rename(cache->graveyard, dir); + + if (PTR_ERR(grave) == -ENOMEM) { + _leave(" = -ENOMEM"); + return -ENOMEM; + } + + cachefiles_io_error(cache, "Lookup error %ld", + PTR_ERR(grave)); + return -EIO; + } + + if (grave->d_inode) { + unlock_rename(cache->graveyard, dir); + dput(grave); + grave = NULL; + cond_resched(); + goto try_again; + } + + if (d_mountpoint(grave)) { + unlock_rename(cache->graveyard, dir); + dput(grave); + cachefiles_io_error(cache, "Mountpoint in graveyard"); + return -EIO; + } + + /* target should not be an ancestor of source */ + if (trap == grave) { + unlock_rename(cache->graveyard, dir); + dput(grave); + cachefiles_io_error(cache, "May not make directory loop"); + return -EIO; + } + + /* attempt the rename */ + ret = vfs_rename(dir->d_inode, rep, cache->graveyard->d_inode, grave); + if (ret != 0 && ret != -ENOMEM) + cachefiles_io_error(cache, "Rename failed with error %d", ret); + + unlock_rename(cache->graveyard, dir); + dput(grave); + _leave(" = 0"); + return 0; +} + +/* + * delete an object representation from the cache + */ +int cachefiles_delete_object(struct cachefiles_cache *cache, + struct cachefiles_object *object) +{ + struct dentry *dir; + int ret; + + _enter(",{%p}", object->dentry); + + ASSERT(object->dentry); + ASSERT(object->dentry->d_inode); + ASSERT(object->dentry->d_parent); + + dir = dget_parent(object->dentry); + + mutex_lock(&dir->d_inode->i_mutex); + ret = cachefiles_bury_object(cache, dir, object->dentry); + + dput(dir); + _leave(" = %d", ret); + return ret; +} + +/* + * walk from the parent object to the child object through the backing + * filesystem, creating directories as we go + */ +int cachefiles_walk_to_object(struct cachefiles_object *parent, + struct cachefiles_object *object, + const char *key, + struct cachefiles_xattr *auxdata) +{ + struct cachefiles_cache *cache; + struct dentry *dir, *next = NULL; + unsigned long start; + const char *name; + int ret, nlen; + + _enter("{%p},,%s,", parent->dentry, key); + + cache = container_of(parent->fscache.cache, + struct cachefiles_cache, cache); + + ASSERT(parent->dentry); + ASSERT(parent->dentry->d_inode); + + if (!(S_ISDIR(parent->dentry->d_inode->i_mode))) { + // TODO: convert file to dir + _leave("looking up in none directory"); + return -ENOBUFS; + } + + dir = dget(parent->dentry); + +advance: + /* attempt to transit the first directory component */ + name = key; + nlen = strlen(key); + + /* key ends in a double NUL */ + key = key + nlen + 1; + if (!*key) + key = NULL; + +lookup_again: + /* search the current directory for the element name */ + _debug("lookup '%s'", name); + + mutex_lock(&dir->d_inode->i_mutex); + + start = jiffies; + next = lookup_one_len(name, dir, nlen); + cachefiles_hist(cachefiles_lookup_histogram, start); + if (IS_ERR(next)) + goto lookup_error; + + _debug("next -> %p %s", next, next->d_inode ? "positive" : "negative"); + + if (!key) + object->new = !next->d_inode; + + /* if this element of the path doesn't exist, then the lookup phase + * failed, and we can release any readers in the certain knowledge that + * there's nothing for them to actually read */ + if (!next->d_inode) + fscache_object_lookup_negative(&object->fscache); + + /* we need to create the object if it's negative */ + if (key || object->type == FSCACHE_COOKIE_TYPE_INDEX) { + /* index objects and intervening tree levels must be subdirs */ + if (!next->d_inode) { + ret = cachefiles_has_space(cache, 1, 0); + if (ret < 0) + goto create_error; + + start = jiffies; + ret = vfs_mkdir(dir->d_inode, next, 0); + cachefiles_hist(cachefiles_mkdir_histogram, start); + if (ret < 0) + goto create_error; + + ASSERT(next->d_inode); + + _debug("mkdir -> %p{%p{ino=%lu}}", + next, next->d_inode, next->d_inode->i_ino); + + } else if (!S_ISDIR(next->d_inode->i_mode)) { + kerror("inode %lu is not a directory", + next->d_inode->i_ino); + ret = -ENOBUFS; + goto error; + } + + } else { + /* non-index objects start out life as files */ + if (!next->d_inode) { + ret = cachefiles_has_space(cache, 1, 0); + if (ret < 0) + goto create_error; + + start = jiffies; + ret = vfs_create(dir->d_inode, next, S_IFREG, NULL); + cachefiles_hist(cachefiles_create_histogram, start); + if (ret < 0) + goto create_error; + + ASSERT(next->d_inode); + + _debug("create -> %p{%p{ino=%lu}}", + next, next->d_inode, next->d_inode->i_ino); + + } else if (!S_ISDIR(next->d_inode->i_mode) && + !S_ISREG(next->d_inode->i_mode) + ) { + kerror("inode %lu is not a file or directory", + next->d_inode->i_ino); + ret = -ENOBUFS; + goto error; + } + } + + /* process the next component */ + if (key) { + _debug("advance"); + mutex_unlock(&dir->d_inode->i_mutex); + dput(dir); + dir = next; + next = NULL; + goto advance; + } + + /* we've found the object we were looking for */ + object->dentry = next; + + /* if we've found that the terminal object exists, then we need to + * check its attributes and delete it if it's out of date */ + if (!object->new) { + _debug("validate '%*.*s'", + next->d_name.len, next->d_name.len, next->d_name.name); + + ret = cachefiles_check_object_xattr(object, auxdata); + if (ret == -ESTALE) { + /* delete the object (the deleter drops the directory + * mutex) */ + object->dentry = NULL; + + ret = cachefiles_bury_object(cache, dir, next); + dput(next); + next = NULL; + + if (ret < 0) + goto delete_error; + + _debug("redo lookup"); + goto lookup_again; + } + } + + /* note that we're now using this object */ + cachefiles_mark_object_active(cache, object); + + mutex_unlock(&dir->d_inode->i_mutex); + dput(dir); + dir = NULL; + + _debug("=== OBTAINED_OBJECT ==="); + + if (object->new) { + /* attach data to a newly constructed terminal object */ + ret = cachefiles_set_object_xattr(object, auxdata); + if (ret < 0) + goto check_error; + } else { + /* always update the atime on an object we've just looked up + * (this is used to keep track of culling, and atimes are only + * updated by read, write and readdir but not lookup or + * open) */ + touch_atime(cache->mnt, next); + } + + /* open a file interface onto a data file */ + if (object->type != FSCACHE_COOKIE_TYPE_INDEX) { + if (S_ISREG(object->dentry->d_inode->i_mode)) { + const struct address_space_operations *aops; + + ret = -EPERM; + aops = object->dentry->d_inode->i_mapping->a_ops; + if (!aops->bmap) + goto check_error; + + object->backer = object->dentry; + } else { + BUG(); // TODO: open file in data-class subdir + } + } + + object->new = 0; + fscache_obtained_object(&object->fscache); + + _leave(" = 0 [%lu]", object->dentry->d_inode->i_ino); + return 0; + +create_error: + _debug("create error %d", ret); + if (ret == -EIO) + cachefiles_io_error(cache, "Create/mkdir failed"); + goto error; + +check_error: + _debug("check error %d", ret); + write_lock(&cache->active_lock); + rb_erase(&object->active_node, &cache->active_nodes); + clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags); + wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE); + write_unlock(&cache->active_lock); + + dput(object->dentry); + object->dentry = NULL; + goto error_out; + +delete_error: + _debug("delete error %d", ret); + goto error_out2; + +lookup_error: + _debug("lookup error %ld", PTR_ERR(next)); + ret = PTR_ERR(next); + if (ret == -EIO) + cachefiles_io_error(cache, "Lookup failed"); + next = NULL; +error: + mutex_unlock(&dir->d_inode->i_mutex); + dput(next); +error_out2: + dput(dir); +error_out: + if (ret == -ENOSPC) + ret = -ENOBUFS; + + _leave(" = error %d", -ret); + return ret; +} + +/* + * get a subdirectory + */ +struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, + struct dentry *dir, + const char *dirname) +{ + struct dentry *subdir; + unsigned long start; + int ret; + + _enter(",,%s", dirname); + + /* search the current directory for the element name */ + mutex_lock(&dir->d_inode->i_mutex); + + start = jiffies; + subdir = lookup_one_len(dirname, dir, strlen(dirname)); + cachefiles_hist(cachefiles_lookup_histogram, start); + if (IS_ERR(subdir)) { + if (PTR_ERR(subdir) == -ENOMEM) + goto nomem_d_alloc; + goto lookup_error; + } + + _debug("subdir -> %p %s", + subdir, subdir->d_inode ? "positive" : "negative"); + + /* we need to create the subdir if it doesn't exist yet */ + if (!subdir->d_inode) { + ret = cachefiles_has_space(cache, 1, 0); + if (ret < 0) + goto mkdir_error; + + _debug("attempt mkdir"); + + ret = vfs_mkdir(dir->d_inode, subdir, 0700); + if (ret < 0) + goto mkdir_error; + + ASSERT(subdir->d_inode); + + _debug("mkdir -> %p{%p{ino=%lu}}", + subdir, + subdir->d_inode, + subdir->d_inode->i_ino); + } + + mutex_unlock(&dir->d_inode->i_mutex); + + /* we need to make sure the subdir is a directory */ + ASSERT(subdir->d_inode); + + if (!S_ISDIR(subdir->d_inode->i_mode)) { + kerror("%s is not a directory", dirname); + ret = -EIO; + goto check_error; + } + + ret = -EPERM; + if (!subdir->d_inode->i_op || + !subdir->d_inode->i_op->setxattr || + !subdir->d_inode->i_op->getxattr || + !subdir->d_inode->i_op->lookup || + !subdir->d_inode->i_op->mkdir || + !subdir->d_inode->i_op->create || + !subdir->d_inode->i_op->rename || + !subdir->d_inode->i_op->rmdir || + !subdir->d_inode->i_op->unlink) + goto check_error; + + _leave(" = [%lu]", subdir->d_inode->i_ino); + return subdir; + +check_error: + dput(subdir); + _leave(" = %d [check]", ret); + return ERR_PTR(ret); + +mkdir_error: + mutex_unlock(&dir->d_inode->i_mutex); + dput(subdir); + kerror("mkdir %s failed with error %d", dirname, ret); + return ERR_PTR(ret); + +lookup_error: + mutex_unlock(&dir->d_inode->i_mutex); + ret = PTR_ERR(subdir); + kerror("Lookup %s failed with error %d", dirname, ret); + return ERR_PTR(ret); + +nomem_d_alloc: + mutex_unlock(&dir->d_inode->i_mutex); + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); +} + +/* + * find out if an object is in use or not + * - if finds object and it's not in use: + * - returns a pointer to the object and a reference on it + * - returns with the directory locked + */ +static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, + struct dentry *dir, + char *filename) +{ + struct cachefiles_object *object; + struct rb_node *_n; + struct dentry *victim; + unsigned long start; + int ret; + + //_enter(",%*.*s/,%s", + // dir->d_name.len, dir->d_name.len, dir->d_name.name, filename); + + /* look up the victim */ + mutex_lock_nested(&dir->d_inode->i_mutex, 1); + + start = jiffies; + victim = lookup_one_len(filename, dir, strlen(filename)); + cachefiles_hist(cachefiles_lookup_histogram, start); + if (IS_ERR(victim)) + goto lookup_error; + + //_debug("victim -> %p %s", + // victim, victim->d_inode ? "positive" : "negative"); + + /* if the object is no longer there then we probably retired the object + * at the netfs's request whilst the cull was in progress + */ + if (!victim->d_inode) { + mutex_unlock(&dir->d_inode->i_mutex); + dput(victim); + _leave(" = -ENOENT [absent]"); + return ERR_PTR(-ENOENT); + } + + /* check to see if we're using this object */ + read_lock(&cache->active_lock); + + _n = cache->active_nodes.rb_node; + + while (_n) { + object = rb_entry(_n, struct cachefiles_object, active_node); + + if (object->dentry > victim) + _n = _n->rb_left; + else if (object->dentry < victim) + _n = _n->rb_right; + else + goto object_in_use; + } + + read_unlock(&cache->active_lock); + + //_leave(" = %p", victim); + return victim; + +object_in_use: + read_unlock(&cache->active_lock); + mutex_unlock(&dir->d_inode->i_mutex); + dput(victim); + //_leave(" = -EBUSY [in use]"); + return ERR_PTR(-EBUSY); + +lookup_error: + mutex_unlock(&dir->d_inode->i_mutex); + ret = PTR_ERR(victim); + if (ret == -ENOENT) { + /* file or dir now absent - probably retired by netfs */ + _leave(" = -ESTALE [absent]"); + return ERR_PTR(-ESTALE); + } + + if (ret == -EIO) { + cachefiles_io_error(cache, "Lookup failed"); + } else if (ret != -ENOMEM) { + kerror("Internal error: %d", ret); + ret = -EIO; + } + + _leave(" = %d", ret); + return ERR_PTR(ret); +} + +/* + * cull an object if it's not in use + * - called only by cache manager daemon + */ +int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, + char *filename) +{ + struct dentry *victim; + int ret; + + _enter(",%*.*s/,%s", + dir->d_name.len, dir->d_name.len, dir->d_name.name, filename); + + victim = cachefiles_check_active(cache, dir, filename); + if (IS_ERR(victim)) + return PTR_ERR(victim); + + _debug("victim -> %p %s", + victim, victim->d_inode ? "positive" : "negative"); + + /* okay... the victim is not being used so we can cull it + * - start by marking it as stale + */ + _debug("victim is cullable"); + + ret = cachefiles_remove_object_xattr(cache, victim); + if (ret < 0) + goto error_unlock; + + /* actually remove the victim (drops the dir mutex) */ + _debug("bury"); + + ret = cachefiles_bury_object(cache, dir, victim); + if (ret < 0) + goto error; + + dput(victim); + _leave(" = 0"); + return 0; + +error_unlock: + mutex_unlock(&dir->d_inode->i_mutex); +error: + dput(victim); + if (ret == -ENOENT) { + /* file or dir now absent - probably retired by netfs */ + _leave(" = -ESTALE [absent]"); + return -ESTALE; + } + + if (ret != -ENOMEM) { + kerror("Internal error: %d", ret); + ret = -EIO; + } + + _leave(" = %d", ret); + return ret; +} + +/* + * find out if an object is in use or not + * - called only by cache manager daemon + * - returns -EBUSY or 0 to indicate whether an object is in use or not + */ +int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir, + char *filename) +{ + struct dentry *victim; + + //_enter(",%*.*s/,%s", + // dir->d_name.len, dir->d_name.len, dir->d_name.name, filename); + + victim = cachefiles_check_active(cache, dir, filename); + if (IS_ERR(victim)) + return PTR_ERR(victim); + + mutex_unlock(&dir->d_inode->i_mutex); + dput(victim); + //_leave(" = 0"); + return 0; +} diff --git a/fs/cachefiles/proc.c b/fs/cachefiles/proc.c new file mode 100644 index 000000000000..eccd33941199 --- /dev/null +++ b/fs/cachefiles/proc.c @@ -0,0 +1,134 @@ +/* CacheFiles statistics + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include "internal.h" + +atomic_t cachefiles_lookup_histogram[HZ]; +atomic_t cachefiles_mkdir_histogram[HZ]; +atomic_t cachefiles_create_histogram[HZ]; + +/* + * display the latency histogram + */ +static int cachefiles_histogram_show(struct seq_file *m, void *v) +{ + unsigned long index; + unsigned x, y, z, t; + + switch ((unsigned long) v) { + case 1: + seq_puts(m, "JIFS SECS LOOKUPS MKDIRS CREATES\n"); + return 0; + case 2: + seq_puts(m, "===== ===== ========= ========= =========\n"); + return 0; + default: + index = (unsigned long) v - 3; + x = atomic_read(&cachefiles_lookup_histogram[index]); + y = atomic_read(&cachefiles_mkdir_histogram[index]); + z = atomic_read(&cachefiles_create_histogram[index]); + if (x == 0 && y == 0 && z == 0) + return 0; + + t = (index * 1000) / HZ; + + seq_printf(m, "%4lu 0.%03u %9u %9u %9u\n", index, t, x, y, z); + return 0; + } +} + +/* + * set up the iterator to start reading from the first line + */ +static void *cachefiles_histogram_start(struct seq_file *m, loff_t *_pos) +{ + if ((unsigned long long)*_pos >= HZ + 2) + return NULL; + if (*_pos == 0) + *_pos = 1; + return (void *)(unsigned long) *_pos; +} + +/* + * move to the next line + */ +static void *cachefiles_histogram_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + return (unsigned long long)*pos > HZ + 2 ? + NULL : (void *)(unsigned long) *pos; +} + +/* + * clean up after reading + */ +static void cachefiles_histogram_stop(struct seq_file *m, void *v) +{ +} + +static const struct seq_operations cachefiles_histogram_ops = { + .start = cachefiles_histogram_start, + .stop = cachefiles_histogram_stop, + .next = cachefiles_histogram_next, + .show = cachefiles_histogram_show, +}; + +/* + * open "/proc/fs/cachefiles/XXX" which provide statistics summaries + */ +static int cachefiles_histogram_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &cachefiles_histogram_ops); +} + +static const struct file_operations cachefiles_histogram_fops = { + .owner = THIS_MODULE, + .open = cachefiles_histogram_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/* + * initialise the /proc/fs/cachefiles/ directory + */ +int __init cachefiles_proc_init(void) +{ + _enter(""); + + if (!proc_mkdir("fs/cachefiles", NULL)) + goto error_dir; + + if (!proc_create("fs/cachefiles/histogram", S_IFREG | 0444, NULL, + &cachefiles_histogram_fops)) + goto error_histogram; + + _leave(" = 0"); + return 0; + +error_histogram: + remove_proc_entry("fs/cachefiles", NULL); +error_dir: + _leave(" = -ENOMEM"); + return -ENOMEM; +} + +/* + * clean up the /proc/fs/cachefiles/ directory + */ +void cachefiles_proc_cleanup(void) +{ + remove_proc_entry("fs/cachefiles/histogram", NULL); + remove_proc_entry("fs/cachefiles", NULL); +} diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c new file mode 100644 index 000000000000..a69787e7dd96 --- /dev/null +++ b/fs/cachefiles/rdwr.c @@ -0,0 +1,879 @@ +/* Storage object read/write + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/mount.h> +#include <linux/file.h> +#include "internal.h" + +/* + * detect wake up events generated by the unlocking of pages in which we're + * interested + * - we use this to detect read completion of backing pages + * - the caller holds the waitqueue lock + */ +static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode, + int sync, void *_key) +{ + struct cachefiles_one_read *monitor = + container_of(wait, struct cachefiles_one_read, monitor); + struct cachefiles_object *object; + struct wait_bit_key *key = _key; + struct page *page = wait->private; + + ASSERT(key); + + _enter("{%lu},%u,%d,{%p,%u}", + monitor->netfs_page->index, mode, sync, + key->flags, key->bit_nr); + + if (key->flags != &page->flags || + key->bit_nr != PG_locked) + return 0; + + _debug("--- monitor %p %lx ---", page, page->flags); + + if (!PageUptodate(page) && !PageError(page)) + dump_stack(); + + /* remove from the waitqueue */ + list_del(&wait->task_list); + + /* move onto the action list and queue for FS-Cache thread pool */ + ASSERT(monitor->op); + + object = container_of(monitor->op->op.object, + struct cachefiles_object, fscache); + + spin_lock(&object->work_lock); + list_add_tail(&monitor->op_link, &monitor->op->to_do); + spin_unlock(&object->work_lock); + + fscache_enqueue_retrieval(monitor->op); + return 0; +} + +/* + * copy data from backing pages to netfs pages to complete a read operation + * - driven by FS-Cache's thread pool + */ +static void cachefiles_read_copier(struct fscache_operation *_op) +{ + struct cachefiles_one_read *monitor; + struct cachefiles_object *object; + struct fscache_retrieval *op; + struct pagevec pagevec; + int error, max; + + op = container_of(_op, struct fscache_retrieval, op); + object = container_of(op->op.object, + struct cachefiles_object, fscache); + + _enter("{ino=%lu}", object->backer->d_inode->i_ino); + + pagevec_init(&pagevec, 0); + + max = 8; + spin_lock_irq(&object->work_lock); + + while (!list_empty(&op->to_do)) { + monitor = list_entry(op->to_do.next, + struct cachefiles_one_read, op_link); + list_del(&monitor->op_link); + + spin_unlock_irq(&object->work_lock); + + _debug("- copy {%lu}", monitor->back_page->index); + + error = -EIO; + if (PageUptodate(monitor->back_page)) { + copy_highpage(monitor->netfs_page, monitor->back_page); + + pagevec_add(&pagevec, monitor->netfs_page); + fscache_mark_pages_cached(monitor->op, &pagevec); + error = 0; + } + + if (error) + cachefiles_io_error_obj( + object, + "Readpage failed on backing file %lx", + (unsigned long) monitor->back_page->flags); + + page_cache_release(monitor->back_page); + + fscache_end_io(op, monitor->netfs_page, error); + page_cache_release(monitor->netfs_page); + fscache_put_retrieval(op); + kfree(monitor); + + /* let the thread pool have some air occasionally */ + max--; + if (max < 0 || need_resched()) { + if (!list_empty(&op->to_do)) + fscache_enqueue_retrieval(op); + _leave(" [maxed out]"); + return; + } + + spin_lock_irq(&object->work_lock); + } + + spin_unlock_irq(&object->work_lock); + _leave(""); +} + +/* + * read the corresponding page to the given set from the backing file + * - an uncertain page is simply discarded, to be tried again another time + */ +static int cachefiles_read_backing_file_one(struct cachefiles_object *object, + struct fscache_retrieval *op, + struct page *netpage, + struct pagevec *pagevec) +{ + struct cachefiles_one_read *monitor; + struct address_space *bmapping; + struct page *newpage, *backpage; + int ret; + + _enter(""); + + pagevec_reinit(pagevec); + + _debug("read back %p{%lu,%d}", + netpage, netpage->index, page_count(netpage)); + + monitor = kzalloc(sizeof(*monitor), GFP_KERNEL); + if (!monitor) + goto nomem; + + monitor->netfs_page = netpage; + monitor->op = fscache_get_retrieval(op); + + init_waitqueue_func_entry(&monitor->monitor, cachefiles_read_waiter); + + /* attempt to get hold of the backing page */ + bmapping = object->backer->d_inode->i_mapping; + newpage = NULL; + + for (;;) { + backpage = find_get_page(bmapping, netpage->index); + if (backpage) + goto backing_page_already_present; + + if (!newpage) { + newpage = page_cache_alloc_cold(bmapping); + if (!newpage) + goto nomem_monitor; + } + + ret = add_to_page_cache(newpage, bmapping, + netpage->index, GFP_KERNEL); + if (ret == 0) + goto installed_new_backing_page; + if (ret != -EEXIST) + goto nomem_page; + } + + /* we've installed a new backing page, so now we need to add it + * to the LRU list and start it reading */ +installed_new_backing_page: + _debug("- new %p", newpage); + + backpage = newpage; + newpage = NULL; + + page_cache_get(backpage); + pagevec_add(pagevec, backpage); + __pagevec_lru_add_file(pagevec); + +read_backing_page: + ret = bmapping->a_ops->readpage(NULL, backpage); + if (ret < 0) + goto read_error; + + /* set the monitor to transfer the data across */ +monitor_backing_page: + _debug("- monitor add"); + + /* install the monitor */ + page_cache_get(monitor->netfs_page); + page_cache_get(backpage); + monitor->back_page = backpage; + monitor->monitor.private = backpage; + add_page_wait_queue(backpage, &monitor->monitor); + monitor = NULL; + + /* but the page may have been read before the monitor was installed, so + * the monitor may miss the event - so we have to ensure that we do get + * one in such a case */ + if (trylock_page(backpage)) { + _debug("jumpstart %p {%lx}", backpage, backpage->flags); + unlock_page(backpage); + } + goto success; + + /* if the backing page is already present, it can be in one of + * three states: read in progress, read failed or read okay */ +backing_page_already_present: + _debug("- present"); + + if (newpage) { + page_cache_release(newpage); + newpage = NULL; + } + + if (PageError(backpage)) + goto io_error; + + if (PageUptodate(backpage)) + goto backing_page_already_uptodate; + + if (!trylock_page(backpage)) + goto monitor_backing_page; + _debug("read %p {%lx}", backpage, backpage->flags); + goto read_backing_page; + + /* the backing page is already up to date, attach the netfs + * page to the pagecache and LRU and copy the data across */ +backing_page_already_uptodate: + _debug("- uptodate"); + + pagevec_add(pagevec, netpage); + fscache_mark_pages_cached(op, pagevec); + + copy_highpage(netpage, backpage); + fscache_end_io(op, netpage, 0); + +success: + _debug("success"); + ret = 0; + +out: + if (backpage) + page_cache_release(backpage); + if (monitor) { + fscache_put_retrieval(monitor->op); + kfree(monitor); + } + _leave(" = %d", ret); + return ret; + +read_error: + _debug("read error %d", ret); + if (ret == -ENOMEM) + goto out; +io_error: + cachefiles_io_error_obj(object, "Page read error on backing file"); + ret = -ENOBUFS; + goto out; + +nomem_page: + page_cache_release(newpage); +nomem_monitor: + fscache_put_retrieval(monitor->op); + kfree(monitor); +nomem: + _leave(" = -ENOMEM"); + return -ENOMEM; +} + +/* + * read a page from the cache or allocate a block in which to store it + * - cache withdrawal is prevented by the caller + * - returns -EINTR if interrupted + * - returns -ENOMEM if ran out of memory + * - returns -ENOBUFS if no buffers can be made available + * - returns -ENOBUFS if page is beyond EOF + * - if the page is backed by a block in the cache: + * - a read will be started which will call the callback on completion + * - 0 will be returned + * - else if the page is unbacked: + * - the metadata will be retained + * - -ENODATA will be returned + */ +int cachefiles_read_or_alloc_page(struct fscache_retrieval *op, + struct page *page, + gfp_t gfp) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + struct pagevec pagevec; + struct inode *inode; + sector_t block0, block; + unsigned shift; + int ret; + + object = container_of(op->op.object, + struct cachefiles_object, fscache); + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + + _enter("{%p},{%lx},,,", object, page->index); + + if (!object->backer) + return -ENOBUFS; + + inode = object->backer->d_inode; + ASSERT(S_ISREG(inode->i_mode)); + ASSERT(inode->i_mapping->a_ops->bmap); + ASSERT(inode->i_mapping->a_ops->readpages); + + /* calculate the shift required to use bmap */ + if (inode->i_sb->s_blocksize > PAGE_SIZE) + return -ENOBUFS; + + shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits; + + op->op.flags = FSCACHE_OP_FAST; + op->op.processor = cachefiles_read_copier; + + pagevec_init(&pagevec, 0); + + /* we assume the absence or presence of the first block is a good + * enough indication for the page as a whole + * - TODO: don't use bmap() for this as it is _not_ actually good + * enough for this as it doesn't indicate errors, but it's all we've + * got for the moment + */ + block0 = page->index; + block0 <<= shift; + + block = inode->i_mapping->a_ops->bmap(inode->i_mapping, block0); + _debug("%llx -> %llx", + (unsigned long long) block0, + (unsigned long long) block); + + if (block) { + /* submit the apparently valid page to the backing fs to be + * read from disk */ + ret = cachefiles_read_backing_file_one(object, op, page, + &pagevec); + } else if (cachefiles_has_space(cache, 0, 1) == 0) { + /* there's space in the cache we can use */ + pagevec_add(&pagevec, page); + fscache_mark_pages_cached(op, &pagevec); + ret = -ENODATA; + } else { + ret = -ENOBUFS; + } + + _leave(" = %d", ret); + return ret; +} + +/* + * read the corresponding pages to the given set from the backing file + * - any uncertain pages are simply discarded, to be tried again another time + */ +static int cachefiles_read_backing_file(struct cachefiles_object *object, + struct fscache_retrieval *op, + struct list_head *list, + struct pagevec *mark_pvec) +{ + struct cachefiles_one_read *monitor = NULL; + struct address_space *bmapping = object->backer->d_inode->i_mapping; + struct pagevec lru_pvec; + struct page *newpage = NULL, *netpage, *_n, *backpage = NULL; + int ret = 0; + + _enter(""); + + pagevec_init(&lru_pvec, 0); + + list_for_each_entry_safe(netpage, _n, list, lru) { + list_del(&netpage->lru); + + _debug("read back %p{%lu,%d}", + netpage, netpage->index, page_count(netpage)); + + if (!monitor) { + monitor = kzalloc(sizeof(*monitor), GFP_KERNEL); + if (!monitor) + goto nomem; + + monitor->op = fscache_get_retrieval(op); + init_waitqueue_func_entry(&monitor->monitor, + cachefiles_read_waiter); + } + + for (;;) { + backpage = find_get_page(bmapping, netpage->index); + if (backpage) + goto backing_page_already_present; + + if (!newpage) { + newpage = page_cache_alloc_cold(bmapping); + if (!newpage) + goto nomem; + } + + ret = add_to_page_cache(newpage, bmapping, + netpage->index, GFP_KERNEL); + if (ret == 0) + goto installed_new_backing_page; + if (ret != -EEXIST) + goto nomem; + } + + /* we've installed a new backing page, so now we need to add it + * to the LRU list and start it reading */ + installed_new_backing_page: + _debug("- new %p", newpage); + + backpage = newpage; + newpage = NULL; + + page_cache_get(backpage); + if (!pagevec_add(&lru_pvec, backpage)) + __pagevec_lru_add_file(&lru_pvec); + + reread_backing_page: + ret = bmapping->a_ops->readpage(NULL, backpage); + if (ret < 0) + goto read_error; + + /* add the netfs page to the pagecache and LRU, and set the + * monitor to transfer the data across */ + monitor_backing_page: + _debug("- monitor add"); + + ret = add_to_page_cache(netpage, op->mapping, netpage->index, + GFP_KERNEL); + if (ret < 0) { + if (ret == -EEXIST) { + page_cache_release(netpage); + continue; + } + goto nomem; + } + + page_cache_get(netpage); + if (!pagevec_add(&lru_pvec, netpage)) + __pagevec_lru_add_file(&lru_pvec); + + /* install a monitor */ + page_cache_get(netpage); + monitor->netfs_page = netpage; + + page_cache_get(backpage); + monitor->back_page = backpage; + monitor->monitor.private = backpage; + add_page_wait_queue(backpage, &monitor->monitor); + monitor = NULL; + + /* but the page may have been read before the monitor was + * installed, so the monitor may miss the event - so we have to + * ensure that we do get one in such a case */ + if (trylock_page(backpage)) { + _debug("2unlock %p {%lx}", backpage, backpage->flags); + unlock_page(backpage); + } + + page_cache_release(backpage); + backpage = NULL; + + page_cache_release(netpage); + netpage = NULL; + continue; + + /* if the backing page is already present, it can be in one of + * three states: read in progress, read failed or read okay */ + backing_page_already_present: + _debug("- present %p", backpage); + + if (PageError(backpage)) + goto io_error; + + if (PageUptodate(backpage)) + goto backing_page_already_uptodate; + + _debug("- not ready %p{%lx}", backpage, backpage->flags); + + if (!trylock_page(backpage)) + goto monitor_backing_page; + + if (PageError(backpage)) { + _debug("error %lx", backpage->flags); + unlock_page(backpage); + goto io_error; + } + + if (PageUptodate(backpage)) + goto backing_page_already_uptodate_unlock; + + /* we've locked a page that's neither up to date nor erroneous, + * so we need to attempt to read it again */ + goto reread_backing_page; + + /* the backing page is already up to date, attach the netfs + * page to the pagecache and LRU and copy the data across */ + backing_page_already_uptodate_unlock: + _debug("uptodate %lx", backpage->flags); + unlock_page(backpage); + backing_page_already_uptodate: + _debug("- uptodate"); + + ret = add_to_page_cache(netpage, op->mapping, netpage->index, + GFP_KERNEL); + if (ret < 0) { + if (ret == -EEXIST) { + page_cache_release(netpage); + continue; + } + goto nomem; + } + + copy_highpage(netpage, backpage); + + page_cache_release(backpage); + backpage = NULL; + + if (!pagevec_add(mark_pvec, netpage)) + fscache_mark_pages_cached(op, mark_pvec); + + page_cache_get(netpage); + if (!pagevec_add(&lru_pvec, netpage)) + __pagevec_lru_add_file(&lru_pvec); + + fscache_end_io(op, netpage, 0); + page_cache_release(netpage); + netpage = NULL; + continue; + } + + netpage = NULL; + + _debug("out"); + +out: + /* tidy up */ + pagevec_lru_add_file(&lru_pvec); + + if (newpage) + page_cache_release(newpage); + if (netpage) + page_cache_release(netpage); + if (backpage) + page_cache_release(backpage); + if (monitor) { + fscache_put_retrieval(op); + kfree(monitor); + } + + list_for_each_entry_safe(netpage, _n, list, lru) { + list_del(&netpage->lru); + page_cache_release(netpage); + } + + _leave(" = %d", ret); + return ret; + +nomem: + _debug("nomem"); + ret = -ENOMEM; + goto out; + +read_error: + _debug("read error %d", ret); + if (ret == -ENOMEM) + goto out; +io_error: + cachefiles_io_error_obj(object, "Page read error on backing file"); + ret = -ENOBUFS; + goto out; +} + +/* + * read a list of pages from the cache or allocate blocks in which to store + * them + */ +int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op, + struct list_head *pages, + unsigned *nr_pages, + gfp_t gfp) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + struct list_head backpages; + struct pagevec pagevec; + struct inode *inode; + struct page *page, *_n; + unsigned shift, nrbackpages; + int ret, ret2, space; + + object = container_of(op->op.object, + struct cachefiles_object, fscache); + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + + _enter("{OBJ%x,%d},,%d,,", + object->fscache.debug_id, atomic_read(&op->op.usage), + *nr_pages); + + if (!object->backer) + return -ENOBUFS; + + space = 1; + if (cachefiles_has_space(cache, 0, *nr_pages) < 0) + space = 0; + + inode = object->backer->d_inode; + ASSERT(S_ISREG(inode->i_mode)); + ASSERT(inode->i_mapping->a_ops->bmap); + ASSERT(inode->i_mapping->a_ops->readpages); + + /* calculate the shift required to use bmap */ + if (inode->i_sb->s_blocksize > PAGE_SIZE) + return -ENOBUFS; + + shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits; + + pagevec_init(&pagevec, 0); + + op->op.flags = FSCACHE_OP_FAST; + op->op.processor = cachefiles_read_copier; + + INIT_LIST_HEAD(&backpages); + nrbackpages = 0; + + ret = space ? -ENODATA : -ENOBUFS; + list_for_each_entry_safe(page, _n, pages, lru) { + sector_t block0, block; + + /* we assume the absence or presence of the first block is a + * good enough indication for the page as a whole + * - TODO: don't use bmap() for this as it is _not_ actually + * good enough for this as it doesn't indicate errors, but + * it's all we've got for the moment + */ + block0 = page->index; + block0 <<= shift; + + block = inode->i_mapping->a_ops->bmap(inode->i_mapping, + block0); + _debug("%llx -> %llx", + (unsigned long long) block0, + (unsigned long long) block); + + if (block) { + /* we have data - add it to the list to give to the + * backing fs */ + list_move(&page->lru, &backpages); + (*nr_pages)--; + nrbackpages++; + } else if (space && pagevec_add(&pagevec, page) == 0) { + fscache_mark_pages_cached(op, &pagevec); + ret = -ENODATA; + } + } + + if (pagevec_count(&pagevec) > 0) + fscache_mark_pages_cached(op, &pagevec); + + if (list_empty(pages)) + ret = 0; + + /* submit the apparently valid pages to the backing fs to be read from + * disk */ + if (nrbackpages > 0) { + ret2 = cachefiles_read_backing_file(object, op, &backpages, + &pagevec); + if (ret2 == -ENOMEM || ret2 == -EINTR) + ret = ret2; + } + + if (pagevec_count(&pagevec) > 0) + fscache_mark_pages_cached(op, &pagevec); + + _leave(" = %d [nr=%u%s]", + ret, *nr_pages, list_empty(pages) ? " empty" : ""); + return ret; +} + +/* + * allocate a block in the cache in which to store a page + * - cache withdrawal is prevented by the caller + * - returns -EINTR if interrupted + * - returns -ENOMEM if ran out of memory + * - returns -ENOBUFS if no buffers can be made available + * - returns -ENOBUFS if page is beyond EOF + * - otherwise: + * - the metadata will be retained + * - 0 will be returned + */ +int cachefiles_allocate_page(struct fscache_retrieval *op, + struct page *page, + gfp_t gfp) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + struct pagevec pagevec; + int ret; + + object = container_of(op->op.object, + struct cachefiles_object, fscache); + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + + _enter("%p,{%lx},", object, page->index); + + ret = cachefiles_has_space(cache, 0, 1); + if (ret == 0) { + pagevec_init(&pagevec, 0); + pagevec_add(&pagevec, page); + fscache_mark_pages_cached(op, &pagevec); + } else { + ret = -ENOBUFS; + } + + _leave(" = %d", ret); + return ret; +} + +/* + * allocate blocks in the cache in which to store a set of pages + * - cache withdrawal is prevented by the caller + * - returns -EINTR if interrupted + * - returns -ENOMEM if ran out of memory + * - returns -ENOBUFS if some buffers couldn't be made available + * - returns -ENOBUFS if some pages are beyond EOF + * - otherwise: + * - -ENODATA will be returned + * - metadata will be retained for any page marked + */ +int cachefiles_allocate_pages(struct fscache_retrieval *op, + struct list_head *pages, + unsigned *nr_pages, + gfp_t gfp) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + struct pagevec pagevec; + struct page *page; + int ret; + + object = container_of(op->op.object, + struct cachefiles_object, fscache); + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + + _enter("%p,,,%d,", object, *nr_pages); + + ret = cachefiles_has_space(cache, 0, *nr_pages); + if (ret == 0) { + pagevec_init(&pagevec, 0); + + list_for_each_entry(page, pages, lru) { + if (pagevec_add(&pagevec, page) == 0) + fscache_mark_pages_cached(op, &pagevec); + } + + if (pagevec_count(&pagevec) > 0) + fscache_mark_pages_cached(op, &pagevec); + ret = -ENODATA; + } else { + ret = -ENOBUFS; + } + + _leave(" = %d", ret); + return ret; +} + +/* + * request a page be stored in the cache + * - cache withdrawal is prevented by the caller + * - this request may be ignored if there's no cache block available, in which + * case -ENOBUFS will be returned + * - if the op is in progress, 0 will be returned + */ +int cachefiles_write_page(struct fscache_storage *op, struct page *page) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + mm_segment_t old_fs; + struct file *file; + loff_t pos; + void *data; + int ret; + + ASSERT(op != NULL); + ASSERT(page != NULL); + + object = container_of(op->op.object, + struct cachefiles_object, fscache); + + _enter("%p,%p{%lx},,,", object, page, page->index); + + if (!object->backer) { + _leave(" = -ENOBUFS"); + return -ENOBUFS; + } + + ASSERT(S_ISREG(object->backer->d_inode->i_mode)); + + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + + /* write the page to the backing filesystem and let it store it in its + * own time */ + dget(object->backer); + mntget(cache->mnt); + file = dentry_open(object->backer, cache->mnt, O_RDWR, + cache->cache_cred); + if (IS_ERR(file)) { + ret = PTR_ERR(file); + } else { + ret = -EIO; + if (file->f_op->write) { + pos = (loff_t) page->index << PAGE_SHIFT; + data = kmap(page); + old_fs = get_fs(); + set_fs(KERNEL_DS); + ret = file->f_op->write( + file, (const void __user *) data, PAGE_SIZE, + &pos); + set_fs(old_fs); + kunmap(page); + if (ret != PAGE_SIZE) + ret = -EIO; + } + fput(file); + } + + if (ret < 0) { + if (ret == -EIO) + cachefiles_io_error_obj( + object, "Write page to backing file failed"); + ret = -ENOBUFS; + } + + _leave(" = %d", ret); + return ret; +} + +/* + * detach a backing block from a page + * - cache withdrawal is prevented by the caller + */ +void cachefiles_uncache_page(struct fscache_object *_object, struct page *page) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + + object = container_of(_object, struct cachefiles_object, fscache); + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + + _enter("%p,{%lu}", object, page->index); + + spin_unlock(&object->fscache.cookie->lock); +} diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c new file mode 100644 index 000000000000..b5808cdb2232 --- /dev/null +++ b/fs/cachefiles/security.c @@ -0,0 +1,116 @@ +/* CacheFiles security management + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/fs.h> +#include <linux/cred.h> +#include "internal.h" + +/* + * determine the security context within which we access the cache from within + * the kernel + */ +int cachefiles_get_security_ID(struct cachefiles_cache *cache) +{ + struct cred *new; + int ret; + + _enter("{%s}", cache->secctx); + + new = prepare_kernel_cred(current); + if (!new) { + ret = -ENOMEM; + goto error; + } + + if (cache->secctx) { + ret = set_security_override_from_ctx(new, cache->secctx); + if (ret < 0) { + put_cred(new); + printk(KERN_ERR "CacheFiles:" + " Security denies permission to nominate" + " security context: error %d\n", + ret); + goto error; + } + } + + cache->cache_cred = new; + ret = 0; +error: + _leave(" = %d", ret); + return ret; +} + +/* + * see if mkdir and create can be performed in the root directory + */ +static int cachefiles_check_cache_dir(struct cachefiles_cache *cache, + struct dentry *root) +{ + int ret; + + ret = security_inode_mkdir(root->d_inode, root, 0); + if (ret < 0) { + printk(KERN_ERR "CacheFiles:" + " Security denies permission to make dirs: error %d", + ret); + return ret; + } + + ret = security_inode_create(root->d_inode, root, 0); + if (ret < 0) + printk(KERN_ERR "CacheFiles:" + " Security denies permission to create files: error %d", + ret); + + return ret; +} + +/* + * check the security details of the on-disk cache + * - must be called with security override in force + */ +int cachefiles_determine_cache_security(struct cachefiles_cache *cache, + struct dentry *root, + const struct cred **_saved_cred) +{ + struct cred *new; + int ret; + + _enter(""); + + /* duplicate the cache creds for COW (the override is currently in + * force, so we can use prepare_creds() to do this) */ + new = prepare_creds(); + if (!new) + return -ENOMEM; + + cachefiles_end_secure(cache, *_saved_cred); + + /* use the cache root dir's security context as the basis with + * which create files */ + ret = set_create_files_as(new, root->d_inode); + if (ret < 0) { + _leave(" = %d [cfa]", ret); + return ret; + } + + put_cred(cache->cache_cred); + cache->cache_cred = new; + + cachefiles_begin_secure(cache, _saved_cred); + ret = cachefiles_check_cache_dir(cache, root); + + if (ret == -EOPNOTSUPP) + ret = 0; + _leave(" = %d", ret); + return ret; +} diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c new file mode 100644 index 000000000000..f3e7a0bf068b --- /dev/null +++ b/fs/cachefiles/xattr.c @@ -0,0 +1,291 @@ +/* CacheFiles extended attribute management + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/fsnotify.h> +#include <linux/quotaops.h> +#include <linux/xattr.h> +#include "internal.h" + +static const char cachefiles_xattr_cache[] = + XATTR_USER_PREFIX "CacheFiles.cache"; + +/* + * check the type label on an object + * - done using xattrs + */ +int cachefiles_check_object_type(struct cachefiles_object *object) +{ + struct dentry *dentry = object->dentry; + char type[3], xtype[3]; + int ret; + + ASSERT(dentry); + ASSERT(dentry->d_inode); + + if (!object->fscache.cookie) + strcpy(type, "C3"); + else + snprintf(type, 3, "%02x", object->fscache.cookie->def->type); + + _enter("%p{%s}", object, type); + + /* attempt to install a type label directly */ + ret = vfs_setxattr(dentry, cachefiles_xattr_cache, type, 2, + XATTR_CREATE); + if (ret == 0) { + _debug("SET"); /* we succeeded */ + goto error; + } + + if (ret != -EEXIST) { + kerror("Can't set xattr on %*.*s [%lu] (err %d)", + dentry->d_name.len, dentry->d_name.len, + dentry->d_name.name, dentry->d_inode->i_ino, + -ret); + goto error; + } + + /* read the current type label */ + ret = vfs_getxattr(dentry, cachefiles_xattr_cache, xtype, 3); + if (ret < 0) { + if (ret == -ERANGE) + goto bad_type_length; + + kerror("Can't read xattr on %*.*s [%lu] (err %d)", + dentry->d_name.len, dentry->d_name.len, + dentry->d_name.name, dentry->d_inode->i_ino, + -ret); + goto error; + } + + /* check the type is what we're expecting */ + if (ret != 2) + goto bad_type_length; + + if (xtype[0] != type[0] || xtype[1] != type[1]) + goto bad_type; + + ret = 0; + +error: + _leave(" = %d", ret); + return ret; + +bad_type_length: + kerror("Cache object %lu type xattr length incorrect", + dentry->d_inode->i_ino); + ret = -EIO; + goto error; + +bad_type: + xtype[2] = 0; + kerror("Cache object %*.*s [%lu] type %s not %s", + dentry->d_name.len, dentry->d_name.len, + dentry->d_name.name, dentry->d_inode->i_ino, + xtype, type); + ret = -EIO; + goto error; +} + +/* + * set the state xattr on a cache file + */ +int cachefiles_set_object_xattr(struct cachefiles_object *object, + struct cachefiles_xattr *auxdata) +{ + struct dentry *dentry = object->dentry; + int ret; + + ASSERT(object->fscache.cookie); + ASSERT(dentry); + + _enter("%p,#%d", object, auxdata->len); + + /* attempt to install the cache metadata directly */ + _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len); + + ret = vfs_setxattr(dentry, cachefiles_xattr_cache, + &auxdata->type, auxdata->len, + XATTR_CREATE); + if (ret < 0 && ret != -ENOMEM) + cachefiles_io_error_obj( + object, + "Failed to set xattr with error %d", ret); + + _leave(" = %d", ret); + return ret; +} + +/* + * update the state xattr on a cache file + */ +int cachefiles_update_object_xattr(struct cachefiles_object *object, + struct cachefiles_xattr *auxdata) +{ + struct dentry *dentry = object->dentry; + int ret; + + ASSERT(object->fscache.cookie); + ASSERT(dentry); + + _enter("%p,#%d", object, auxdata->len); + + /* attempt to install the cache metadata directly */ + _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len); + + ret = vfs_setxattr(dentry, cachefiles_xattr_cache, + &auxdata->type, auxdata->len, + XATTR_REPLACE); + if (ret < 0 && ret != -ENOMEM) + cachefiles_io_error_obj( + object, + "Failed to update xattr with error %d", ret); + + _leave(" = %d", ret); + return ret; +} + +/* + * check the state xattr on a cache file + * - return -ESTALE if the object should be deleted + */ +int cachefiles_check_object_xattr(struct cachefiles_object *object, + struct cachefiles_xattr *auxdata) +{ + struct cachefiles_xattr *auxbuf; + struct dentry *dentry = object->dentry; + int ret; + + _enter("%p,#%d", object, auxdata->len); + + ASSERT(dentry); + ASSERT(dentry->d_inode); + + auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL); + if (!auxbuf) { + _leave(" = -ENOMEM"); + return -ENOMEM; + } + + /* read the current type label */ + ret = vfs_getxattr(dentry, cachefiles_xattr_cache, + &auxbuf->type, 512 + 1); + if (ret < 0) { + if (ret == -ENODATA) + goto stale; /* no attribute - power went off + * mid-cull? */ + + if (ret == -ERANGE) + goto bad_type_length; + + cachefiles_io_error_obj(object, + "Can't read xattr on %lu (err %d)", + dentry->d_inode->i_ino, -ret); + goto error; + } + + /* check the on-disk object */ + if (ret < 1) + goto bad_type_length; + + if (auxbuf->type != auxdata->type) + goto stale; + + auxbuf->len = ret; + + /* consult the netfs */ + if (object->fscache.cookie->def->check_aux) { + enum fscache_checkaux result; + unsigned int dlen; + + dlen = auxbuf->len - 1; + + _debug("checkaux %s #%u", + object->fscache.cookie->def->name, dlen); + + result = fscache_check_aux(&object->fscache, + &auxbuf->data, dlen); + + switch (result) { + /* entry okay as is */ + case FSCACHE_CHECKAUX_OKAY: + goto okay; + + /* entry requires update */ + case FSCACHE_CHECKAUX_NEEDS_UPDATE: + break; + + /* entry requires deletion */ + case FSCACHE_CHECKAUX_OBSOLETE: + goto stale; + + default: + BUG(); + } + + /* update the current label */ + ret = vfs_setxattr(dentry, cachefiles_xattr_cache, + &auxdata->type, auxdata->len, + XATTR_REPLACE); + if (ret < 0) { + cachefiles_io_error_obj(object, + "Can't update xattr on %lu" + " (error %d)", + dentry->d_inode->i_ino, -ret); + goto error; + } + } + +okay: + ret = 0; + +error: + kfree(auxbuf); + _leave(" = %d", ret); + return ret; + +bad_type_length: + kerror("Cache object %lu xattr length incorrect", + dentry->d_inode->i_ino); + ret = -EIO; + goto error; + +stale: + ret = -ESTALE; + goto error; +} + +/* + * remove the object's xattr to mark it stale + */ +int cachefiles_remove_object_xattr(struct cachefiles_cache *cache, + struct dentry *dentry) +{ + int ret; + + ret = vfs_removexattr(dentry, cachefiles_xattr_cache); + if (ret < 0) { + if (ret == -ENOENT || ret == -ENODATA) + ret = 0; + else if (ret != -ENOMEM) + cachefiles_io_error(cache, + "Can't remove xattr from %lu" + " (error %d)", + dentry->d_inode->i_ino, -ret); + } + + _leave(" = %d", ret); + return ret; +} diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 2f35cccfcd8d..54dce78fbb73 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -254,7 +254,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, return -ENOMEM; } - mode &= ~current->fs->umask; + mode &= ~current_umask(); if (oplockEnabled) oplock = REQ_OPLOCK; @@ -479,7 +479,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode, rc = -ENOMEM; else if (pTcon->unix_ext) { struct cifs_unix_set_info_args args = { - .mode = mode & ~current->fs->umask, + .mode = mode & ~current_umask(), .ctime = NO_CHANGE_64, .atime = NO_CHANGE_64, .mtime = NO_CHANGE_64, diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index a8797cc60805..f121a80fdd6f 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1125,7 +1125,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode) goto mkdir_out; } - mode &= ~current->fs->umask; + mode &= ~current_umask(); rc = CIFSPOSIXCreate(xid, pTcon, SMB_O_DIRECTORY | SMB_O_CREAT, mode, NULL /* netfid */, pInfo, &oplock, full_path, cifs_sb->local_nls, @@ -1204,7 +1204,7 @@ mkdir_get_info: if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2)) direntry->d_inode->i_nlink = 2; - mode &= ~current->fs->umask; + mode &= ~current_umask(); /* must turn on setgid bit if parent dir has it */ if (inode->i_mode & S_ISGID) mode |= S_ISGID; diff --git a/fs/compat.c b/fs/compat.c index 55efdfebdf5a..3f84d5f15889 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -51,6 +51,7 @@ #include <linux/poll.h> #include <linux/mm.h> #include <linux/eventpoll.h> +#include <linux/fs_struct.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -1195,16 +1196,12 @@ out: return ret; } -asmlinkage ssize_t -compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, unsigned long vlen) +static size_t compat_readv(struct file *file, + const struct compat_iovec __user *vec, + unsigned long vlen, loff_t *pos) { - struct file *file; ssize_t ret = -EBADF; - file = fget(fd); - if (!file) - return -EBADF; - if (!(file->f_mode & FMODE_READ)) goto out; @@ -1212,25 +1209,56 @@ compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, unsign if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read)) goto out; - ret = compat_do_readv_writev(READ, file, vec, vlen, &file->f_pos); + ret = compat_do_readv_writev(READ, file, vec, vlen, pos); out: if (ret > 0) add_rchar(current, ret); inc_syscr(current); - fput(file); return ret; } asmlinkage ssize_t -compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, unsigned long vlen) +compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, + unsigned long vlen) { struct file *file; - ssize_t ret = -EBADF; + int fput_needed; + ssize_t ret; - file = fget(fd); + file = fget_light(fd, &fput_needed); if (!file) return -EBADF; + ret = compat_readv(file, vec, vlen, &file->f_pos); + fput_light(file, fput_needed); + return ret; +} + +asmlinkage ssize_t +compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec, + unsigned long vlen, u32 pos_low, u32 pos_high) +{ + loff_t pos = ((loff_t)pos_high << 32) | pos_low; + struct file *file; + int fput_needed; + ssize_t ret; + + if (pos < 0) + return -EINVAL; + file = fget_light(fd, &fput_needed); + if (!file) + return -EBADF; + ret = compat_readv(file, vec, vlen, &pos); + fput_light(file, fput_needed); + return ret; +} + +static size_t compat_writev(struct file *file, + const struct compat_iovec __user *vec, + unsigned long vlen, loff_t *pos) +{ + ssize_t ret = -EBADF; + if (!(file->f_mode & FMODE_WRITE)) goto out; @@ -1238,13 +1266,47 @@ compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, unsig if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write)) goto out; - ret = compat_do_readv_writev(WRITE, file, vec, vlen, &file->f_pos); + ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos); out: if (ret > 0) add_wchar(current, ret); inc_syscw(current); - fput(file); + return ret; +} + +asmlinkage ssize_t +compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, + unsigned long vlen) +{ + struct file *file; + int fput_needed; + ssize_t ret; + + file = fget_light(fd, &fput_needed); + if (!file) + return -EBADF; + ret = compat_writev(file, vec, vlen, &file->f_pos); + fput_light(file, fput_needed); + return ret; +} + +asmlinkage ssize_t +compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec, + unsigned long vlen, u32 pos_low, u32 pos_high) +{ + loff_t pos = ((loff_t)pos_high << 32) | pos_low; + struct file *file; + int fput_needed; + ssize_t ret; + + if (pos < 0) + return -EINVAL; + file = fget_light(fd, &fput_needed); + if (!file) + return -EBADF; + ret = compat_writev(file, vec, vlen, &pos); + fput_light(file, fput_needed); return ret; } @@ -1441,12 +1503,15 @@ int compat_do_execve(char * filename, bprm->cred = prepare_exec_creds(); if (!bprm->cred) goto out_unlock; - check_unsafe_exec(bprm); + + retval = check_unsafe_exec(bprm); + if (retval) + goto out_unlock; file = open_exec(filename); retval = PTR_ERR(file); if (IS_ERR(file)) - goto out_unlock; + goto out_unmark; sched_exec(); @@ -1488,6 +1553,9 @@ int compat_do_execve(char * filename, goto out; /* execve succeeded */ + write_lock(¤t->fs->lock); + current->fs->in_exec = 0; + write_unlock(¤t->fs->lock); current->in_execve = 0; mutex_unlock(¤t->cred_exec_mutex); acct_update_integrals(current); @@ -1506,6 +1574,11 @@ out_file: fput(bprm->file); } +out_unmark: + write_lock(¤t->fs->lock); + current->fs->in_exec = 0; + write_unlock(¤t->fs->lock); + out_unlock: current->in_execve = 0; mutex_unlock(¤t->cred_exec_mutex); diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index ff786687e93b..3e87ce443ea2 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -23,7 +23,7 @@ #include <linux/if.h> #include <linux/if_bridge.h> #include <linux/slab.h> -#include <linux/raid/md.h> +#include <linux/raid/md_u.h> #include <linux/kd.h> #include <linux/route.h> #include <linux/in6.h> diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index a07338d2d140..dd3634e4c967 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -318,6 +318,7 @@ out: static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); buf->f_type = CRAMFS_MAGIC; buf->f_bsize = PAGE_CACHE_SIZE; @@ -326,6 +327,8 @@ static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = 0; buf->f_files = CRAMFS_SB(sb)->files; buf->f_ffree = 0; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); buf->f_namelen = CRAMFS_MAXPATHLEN; return 0; } @@ -459,11 +462,14 @@ static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, s static int cramfs_readpage(struct file *file, struct page * page) { struct inode *inode = page->mapping->host; - u32 maxblock, bytes_filled; + u32 maxblock; + int bytes_filled; void *pgdata; maxblock = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; bytes_filled = 0; + pgdata = kmap(page); + if (page->index < maxblock) { struct super_block *sb = inode->i_sb; u32 blkptr_offset = OFFSET(inode) + page->index*4; @@ -472,30 +478,43 @@ static int cramfs_readpage(struct file *file, struct page * page) start_offset = OFFSET(inode) + maxblock*4; mutex_lock(&read_mutex); if (page->index) - start_offset = *(u32 *) cramfs_read(sb, blkptr_offset-4, 4); - compr_len = (*(u32 *) cramfs_read(sb, blkptr_offset, 4) - start_offset); + start_offset = *(u32 *) cramfs_read(sb, blkptr_offset-4, + 4); + compr_len = (*(u32 *) cramfs_read(sb, blkptr_offset, 4) - + start_offset); mutex_unlock(&read_mutex); - pgdata = kmap(page); + if (compr_len == 0) ; /* hole */ - else if (compr_len > (PAGE_CACHE_SIZE << 1)) - printk(KERN_ERR "cramfs: bad compressed blocksize %u\n", compr_len); - else { + else if (unlikely(compr_len > (PAGE_CACHE_SIZE << 1))) { + pr_err("cramfs: bad compressed blocksize %u\n", + compr_len); + goto err; + } else { mutex_lock(&read_mutex); bytes_filled = cramfs_uncompress_block(pgdata, PAGE_CACHE_SIZE, cramfs_read(sb, start_offset, compr_len), compr_len); mutex_unlock(&read_mutex); + if (unlikely(bytes_filled < 0)) + goto err; } - } else - pgdata = kmap(page); + } + memset(pgdata + bytes_filled, 0, PAGE_CACHE_SIZE - bytes_filled); - kunmap(page); flush_dcache_page(page); + kunmap(page); SetPageUptodate(page); unlock_page(page); return 0; + +err: + kunmap(page); + ClearPageUptodate(page); + SetPageError(page); + unlock_page(page); + return 0; } static const struct address_space_operations cramfs_aops = { diff --git a/fs/cramfs/uncompress.c b/fs/cramfs/uncompress.c index fc3ccb74626f..023329800d2e 100644 --- a/fs/cramfs/uncompress.c +++ b/fs/cramfs/uncompress.c @@ -50,7 +50,7 @@ int cramfs_uncompress_block(void *dst, int dstlen, void *src, int srclen) err: printk("Error %d while decompressing!\n", err); printk("%p(%d)->%p(%d)\n", src, srclen, dst, dstlen); - return 0; + return -EIO; } int cramfs_uncompress_init(void) diff --git a/fs/dcache.c b/fs/dcache.c index 90bbd7e1b116..761d30be2683 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -17,7 +17,6 @@ #include <linux/syscalls.h> #include <linux/string.h> #include <linux/mm.h> -#include <linux/fdtable.h> #include <linux/fs.h> #include <linux/fsnotify.h> #include <linux/slab.h> @@ -32,6 +31,7 @@ #include <linux/seqlock.h> #include <linux/swap.h> #include <linux/bootmem.h> +#include <linux/fs_struct.h> #include "internal.h" int sysctl_vfs_cache_pressure __read_mostly = 100; diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 81ae9ea3c6e1..0662ba6de85a 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -30,6 +30,7 @@ static struct vfsmount *debugfs_mount; static int debugfs_mount_count; +static bool debugfs_registered; static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev) { @@ -496,6 +497,16 @@ exit: } EXPORT_SYMBOL_GPL(debugfs_rename); +/** + * debugfs_initialized - Tells whether debugfs has been registered + */ +bool debugfs_initialized(void) +{ + return debugfs_registered; +} +EXPORT_SYMBOL_GPL(debugfs_initialized); + + static struct kobject *debug_kobj; static int __init debugfs_init(void) @@ -509,11 +520,16 @@ static int __init debugfs_init(void) retval = register_filesystem(&debug_fs_type); if (retval) kobject_put(debug_kobj); + else + debugfs_registered = true; + return retval; } static void __exit debugfs_exit(void) { + debugfs_registered = false; + simple_release_fs(&debugfs_mount, &debugfs_mount_count); unregister_filesystem(&debug_fs_type); kobject_put(debug_kobj); diff --git a/fs/direct-io.c b/fs/direct-io.c index b6d43908ff7a..da258e7249cc 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1126,7 +1126,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, int acquire_i_mutex = 0; if (rw & WRITE) - rw = WRITE_SYNC; + rw = WRITE_ODIRECT; if (bdev) bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev)); diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 44d725f612cf..b6a719a909f8 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -18,7 +18,7 @@ static void drop_pagecache_sb(struct super_block *sb) spin_lock(&inode_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) + if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) continue; if (inode->i_mapping->nrpages == 0) continue; diff --git a/fs/efs/super.c b/fs/efs/super.c index 73b19cfc91fc..f04942810818 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -329,18 +329,22 @@ out_no_fs: } static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) { - struct efs_sb_info *sb = SUPER_INFO(dentry->d_sb); + struct super_block *sb = dentry->d_sb; + struct efs_sb_info *sbi = SUPER_INFO(sb); + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); buf->f_type = EFS_SUPER_MAGIC; /* efs magic number */ buf->f_bsize = EFS_BLOCKSIZE; /* blocksize */ - buf->f_blocks = sb->total_groups * /* total data blocks */ - (sb->group_size - sb->inode_blocks); - buf->f_bfree = sb->data_free; /* free data blocks */ - buf->f_bavail = sb->data_free; /* free blocks for non-root */ - buf->f_files = sb->total_groups * /* total inodes */ - sb->inode_blocks * + buf->f_blocks = sbi->total_groups * /* total data blocks */ + (sbi->group_size - sbi->inode_blocks); + buf->f_bfree = sbi->data_free; /* free data blocks */ + buf->f_bavail = sbi->data_free; /* free blocks for non-root */ + buf->f_files = sbi->total_groups * /* total inodes */ + sbi->inode_blocks * (EFS_BLOCKSIZE / sizeof(struct efs_dinode)); - buf->f_ffree = sb->inode_free; /* free inodes */ + buf->f_ffree = sbi->inode_free; /* free inodes */ + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); buf->f_namelen = EFS_MAXNAMELEN; /* max filename length */ return 0; diff --git a/fs/exec.c b/fs/exec.c index c5128fbc9165..052a961e41aa 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -53,6 +53,7 @@ #include <linux/tracehook.h> #include <linux/kmod.h> #include <linux/fsnotify.h> +#include <linux/fs_struct.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -1056,28 +1057,35 @@ EXPORT_SYMBOL(install_exec_creds); * - the caller must hold current->cred_exec_mutex to protect against * PTRACE_ATTACH */ -void check_unsafe_exec(struct linux_binprm *bprm) +int check_unsafe_exec(struct linux_binprm *bprm) { struct task_struct *p = current, *t; unsigned long flags; - unsigned n_fs, n_sighand; + unsigned n_fs; + int res = 0; bprm->unsafe = tracehook_unsafe_exec(p); n_fs = 1; - n_sighand = 1; + write_lock(&p->fs->lock); lock_task_sighand(p, &flags); for (t = next_thread(p); t != p; t = next_thread(t)) { if (t->fs == p->fs) n_fs++; - n_sighand++; } - if (atomic_read(&p->fs->count) > n_fs || - atomic_read(&p->sighand->count) > n_sighand) + if (p->fs->users > n_fs) { bprm->unsafe |= LSM_UNSAFE_SHARE; + } else { + if (p->fs->in_exec) + res = -EAGAIN; + p->fs->in_exec = 1; + } unlock_task_sighand(p, &flags); + write_unlock(&p->fs->lock); + + return res; } /* @@ -1296,12 +1304,15 @@ int do_execve(char * filename, bprm->cred = prepare_exec_creds(); if (!bprm->cred) goto out_unlock; - check_unsafe_exec(bprm); + + retval = check_unsafe_exec(bprm); + if (retval) + goto out_unlock; file = open_exec(filename); retval = PTR_ERR(file); if (IS_ERR(file)) - goto out_unlock; + goto out_unmark; sched_exec(); @@ -1344,6 +1355,9 @@ int do_execve(char * filename, goto out; /* execve succeeded */ + write_lock(¤t->fs->lock); + current->fs->in_exec = 0; + write_unlock(¤t->fs->lock); current->in_execve = 0; mutex_unlock(¤t->cred_exec_mutex); acct_update_integrals(current); @@ -1362,6 +1376,11 @@ out_file: fput(bprm->file); } +out_unmark: + write_lock(¤t->fs->lock); + current->fs->in_exec = 0; + write_unlock(¤t->fs->lock); + out_unlock: current->in_execve = 0; mutex_unlock(¤t->cred_exec_mutex); diff --git a/fs/exofs/BUGS b/fs/exofs/BUGS new file mode 100644 index 000000000000..1b2d4c63a579 --- /dev/null +++ b/fs/exofs/BUGS @@ -0,0 +1,3 @@ +- Out-of-space may cause a severe problem if the object (and directory entry) + were written, but the inode attributes failed. Then if the filesystem was + unmounted and mounted the kernel can get into an endless loop doing a readdir. diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild new file mode 100644 index 000000000000..cc2d22db119c --- /dev/null +++ b/fs/exofs/Kbuild @@ -0,0 +1,16 @@ +# +# Kbuild for the EXOFS module +# +# Copyright (C) 2008 Panasas Inc. All rights reserved. +# +# Authors: +# Boaz Harrosh <bharrosh@panasas.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 +# +# Kbuild - Gets included from the Kernels Makefile and build system +# + +exofs-y := osd.o inode.o file.o symlink.o namei.o dir.o super.o +obj-$(CONFIG_EXOFS_FS) += exofs.o diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig new file mode 100644 index 000000000000..86194b2f799d --- /dev/null +++ b/fs/exofs/Kconfig @@ -0,0 +1,13 @@ +config EXOFS_FS + tristate "exofs: OSD based file system support" + depends on SCSI_OSD_ULD + help + EXOFS is a file system that uses an OSD storage device, + as its backing storage. + +# Debugging-related stuff +config EXOFS_DEBUG + bool "Enable debugging" + depends on EXOFS_FS + help + This option enables EXOFS debug prints. diff --git a/fs/exofs/common.h b/fs/exofs/common.h new file mode 100644 index 000000000000..b1512c4bb8c7 --- /dev/null +++ b/fs/exofs/common.h @@ -0,0 +1,184 @@ +/* + * common.h - Common definitions for both Kernel and user-mode utilities + * + * Copyright (C) 2005, 2006 + * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com) + * Copyright (C) 2005, 2006 + * International Business Machines + * Copyright (C) 2008, 2009 + * Boaz Harrosh <bharrosh@panasas.com> + * + * Copyrights for code taken from ext2: + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/inode.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef __EXOFS_COM_H__ +#define __EXOFS_COM_H__ + +#include <linux/types.h> + +#include <scsi/osd_attributes.h> +#include <scsi/osd_initiator.h> +#include <scsi/osd_sec.h> + +/**************************************************************************** + * Object ID related defines + * NOTE: inode# = object ID - EXOFS_OBJ_OFF + ****************************************************************************/ +#define EXOFS_MIN_PID 0x10000 /* Smallest partition ID */ +#define EXOFS_OBJ_OFF 0x10000 /* offset for objects */ +#define EXOFS_SUPER_ID 0x10000 /* object ID for on-disk superblock */ +#define EXOFS_ROOT_ID 0x10002 /* object ID for root directory */ + +/* exofs Application specific page/attribute */ +# define EXOFS_APAGE_FS_DATA (OSD_APAGE_APP_DEFINED_FIRST + 3) +# define EXOFS_ATTR_INODE_DATA 1 + +/* + * The maximum number of files we can have is limited by the size of the + * inode number. This is the largest object ID that the file system supports. + * Object IDs 0, 1, and 2 are always in use (see above defines). + */ +enum { + EXOFS_MAX_INO_ID = (sizeof(ino_t) * 8 == 64) ? ULLONG_MAX : + (1ULL << (sizeof(ino_t) * 8ULL - 1ULL)), + EXOFS_MAX_ID = (EXOFS_MAX_INO_ID - 1 - EXOFS_OBJ_OFF), +}; + +/**************************************************************************** + * Misc. + ****************************************************************************/ +#define EXOFS_BLKSHIFT 12 +#define EXOFS_BLKSIZE (1UL << EXOFS_BLKSHIFT) + +/**************************************************************************** + * superblock-related things + ****************************************************************************/ +#define EXOFS_SUPER_MAGIC 0x5DF5 + +/* + * The file system control block - stored in an object's data (mainly, the one + * with ID EXOFS_SUPER_ID). This is where the in-memory superblock is stored + * on disk. Right now it just has a magic value, which is basically a sanity + * check on our ability to communicate with the object store. + */ +struct exofs_fscb { + __le64 s_nextid; /* Highest object ID used */ + __le32 s_numfiles; /* Number of files on fs */ + __le16 s_magic; /* Magic signature */ + __le16 s_newfs; /* Non-zero if this is a new fs */ +}; + +/**************************************************************************** + * inode-related things + ****************************************************************************/ +#define EXOFS_IDATA 5 + +/* + * The file control block - stored in an object's attributes. This is where + * the in-memory inode is stored on disk. + */ +struct exofs_fcb { + __le64 i_size; /* Size of the file */ + __le16 i_mode; /* File mode */ + __le16 i_links_count; /* Links count */ + __le32 i_uid; /* Owner Uid */ + __le32 i_gid; /* Group Id */ + __le32 i_atime; /* Access time */ + __le32 i_ctime; /* Creation time */ + __le32 i_mtime; /* Modification time */ + __le32 i_flags; /* File flags (unused for now)*/ + __le32 i_generation; /* File version (for NFS) */ + __le32 i_data[EXOFS_IDATA]; /* Short symlink names and device #s */ +}; + +#define EXOFS_INO_ATTR_SIZE sizeof(struct exofs_fcb) + +/* This is the Attribute the fcb is stored in */ +static const struct __weak osd_attr g_attr_inode_data = ATTR_DEF( + EXOFS_APAGE_FS_DATA, + EXOFS_ATTR_INODE_DATA, + EXOFS_INO_ATTR_SIZE); + +/**************************************************************************** + * dentry-related things + ****************************************************************************/ +#define EXOFS_NAME_LEN 255 + +/* + * The on-disk directory entry + */ +struct exofs_dir_entry { + __le64 inode_no; /* inode number */ + __le16 rec_len; /* directory entry length */ + u8 name_len; /* name length */ + u8 file_type; /* umm...file type */ + char name[EXOFS_NAME_LEN]; /* file name */ +}; + +enum { + EXOFS_FT_UNKNOWN, + EXOFS_FT_REG_FILE, + EXOFS_FT_DIR, + EXOFS_FT_CHRDEV, + EXOFS_FT_BLKDEV, + EXOFS_FT_FIFO, + EXOFS_FT_SOCK, + EXOFS_FT_SYMLINK, + EXOFS_FT_MAX +}; + +#define EXOFS_DIR_PAD 4 +#define EXOFS_DIR_ROUND (EXOFS_DIR_PAD - 1) +#define EXOFS_DIR_REC_LEN(name_len) \ + (((name_len) + offsetof(struct exofs_dir_entry, name) + \ + EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND) + +/************************* + * function declarations * + *************************/ +/* osd.c */ +void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], + const struct osd_obj_id *obj); + +int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid); +static inline int exofs_check_ok(struct osd_request *or) +{ + return exofs_check_ok_resid(or, NULL, NULL); +} +int exofs_sync_op(struct osd_request *or, int timeout, u8 *cred); +int exofs_async_op(struct osd_request *or, + osd_req_done_fn *async_done, void *caller_context, u8 *cred); + +int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr); + +int osd_req_read_kern(struct osd_request *or, + const struct osd_obj_id *obj, u64 offset, void *buff, u64 len); + +int osd_req_write_kern(struct osd_request *or, + const struct osd_obj_id *obj, u64 offset, void *buff, u64 len); + +#endif /*ifndef __EXOFS_COM_H__*/ diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c new file mode 100644 index 000000000000..65b0c8c776a1 --- /dev/null +++ b/fs/exofs/dir.c @@ -0,0 +1,672 @@ +/* + * Copyright (C) 2005, 2006 + * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com) + * Copyright (C) 2005, 2006 + * International Business Machines + * Copyright (C) 2008, 2009 + * Boaz Harrosh <bharrosh@panasas.com> + * + * Copyrights for code taken from ext2: + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/inode.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "exofs.h" + +static inline unsigned exofs_chunk_size(struct inode *inode) +{ + return inode->i_sb->s_blocksize; +} + +static inline void exofs_put_page(struct page *page) +{ + kunmap(page); + page_cache_release(page); +} + +/* Accesses dir's inode->i_size must be called under inode lock */ +static inline unsigned long dir_pages(struct inode *inode) +{ + return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; +} + +static unsigned exofs_last_byte(struct inode *inode, unsigned long page_nr) +{ + loff_t last_byte = inode->i_size; + + last_byte -= page_nr << PAGE_CACHE_SHIFT; + if (last_byte > PAGE_CACHE_SIZE) + last_byte = PAGE_CACHE_SIZE; + return last_byte; +} + +static int exofs_commit_chunk(struct page *page, loff_t pos, unsigned len) +{ + struct address_space *mapping = page->mapping; + struct inode *dir = mapping->host; + int err = 0; + + dir->i_version++; + + if (!PageUptodate(page)) + SetPageUptodate(page); + + if (pos+len > dir->i_size) { + i_size_write(dir, pos+len); + mark_inode_dirty(dir); + } + set_page_dirty(page); + + if (IS_DIRSYNC(dir)) + err = write_one_page(page, 1); + else + unlock_page(page); + + return err; +} + +static void exofs_check_page(struct page *page) +{ + struct inode *dir = page->mapping->host; + unsigned chunk_size = exofs_chunk_size(dir); + char *kaddr = page_address(page); + unsigned offs, rec_len; + unsigned limit = PAGE_CACHE_SIZE; + struct exofs_dir_entry *p; + char *error; + + /* if the page is the last one in the directory */ + if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) { + limit = dir->i_size & ~PAGE_CACHE_MASK; + if (limit & (chunk_size - 1)) + goto Ebadsize; + if (!limit) + goto out; + } + for (offs = 0; offs <= limit - EXOFS_DIR_REC_LEN(1); offs += rec_len) { + p = (struct exofs_dir_entry *)(kaddr + offs); + rec_len = le16_to_cpu(p->rec_len); + + if (rec_len < EXOFS_DIR_REC_LEN(1)) + goto Eshort; + if (rec_len & 3) + goto Ealign; + if (rec_len < EXOFS_DIR_REC_LEN(p->name_len)) + goto Enamelen; + if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1)) + goto Espan; + } + if (offs != limit) + goto Eend; +out: + SetPageChecked(page); + return; + +Ebadsize: + EXOFS_ERR("ERROR [exofs_check_page]: " + "size of directory #%lu is not a multiple of chunk size", + dir->i_ino + ); + goto fail; +Eshort: + error = "rec_len is smaller than minimal"; + goto bad_entry; +Ealign: + error = "unaligned directory entry"; + goto bad_entry; +Enamelen: + error = "rec_len is too small for name_len"; + goto bad_entry; +Espan: + error = "directory entry across blocks"; + goto bad_entry; +bad_entry: + EXOFS_ERR( + "ERROR [exofs_check_page]: bad entry in directory #%lu: %s - " + "offset=%lu, inode=%llu, rec_len=%d, name_len=%d", + dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs, + _LLU(le64_to_cpu(p->inode_no)), + rec_len, p->name_len); + goto fail; +Eend: + p = (struct exofs_dir_entry *)(kaddr + offs); + EXOFS_ERR("ERROR [exofs_check_page]: " + "entry in directory #%lu spans the page boundary" + "offset=%lu, inode=%llu", + dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs, + _LLU(le64_to_cpu(p->inode_no))); +fail: + SetPageChecked(page); + SetPageError(page); +} + +static struct page *exofs_get_page(struct inode *dir, unsigned long n) +{ + struct address_space *mapping = dir->i_mapping; + struct page *page = read_mapping_page(mapping, n, NULL); + + if (!IS_ERR(page)) { + kmap(page); + if (!PageChecked(page)) + exofs_check_page(page); + if (PageError(page)) + goto fail; + } + return page; + +fail: + exofs_put_page(page); + return ERR_PTR(-EIO); +} + +static inline int exofs_match(int len, const unsigned char *name, + struct exofs_dir_entry *de) +{ + if (len != de->name_len) + return 0; + if (!de->inode_no) + return 0; + return !memcmp(name, de->name, len); +} + +static inline +struct exofs_dir_entry *exofs_next_entry(struct exofs_dir_entry *p) +{ + return (struct exofs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len)); +} + +static inline unsigned +exofs_validate_entry(char *base, unsigned offset, unsigned mask) +{ + struct exofs_dir_entry *de = (struct exofs_dir_entry *)(base + offset); + struct exofs_dir_entry *p = + (struct exofs_dir_entry *)(base + (offset&mask)); + while ((char *)p < (char *)de) { + if (p->rec_len == 0) + break; + p = exofs_next_entry(p); + } + return (char *)p - base; +} + +static unsigned char exofs_filetype_table[EXOFS_FT_MAX] = { + [EXOFS_FT_UNKNOWN] = DT_UNKNOWN, + [EXOFS_FT_REG_FILE] = DT_REG, + [EXOFS_FT_DIR] = DT_DIR, + [EXOFS_FT_CHRDEV] = DT_CHR, + [EXOFS_FT_BLKDEV] = DT_BLK, + [EXOFS_FT_FIFO] = DT_FIFO, + [EXOFS_FT_SOCK] = DT_SOCK, + [EXOFS_FT_SYMLINK] = DT_LNK, +}; + +#define S_SHIFT 12 +static unsigned char exofs_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] = EXOFS_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = EXOFS_FT_DIR, + [S_IFCHR >> S_SHIFT] = EXOFS_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = EXOFS_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = EXOFS_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = EXOFS_FT_SOCK, + [S_IFLNK >> S_SHIFT] = EXOFS_FT_SYMLINK, +}; + +static inline +void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode) +{ + mode_t mode = inode->i_mode; + de->file_type = exofs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; +} + +static int +exofs_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + loff_t pos = filp->f_pos; + struct inode *inode = filp->f_path.dentry->d_inode; + unsigned int offset = pos & ~PAGE_CACHE_MASK; + unsigned long n = pos >> PAGE_CACHE_SHIFT; + unsigned long npages = dir_pages(inode); + unsigned chunk_mask = ~(exofs_chunk_size(inode)-1); + unsigned char *types = NULL; + int need_revalidate = (filp->f_version != inode->i_version); + + if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1)) + return 0; + + types = exofs_filetype_table; + + for ( ; n < npages; n++, offset = 0) { + char *kaddr, *limit; + struct exofs_dir_entry *de; + struct page *page = exofs_get_page(inode, n); + + if (IS_ERR(page)) { + EXOFS_ERR("ERROR: " + "bad page in #%lu", + inode->i_ino); + filp->f_pos += PAGE_CACHE_SIZE - offset; + return PTR_ERR(page); + } + kaddr = page_address(page); + if (unlikely(need_revalidate)) { + if (offset) { + offset = exofs_validate_entry(kaddr, offset, + chunk_mask); + filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset; + } + filp->f_version = inode->i_version; + need_revalidate = 0; + } + de = (struct exofs_dir_entry *)(kaddr + offset); + limit = kaddr + exofs_last_byte(inode, n) - + EXOFS_DIR_REC_LEN(1); + for (; (char *)de <= limit; de = exofs_next_entry(de)) { + if (de->rec_len == 0) { + EXOFS_ERR("ERROR: " + "zero-length directory entry"); + exofs_put_page(page); + return -EIO; + } + if (de->inode_no) { + int over; + unsigned char d_type = DT_UNKNOWN; + + if (types && de->file_type < EXOFS_FT_MAX) + d_type = types[de->file_type]; + + offset = (char *)de - kaddr; + over = filldir(dirent, de->name, de->name_len, + (n<<PAGE_CACHE_SHIFT) | offset, + le64_to_cpu(de->inode_no), + d_type); + if (over) { + exofs_put_page(page); + return 0; + } + } + filp->f_pos += le16_to_cpu(de->rec_len); + } + exofs_put_page(page); + } + + return 0; +} + +struct exofs_dir_entry *exofs_find_entry(struct inode *dir, + struct dentry *dentry, struct page **res_page) +{ + const unsigned char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + unsigned reclen = EXOFS_DIR_REC_LEN(namelen); + unsigned long start, n; + unsigned long npages = dir_pages(dir); + struct page *page = NULL; + struct exofs_i_info *oi = exofs_i(dir); + struct exofs_dir_entry *de; + + if (npages == 0) + goto out; + + *res_page = NULL; + + start = oi->i_dir_start_lookup; + if (start >= npages) + start = 0; + n = start; + do { + char *kaddr; + page = exofs_get_page(dir, n); + if (!IS_ERR(page)) { + kaddr = page_address(page); + de = (struct exofs_dir_entry *) kaddr; + kaddr += exofs_last_byte(dir, n) - reclen; + while ((char *) de <= kaddr) { + if (de->rec_len == 0) { + EXOFS_ERR( + "ERROR: exofs_find_entry: " + "zero-length directory entry"); + exofs_put_page(page); + goto out; + } + if (exofs_match(namelen, name, de)) + goto found; + de = exofs_next_entry(de); + } + exofs_put_page(page); + } + if (++n >= npages) + n = 0; + } while (n != start); +out: + return NULL; + +found: + *res_page = page; + oi->i_dir_start_lookup = n; + return de; +} + +struct exofs_dir_entry *exofs_dotdot(struct inode *dir, struct page **p) +{ + struct page *page = exofs_get_page(dir, 0); + struct exofs_dir_entry *de = NULL; + + if (!IS_ERR(page)) { + de = exofs_next_entry( + (struct exofs_dir_entry *)page_address(page)); + *p = page; + } + return de; +} + +ino_t exofs_parent_ino(struct dentry *child) +{ + struct page *page; + struct exofs_dir_entry *de; + ino_t ino; + + de = exofs_dotdot(child->d_inode, &page); + if (!de) + return 0; + + ino = le64_to_cpu(de->inode_no); + exofs_put_page(page); + return ino; +} + +ino_t exofs_inode_by_name(struct inode *dir, struct dentry *dentry) +{ + ino_t res = 0; + struct exofs_dir_entry *de; + struct page *page; + + de = exofs_find_entry(dir, dentry, &page); + if (de) { + res = le64_to_cpu(de->inode_no); + exofs_put_page(page); + } + return res; +} + +int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de, + struct page *page, struct inode *inode) +{ + loff_t pos = page_offset(page) + + (char *) de - (char *) page_address(page); + unsigned len = le16_to_cpu(de->rec_len); + int err; + + lock_page(page); + err = exofs_write_begin(NULL, page->mapping, pos, len, + AOP_FLAG_UNINTERRUPTIBLE, &page, NULL); + if (err) + EXOFS_ERR("exofs_set_link: exofs_write_begin FAILD => %d\n", + err); + + de->inode_no = cpu_to_le64(inode->i_ino); + exofs_set_de_type(de, inode); + if (likely(!err)) + err = exofs_commit_chunk(page, pos, len); + exofs_put_page(page); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + mark_inode_dirty(dir); + return err; +} + +int exofs_add_link(struct dentry *dentry, struct inode *inode) +{ + struct inode *dir = dentry->d_parent->d_inode; + const unsigned char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + unsigned chunk_size = exofs_chunk_size(dir); + unsigned reclen = EXOFS_DIR_REC_LEN(namelen); + unsigned short rec_len, name_len; + struct page *page = NULL; + struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; + struct exofs_dir_entry *de; + unsigned long npages = dir_pages(dir); + unsigned long n; + char *kaddr; + loff_t pos; + int err; + + for (n = 0; n <= npages; n++) { + char *dir_end; + + page = exofs_get_page(dir, n); + err = PTR_ERR(page); + if (IS_ERR(page)) + goto out; + lock_page(page); + kaddr = page_address(page); + dir_end = kaddr + exofs_last_byte(dir, n); + de = (struct exofs_dir_entry *)kaddr; + kaddr += PAGE_CACHE_SIZE - reclen; + while ((char *)de <= kaddr) { + if ((char *)de == dir_end) { + name_len = 0; + rec_len = chunk_size; + de->rec_len = cpu_to_le16(chunk_size); + de->inode_no = 0; + goto got_it; + } + if (de->rec_len == 0) { + EXOFS_ERR("ERROR: exofs_add_link: " + "zero-length directory entry"); + err = -EIO; + goto out_unlock; + } + err = -EEXIST; + if (exofs_match(namelen, name, de)) + goto out_unlock; + name_len = EXOFS_DIR_REC_LEN(de->name_len); + rec_len = le16_to_cpu(de->rec_len); + if (!de->inode_no && rec_len >= reclen) + goto got_it; + if (rec_len >= name_len + reclen) + goto got_it; + de = (struct exofs_dir_entry *) ((char *) de + rec_len); + } + unlock_page(page); + exofs_put_page(page); + } + + EXOFS_ERR("exofs_add_link: BAD dentry=%p or inode=%p", dentry, inode); + return -EINVAL; + +got_it: + pos = page_offset(page) + + (char *)de - (char *)page_address(page); + err = exofs_write_begin(NULL, page->mapping, pos, rec_len, 0, + &page, NULL); + if (err) + goto out_unlock; + if (de->inode_no) { + struct exofs_dir_entry *de1 = + (struct exofs_dir_entry *)((char *)de + name_len); + de1->rec_len = cpu_to_le16(rec_len - name_len); + de->rec_len = cpu_to_le16(name_len); + de = de1; + } + de->name_len = namelen; + memcpy(de->name, name, namelen); + de->inode_no = cpu_to_le64(inode->i_ino); + exofs_set_de_type(de, inode); + err = exofs_commit_chunk(page, pos, rec_len); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + mark_inode_dirty(dir); + sbi->s_numfiles++; + +out_put: + exofs_put_page(page); +out: + return err; +out_unlock: + unlock_page(page); + goto out_put; +} + +int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; + char *kaddr = page_address(page); + unsigned from = ((char *)dir - kaddr) & ~(exofs_chunk_size(inode)-1); + unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len); + loff_t pos; + struct exofs_dir_entry *pde = NULL; + struct exofs_dir_entry *de = (struct exofs_dir_entry *) (kaddr + from); + int err; + + while (de < dir) { + if (de->rec_len == 0) { + EXOFS_ERR("ERROR: exofs_delete_entry:" + "zero-length directory entry"); + err = -EIO; + goto out; + } + pde = de; + de = exofs_next_entry(de); + } + if (pde) + from = (char *)pde - (char *)page_address(page); + pos = page_offset(page) + from; + lock_page(page); + err = exofs_write_begin(NULL, page->mapping, pos, to - from, 0, + &page, NULL); + if (err) + EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILD => %d\n", + err); + if (pde) + pde->rec_len = cpu_to_le16(to - from); + dir->inode_no = 0; + if (likely(!err)) + err = exofs_commit_chunk(page, pos, to - from); + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + mark_inode_dirty(inode); + sbi->s_numfiles--; +out: + exofs_put_page(page); + return err; +} + +/* kept aligned on 4 bytes */ +#define THIS_DIR ".\0\0" +#define PARENT_DIR "..\0" + +int exofs_make_empty(struct inode *inode, struct inode *parent) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page = grab_cache_page(mapping, 0); + unsigned chunk_size = exofs_chunk_size(inode); + struct exofs_dir_entry *de; + int err; + void *kaddr; + + if (!page) + return -ENOMEM; + + err = exofs_write_begin(NULL, page->mapping, 0, chunk_size, 0, + &page, NULL); + if (err) { + unlock_page(page); + goto fail; + } + + kaddr = kmap_atomic(page, KM_USER0); + de = (struct exofs_dir_entry *)kaddr; + de->name_len = 1; + de->rec_len = cpu_to_le16(EXOFS_DIR_REC_LEN(1)); + memcpy(de->name, THIS_DIR, sizeof(THIS_DIR)); + de->inode_no = cpu_to_le64(inode->i_ino); + exofs_set_de_type(de, inode); + + de = (struct exofs_dir_entry *)(kaddr + EXOFS_DIR_REC_LEN(1)); + de->name_len = 2; + de->rec_len = cpu_to_le16(chunk_size - EXOFS_DIR_REC_LEN(1)); + de->inode_no = cpu_to_le64(parent->i_ino); + memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR)); + exofs_set_de_type(de, inode); + kunmap_atomic(page, KM_USER0); + err = exofs_commit_chunk(page, 0, chunk_size); +fail: + page_cache_release(page); + return err; +} + +int exofs_empty_dir(struct inode *inode) +{ + struct page *page = NULL; + unsigned long i, npages = dir_pages(inode); + + for (i = 0; i < npages; i++) { + char *kaddr; + struct exofs_dir_entry *de; + page = exofs_get_page(inode, i); + + if (IS_ERR(page)) + continue; + + kaddr = page_address(page); + de = (struct exofs_dir_entry *)kaddr; + kaddr += exofs_last_byte(inode, i) - EXOFS_DIR_REC_LEN(1); + + while ((char *)de <= kaddr) { + if (de->rec_len == 0) { + EXOFS_ERR("ERROR: exofs_empty_dir: " + "zero-length directory entry" + "kaddr=%p, de=%p\n", kaddr, de); + goto not_empty; + } + if (de->inode_no != 0) { + /* check for . and .. */ + if (de->name[0] != '.') + goto not_empty; + if (de->name_len > 2) + goto not_empty; + if (de->name_len < 2) { + if (le64_to_cpu(de->inode_no) != + inode->i_ino) + goto not_empty; + } else if (de->name[1] != '.') + goto not_empty; + } + de = exofs_next_entry(de); + } + exofs_put_page(page); + } + return 1; + +not_empty: + exofs_put_page(page); + return 0; +} + +const struct file_operations exofs_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = exofs_readdir, +}; diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h new file mode 100644 index 000000000000..0fd4c7859679 --- /dev/null +++ b/fs/exofs/exofs.h @@ -0,0 +1,180 @@ +/* + * Copyright (C) 2005, 2006 + * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com) + * Copyright (C) 2005, 2006 + * International Business Machines + * Copyright (C) 2008, 2009 + * Boaz Harrosh <bharrosh@panasas.com> + * + * Copyrights for code taken from ext2: + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/inode.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <linux/fs.h> +#include <linux/time.h> +#include "common.h" + +#ifndef __EXOFS_H__ +#define __EXOFS_H__ + +#define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a) + +#ifdef CONFIG_EXOFS_DEBUG +#define EXOFS_DBGMSG(fmt, a...) \ + printk(KERN_NOTICE "exofs @%s:%d: " fmt, __func__, __LINE__, ##a) +#else +#define EXOFS_DBGMSG(fmt, a...) \ + do { if (0) printk(fmt, ##a); } while (0) +#endif + +/* u64 has problems with printk this will cast it to unsigned long long */ +#define _LLU(x) (unsigned long long)(x) + +/* + * our extension to the in-memory superblock + */ +struct exofs_sb_info { + struct osd_dev *s_dev; /* returned by get_osd_dev */ + osd_id s_pid; /* partition ID of file system*/ + int s_timeout; /* timeout for OSD operations */ + uint64_t s_nextid; /* highest object ID used */ + uint32_t s_numfiles; /* number of files on fs */ + spinlock_t s_next_gen_lock; /* spinlock for gen # update */ + u32 s_next_generation; /* next gen # to use */ + atomic_t s_curr_pending; /* number of pending commands */ + uint8_t s_cred[OSD_CAP_LEN]; /* all-powerful credential */ +}; + +/* + * our extension to the in-memory inode + */ +struct exofs_i_info { + unsigned long i_flags; /* various atomic flags */ + uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/ + uint32_t i_dir_start_lookup; /* which page to start lookup */ + wait_queue_head_t i_wq; /* wait queue for inode */ + uint64_t i_commit_size; /* the object's written length */ + uint8_t i_cred[OSD_CAP_LEN];/* all-powerful credential */ + struct inode vfs_inode; /* normal in-memory inode */ +}; + +/* + * our inode flags + */ +#define OBJ_2BCREATED 0 /* object will be created soon*/ +#define OBJ_CREATED 1 /* object has been created on the osd*/ + +static inline int obj_2bcreated(struct exofs_i_info *oi) +{ + return test_bit(OBJ_2BCREATED, &oi->i_flags); +} + +static inline void set_obj_2bcreated(struct exofs_i_info *oi) +{ + set_bit(OBJ_2BCREATED, &oi->i_flags); +} + +static inline int obj_created(struct exofs_i_info *oi) +{ + return test_bit(OBJ_CREATED, &oi->i_flags); +} + +static inline void set_obj_created(struct exofs_i_info *oi) +{ + set_bit(OBJ_CREATED, &oi->i_flags); +} + +int __exofs_wait_obj_created(struct exofs_i_info *oi); +static inline int wait_obj_created(struct exofs_i_info *oi) +{ + if (likely(obj_created(oi))) + return 0; + + return __exofs_wait_obj_created(oi); +} + +/* + * get to our inode from the vfs inode + */ +static inline struct exofs_i_info *exofs_i(struct inode *inode) +{ + return container_of(inode, struct exofs_i_info, vfs_inode); +} + +/* + * Maximum count of links to a file + */ +#define EXOFS_LINK_MAX 32000 + +/************************* + * function declarations * + *************************/ +/* inode.c */ +void exofs_truncate(struct inode *inode); +int exofs_setattr(struct dentry *, struct iattr *); +int exofs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata); +extern struct inode *exofs_iget(struct super_block *, unsigned long); +struct inode *exofs_new_inode(struct inode *, int); +extern int exofs_write_inode(struct inode *, int); +extern void exofs_delete_inode(struct inode *); + +/* dir.c: */ +int exofs_add_link(struct dentry *, struct inode *); +ino_t exofs_inode_by_name(struct inode *, struct dentry *); +int exofs_delete_entry(struct exofs_dir_entry *, struct page *); +int exofs_make_empty(struct inode *, struct inode *); +struct exofs_dir_entry *exofs_find_entry(struct inode *, struct dentry *, + struct page **); +int exofs_empty_dir(struct inode *); +struct exofs_dir_entry *exofs_dotdot(struct inode *, struct page **); +ino_t exofs_parent_ino(struct dentry *child); +int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *, + struct inode *); + +/********************* + * operation vectors * + *********************/ +/* dir.c: */ +extern const struct file_operations exofs_dir_operations; + +/* file.c */ +extern const struct inode_operations exofs_file_inode_operations; +extern const struct file_operations exofs_file_operations; + +/* inode.c */ +extern const struct address_space_operations exofs_aops; + +/* namei.c */ +extern const struct inode_operations exofs_dir_inode_operations; +extern const struct inode_operations exofs_special_inode_operations; + +/* symlink.c */ +extern const struct inode_operations exofs_symlink_inode_operations; +extern const struct inode_operations exofs_fast_symlink_inode_operations; + +#endif diff --git a/fs/exofs/file.c b/fs/exofs/file.c new file mode 100644 index 000000000000..6ed7fe484752 --- /dev/null +++ b/fs/exofs/file.c @@ -0,0 +1,87 @@ +/* + * Copyright (C) 2005, 2006 + * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com) + * Copyright (C) 2005, 2006 + * International Business Machines + * Copyright (C) 2008, 2009 + * Boaz Harrosh <bharrosh@panasas.com> + * + * Copyrights for code taken from ext2: + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/inode.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <linux/buffer_head.h> + +#include "exofs.h" + +static int exofs_release_file(struct inode *inode, struct file *filp) +{ + return 0; +} + +static int exofs_file_fsync(struct file *filp, struct dentry *dentry, + int datasync) +{ + int ret; + struct address_space *mapping = filp->f_mapping; + + ret = filemap_write_and_wait(mapping); + if (ret) + return ret; + + /*Note: file_fsync below also calles sync_blockdev, which is a no-op + * for exofs, but other then that it does sync_inode and + * sync_superblock which is what we need here. + */ + return file_fsync(filp, dentry, datasync); +} + +static int exofs_flush(struct file *file, fl_owner_t id) +{ + exofs_file_fsync(file, file->f_path.dentry, 1); + /* TODO: Flush the OSD target */ + return 0; +} + +const struct file_operations exofs_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = generic_file_aio_read, + .aio_write = generic_file_aio_write, + .mmap = generic_file_mmap, + .open = generic_file_open, + .release = exofs_release_file, + .fsync = exofs_file_fsync, + .flush = exofs_flush, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, +}; + +const struct inode_operations exofs_file_inode_operations = { + .truncate = exofs_truncate, + .setattr = exofs_setattr, +}; diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c new file mode 100644 index 000000000000..ba8d9fab4693 --- /dev/null +++ b/fs/exofs/inode.c @@ -0,0 +1,1303 @@ +/* + * Copyright (C) 2005, 2006 + * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com) + * Copyright (C) 2005, 2006 + * International Business Machines + * Copyright (C) 2008, 2009 + * Boaz Harrosh <bharrosh@panasas.com> + * + * Copyrights for code taken from ext2: + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/inode.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <linux/writeback.h> +#include <linux/buffer_head.h> +#include <scsi/scsi_device.h> + +#include "exofs.h" + +#ifdef CONFIG_EXOFS_DEBUG +# define EXOFS_DEBUG_OBJ_ISIZE 1 +#endif + +struct page_collect { + struct exofs_sb_info *sbi; + struct request_queue *req_q; + struct inode *inode; + unsigned expected_pages; + + struct bio *bio; + unsigned nr_pages; + unsigned long length; + loff_t pg_first; /* keep 64bit also in 32-arches */ +}; + +static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, + struct inode *inode) +{ + struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; + struct request_queue *req_q = sbi->s_dev->scsi_device->request_queue; + + pcol->sbi = sbi; + pcol->req_q = req_q; + pcol->inode = inode; + pcol->expected_pages = expected_pages; + + pcol->bio = NULL; + pcol->nr_pages = 0; + pcol->length = 0; + pcol->pg_first = -1; + + EXOFS_DBGMSG("_pcol_init ino=0x%lx expected_pages=%u\n", inode->i_ino, + expected_pages); +} + +static void _pcol_reset(struct page_collect *pcol) +{ + pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages); + + pcol->bio = NULL; + pcol->nr_pages = 0; + pcol->length = 0; + pcol->pg_first = -1; + EXOFS_DBGMSG("_pcol_reset ino=0x%lx expected_pages=%u\n", + pcol->inode->i_ino, pcol->expected_pages); + + /* this is probably the end of the loop but in writes + * it might not end here. don't be left with nothing + */ + if (!pcol->expected_pages) + pcol->expected_pages = 128; +} + +static int pcol_try_alloc(struct page_collect *pcol) +{ + int pages = min_t(unsigned, pcol->expected_pages, BIO_MAX_PAGES); + + for (; pages; pages >>= 1) { + pcol->bio = bio_alloc(GFP_KERNEL, pages); + if (likely(pcol->bio)) + return 0; + } + + EXOFS_ERR("Failed to kcalloc expected_pages=%u\n", + pcol->expected_pages); + return -ENOMEM; +} + +static void pcol_free(struct page_collect *pcol) +{ + bio_put(pcol->bio); + pcol->bio = NULL; +} + +static int pcol_add_page(struct page_collect *pcol, struct page *page, + unsigned len) +{ + int added_len = bio_add_pc_page(pcol->req_q, pcol->bio, page, len, 0); + if (unlikely(len != added_len)) + return -ENOMEM; + + ++pcol->nr_pages; + pcol->length += len; + return 0; +} + +static int update_read_page(struct page *page, int ret) +{ + if (ret == 0) { + /* Everything is OK */ + SetPageUptodate(page); + if (PageError(page)) + ClearPageError(page); + } else if (ret == -EFAULT) { + /* In this case we were trying to read something that wasn't on + * disk yet - return a page full of zeroes. This should be OK, + * because the object should be empty (if there was a write + * before this read, the read would be waiting with the page + * locked */ + clear_highpage(page); + + SetPageUptodate(page); + if (PageError(page)) + ClearPageError(page); + ret = 0; /* recovered error */ + EXOFS_DBGMSG("recovered read error\n"); + } else /* Error */ + SetPageError(page); + + return ret; +} + +static void update_write_page(struct page *page, int ret) +{ + if (ret) { + mapping_set_error(page->mapping, ret); + SetPageError(page); + } + end_page_writeback(page); +} + +/* Called at the end of reads, to optionally unlock pages and update their + * status. + */ +static int __readpages_done(struct osd_request *or, struct page_collect *pcol, + bool do_unlock) +{ + struct bio_vec *bvec; + int i; + u64 resid; + u64 good_bytes; + u64 length = 0; + int ret = exofs_check_ok_resid(or, &resid, NULL); + + osd_end_request(or); + + if (likely(!ret)) + good_bytes = pcol->length; + else if (!resid) + good_bytes = 0; + else + good_bytes = pcol->length - resid; + + EXOFS_DBGMSG("readpages_done(0x%lx) good_bytes=0x%llx" + " length=0x%lx nr_pages=%u\n", + pcol->inode->i_ino, _LLU(good_bytes), pcol->length, + pcol->nr_pages); + + __bio_for_each_segment(bvec, pcol->bio, i, 0) { + struct page *page = bvec->bv_page; + struct inode *inode = page->mapping->host; + int page_stat; + + if (inode != pcol->inode) + continue; /* osd might add more pages at end */ + + if (likely(length < good_bytes)) + page_stat = 0; + else + page_stat = ret; + + EXOFS_DBGMSG(" readpages_done(0x%lx, 0x%lx) %s\n", + inode->i_ino, page->index, + page_stat ? "bad_bytes" : "good_bytes"); + + ret = update_read_page(page, page_stat); + if (do_unlock) + unlock_page(page); + length += bvec->bv_len; + } + + pcol_free(pcol); + EXOFS_DBGMSG("readpages_done END\n"); + return ret; +} + +/* callback of async reads */ +static void readpages_done(struct osd_request *or, void *p) +{ + struct page_collect *pcol = p; + + __readpages_done(or, pcol, true); + atomic_dec(&pcol->sbi->s_curr_pending); + kfree(p); +} + +static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw) +{ + struct bio_vec *bvec; + int i; + + __bio_for_each_segment(bvec, pcol->bio, i, 0) { + struct page *page = bvec->bv_page; + + if (rw == READ) + update_read_page(page, ret); + else + update_write_page(page, ret); + + unlock_page(page); + } + pcol_free(pcol); +} + +static int read_exec(struct page_collect *pcol, bool is_sync) +{ + struct exofs_i_info *oi = exofs_i(pcol->inode); + struct osd_obj_id obj = {pcol->sbi->s_pid, + pcol->inode->i_ino + EXOFS_OBJ_OFF}; + struct osd_request *or = NULL; + struct page_collect *pcol_copy = NULL; + loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT; + int ret; + + if (!pcol->bio) + return 0; + + /* see comment in _readpage() about sync reads */ + WARN_ON(is_sync && (pcol->nr_pages != 1)); + + or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL); + if (unlikely(!or)) { + ret = -ENOMEM; + goto err; + } + + osd_req_read(or, &obj, pcol->bio, i_start); + + if (is_sync) { + exofs_sync_op(or, pcol->sbi->s_timeout, oi->i_cred); + return __readpages_done(or, pcol, false); + } + + pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); + if (!pcol_copy) { + ret = -ENOMEM; + goto err; + } + + *pcol_copy = *pcol; + ret = exofs_async_op(or, readpages_done, pcol_copy, oi->i_cred); + if (unlikely(ret)) + goto err; + + atomic_inc(&pcol->sbi->s_curr_pending); + + EXOFS_DBGMSG("read_exec obj=0x%llx start=0x%llx length=0x%lx\n", + obj.id, _LLU(i_start), pcol->length); + + /* pages ownership was passed to pcol_copy */ + _pcol_reset(pcol); + return 0; + +err: + if (!is_sync) + _unlock_pcol_pages(pcol, ret, READ); + kfree(pcol_copy); + if (or) + osd_end_request(or); + return ret; +} + +/* readpage_strip is called either directly from readpage() or by the VFS from + * within read_cache_pages(), to add one more page to be read. It will try to + * collect as many contiguous pages as posible. If a discontinuity is + * encountered, or it runs out of resources, it will submit the previous segment + * and will start a new collection. Eventually caller must submit the last + * segment if present. + */ +static int readpage_strip(void *data, struct page *page) +{ + struct page_collect *pcol = data; + struct inode *inode = pcol->inode; + struct exofs_i_info *oi = exofs_i(inode); + loff_t i_size = i_size_read(inode); + pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + size_t len; + int ret; + + /* FIXME: Just for debugging, will be removed */ + if (PageUptodate(page)) + EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino, + page->index); + + if (page->index < end_index) + len = PAGE_CACHE_SIZE; + else if (page->index == end_index) + len = i_size & ~PAGE_CACHE_MASK; + else + len = 0; + + if (!len || !obj_created(oi)) { + /* this will be out of bounds, or doesn't exist yet. + * Current page is cleared and the request is split + */ + clear_highpage(page); + + SetPageUptodate(page); + if (PageError(page)) + ClearPageError(page); + + unlock_page(page); + EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page," + " splitting\n", inode->i_ino, page->index); + + return read_exec(pcol, false); + } + +try_again: + + if (unlikely(pcol->pg_first == -1)) { + pcol->pg_first = page->index; + } else if (unlikely((pcol->pg_first + pcol->nr_pages) != + page->index)) { + /* Discontinuity detected, split the request */ + ret = read_exec(pcol, false); + if (unlikely(ret)) + goto fail; + goto try_again; + } + + if (!pcol->bio) { + ret = pcol_try_alloc(pcol); + if (unlikely(ret)) + goto fail; + } + + if (len != PAGE_CACHE_SIZE) + zero_user(page, len, PAGE_CACHE_SIZE - len); + + EXOFS_DBGMSG(" readpage_strip(0x%lx, 0x%lx) len=0x%zx\n", + inode->i_ino, page->index, len); + + ret = pcol_add_page(pcol, page, len); + if (ret) { + EXOFS_DBGMSG("Failed pcol_add_page pages[i]=%p " + "this_len=0x%zx nr_pages=%u length=0x%lx\n", + page, len, pcol->nr_pages, pcol->length); + + /* split the request, and start again with current page */ + ret = read_exec(pcol, false); + if (unlikely(ret)) + goto fail; + + goto try_again; + } + + return 0; + +fail: + /* SetPageError(page); ??? */ + unlock_page(page); + return ret; +} + +static int exofs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + struct page_collect pcol; + int ret; + + _pcol_init(&pcol, nr_pages, mapping->host); + + ret = read_cache_pages(mapping, pages, readpage_strip, &pcol); + if (ret) { + EXOFS_ERR("read_cache_pages => %d\n", ret); + return ret; + } + + return read_exec(&pcol, false); +} + +static int _readpage(struct page *page, bool is_sync) +{ + struct page_collect pcol; + int ret; + + _pcol_init(&pcol, 1, page->mapping->host); + + /* readpage_strip might call read_exec(,async) inside at several places + * but this is safe for is_async=0 since read_exec will not do anything + * when we have a single page. + */ + ret = readpage_strip(&pcol, page); + if (ret) { + EXOFS_ERR("_readpage => %d\n", ret); + return ret; + } + + return read_exec(&pcol, is_sync); +} + +/* + * We don't need the file + */ +static int exofs_readpage(struct file *file, struct page *page) +{ + return _readpage(page, false); +} + +/* Callback for osd_write. All writes are asynchronouse */ +static void writepages_done(struct osd_request *or, void *p) +{ + struct page_collect *pcol = p; + struct bio_vec *bvec; + int i; + u64 resid; + u64 good_bytes; + u64 length = 0; + + int ret = exofs_check_ok_resid(or, NULL, &resid); + + osd_end_request(or); + atomic_dec(&pcol->sbi->s_curr_pending); + + if (likely(!ret)) + good_bytes = pcol->length; + else if (!resid) + good_bytes = 0; + else + good_bytes = pcol->length - resid; + + EXOFS_DBGMSG("writepages_done(0x%lx) good_bytes=0x%llx" + " length=0x%lx nr_pages=%u\n", + pcol->inode->i_ino, _LLU(good_bytes), pcol->length, + pcol->nr_pages); + + __bio_for_each_segment(bvec, pcol->bio, i, 0) { + struct page *page = bvec->bv_page; + struct inode *inode = page->mapping->host; + int page_stat; + + if (inode != pcol->inode) + continue; /* osd might add more pages to a bio */ + + if (likely(length < good_bytes)) + page_stat = 0; + else + page_stat = ret; + + update_write_page(page, page_stat); + unlock_page(page); + EXOFS_DBGMSG(" writepages_done(0x%lx, 0x%lx) status=%d\n", + inode->i_ino, page->index, page_stat); + + length += bvec->bv_len; + } + + pcol_free(pcol); + kfree(pcol); + EXOFS_DBGMSG("writepages_done END\n"); +} + +static int write_exec(struct page_collect *pcol) +{ + struct exofs_i_info *oi = exofs_i(pcol->inode); + struct osd_obj_id obj = {pcol->sbi->s_pid, + pcol->inode->i_ino + EXOFS_OBJ_OFF}; + struct osd_request *or = NULL; + struct page_collect *pcol_copy = NULL; + loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT; + int ret; + + if (!pcol->bio) + return 0; + + or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL); + if (unlikely(!or)) { + EXOFS_ERR("write_exec: Faild to osd_start_request()\n"); + ret = -ENOMEM; + goto err; + } + + pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); + if (!pcol_copy) { + EXOFS_ERR("write_exec: Faild to kmalloc(pcol)\n"); + ret = -ENOMEM; + goto err; + } + + *pcol_copy = *pcol; + + osd_req_write(or, &obj, pcol_copy->bio, i_start); + ret = exofs_async_op(or, writepages_done, pcol_copy, oi->i_cred); + if (unlikely(ret)) { + EXOFS_ERR("write_exec: exofs_async_op() Faild\n"); + goto err; + } + + atomic_inc(&pcol->sbi->s_curr_pending); + EXOFS_DBGMSG("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n", + pcol->inode->i_ino, pcol->pg_first, _LLU(i_start), + pcol->length); + /* pages ownership was passed to pcol_copy */ + _pcol_reset(pcol); + return 0; + +err: + _unlock_pcol_pages(pcol, ret, WRITE); + kfree(pcol_copy); + if (or) + osd_end_request(or); + return ret; +} + +/* writepage_strip is called either directly from writepage() or by the VFS from + * within write_cache_pages(), to add one more page to be written to storage. + * It will try to collect as many contiguous pages as possible. If a + * discontinuity is encountered or it runs out of resources it will submit the + * previous segment and will start a new collection. + * Eventually caller must submit the last segment if present. + */ +static int writepage_strip(struct page *page, + struct writeback_control *wbc_unused, void *data) +{ + struct page_collect *pcol = data; + struct inode *inode = pcol->inode; + struct exofs_i_info *oi = exofs_i(inode); + loff_t i_size = i_size_read(inode); + pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + size_t len; + int ret; + + BUG_ON(!PageLocked(page)); + + ret = wait_obj_created(oi); + if (unlikely(ret)) + goto fail; + + if (page->index < end_index) + /* in this case, the page is within the limits of the file */ + len = PAGE_CACHE_SIZE; + else { + len = i_size & ~PAGE_CACHE_MASK; + + if (page->index > end_index || !len) { + /* in this case, the page is outside the limits + * (truncate in progress) + */ + ret = write_exec(pcol); + if (unlikely(ret)) + goto fail; + if (PageError(page)) + ClearPageError(page); + unlock_page(page); + return 0; + } + } + +try_again: + + if (unlikely(pcol->pg_first == -1)) { + pcol->pg_first = page->index; + } else if (unlikely((pcol->pg_first + pcol->nr_pages) != + page->index)) { + /* Discontinuity detected, split the request */ + ret = write_exec(pcol); + if (unlikely(ret)) + goto fail; + goto try_again; + } + + if (!pcol->bio) { + ret = pcol_try_alloc(pcol); + if (unlikely(ret)) + goto fail; + } + + EXOFS_DBGMSG(" writepage_strip(0x%lx, 0x%lx) len=0x%zx\n", + inode->i_ino, page->index, len); + + ret = pcol_add_page(pcol, page, len); + if (unlikely(ret)) { + EXOFS_DBGMSG("Failed pcol_add_page " + "nr_pages=%u total_length=0x%lx\n", + pcol->nr_pages, pcol->length); + + /* split the request, next loop will start again */ + ret = write_exec(pcol); + if (unlikely(ret)) { + EXOFS_DBGMSG("write_exec faild => %d", ret); + goto fail; + } + + goto try_again; + } + + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + + return 0; + +fail: + set_bit(AS_EIO, &page->mapping->flags); + unlock_page(page); + return ret; +} + +static int exofs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct page_collect pcol; + long start, end, expected_pages; + int ret; + + start = wbc->range_start >> PAGE_CACHE_SHIFT; + end = (wbc->range_end == LLONG_MAX) ? + start + mapping->nrpages : + wbc->range_end >> PAGE_CACHE_SHIFT; + + if (start || end) + expected_pages = min(end - start + 1, 32L); + else + expected_pages = mapping->nrpages; + + EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx" + " m->nrpages=%lu start=0x%lx end=0x%lx\n", + mapping->host->i_ino, wbc->range_start, wbc->range_end, + mapping->nrpages, start, end); + + _pcol_init(&pcol, expected_pages, mapping->host); + + ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol); + if (ret) { + EXOFS_ERR("write_cache_pages => %d\n", ret); + return ret; + } + + return write_exec(&pcol); +} + +static int exofs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct page_collect pcol; + int ret; + + _pcol_init(&pcol, 1, page->mapping->host); + + ret = writepage_strip(page, NULL, &pcol); + if (ret) { + EXOFS_ERR("exofs_writepage => %d\n", ret); + return ret; + } + + return write_exec(&pcol); +} + +int exofs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret = 0; + struct page *page; + + page = *pagep; + if (page == NULL) { + ret = simple_write_begin(file, mapping, pos, len, flags, pagep, + fsdata); + if (ret) { + EXOFS_DBGMSG("simple_write_begin faild\n"); + return ret; + } + + page = *pagep; + } + + /* read modify write */ + if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) { + ret = _readpage(page, true); + if (ret) { + /*SetPageError was done by _readpage. Is it ok?*/ + unlock_page(page); + EXOFS_DBGMSG("__readpage_filler faild\n"); + } + } + + return ret; +} + +static int exofs_write_begin_export(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + *pagep = NULL; + + return exofs_write_begin(file, mapping, pos, len, flags, pagep, + fsdata); +} + +const struct address_space_operations exofs_aops = { + .readpage = exofs_readpage, + .readpages = exofs_readpages, + .writepage = exofs_writepage, + .writepages = exofs_writepages, + .write_begin = exofs_write_begin_export, + .write_end = simple_write_end, +}; + +/****************************************************************************** + * INODE OPERATIONS + *****************************************************************************/ + +/* + * Test whether an inode is a fast symlink. + */ +static inline int exofs_inode_is_fast_symlink(struct inode *inode) +{ + struct exofs_i_info *oi = exofs_i(inode); + + return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0); +} + +/* + * get_block_t - Fill in a buffer_head + * An OSD takes care of block allocation so we just fake an allocation by + * putting in the inode's sector_t in the buffer_head. + * TODO: What about the case of create==0 and @iblock does not exist in the + * object? + */ +static int exofs_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + map_bh(bh_result, inode->i_sb, iblock); + return 0; +} + +const struct osd_attr g_attr_logical_length = ATTR_DEF( + OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8); + +/* + * Truncate a file to the specified size - all we have to do is set the size + * attribute. We make sure the object exists first. + */ +void exofs_truncate(struct inode *inode) +{ + struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; + struct exofs_i_info *oi = exofs_i(inode); + struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF}; + struct osd_request *or; + struct osd_attr attr; + loff_t isize = i_size_read(inode); + __be64 newsize; + int ret; + + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) + || S_ISLNK(inode->i_mode))) + return; + if (exofs_inode_is_fast_symlink(inode)) + return; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return; + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + + nobh_truncate_page(inode->i_mapping, isize, exofs_get_block); + + or = osd_start_request(sbi->s_dev, GFP_KERNEL); + if (unlikely(!or)) { + EXOFS_ERR("ERROR: exofs_truncate: osd_start_request failed\n"); + goto fail; + } + + osd_req_set_attributes(or, &obj); + + newsize = cpu_to_be64((u64)isize); + attr = g_attr_logical_length; + attr.val_ptr = &newsize; + osd_req_add_set_attr_list(or, &attr, 1); + + /* if we are about to truncate an object, and it hasn't been + * created yet, wait + */ + if (unlikely(wait_obj_created(oi))) + goto fail; + + ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred); + osd_end_request(or); + if (ret) + goto fail; + +out: + mark_inode_dirty(inode); + return; +fail: + make_bad_inode(inode); + goto out; +} + +/* + * Set inode attributes - just call generic functions. + */ +int exofs_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = dentry->d_inode; + int error; + + error = inode_change_ok(inode, iattr); + if (error) + return error; + + error = inode_setattr(inode, iattr); + return error; +} + +/* + * Read an inode from the OSD, and return it as is. We also return the size + * attribute in the 'sanity' argument if we got compiled with debugging turned + * on. + */ +static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, + struct exofs_fcb *inode, uint64_t *sanity) +{ + struct exofs_sb_info *sbi = sb->s_fs_info; + struct osd_request *or; + struct osd_attr attr; + struct osd_obj_id obj = {sbi->s_pid, + oi->vfs_inode.i_ino + EXOFS_OBJ_OFF}; + int ret; + + exofs_make_credential(oi->i_cred, &obj); + + or = osd_start_request(sbi->s_dev, GFP_KERNEL); + if (unlikely(!or)) { + EXOFS_ERR("exofs_get_inode: osd_start_request failed.\n"); + return -ENOMEM; + } + osd_req_get_attributes(or, &obj); + + /* we need the inode attribute */ + osd_req_add_get_attr_list(or, &g_attr_inode_data, 1); + +#ifdef EXOFS_DEBUG_OBJ_ISIZE + /* we get the size attributes to do a sanity check */ + osd_req_add_get_attr_list(or, &g_attr_logical_length, 1); +#endif + + ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred); + if (ret) + goto out; + + attr = g_attr_inode_data; + ret = extract_attr_from_req(or, &attr); + if (ret) { + EXOFS_ERR("exofs_get_inode: extract_attr_from_req failed\n"); + goto out; + } + + WARN_ON(attr.len != EXOFS_INO_ATTR_SIZE); + memcpy(inode, attr.val_ptr, EXOFS_INO_ATTR_SIZE); + +#ifdef EXOFS_DEBUG_OBJ_ISIZE + attr = g_attr_logical_length; + ret = extract_attr_from_req(or, &attr); + if (ret) { + EXOFS_ERR("ERROR: extract attr from or failed\n"); + goto out; + } + *sanity = get_unaligned_be64(attr.val_ptr); +#endif + +out: + osd_end_request(or); + return ret; +} + +/* + * Fill in an inode read from the OSD and set it up for use + */ +struct inode *exofs_iget(struct super_block *sb, unsigned long ino) +{ + struct exofs_i_info *oi; + struct exofs_fcb fcb; + struct inode *inode; + uint64_t uninitialized_var(sanity); + int ret; + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + oi = exofs_i(inode); + + /* read the inode from the osd */ + ret = exofs_get_inode(sb, oi, &fcb, &sanity); + if (ret) + goto bad_inode; + + init_waitqueue_head(&oi->i_wq); + set_obj_created(oi); + + /* copy stuff from on-disk struct to in-memory struct */ + inode->i_mode = le16_to_cpu(fcb.i_mode); + inode->i_uid = le32_to_cpu(fcb.i_uid); + inode->i_gid = le32_to_cpu(fcb.i_gid); + inode->i_nlink = le16_to_cpu(fcb.i_links_count); + inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime); + inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime); + inode->i_mtime.tv_sec = (signed)le32_to_cpu(fcb.i_mtime); + inode->i_ctime.tv_nsec = + inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0; + oi->i_commit_size = le64_to_cpu(fcb.i_size); + i_size_write(inode, oi->i_commit_size); + inode->i_blkbits = EXOFS_BLKSHIFT; + inode->i_generation = le32_to_cpu(fcb.i_generation); + +#ifdef EXOFS_DEBUG_OBJ_ISIZE + if ((inode->i_size != sanity) && + (!exofs_inode_is_fast_symlink(inode))) { + EXOFS_ERR("WARNING: Size of object from inode and " + "attributes differ (%lld != %llu)\n", + inode->i_size, _LLU(sanity)); + } +#endif + + oi->i_dir_start_lookup = 0; + + if ((inode->i_nlink == 0) && (inode->i_mode == 0)) { + ret = -ESTALE; + goto bad_inode; + } + + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + if (fcb.i_data[0]) + inode->i_rdev = + old_decode_dev(le32_to_cpu(fcb.i_data[0])); + else + inode->i_rdev = + new_decode_dev(le32_to_cpu(fcb.i_data[1])); + } else { + memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data)); + } + + if (S_ISREG(inode->i_mode)) { + inode->i_op = &exofs_file_inode_operations; + inode->i_fop = &exofs_file_operations; + inode->i_mapping->a_ops = &exofs_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &exofs_dir_inode_operations; + inode->i_fop = &exofs_dir_operations; + inode->i_mapping->a_ops = &exofs_aops; + } else if (S_ISLNK(inode->i_mode)) { + if (exofs_inode_is_fast_symlink(inode)) + inode->i_op = &exofs_fast_symlink_inode_operations; + else { + inode->i_op = &exofs_symlink_inode_operations; + inode->i_mapping->a_ops = &exofs_aops; + } + } else { + inode->i_op = &exofs_special_inode_operations; + if (fcb.i_data[0]) + init_special_inode(inode, inode->i_mode, + old_decode_dev(le32_to_cpu(fcb.i_data[0]))); + else + init_special_inode(inode, inode->i_mode, + new_decode_dev(le32_to_cpu(fcb.i_data[1]))); + } + + unlock_new_inode(inode); + return inode; + +bad_inode: + iget_failed(inode); + return ERR_PTR(ret); +} + +int __exofs_wait_obj_created(struct exofs_i_info *oi) +{ + if (!obj_created(oi)) { + BUG_ON(!obj_2bcreated(oi)); + wait_event(oi->i_wq, obj_created(oi)); + } + return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0; +} +/* + * Callback function from exofs_new_inode(). The important thing is that we + * set the obj_created flag so that other methods know that the object exists on + * the OSD. + */ +static void create_done(struct osd_request *or, void *p) +{ + struct inode *inode = p; + struct exofs_i_info *oi = exofs_i(inode); + struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; + int ret; + + ret = exofs_check_ok(or); + osd_end_request(or); + atomic_dec(&sbi->s_curr_pending); + + if (unlikely(ret)) { + EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx", + _LLU(sbi->s_pid), _LLU(inode->i_ino + EXOFS_OBJ_OFF)); + make_bad_inode(inode); + } else + set_obj_created(oi); + + atomic_dec(&inode->i_count); + wake_up(&oi->i_wq); +} + +/* + * Set up a new inode and create an object for it on the OSD + */ +struct inode *exofs_new_inode(struct inode *dir, int mode) +{ + struct super_block *sb; + struct inode *inode; + struct exofs_i_info *oi; + struct exofs_sb_info *sbi; + struct osd_request *or; + struct osd_obj_id obj; + int ret; + + sb = dir->i_sb; + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + oi = exofs_i(inode); + + init_waitqueue_head(&oi->i_wq); + set_obj_2bcreated(oi); + + sbi = sb->s_fs_info; + + sb->s_dirt = 1; + inode->i_uid = current->cred->fsuid; + if (dir->i_mode & S_ISGID) { + inode->i_gid = dir->i_gid; + if (S_ISDIR(mode)) + mode |= S_ISGID; + } else { + inode->i_gid = current->cred->fsgid; + } + inode->i_mode = mode; + + inode->i_ino = sbi->s_nextid++; + inode->i_blkbits = EXOFS_BLKSHIFT; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + oi->i_commit_size = inode->i_size = 0; + spin_lock(&sbi->s_next_gen_lock); + inode->i_generation = sbi->s_next_generation++; + spin_unlock(&sbi->s_next_gen_lock); + insert_inode_hash(inode); + + mark_inode_dirty(inode); + + obj.partition = sbi->s_pid; + obj.id = inode->i_ino + EXOFS_OBJ_OFF; + exofs_make_credential(oi->i_cred, &obj); + + or = osd_start_request(sbi->s_dev, GFP_KERNEL); + if (unlikely(!or)) { + EXOFS_ERR("exofs_new_inode: osd_start_request failed\n"); + return ERR_PTR(-ENOMEM); + } + + osd_req_create_object(or, &obj); + + /* increment the refcount so that the inode will still be around when we + * reach the callback + */ + atomic_inc(&inode->i_count); + + ret = exofs_async_op(or, create_done, inode, oi->i_cred); + if (ret) { + atomic_dec(&inode->i_count); + osd_end_request(or); + return ERR_PTR(-EIO); + } + atomic_inc(&sbi->s_curr_pending); + + return inode; +} + +/* + * struct to pass two arguments to update_inode's callback + */ +struct updatei_args { + struct exofs_sb_info *sbi; + struct exofs_fcb fcb; +}; + +/* + * Callback function from exofs_update_inode(). + */ +static void updatei_done(struct osd_request *or, void *p) +{ + struct updatei_args *args = p; + + osd_end_request(or); + + atomic_dec(&args->sbi->s_curr_pending); + + kfree(args); +} + +/* + * Write the inode to the OSD. Just fill up the struct, and set the attribute + * synchronously or asynchronously depending on the do_sync flag. + */ +static int exofs_update_inode(struct inode *inode, int do_sync) +{ + struct exofs_i_info *oi = exofs_i(inode); + struct super_block *sb = inode->i_sb; + struct exofs_sb_info *sbi = sb->s_fs_info; + struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF}; + struct osd_request *or; + struct osd_attr attr; + struct exofs_fcb *fcb; + struct updatei_args *args; + int ret; + + args = kzalloc(sizeof(*args), GFP_KERNEL); + if (!args) + return -ENOMEM; + + fcb = &args->fcb; + + fcb->i_mode = cpu_to_le16(inode->i_mode); + fcb->i_uid = cpu_to_le32(inode->i_uid); + fcb->i_gid = cpu_to_le32(inode->i_gid); + fcb->i_links_count = cpu_to_le16(inode->i_nlink); + fcb->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); + fcb->i_atime = cpu_to_le32(inode->i_atime.tv_sec); + fcb->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); + oi->i_commit_size = i_size_read(inode); + fcb->i_size = cpu_to_le64(oi->i_commit_size); + fcb->i_generation = cpu_to_le32(inode->i_generation); + + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + if (old_valid_dev(inode->i_rdev)) { + fcb->i_data[0] = + cpu_to_le32(old_encode_dev(inode->i_rdev)); + fcb->i_data[1] = 0; + } else { + fcb->i_data[0] = 0; + fcb->i_data[1] = + cpu_to_le32(new_encode_dev(inode->i_rdev)); + fcb->i_data[2] = 0; + } + } else + memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data)); + + or = osd_start_request(sbi->s_dev, GFP_KERNEL); + if (unlikely(!or)) { + EXOFS_ERR("exofs_update_inode: osd_start_request failed.\n"); + ret = -ENOMEM; + goto free_args; + } + + osd_req_set_attributes(or, &obj); + + attr = g_attr_inode_data; + attr.val_ptr = fcb; + osd_req_add_set_attr_list(or, &attr, 1); + + if (!obj_created(oi)) { + EXOFS_DBGMSG("!obj_created\n"); + BUG_ON(!obj_2bcreated(oi)); + wait_event(oi->i_wq, obj_created(oi)); + EXOFS_DBGMSG("wait_event done\n"); + } + + if (do_sync) { + ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred); + osd_end_request(or); + goto free_args; + } else { + args->sbi = sbi; + + ret = exofs_async_op(or, updatei_done, args, oi->i_cred); + if (ret) { + osd_end_request(or); + goto free_args; + } + atomic_inc(&sbi->s_curr_pending); + goto out; /* deallocation in updatei_done */ + } + +free_args: + kfree(args); +out: + EXOFS_DBGMSG("ret=>%d\n", ret); + return ret; +} + +int exofs_write_inode(struct inode *inode, int wait) +{ + return exofs_update_inode(inode, wait); +} + +/* + * Callback function from exofs_delete_inode() - don't have much cleaning up to + * do. + */ +static void delete_done(struct osd_request *or, void *p) +{ + struct exofs_sb_info *sbi; + osd_end_request(or); + sbi = p; + atomic_dec(&sbi->s_curr_pending); +} + +/* + * Called when the refcount of an inode reaches zero. We remove the object + * from the OSD here. We make sure the object was created before we try and + * delete it. + */ +void exofs_delete_inode(struct inode *inode) +{ + struct exofs_i_info *oi = exofs_i(inode); + struct super_block *sb = inode->i_sb; + struct exofs_sb_info *sbi = sb->s_fs_info; + struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF}; + struct osd_request *or; + int ret; + + truncate_inode_pages(&inode->i_data, 0); + + if (is_bad_inode(inode)) + goto no_delete; + + mark_inode_dirty(inode); + exofs_update_inode(inode, inode_needs_sync(inode)); + + inode->i_size = 0; + if (inode->i_blocks) + exofs_truncate(inode); + + clear_inode(inode); + + or = osd_start_request(sbi->s_dev, GFP_KERNEL); + if (unlikely(!or)) { + EXOFS_ERR("exofs_delete_inode: osd_start_request failed\n"); + return; + } + + osd_req_remove_object(or, &obj); + + /* if we are deleting an obj that hasn't been created yet, wait */ + if (!obj_created(oi)) { + BUG_ON(!obj_2bcreated(oi)); + wait_event(oi->i_wq, obj_created(oi)); + } + + ret = exofs_async_op(or, delete_done, sbi, oi->i_cred); + if (ret) { + EXOFS_ERR( + "ERROR: @exofs_delete_inode exofs_async_op failed\n"); + osd_end_request(or); + return; + } + atomic_inc(&sbi->s_curr_pending); + + return; + +no_delete: + clear_inode(inode); +} diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c new file mode 100644 index 000000000000..77fdd765e76d --- /dev/null +++ b/fs/exofs/namei.c @@ -0,0 +1,342 @@ +/* + * Copyright (C) 2005, 2006 + * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com) + * Copyright (C) 2005, 2006 + * International Business Machines + * Copyright (C) 2008, 2009 + * Boaz Harrosh <bharrosh@panasas.com> + * + * Copyrights for code taken from ext2: + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/inode.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "exofs.h" + +static inline int exofs_add_nondir(struct dentry *dentry, struct inode *inode) +{ + int err = exofs_add_link(dentry, inode); + if (!err) { + d_instantiate(dentry, inode); + return 0; + } + inode_dec_link_count(inode); + iput(inode); + return err; +} + +static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + struct inode *inode; + ino_t ino; + + if (dentry->d_name.len > EXOFS_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + ino = exofs_inode_by_name(dir, dentry); + inode = NULL; + if (ino) { + inode = exofs_iget(dir->i_sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + } + return d_splice_alias(inode, dentry); +} + +static int exofs_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + struct inode *inode = exofs_new_inode(dir, mode); + int err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + inode->i_op = &exofs_file_inode_operations; + inode->i_fop = &exofs_file_operations; + inode->i_mapping->a_ops = &exofs_aops; + mark_inode_dirty(inode); + err = exofs_add_nondir(dentry, inode); + } + return err; +} + +static int exofs_mknod(struct inode *dir, struct dentry *dentry, int mode, + dev_t rdev) +{ + struct inode *inode; + int err; + + if (!new_valid_dev(rdev)) + return -EINVAL; + + inode = exofs_new_inode(dir, mode); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + init_special_inode(inode, inode->i_mode, rdev); + mark_inode_dirty(inode); + err = exofs_add_nondir(dentry, inode); + } + return err; +} + +static int exofs_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct super_block *sb = dir->i_sb; + int err = -ENAMETOOLONG; + unsigned l = strlen(symname)+1; + struct inode *inode; + struct exofs_i_info *oi; + + if (l > sb->s_blocksize) + goto out; + + inode = exofs_new_inode(dir, S_IFLNK | S_IRWXUGO); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out; + + oi = exofs_i(inode); + if (l > sizeof(oi->i_data)) { + /* slow symlink */ + inode->i_op = &exofs_symlink_inode_operations; + inode->i_mapping->a_ops = &exofs_aops; + memset(oi->i_data, 0, sizeof(oi->i_data)); + + err = page_symlink(inode, symname, l); + if (err) + goto out_fail; + } else { + /* fast symlink */ + inode->i_op = &exofs_fast_symlink_inode_operations; + memcpy(oi->i_data, symname, l); + inode->i_size = l-1; + } + mark_inode_dirty(inode); + + err = exofs_add_nondir(dentry, inode); +out: + return err; + +out_fail: + inode_dec_link_count(inode); + iput(inode); + goto out; +} + +static int exofs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct inode *inode = old_dentry->d_inode; + + if (inode->i_nlink >= EXOFS_LINK_MAX) + return -EMLINK; + + inode->i_ctime = CURRENT_TIME; + inode_inc_link_count(inode); + atomic_inc(&inode->i_count); + + return exofs_add_nondir(dentry, inode); +} + +static int exofs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + struct inode *inode; + int err = -EMLINK; + + if (dir->i_nlink >= EXOFS_LINK_MAX) + goto out; + + inode_inc_link_count(dir); + + inode = exofs_new_inode(dir, S_IFDIR | mode); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_dir; + + inode->i_op = &exofs_dir_inode_operations; + inode->i_fop = &exofs_dir_operations; + inode->i_mapping->a_ops = &exofs_aops; + + inode_inc_link_count(inode); + + err = exofs_make_empty(inode, dir); + if (err) + goto out_fail; + + err = exofs_add_link(dentry, inode); + if (err) + goto out_fail; + + d_instantiate(dentry, inode); +out: + return err; + +out_fail: + inode_dec_link_count(inode); + inode_dec_link_count(inode); + iput(inode); +out_dir: + inode_dec_link_count(dir); + goto out; +} + +static int exofs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + struct exofs_dir_entry *de; + struct page *page; + int err = -ENOENT; + + de = exofs_find_entry(dir, dentry, &page); + if (!de) + goto out; + + err = exofs_delete_entry(de, page); + if (err) + goto out; + + inode->i_ctime = dir->i_ctime; + inode_dec_link_count(inode); + err = 0; +out: + return err; +} + +static int exofs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + int err = -ENOTEMPTY; + + if (exofs_empty_dir(inode)) { + err = exofs_unlink(dir, dentry); + if (!err) { + inode->i_size = 0; + inode_dec_link_count(inode); + inode_dec_link_count(dir); + } + } + return err; +} + +static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct inode *old_inode = old_dentry->d_inode; + struct inode *new_inode = new_dentry->d_inode; + struct page *dir_page = NULL; + struct exofs_dir_entry *dir_de = NULL; + struct page *old_page; + struct exofs_dir_entry *old_de; + int err = -ENOENT; + + old_de = exofs_find_entry(old_dir, old_dentry, &old_page); + if (!old_de) + goto out; + + if (S_ISDIR(old_inode->i_mode)) { + err = -EIO; + dir_de = exofs_dotdot(old_inode, &dir_page); + if (!dir_de) + goto out_old; + } + + if (new_inode) { + struct page *new_page; + struct exofs_dir_entry *new_de; + + err = -ENOTEMPTY; + if (dir_de && !exofs_empty_dir(new_inode)) + goto out_dir; + + err = -ENOENT; + new_de = exofs_find_entry(new_dir, new_dentry, &new_page); + if (!new_de) + goto out_dir; + inode_inc_link_count(old_inode); + err = exofs_set_link(new_dir, new_de, new_page, old_inode); + new_inode->i_ctime = CURRENT_TIME; + if (dir_de) + drop_nlink(new_inode); + inode_dec_link_count(new_inode); + if (err) + goto out_dir; + } else { + if (dir_de) { + err = -EMLINK; + if (new_dir->i_nlink >= EXOFS_LINK_MAX) + goto out_dir; + } + inode_inc_link_count(old_inode); + err = exofs_add_link(new_dentry, old_inode); + if (err) { + inode_dec_link_count(old_inode); + goto out_dir; + } + if (dir_de) + inode_inc_link_count(new_dir); + } + + old_inode->i_ctime = CURRENT_TIME; + + exofs_delete_entry(old_de, old_page); + inode_dec_link_count(old_inode); + + if (dir_de) { + err = exofs_set_link(old_inode, dir_de, dir_page, new_dir); + inode_dec_link_count(old_dir); + if (err) + goto out_dir; + } + return 0; + + +out_dir: + if (dir_de) { + kunmap(dir_page); + page_cache_release(dir_page); + } +out_old: + kunmap(old_page); + page_cache_release(old_page); +out: + return err; +} + +const struct inode_operations exofs_dir_inode_operations = { + .create = exofs_create, + .lookup = exofs_lookup, + .link = exofs_link, + .unlink = exofs_unlink, + .symlink = exofs_symlink, + .mkdir = exofs_mkdir, + .rmdir = exofs_rmdir, + .mknod = exofs_mknod, + .rename = exofs_rename, + .setattr = exofs_setattr, +}; + +const struct inode_operations exofs_special_inode_operations = { + .setattr = exofs_setattr, +}; diff --git a/fs/exofs/osd.c b/fs/exofs/osd.c new file mode 100644 index 000000000000..b249ae97fb15 --- /dev/null +++ b/fs/exofs/osd.c @@ -0,0 +1,153 @@ +/* + * Copyright (C) 2005, 2006 + * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com) + * Copyright (C) 2005, 2006 + * International Business Machines + * Copyright (C) 2008, 2009 + * Boaz Harrosh <bharrosh@panasas.com> + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <scsi/scsi_device.h> +#include <scsi/osd_sense.h> + +#include "exofs.h" + +int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid) +{ + struct osd_sense_info osi; + int ret = osd_req_decode_sense(or, &osi); + + if (ret) { /* translate to Linux codes */ + if (osi.additional_code == scsi_invalid_field_in_cdb) { + if (osi.cdb_field_offset == OSD_CFO_STARTING_BYTE) + ret = -EFAULT; + if (osi.cdb_field_offset == OSD_CFO_OBJECT_ID) + ret = -ENOENT; + else + ret = -EINVAL; + } else if (osi.additional_code == osd_quota_error) + ret = -ENOSPC; + else + ret = -EIO; + } + + /* FIXME: should be include in osd_sense_info */ + if (in_resid) + *in_resid = or->in.req ? or->in.req->data_len : 0; + + if (out_resid) + *out_resid = or->out.req ? or->out.req->data_len : 0; + + return ret; +} + +void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj) +{ + osd_sec_init_nosec_doall_caps(cred_a, obj, false, true); +} + +/* + * Perform a synchronous OSD operation. + */ +int exofs_sync_op(struct osd_request *or, int timeout, uint8_t *credential) +{ + int ret; + + or->timeout = timeout; + ret = osd_finalize_request(or, 0, credential, NULL); + if (ret) { + EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret); + return ret; + } + + ret = osd_execute_request(or); + + if (ret) + EXOFS_DBGMSG("osd_execute_request() => %d\n", ret); + /* osd_req_decode_sense(or, ret); */ + return ret; +} + +/* + * Perform an asynchronous OSD operation. + */ +int exofs_async_op(struct osd_request *or, osd_req_done_fn *async_done, + void *caller_context, u8 *cred) +{ + int ret; + + ret = osd_finalize_request(or, 0, cred, NULL); + if (ret) { + EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret); + return ret; + } + + ret = osd_execute_request_async(or, async_done, caller_context); + + if (ret) + EXOFS_DBGMSG("osd_execute_request_async() => %d\n", ret); + return ret; +} + +int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr) +{ + struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */ + void *iter = NULL; + int nelem; + + do { + nelem = 1; + osd_req_decode_get_attr_list(or, &cur_attr, &nelem, &iter); + if ((cur_attr.attr_page == attr->attr_page) && + (cur_attr.attr_id == attr->attr_id)) { + attr->len = cur_attr.len; + attr->val_ptr = cur_attr.val_ptr; + return 0; + } + } while (iter); + + return -EIO; +} + +int osd_req_read_kern(struct osd_request *or, + const struct osd_obj_id *obj, u64 offset, void* buff, u64 len) +{ + struct request_queue *req_q = or->osd_dev->scsi_device->request_queue; + struct bio *bio = bio_map_kern(req_q, buff, len, GFP_KERNEL); + + if (!bio) + return -ENOMEM; + + osd_req_read(or, obj, bio, offset); + return 0; +} + +int osd_req_write_kern(struct osd_request *or, + const struct osd_obj_id *obj, u64 offset, void* buff, u64 len) +{ + struct request_queue *req_q = or->osd_dev->scsi_device->request_queue; + struct bio *bio = bio_map_kern(req_q, buff, len, GFP_KERNEL); + + if (!bio) + return -ENOMEM; + + osd_req_write(or, obj, bio, offset); + return 0; +} diff --git a/fs/exofs/super.c b/fs/exofs/super.c new file mode 100644 index 000000000000..9f1985e857e2 --- /dev/null +++ b/fs/exofs/super.c @@ -0,0 +1,584 @@ +/* + * Copyright (C) 2005, 2006 + * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com) + * Copyright (C) 2005, 2006 + * International Business Machines + * Copyright (C) 2008, 2009 + * Boaz Harrosh <bharrosh@panasas.com> + * + * Copyrights for code taken from ext2: + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/inode.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <linux/string.h> +#include <linux/parser.h> +#include <linux/vfs.h> +#include <linux/random.h> +#include <linux/exportfs.h> + +#include "exofs.h" + +/****************************************************************************** + * MOUNT OPTIONS + *****************************************************************************/ + +/* + * struct to hold what we get from mount options + */ +struct exofs_mountopt { + const char *dev_name; + uint64_t pid; + int timeout; +}; + +/* + * exofs-specific mount-time options. + */ +enum { Opt_pid, Opt_to, Opt_mkfs, Opt_format, Opt_err }; + +/* + * Our mount-time options. These should ideally be 64-bit unsigned, but the + * kernel's parsing functions do not currently support that. 32-bit should be + * sufficient for most applications now. + */ +static match_table_t tokens = { + {Opt_pid, "pid=%u"}, + {Opt_to, "to=%u"}, + {Opt_err, NULL} +}; + +/* + * The main option parsing method. Also makes sure that all of the mandatory + * mount options were set. + */ +static int parse_options(char *options, struct exofs_mountopt *opts) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + int option; + bool s_pid = false; + + EXOFS_DBGMSG("parse_options %s\n", options); + /* defaults */ + memset(opts, 0, sizeof(*opts)); + opts->timeout = BLK_DEFAULT_SG_TIMEOUT; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + char str[32]; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_pid: + if (0 == match_strlcpy(str, &args[0], sizeof(str))) + return -EINVAL; + opts->pid = simple_strtoull(str, NULL, 0); + if (opts->pid < EXOFS_MIN_PID) { + EXOFS_ERR("Partition ID must be >= %u", + EXOFS_MIN_PID); + return -EINVAL; + } + s_pid = 1; + break; + case Opt_to: + if (match_int(&args[0], &option)) + return -EINVAL; + if (option <= 0) { + EXOFS_ERR("Timout must be > 0"); + return -EINVAL; + } + opts->timeout = option * HZ; + break; + } + } + + if (!s_pid) { + EXOFS_ERR("Need to specify the following options:\n"); + EXOFS_ERR(" -o pid=pid_no_to_use\n"); + return -EINVAL; + } + + return 0; +} + +/****************************************************************************** + * INODE CACHE + *****************************************************************************/ + +/* + * Our inode cache. Isn't it pretty? + */ +static struct kmem_cache *exofs_inode_cachep; + +/* + * Allocate an inode in the cache + */ +static struct inode *exofs_alloc_inode(struct super_block *sb) +{ + struct exofs_i_info *oi; + + oi = kmem_cache_alloc(exofs_inode_cachep, GFP_KERNEL); + if (!oi) + return NULL; + + oi->vfs_inode.i_version = 1; + return &oi->vfs_inode; +} + +/* + * Remove an inode from the cache + */ +static void exofs_destroy_inode(struct inode *inode) +{ + kmem_cache_free(exofs_inode_cachep, exofs_i(inode)); +} + +/* + * Initialize the inode + */ +static void exofs_init_once(void *foo) +{ + struct exofs_i_info *oi = foo; + + inode_init_once(&oi->vfs_inode); +} + +/* + * Create and initialize the inode cache + */ +static int init_inodecache(void) +{ + exofs_inode_cachep = kmem_cache_create("exofs_inode_cache", + sizeof(struct exofs_i_info), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + exofs_init_once); + if (exofs_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +/* + * Destroy the inode cache + */ +static void destroy_inodecache(void) +{ + kmem_cache_destroy(exofs_inode_cachep); +} + +/****************************************************************************** + * SUPERBLOCK FUNCTIONS + *****************************************************************************/ +static const struct super_operations exofs_sops; +static const struct export_operations exofs_export_ops; + +/* + * Write the superblock to the OSD + */ +static void exofs_write_super(struct super_block *sb) +{ + struct exofs_sb_info *sbi; + struct exofs_fscb *fscb; + struct osd_request *or; + struct osd_obj_id obj; + int ret; + + fscb = kzalloc(sizeof(struct exofs_fscb), GFP_KERNEL); + if (!fscb) { + EXOFS_ERR("exofs_write_super: memory allocation failed.\n"); + return; + } + + lock_kernel(); + sbi = sb->s_fs_info; + fscb->s_nextid = cpu_to_le64(sbi->s_nextid); + fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles); + fscb->s_magic = cpu_to_le16(sb->s_magic); + fscb->s_newfs = 0; + + or = osd_start_request(sbi->s_dev, GFP_KERNEL); + if (unlikely(!or)) { + EXOFS_ERR("exofs_write_super: osd_start_request failed.\n"); + goto out; + } + + obj.partition = sbi->s_pid; + obj.id = EXOFS_SUPER_ID; + ret = osd_req_write_kern(or, &obj, 0, fscb, sizeof(*fscb)); + if (unlikely(ret)) { + EXOFS_ERR("exofs_write_super: osd_req_write_kern failed.\n"); + goto out; + } + + ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred); + if (unlikely(ret)) { + EXOFS_ERR("exofs_write_super: exofs_sync_op failed.\n"); + goto out; + } + sb->s_dirt = 0; + +out: + if (or) + osd_end_request(or); + unlock_kernel(); + kfree(fscb); +} + +/* + * This function is called when the vfs is freeing the superblock. We just + * need to free our own part. + */ +static void exofs_put_super(struct super_block *sb) +{ + int num_pend; + struct exofs_sb_info *sbi = sb->s_fs_info; + + /* make sure there are no pending commands */ + for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0; + num_pend = atomic_read(&sbi->s_curr_pending)) { + wait_queue_head_t wq; + init_waitqueue_head(&wq); + wait_event_timeout(wq, + (atomic_read(&sbi->s_curr_pending) == 0), + msecs_to_jiffies(100)); + } + + osduld_put_device(sbi->s_dev); + kfree(sb->s_fs_info); + sb->s_fs_info = NULL; +} + +/* + * Read the superblock from the OSD and fill in the fields + */ +static int exofs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct inode *root; + struct exofs_mountopt *opts = data; + struct exofs_sb_info *sbi; /*extended info */ + struct exofs_fscb fscb; /*on-disk superblock info */ + struct osd_request *or = NULL; + struct osd_obj_id obj; + int ret; + + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + sb->s_fs_info = sbi; + + /* use mount options to fill superblock */ + sbi->s_dev = osduld_path_lookup(opts->dev_name); + if (IS_ERR(sbi->s_dev)) { + ret = PTR_ERR(sbi->s_dev); + sbi->s_dev = NULL; + goto free_sbi; + } + + sbi->s_pid = opts->pid; + sbi->s_timeout = opts->timeout; + + /* fill in some other data by hand */ + memset(sb->s_id, 0, sizeof(sb->s_id)); + strcpy(sb->s_id, "exofs"); + sb->s_blocksize = EXOFS_BLKSIZE; + sb->s_blocksize_bits = EXOFS_BLKSHIFT; + sb->s_maxbytes = MAX_LFS_FILESIZE; + atomic_set(&sbi->s_curr_pending, 0); + sb->s_bdev = NULL; + sb->s_dev = 0; + + /* read data from on-disk superblock object */ + obj.partition = sbi->s_pid; + obj.id = EXOFS_SUPER_ID; + exofs_make_credential(sbi->s_cred, &obj); + + or = osd_start_request(sbi->s_dev, GFP_KERNEL); + if (unlikely(!or)) { + if (!silent) + EXOFS_ERR( + "exofs_fill_super: osd_start_request failed.\n"); + ret = -ENOMEM; + goto free_sbi; + } + ret = osd_req_read_kern(or, &obj, 0, &fscb, sizeof(fscb)); + if (unlikely(ret)) { + if (!silent) + EXOFS_ERR( + "exofs_fill_super: osd_req_read_kern failed.\n"); + ret = -ENOMEM; + goto free_sbi; + } + + ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred); + if (unlikely(ret)) { + if (!silent) + EXOFS_ERR("exofs_fill_super: exofs_sync_op failed.\n"); + ret = -EIO; + goto free_sbi; + } + + sb->s_magic = le16_to_cpu(fscb.s_magic); + sbi->s_nextid = le64_to_cpu(fscb.s_nextid); + sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles); + + /* make sure what we read from the object store is correct */ + if (sb->s_magic != EXOFS_SUPER_MAGIC) { + if (!silent) + EXOFS_ERR("ERROR: Bad magic value\n"); + ret = -EINVAL; + goto free_sbi; + } + + /* start generation numbers from a random point */ + get_random_bytes(&sbi->s_next_generation, sizeof(u32)); + spin_lock_init(&sbi->s_next_gen_lock); + + /* set up operation vectors */ + sb->s_op = &exofs_sops; + sb->s_export_op = &exofs_export_ops; + root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF); + if (IS_ERR(root)) { + EXOFS_ERR("ERROR: exofs_iget failed\n"); + ret = PTR_ERR(root); + goto free_sbi; + } + sb->s_root = d_alloc_root(root); + if (!sb->s_root) { + iput(root); + EXOFS_ERR("ERROR: get root inode failed\n"); + ret = -ENOMEM; + goto free_sbi; + } + + if (!S_ISDIR(root->i_mode)) { + dput(sb->s_root); + sb->s_root = NULL; + EXOFS_ERR("ERROR: corrupt root inode (mode = %hd)\n", + root->i_mode); + ret = -EINVAL; + goto free_sbi; + } + + ret = 0; +out: + if (or) + osd_end_request(or); + return ret; + +free_sbi: + osduld_put_device(sbi->s_dev); /* NULL safe */ + kfree(sbi); + goto out; +} + +/* + * Set up the superblock (calls exofs_fill_super eventually) + */ +static int exofs_get_sb(struct file_system_type *type, + int flags, const char *dev_name, + void *data, struct vfsmount *mnt) +{ + struct exofs_mountopt opts; + int ret; + + ret = parse_options(data, &opts); + if (ret) + return ret; + + opts.dev_name = dev_name; + return get_sb_nodev(type, flags, &opts, exofs_fill_super, mnt); +} + +/* + * Return information about the file system state in the buffer. This is used + * by the 'df' command, for example. + */ +static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct exofs_sb_info *sbi = sb->s_fs_info; + struct osd_obj_id obj = {sbi->s_pid, 0}; + struct osd_attr attrs[] = { + ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS, + OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)), + ATTR_DEF(OSD_APAGE_PARTITION_INFORMATION, + OSD_ATTR_PI_USED_CAPACITY, sizeof(__be64)), + }; + uint64_t capacity = ULLONG_MAX; + uint64_t used = ULLONG_MAX; + struct osd_request *or; + uint8_t cred_a[OSD_CAP_LEN]; + int ret; + + /* get used/capacity attributes */ + exofs_make_credential(cred_a, &obj); + + or = osd_start_request(sbi->s_dev, GFP_KERNEL); + if (unlikely(!or)) { + EXOFS_DBGMSG("exofs_statfs: osd_start_request failed.\n"); + return -ENOMEM; + } + + osd_req_get_attributes(or, &obj); + osd_req_add_get_attr_list(or, attrs, ARRAY_SIZE(attrs)); + ret = exofs_sync_op(or, sbi->s_timeout, cred_a); + if (unlikely(ret)) + goto out; + + ret = extract_attr_from_req(or, &attrs[0]); + if (likely(!ret)) + capacity = get_unaligned_be64(attrs[0].val_ptr); + else + EXOFS_DBGMSG("exofs_statfs: get capacity failed.\n"); + + ret = extract_attr_from_req(or, &attrs[1]); + if (likely(!ret)) + used = get_unaligned_be64(attrs[1].val_ptr); + else + EXOFS_DBGMSG("exofs_statfs: get used-space failed.\n"); + + /* fill in the stats buffer */ + buf->f_type = EXOFS_SUPER_MAGIC; + buf->f_bsize = EXOFS_BLKSIZE; + buf->f_blocks = (capacity >> EXOFS_BLKSHIFT); + buf->f_bfree = ((capacity - used) >> EXOFS_BLKSHIFT); + buf->f_bavail = buf->f_bfree; + buf->f_files = sbi->s_numfiles; + buf->f_ffree = EXOFS_MAX_ID - sbi->s_numfiles; + buf->f_namelen = EXOFS_NAME_LEN; + +out: + osd_end_request(or); + return ret; +} + +static const struct super_operations exofs_sops = { + .alloc_inode = exofs_alloc_inode, + .destroy_inode = exofs_destroy_inode, + .write_inode = exofs_write_inode, + .delete_inode = exofs_delete_inode, + .put_super = exofs_put_super, + .write_super = exofs_write_super, + .statfs = exofs_statfs, +}; + +/****************************************************************************** + * EXPORT OPERATIONS + *****************************************************************************/ + +struct dentry *exofs_get_parent(struct dentry *child) +{ + unsigned long ino = exofs_parent_ino(child); + + if (!ino) + return NULL; + + return d_obtain_alias(exofs_iget(child->d_inode->i_sb, ino)); +} + +static struct inode *exofs_nfs_get_inode(struct super_block *sb, + u64 ino, u32 generation) +{ + struct inode *inode; + + inode = exofs_iget(sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + if (generation && inode->i_generation != generation) { + /* we didn't find the right inode.. */ + iput(inode); + return ERR_PTR(-ESTALE); + } + return inode; +} + +static struct dentry *exofs_fh_to_dentry(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + exofs_nfs_get_inode); +} + +static struct dentry *exofs_fh_to_parent(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + exofs_nfs_get_inode); +} + +static const struct export_operations exofs_export_ops = { + .fh_to_dentry = exofs_fh_to_dentry, + .fh_to_parent = exofs_fh_to_parent, + .get_parent = exofs_get_parent, +}; + +/****************************************************************************** + * INSMOD/RMMOD + *****************************************************************************/ + +/* + * struct that describes this file system + */ +static struct file_system_type exofs_type = { + .owner = THIS_MODULE, + .name = "exofs", + .get_sb = exofs_get_sb, + .kill_sb = generic_shutdown_super, +}; + +static int __init init_exofs(void) +{ + int err; + + err = init_inodecache(); + if (err) + goto out; + + err = register_filesystem(&exofs_type); + if (err) + goto out_d; + + return 0; +out_d: + destroy_inodecache(); +out: + return err; +} + +static void __exit exit_exofs(void) +{ + unregister_filesystem(&exofs_type); + destroy_inodecache(); +} + +MODULE_AUTHOR("Avishay Traeger <avishay@gmail.com>"); +MODULE_DESCRIPTION("exofs"); +MODULE_LICENSE("GPL"); + +module_init(init_exofs) +module_exit(exit_exofs) diff --git a/fs/exofs/symlink.c b/fs/exofs/symlink.c new file mode 100644 index 000000000000..36e2d7bc7f7b --- /dev/null +++ b/fs/exofs/symlink.c @@ -0,0 +1,57 @@ +/* + * Copyright (C) 2005, 2006 + * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com) + * Copyright (C) 2005, 2006 + * International Business Machines + * Copyright (C) 2008, 2009 + * Boaz Harrosh <bharrosh@panasas.com> + * + * Copyrights for code taken from ext2: + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/inode.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <linux/namei.h> + +#include "exofs.h" + +static void *exofs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct exofs_i_info *oi = exofs_i(dentry->d_inode); + + nd_set_link(nd, (char *)oi->i_data); + return NULL; +} + +const struct inode_operations exofs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, +}; + +const struct inode_operations exofs_fast_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = exofs_follow_link, +}; diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index ae8c4f850b27..d46e38cb85c5 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -318,7 +318,7 @@ ext2_init_acl(struct inode *inode, struct inode *dir) return PTR_ERR(acl); } if (!acl) - inode->i_mode &= ~current->fs->umask; + inode->i_mode &= ~current_umask(); } if (test_opt(inode->i_sb, POSIX_ACL) && acl) { struct posix_acl *clone; diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig index 8e0cfe44b0fc..fb3c1a21b135 100644 --- a/fs/ext3/Kconfig +++ b/fs/ext3/Kconfig @@ -28,6 +28,25 @@ config EXT3_FS To compile this file system support as a module, choose M here: the module will be called ext3. +config EXT3_DEFAULTS_TO_ORDERED + bool "Default to 'data=ordered' in ext3 (legacy option)" + depends on EXT3_FS + help + If a filesystem does not explicitly specify a data ordering + mode, and the journal capability allowed it, ext3 used to + historically default to 'data=ordered'. + + That was a rather unfortunate choice, because it leads to all + kinds of latency problems, and the 'data=writeback' mode is more + appropriate these days. + + You should probably always answer 'n' here, and if you really + want to use 'data=ordered' mode, set it in the filesystem itself + with 'tune2fs -o journal_data_ordered'. + + But if you really want to enable the legacy default, you can do + so by answering 'y' to this question. + config EXT3_FS_XATTR bool "Ext3 extended attributes" depends on EXT3_FS diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index b60bb241880c..d81ef2fdb08e 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -323,7 +323,7 @@ ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) return PTR_ERR(acl); } if (!acl) - inode->i_mode &= ~current->fs->umask; + inode->i_mode &= ~current_umask(); } if (test_opt(inode->i_sb, POSIX_ACL) && acl) { struct posix_acl *clone; diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index 5853f4440af4..3d724a95882f 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -42,7 +42,7 @@ const struct file_operations ext3_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = ext3_readdir, /* we take BKL. needed?*/ - .ioctl = ext3_ioctl, /* BKL held */ + .unlocked_ioctl = ext3_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext3_compat_ioctl, #endif diff --git a/fs/ext3/file.c b/fs/ext3/file.c index 3be1e0689c9a..5b49704b231b 100644 --- a/fs/ext3/file.c +++ b/fs/ext3/file.c @@ -33,6 +33,10 @@ */ static int ext3_release_file (struct inode * inode, struct file * filp) { + if (EXT3_I(inode)->i_state & EXT3_STATE_FLUSH_ON_CLOSE) { + filemap_flush(inode->i_mapping); + EXT3_I(inode)->i_state &= ~EXT3_STATE_FLUSH_ON_CLOSE; + } /* if we are the last writer on the inode, drop the block reservation */ if ((filp->f_mode & FMODE_WRITE) && (atomic_read(&inode->i_writecount) == 1)) @@ -112,7 +116,7 @@ const struct file_operations ext3_file_operations = { .write = do_sync_write, .aio_read = generic_file_aio_read, .aio_write = ext3_file_write, - .ioctl = ext3_ioctl, + .unlocked_ioctl = ext3_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext3_compat_ioctl, #endif diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 4a09ff169870..fcfa24361856 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1149,12 +1149,15 @@ static int ext3_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { struct inode *inode = mapping->host; - int ret, needed_blocks = ext3_writepage_trans_blocks(inode); + int ret; handle_t *handle; int retries = 0; struct page *page; pgoff_t index; unsigned from, to; + /* Reserve one block more for addition to orphan list in case + * we allocate blocks but write fails for some reason */ + int needed_blocks = ext3_writepage_trans_blocks(inode) + 1; index = pos >> PAGE_CACHE_SHIFT; from = pos & (PAGE_CACHE_SIZE - 1); @@ -1184,15 +1187,20 @@ retry: } write_begin_failed: if (ret) { - ext3_journal_stop(handle); - unlock_page(page); - page_cache_release(page); /* * block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need * i_size_read because we hold i_mutex. + * + * Add inode to orphan list in case we crash before truncate + * finishes. */ if (pos + len > inode->i_size) + ext3_orphan_add(handle, inode); + ext3_journal_stop(handle); + unlock_page(page); + page_cache_release(page); + if (pos + len > inode->i_size) vmtruncate(inode, inode->i_size); } if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) @@ -1211,6 +1219,18 @@ int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh) return err; } +/* For ordered writepage and write_end functions */ +static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) +{ + /* + * Write could have mapped the buffer but it didn't copy the data in + * yet. So avoid filing such buffer into a transaction. + */ + if (buffer_mapped(bh) && buffer_uptodate(bh)) + return ext3_journal_dirty_data(handle, bh); + return 0; +} + /* For write_end() in data=journal mode */ static int write_end_fn(handle_t *handle, struct buffer_head *bh) { @@ -1221,26 +1241,20 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh) } /* - * Generic write_end handler for ordered and writeback ext3 journal modes. - * We can't use generic_write_end, because that unlocks the page and we need to - * unlock the page after ext3_journal_stop, but ext3_journal_stop must run - * after block_write_end. + * This is nasty and subtle: ext3_write_begin() could have allocated blocks + * for the whole page but later we failed to copy the data in. Update inode + * size according to what we managed to copy. The rest is going to be + * truncated in write_end function. */ -static int ext3_generic_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) +static void update_file_sizes(struct inode *inode, loff_t pos, unsigned copied) { - struct inode *inode = file->f_mapping->host; - - copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); - - if (pos+copied > inode->i_size) { - i_size_write(inode, pos+copied); + /* What matters to us is i_disksize. We don't write i_size anywhere */ + if (pos + copied > inode->i_size) + i_size_write(inode, pos + copied); + if (pos + copied > EXT3_I(inode)->i_disksize) { + EXT3_I(inode)->i_disksize = pos + copied; mark_inode_dirty(inode); } - - return copied; } /* @@ -1260,35 +1274,29 @@ static int ext3_ordered_write_end(struct file *file, unsigned from, to; int ret = 0, ret2; - from = pos & (PAGE_CACHE_SIZE - 1); - to = from + len; + copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); + from = pos & (PAGE_CACHE_SIZE - 1); + to = from + copied; ret = walk_page_buffers(handle, page_buffers(page), - from, to, NULL, ext3_journal_dirty_data); + from, to, NULL, journal_dirty_data_fn); - if (ret == 0) { - /* - * generic_write_end() will run mark_inode_dirty() if i_size - * changes. So let's piggyback the i_disksize mark_inode_dirty - * into that. - */ - loff_t new_i_size; - - new_i_size = pos + copied; - if (new_i_size > EXT3_I(inode)->i_disksize) - EXT3_I(inode)->i_disksize = new_i_size; - ret2 = ext3_generic_write_end(file, mapping, pos, len, copied, - page, fsdata); - copied = ret2; - if (ret2 < 0) - ret = ret2; - } + if (ret == 0) + update_file_sizes(inode, pos, copied); + /* + * There may be allocated blocks outside of i_size because + * we failed to copy some data. Prepare for truncate. + */ + if (pos + len > inode->i_size) + ext3_orphan_add(handle, inode); ret2 = ext3_journal_stop(handle); if (!ret) ret = ret2; unlock_page(page); page_cache_release(page); + if (pos + len > inode->i_size) + vmtruncate(inode, inode->i_size); return ret ? ret : copied; } @@ -1299,25 +1307,22 @@ static int ext3_writeback_write_end(struct file *file, { handle_t *handle = ext3_journal_current_handle(); struct inode *inode = file->f_mapping->host; - int ret = 0, ret2; - loff_t new_i_size; - - new_i_size = pos + copied; - if (new_i_size > EXT3_I(inode)->i_disksize) - EXT3_I(inode)->i_disksize = new_i_size; - - ret2 = ext3_generic_write_end(file, mapping, pos, len, copied, - page, fsdata); - copied = ret2; - if (ret2 < 0) - ret = ret2; + int ret; - ret2 = ext3_journal_stop(handle); - if (!ret) - ret = ret2; + copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); + update_file_sizes(inode, pos, copied); + /* + * There may be allocated blocks outside of i_size because + * we failed to copy some data. Prepare for truncate. + */ + if (pos + len > inode->i_size) + ext3_orphan_add(handle, inode); + ret = ext3_journal_stop(handle); unlock_page(page); page_cache_release(page); + if (pos + len > inode->i_size) + vmtruncate(inode, inode->i_size); return ret ? ret : copied; } @@ -1338,15 +1343,23 @@ static int ext3_journalled_write_end(struct file *file, if (copied < len) { if (!PageUptodate(page)) copied = 0; - page_zero_new_buffers(page, from+copied, to); + page_zero_new_buffers(page, from + copied, to); + to = from + copied; } ret = walk_page_buffers(handle, page_buffers(page), from, to, &partial, write_end_fn); if (!partial) SetPageUptodate(page); - if (pos+copied > inode->i_size) - i_size_write(inode, pos+copied); + + if (pos + copied > inode->i_size) + i_size_write(inode, pos + copied); + /* + * There may be allocated blocks outside of i_size because + * we failed to copy some data. Prepare for truncate. + */ + if (pos + len > inode->i_size) + ext3_orphan_add(handle, inode); EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; if (inode->i_size > EXT3_I(inode)->i_disksize) { EXT3_I(inode)->i_disksize = inode->i_size; @@ -1361,6 +1374,8 @@ static int ext3_journalled_write_end(struct file *file, unlock_page(page); page_cache_release(page); + if (pos + len > inode->i_size) + vmtruncate(inode, inode->i_size); return ret ? ret : copied; } @@ -1428,17 +1443,11 @@ static int bput_one(handle_t *handle, struct buffer_head *bh) return 0; } -static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) -{ - if (buffer_mapped(bh)) - return ext3_journal_dirty_data(handle, bh); - return 0; -} - static int buffer_unmapped(handle_t *handle, struct buffer_head *bh) { return !buffer_mapped(bh); } + /* * Note that we always start a transaction even if we're not journalling * data. This is to preserve ordering: any hole instantiation within @@ -1512,12 +1521,16 @@ static int ext3_ordered_writepage(struct page *page, if (!page_has_buffers(page)) { create_empty_buffers(page, inode->i_sb->s_blocksize, (1 << BH_Dirty)|(1 << BH_Uptodate)); - } else if (!walk_page_buffers(NULL, page_buffers(page), 0, PAGE_CACHE_SIZE, NULL, buffer_unmapped)) { - /* Provide NULL instead of get_block so that we catch bugs if buffers weren't really mapped */ - return block_write_full_page(page, NULL, wbc); + page_bufs = page_buffers(page); + } else { + page_bufs = page_buffers(page); + if (!walk_page_buffers(NULL, page_bufs, 0, PAGE_CACHE_SIZE, + NULL, buffer_unmapped)) { + /* Provide NULL get_block() to catch bugs if buffers + * weren't really mapped */ + return block_write_full_page(page, NULL, wbc); + } } - page_bufs = page_buffers(page); - handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); if (IS_ERR(handle)) { @@ -1572,6 +1585,15 @@ static int ext3_writeback_writepage(struct page *page, if (ext3_journal_current_handle()) goto out_fail; + if (page_has_buffers(page)) { + if (!walk_page_buffers(NULL, page_buffers(page), 0, + PAGE_CACHE_SIZE, NULL, buffer_unmapped)) { + /* Provide NULL get_block() to catch bugs if buffers + * weren't really mapped */ + return block_write_full_page(page, NULL, wbc); + } + } + handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); if (IS_ERR(handle)) { ret = PTR_ERR(handle); @@ -2354,6 +2376,9 @@ void ext3_truncate(struct inode *inode) if (!ext3_can_truncate(inode)) return; + if (inode->i_size == 0 && ext3_should_writeback_data(inode)) + ei->i_state |= EXT3_STATE_FLUSH_ON_CLOSE; + /* * We have to lock the EOF page here, because lock_page() nests * outside journal_start(). diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c index 5e86ce9a86e0..88974814783a 100644 --- a/fs/ext3/ioctl.c +++ b/fs/ext3/ioctl.c @@ -15,12 +15,11 @@ #include <linux/mount.h> #include <linux/time.h> #include <linux/compat.h> -#include <linux/smp_lock.h> #include <asm/uaccess.h> -int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, - unsigned long arg) +long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { + struct inode *inode = filp->f_dentry->d_inode; struct ext3_inode_info *ei = EXT3_I(inode); unsigned int flags; unsigned short rsv_window_size; @@ -39,29 +38,25 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, unsigned int oldflags; unsigned int jflag; + if (!is_owner_or_cap(inode)) + return -EACCES; + + if (get_user(flags, (int __user *) arg)) + return -EFAULT; + err = mnt_want_write(filp->f_path.mnt); if (err) return err; - if (!is_owner_or_cap(inode)) { - err = -EACCES; - goto flags_out; - } - - if (get_user(flags, (int __user *) arg)) { - err = -EFAULT; - goto flags_out; - } - flags = ext3_mask_flags(inode->i_mode, flags); mutex_lock(&inode->i_mutex); + /* Is it quota file? Do not allow user to mess with it */ - if (IS_NOQUOTA(inode)) { - mutex_unlock(&inode->i_mutex); - err = -EPERM; + err = -EPERM; + if (IS_NOQUOTA(inode)) goto flags_out; - } + oldflags = ei->i_flags; /* The JOURNAL_DATA flag is modifiable only by root */ @@ -74,11 +69,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, * This test looks nicer. Thanks to Pauline Middelink */ if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) { - if (!capable(CAP_LINUX_IMMUTABLE)) { - mutex_unlock(&inode->i_mutex); - err = -EPERM; + if (!capable(CAP_LINUX_IMMUTABLE)) goto flags_out; - } } /* @@ -86,17 +78,12 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, * the relevant capability. */ if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) { - if (!capable(CAP_SYS_RESOURCE)) { - mutex_unlock(&inode->i_mutex); - err = -EPERM; + if (!capable(CAP_SYS_RESOURCE)) goto flags_out; - } } - handle = ext3_journal_start(inode, 1); if (IS_ERR(handle)) { - mutex_unlock(&inode->i_mutex); err = PTR_ERR(handle); goto flags_out; } @@ -116,15 +103,13 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, err = ext3_mark_iloc_dirty(handle, inode, &iloc); flags_err: ext3_journal_stop(handle); - if (err) { - mutex_unlock(&inode->i_mutex); - return err; - } + if (err) + goto flags_out; if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) err = ext3_change_inode_journal_flag(inode, jflag); - mutex_unlock(&inode->i_mutex); flags_out: + mutex_unlock(&inode->i_mutex); mnt_drop_write(filp->f_path.mnt); return err; } @@ -140,6 +125,7 @@ flags_out: if (!is_owner_or_cap(inode)) return -EPERM; + err = mnt_want_write(filp->f_path.mnt); if (err) return err; @@ -147,6 +133,7 @@ flags_out: err = -EFAULT; goto setversion_out; } + handle = ext3_journal_start(inode, 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); @@ -299,9 +286,6 @@ group_add_out: #ifdef CONFIG_COMPAT long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct inode *inode = file->f_path.dentry->d_inode; - int ret; - /* These are just misnamed, they actually get/put from/to user an int */ switch (cmd) { case EXT3_IOC32_GETFLAGS: @@ -341,9 +325,6 @@ long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) default: return -ENOIOCTLCMD; } - lock_kernel(); - ret = ext3_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg)); - unlock_kernel(); - return ret; + return ext3_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); } #endif diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index e2fc63cbba8b..6ff7b9730234 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -161,12 +161,12 @@ static struct dx_frame *dx_probe(struct qstr *entry, struct dx_frame *frame, int *err); static void dx_release (struct dx_frame *frames); -static int dx_make_map (struct ext3_dir_entry_2 *de, int size, +static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize, struct dx_hash_info *hinfo, struct dx_map_entry map[]); static void dx_sort_map(struct dx_map_entry *map, unsigned count); static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to, struct dx_map_entry *offsets, int count); -static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size); +static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize); static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); static int ext3_htree_next_block(struct inode *dir, __u32 hash, struct dx_frame *frame, @@ -708,14 +708,14 @@ errout: * Create map of hash values, offsets, and sizes, stored at end of block. * Returns number of entries mapped. */ -static int dx_make_map (struct ext3_dir_entry_2 *de, int size, - struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) +static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize, + struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) { int count = 0; char *base = (char *) de; struct dx_hash_info h = *hinfo; - while ((char *) de < base + size) + while ((char *) de < base + blocksize) { if (de->name_len && de->inode) { ext3fs_dirhash(de->name, de->name_len, &h); @@ -1047,8 +1047,16 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str return ERR_PTR(-EIO); } inode = ext3_iget(dir->i_sb, ino); - if (IS_ERR(inode)) - return ERR_CAST(inode); + if (unlikely(IS_ERR(inode))) { + if (PTR_ERR(inode) == -ESTALE) { + ext3_error(dir->i_sb, __func__, + "deleted inode referenced: %lu", + ino); + return ERR_PTR(-EIO); + } else { + return ERR_CAST(inode); + } + } } return d_splice_alias(inode, dentry); } @@ -1120,13 +1128,14 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count) * Compact each dir entry in the range to the minimal rec_len. * Returns pointer to last entry in range. */ -static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size) +static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize) { - struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base; + struct ext3_dir_entry_2 *next, *to, *prev; + struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *)base; unsigned rec_len = 0; prev = to = de; - while ((char*)de < base + size) { + while ((char *)de < base + blocksize) { next = ext3_next_entry(de); if (de->inode && de->name_len) { rec_len = EXT3_DIR_REC_LEN(de->name_len); @@ -2265,7 +2274,7 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, struct inode * old_inode, * new_inode; struct buffer_head * old_bh, * new_bh, * dir_bh; struct ext3_dir_entry_2 * old_de, * new_de; - int retval; + int retval, flush_file = 0; old_bh = new_bh = dir_bh = NULL; @@ -2401,6 +2410,8 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, ext3_mark_inode_dirty(handle, new_inode); if (!new_inode->i_nlink) ext3_orphan_add(handle, new_inode); + if (ext3_should_writeback_data(new_inode)) + flush_file = 1; } retval = 0; @@ -2409,6 +2420,8 @@ end_rename: brelse (old_bh); brelse (new_bh); ext3_journal_stop(handle); + if (retval == 0 && flush_file) + filemap_flush(old_inode->i_mapping); return retval; } diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 9e5b8e387e1e..599dbfe504c3 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -44,6 +44,12 @@ #include "acl.h" #include "namei.h" +#ifdef CONFIG_EXT3_DEFAULTS_TO_ORDERED + #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_ORDERED_DATA +#else + #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_WRITEBACK_DATA +#endif + static int ext3_load_journal(struct super_block *, struct ext3_super_block *, unsigned long journal_devnum); static int ext3_create_journal(struct super_block *, struct ext3_super_block *, @@ -1919,7 +1925,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) cope, else JOURNAL_DATA */ if (journal_check_available_features (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) - set_opt(sbi->s_mount_opt, ORDERED_DATA); + set_opt(sbi->s_mount_opt, DEFAULT_DATA_MODE); else set_opt(sbi->s_mount_opt, JOURNAL_DATA); break; diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 7505482a08fa..418b6f3b0ae8 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -18,7 +18,7 @@ config EXT4_FS filesystem; while there will be some performance gains from the delayed allocation and inode table readahead, the best performance gains will require enabling ext4 features in the - filesystem, or formating a new filesystem as an ext4 + filesystem, or formatting a new filesystem as an ext4 filesystem initially. To compile this file system support as a module, choose M here. The diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 694ed6fadcc8..647e0d65a284 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -323,7 +323,7 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) return PTR_ERR(acl); } if (!acl) - inode->i_mode &= ~current->fs->umask; + inode->i_mode &= ~current_umask(); } if (test_opt(inode->i_sb, POSIX_ACL) && acl) { struct posix_acl *clone; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ac77d8b8251d..6132353dcf62 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -342,7 +342,7 @@ static int ext4_valid_extent_idx(struct inode *inode, ext4_fsblk_t block = idx_pblock(ext_idx); struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; if (unlikely(block < le32_to_cpu(es->s_first_data_block) || - (block > ext4_blocks_count(es)))) + (block >= ext4_blocks_count(es)))) return 0; else return 1; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a2e7952bc5f9..c6bd6ced3bb7 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -372,16 +372,16 @@ static int ext4_block_to_path(struct inode *inode, } static int __ext4_check_blockref(const char *function, struct inode *inode, - unsigned int *p, unsigned int max) { + __le32 *p, unsigned int max) { unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es); - unsigned int *bref = p; + __le32 *bref = p; while (bref < p+max) { - if (unlikely(*bref >= maxblocks)) { + if (unlikely(le32_to_cpu(*bref) >= maxblocks)) { ext4_error(inode->i_sb, function, "block reference %u >= max (%u) " "in inode #%lu, offset=%d", - *bref, maxblocks, + le32_to_cpu(*bref), maxblocks, inode->i_ino, (int)(bref-p)); return -EIO; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9987bba99db3..2958f4e6f222 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2508,6 +2508,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (EXT4_BLOCKS_PER_GROUP(sb) == 0) goto cantfind_ext4; + /* check blocks count against device size */ + blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits; + if (blocks_count && ext4_blocks_count(es) > blocks_count) { + printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu " + "exceeds size of device (%llu blocks)\n", + ext4_blocks_count(es), blocks_count); + goto failed_mount; + } + /* * It makes no sense for the first data block to be beyond the end * of the filesystem. diff --git a/fs/fat/inode.c b/fs/fat/inode.c index de0004fe6e00..296785a0dec8 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -523,7 +523,9 @@ static int fat_remount(struct super_block *sb, int *flags, char *data) static int fat_statfs(struct dentry *dentry, struct kstatfs *buf) { - struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb); + struct super_block *sb = dentry->d_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); /* If the count of free cluster is still unknown, counts it here. */ if (sbi->free_clusters == -1 || !sbi->free_clus_valid) { @@ -537,6 +539,8 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = sbi->max_cluster - FAT_START_ENT; buf->f_bfree = sbi->free_clusters; buf->f_bavail = sbi->free_clusters; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); buf->f_namelen = sbi->options.isvfat ? 260 : 12; return 0; @@ -930,7 +934,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug, opts->fs_uid = current_uid(); opts->fs_gid = current_gid(); - opts->fs_fmask = opts->fs_dmask = current->fs->umask; + opts->fs_fmask = current_umask(); opts->allow_utime = -1; opts->codepage = fat_default_codepage; opts->iocharset = fat_default_iocharset; diff --git a/fs/file_table.c b/fs/file_table.c index b74a8e1da913..54018fe48840 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -169,7 +169,6 @@ struct file *alloc_file(struct vfsmount *mnt, struct dentry *dentry, fmode_t mode, const struct file_operations *fop) { struct file *file; - struct path; file = get_empty_filp(); if (!file) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index e3fe9918faaf..91013ff7dd53 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -196,7 +196,7 @@ static void redirty_tail(struct inode *inode) struct inode *tail_inode; tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list); - if (!time_after_eq(inode->dirtied_when, + if (time_before(inode->dirtied_when, tail_inode->dirtied_when)) inode->dirtied_when = jiffies; } @@ -220,6 +220,21 @@ static void inode_sync_complete(struct inode *inode) wake_up_bit(&inode->i_state, __I_SYNC); } +static bool inode_dirtied_after(struct inode *inode, unsigned long t) +{ + bool ret = time_after(inode->dirtied_when, t); +#ifndef CONFIG_64BIT + /* + * For inodes being constantly redirtied, dirtied_when can get stuck. + * It _appears_ to be in the future, but is actually in distant past. + * This test is necessary to prevent such wrapped-around relative times + * from permanently stopping the whole pdflush writeback. + */ + ret = ret && time_before_eq(inode->dirtied_when, jiffies); +#endif + return ret; +} + /* * Move expired dirty inodes from @delaying_queue to @dispatch_queue. */ @@ -231,7 +246,7 @@ static void move_expired_inodes(struct list_head *delaying_queue, struct inode *inode = list_entry(delaying_queue->prev, struct inode, i_list); if (older_than_this && - time_after(inode->dirtied_when, *older_than_this)) + inode_dirtied_after(inode, *older_than_this)) break; list_move(&inode->i_list, dispatch_queue); } @@ -420,7 +435,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * If older_than_this is non-NULL, then only write out inodes which * had their first dirtying at a time earlier than *older_than_this. * - * If we're a pdlfush thread, then implement pdflush collision avoidance + * If we're a pdflush thread, then implement pdflush collision avoidance * against the entire list. * * If `bdi' is non-zero then we're being asked to writeback a specific queue. @@ -492,8 +507,11 @@ void generic_sync_sb_inodes(struct super_block *sb, continue; /* blockdev has wrong queue */ } - /* Was this inode dirtied after sync_sb_inodes was called? */ - if (time_after(inode->dirtied_when, start)) + /* + * Was this inode dirtied after sync_sb_inodes was called? + * This keeps sync from extra jobs and livelock. + */ + if (inode_dirtied_after(inode, start)) break; /* Is another pdflush already flushing this queue? */ @@ -538,7 +556,8 @@ void generic_sync_sb_inodes(struct super_block *sb, list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { struct address_space *mapping; - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) + if (inode->i_state & + (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) continue; mapping = inode->i_mapping; if (mapping->nrpages == 0) diff --git a/fs/fs_struct.c b/fs/fs_struct.c new file mode 100644 index 000000000000..eee059052db5 --- /dev/null +++ b/fs/fs_struct.c @@ -0,0 +1,177 @@ +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/fs.h> +#include <linux/path.h> +#include <linux/slab.h> +#include <linux/fs_struct.h> + +/* + * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. + * It can block. + */ +void set_fs_root(struct fs_struct *fs, struct path *path) +{ + struct path old_root; + + write_lock(&fs->lock); + old_root = fs->root; + fs->root = *path; + path_get(path); + write_unlock(&fs->lock); + if (old_root.dentry) + path_put(&old_root); +} + +/* + * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values. + * It can block. + */ +void set_fs_pwd(struct fs_struct *fs, struct path *path) +{ + struct path old_pwd; + + write_lock(&fs->lock); + old_pwd = fs->pwd; + fs->pwd = *path; + path_get(path); + write_unlock(&fs->lock); + + if (old_pwd.dentry) + path_put(&old_pwd); +} + +void chroot_fs_refs(struct path *old_root, struct path *new_root) +{ + struct task_struct *g, *p; + struct fs_struct *fs; + int count = 0; + + read_lock(&tasklist_lock); + do_each_thread(g, p) { + task_lock(p); + fs = p->fs; + if (fs) { + write_lock(&fs->lock); + if (fs->root.dentry == old_root->dentry + && fs->root.mnt == old_root->mnt) { + path_get(new_root); + fs->root = *new_root; + count++; + } + if (fs->pwd.dentry == old_root->dentry + && fs->pwd.mnt == old_root->mnt) { + path_get(new_root); + fs->pwd = *new_root; + count++; + } + write_unlock(&fs->lock); + } + task_unlock(p); + } while_each_thread(g, p); + read_unlock(&tasklist_lock); + while (count--) + path_put(old_root); +} + +void free_fs_struct(struct fs_struct *fs) +{ + path_put(&fs->root); + path_put(&fs->pwd); + kmem_cache_free(fs_cachep, fs); +} + +void exit_fs(struct task_struct *tsk) +{ + struct fs_struct *fs = tsk->fs; + + if (fs) { + int kill; + task_lock(tsk); + write_lock(&fs->lock); + tsk->fs = NULL; + kill = !--fs->users; + write_unlock(&fs->lock); + task_unlock(tsk); + if (kill) + free_fs_struct(fs); + } +} + +struct fs_struct *copy_fs_struct(struct fs_struct *old) +{ + struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL); + /* We don't need to lock fs - think why ;-) */ + if (fs) { + fs->users = 1; + fs->in_exec = 0; + rwlock_init(&fs->lock); + fs->umask = old->umask; + read_lock(&old->lock); + fs->root = old->root; + path_get(&old->root); + fs->pwd = old->pwd; + path_get(&old->pwd); + read_unlock(&old->lock); + } + return fs; +} + +int unshare_fs_struct(void) +{ + struct fs_struct *fs = current->fs; + struct fs_struct *new_fs = copy_fs_struct(fs); + int kill; + + if (!new_fs) + return -ENOMEM; + + task_lock(current); + write_lock(&fs->lock); + kill = !--fs->users; + current->fs = new_fs; + write_unlock(&fs->lock); + task_unlock(current); + + if (kill) + free_fs_struct(fs); + + return 0; +} +EXPORT_SYMBOL_GPL(unshare_fs_struct); + +int current_umask(void) +{ + return current->fs->umask; +} +EXPORT_SYMBOL(current_umask); + +/* to be mentioned only in INIT_TASK */ +struct fs_struct init_fs = { + .users = 1, + .lock = __RW_LOCK_UNLOCKED(init_fs.lock), + .umask = 0022, +}; + +void daemonize_fs_struct(void) +{ + struct fs_struct *fs = current->fs; + + if (fs) { + int kill; + + task_lock(current); + + write_lock(&init_fs.lock); + init_fs.users++; + write_unlock(&init_fs.lock); + + write_lock(&fs->lock); + current->fs = &init_fs; + kill = !--fs->users; + write_unlock(&fs->lock); + + task_unlock(current); + if (kill) + free_fs_struct(fs); + } +} diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig new file mode 100644 index 000000000000..9bbb8ce7bea0 --- /dev/null +++ b/fs/fscache/Kconfig @@ -0,0 +1,56 @@ + +config FSCACHE + tristate "General filesystem local caching manager" + depends on EXPERIMENTAL + select SLOW_WORK + help + This option enables a generic filesystem caching manager that can be + used by various network and other filesystems to cache data locally. + Different sorts of caches can be plugged in, depending on the + resources available. + + See Documentation/filesystems/caching/fscache.txt for more information. + +config FSCACHE_STATS + bool "Gather statistical information on local caching" + depends on FSCACHE && PROC_FS + help + This option causes statistical information to be gathered on local + caching and exported through file: + + /proc/fs/fscache/stats + + The gathering of statistics adds a certain amount of overhead to + execution as there are a quite a few stats gathered, and on a + multi-CPU system these may be on cachelines that keep bouncing + between CPUs. On the other hand, the stats are very useful for + debugging purposes. Saying 'Y' here is recommended. + + See Documentation/filesystems/caching/fscache.txt for more information. + +config FSCACHE_HISTOGRAM + bool "Gather latency information on local caching" + depends on FSCACHE && PROC_FS + help + This option causes latency information to be gathered on local + caching and exported through file: + + /proc/fs/fscache/histogram + + The generation of this histogram adds a certain amount of overhead to + execution as there are a number of points at which data is gathered, + and on a multi-CPU system these may be on cachelines that keep + bouncing between CPUs. On the other hand, the histogram may be + useful for debugging purposes. Saying 'N' here is recommended. + + See Documentation/filesystems/caching/fscache.txt for more information. + +config FSCACHE_DEBUG + bool "Debug FS-Cache" + depends on FSCACHE + help + This permits debugging to be dynamically enabled in the local caching + management module. If this is set, the debugging output may be + enabled by setting bits in /sys/modules/fscache/parameter/debug. + + See Documentation/filesystems/caching/fscache.txt for more information. diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile new file mode 100644 index 000000000000..91571b95aacc --- /dev/null +++ b/fs/fscache/Makefile @@ -0,0 +1,19 @@ +# +# Makefile for general filesystem caching code +# + +fscache-y := \ + cache.o \ + cookie.o \ + fsdef.o \ + main.o \ + netfs.o \ + object.o \ + operation.o \ + page.o + +fscache-$(CONFIG_PROC_FS) += proc.o +fscache-$(CONFIG_FSCACHE_STATS) += stats.o +fscache-$(CONFIG_FSCACHE_HISTOGRAM) += histogram.o + +obj-$(CONFIG_FSCACHE) := fscache.o diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c new file mode 100644 index 000000000000..e21985bbb1fb --- /dev/null +++ b/fs/fscache/cache.c @@ -0,0 +1,415 @@ +/* FS-Cache cache handling + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define FSCACHE_DEBUG_LEVEL CACHE +#include <linux/module.h> +#include <linux/slab.h> +#include "internal.h" + +LIST_HEAD(fscache_cache_list); +DECLARE_RWSEM(fscache_addremove_sem); +DECLARE_WAIT_QUEUE_HEAD(fscache_cache_cleared_wq); +EXPORT_SYMBOL(fscache_cache_cleared_wq); + +static LIST_HEAD(fscache_cache_tag_list); + +/* + * look up a cache tag + */ +struct fscache_cache_tag *__fscache_lookup_cache_tag(const char *name) +{ + struct fscache_cache_tag *tag, *xtag; + + /* firstly check for the existence of the tag under read lock */ + down_read(&fscache_addremove_sem); + + list_for_each_entry(tag, &fscache_cache_tag_list, link) { + if (strcmp(tag->name, name) == 0) { + atomic_inc(&tag->usage); + up_read(&fscache_addremove_sem); + return tag; + } + } + + up_read(&fscache_addremove_sem); + + /* the tag does not exist - create a candidate */ + xtag = kzalloc(sizeof(*xtag) + strlen(name) + 1, GFP_KERNEL); + if (!xtag) + /* return a dummy tag if out of memory */ + return ERR_PTR(-ENOMEM); + + atomic_set(&xtag->usage, 1); + strcpy(xtag->name, name); + + /* write lock, search again and add if still not present */ + down_write(&fscache_addremove_sem); + + list_for_each_entry(tag, &fscache_cache_tag_list, link) { + if (strcmp(tag->name, name) == 0) { + atomic_inc(&tag->usage); + up_write(&fscache_addremove_sem); + kfree(xtag); + return tag; + } + } + + list_add_tail(&xtag->link, &fscache_cache_tag_list); + up_write(&fscache_addremove_sem); + return xtag; +} + +/* + * release a reference to a cache tag + */ +void __fscache_release_cache_tag(struct fscache_cache_tag *tag) +{ + if (tag != ERR_PTR(-ENOMEM)) { + down_write(&fscache_addremove_sem); + + if (atomic_dec_and_test(&tag->usage)) + list_del_init(&tag->link); + else + tag = NULL; + + up_write(&fscache_addremove_sem); + + kfree(tag); + } +} + +/* + * select a cache in which to store an object + * - the cache addremove semaphore must be at least read-locked by the caller + * - the object will never be an index + */ +struct fscache_cache *fscache_select_cache_for_object( + struct fscache_cookie *cookie) +{ + struct fscache_cache_tag *tag; + struct fscache_object *object; + struct fscache_cache *cache; + + _enter(""); + + if (list_empty(&fscache_cache_list)) { + _leave(" = NULL [no cache]"); + return NULL; + } + + /* we check the parent to determine the cache to use */ + spin_lock(&cookie->lock); + + /* the first in the parent's backing list should be the preferred + * cache */ + if (!hlist_empty(&cookie->backing_objects)) { + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, cookie_link); + + cache = object->cache; + if (object->state >= FSCACHE_OBJECT_DYING || + test_bit(FSCACHE_IOERROR, &cache->flags)) + cache = NULL; + + spin_unlock(&cookie->lock); + _leave(" = %p [parent]", cache); + return cache; + } + + /* the parent is unbacked */ + if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) { + /* cookie not an index and is unbacked */ + spin_unlock(&cookie->lock); + _leave(" = NULL [cookie ub,ni]"); + return NULL; + } + + spin_unlock(&cookie->lock); + + if (!cookie->def->select_cache) + goto no_preference; + + /* ask the netfs for its preference */ + tag = cookie->def->select_cache(cookie->parent->netfs_data, + cookie->netfs_data); + if (!tag) + goto no_preference; + + if (tag == ERR_PTR(-ENOMEM)) { + _leave(" = NULL [nomem tag]"); + return NULL; + } + + if (!tag->cache) { + _leave(" = NULL [unbacked tag]"); + return NULL; + } + + if (test_bit(FSCACHE_IOERROR, &tag->cache->flags)) + return NULL; + + _leave(" = %p [specific]", tag->cache); + return tag->cache; + +no_preference: + /* netfs has no preference - just select first cache */ + cache = list_entry(fscache_cache_list.next, + struct fscache_cache, link); + _leave(" = %p [first]", cache); + return cache; +} + +/** + * fscache_init_cache - Initialise a cache record + * @cache: The cache record to be initialised + * @ops: The cache operations to be installed in that record + * @idfmt: Format string to define identifier + * @...: sprintf-style arguments + * + * Initialise a record of a cache and fill in the name. + * + * See Documentation/filesystems/caching/backend-api.txt for a complete + * description. + */ +void fscache_init_cache(struct fscache_cache *cache, + const struct fscache_cache_ops *ops, + const char *idfmt, + ...) +{ + va_list va; + + memset(cache, 0, sizeof(*cache)); + + cache->ops = ops; + + va_start(va, idfmt); + vsnprintf(cache->identifier, sizeof(cache->identifier), idfmt, va); + va_end(va); + + INIT_WORK(&cache->op_gc, fscache_operation_gc); + INIT_LIST_HEAD(&cache->link); + INIT_LIST_HEAD(&cache->object_list); + INIT_LIST_HEAD(&cache->op_gc_list); + spin_lock_init(&cache->object_list_lock); + spin_lock_init(&cache->op_gc_list_lock); +} +EXPORT_SYMBOL(fscache_init_cache); + +/** + * fscache_add_cache - Declare a cache as being open for business + * @cache: The record describing the cache + * @ifsdef: The record of the cache object describing the top-level index + * @tagname: The tag describing this cache + * + * Add a cache to the system, making it available for netfs's to use. + * + * See Documentation/filesystems/caching/backend-api.txt for a complete + * description. + */ +int fscache_add_cache(struct fscache_cache *cache, + struct fscache_object *ifsdef, + const char *tagname) +{ + struct fscache_cache_tag *tag; + + BUG_ON(!cache->ops); + BUG_ON(!ifsdef); + + cache->flags = 0; + ifsdef->event_mask = ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED); + ifsdef->state = FSCACHE_OBJECT_ACTIVE; + + if (!tagname) + tagname = cache->identifier; + + BUG_ON(!tagname[0]); + + _enter("{%s.%s},,%s", cache->ops->name, cache->identifier, tagname); + + /* we use the cache tag to uniquely identify caches */ + tag = __fscache_lookup_cache_tag(tagname); + if (IS_ERR(tag)) + goto nomem; + + if (test_and_set_bit(FSCACHE_TAG_RESERVED, &tag->flags)) + goto tag_in_use; + + cache->kobj = kobject_create_and_add(tagname, fscache_root); + if (!cache->kobj) + goto error; + + ifsdef->cookie = &fscache_fsdef_index; + ifsdef->cache = cache; + cache->fsdef = ifsdef; + + down_write(&fscache_addremove_sem); + + tag->cache = cache; + cache->tag = tag; + + /* add the cache to the list */ + list_add(&cache->link, &fscache_cache_list); + + /* add the cache's netfs definition index object to the cache's + * list */ + spin_lock(&cache->object_list_lock); + list_add_tail(&ifsdef->cache_link, &cache->object_list); + spin_unlock(&cache->object_list_lock); + + /* add the cache's netfs definition index object to the top level index + * cookie as a known backing object */ + spin_lock(&fscache_fsdef_index.lock); + + hlist_add_head(&ifsdef->cookie_link, + &fscache_fsdef_index.backing_objects); + + atomic_inc(&fscache_fsdef_index.usage); + + /* done */ + spin_unlock(&fscache_fsdef_index.lock); + up_write(&fscache_addremove_sem); + + printk(KERN_NOTICE "FS-Cache: Cache \"%s\" added (type %s)\n", + cache->tag->name, cache->ops->name); + kobject_uevent(cache->kobj, KOBJ_ADD); + + _leave(" = 0 [%s]", cache->identifier); + return 0; + +tag_in_use: + printk(KERN_ERR "FS-Cache: Cache tag '%s' already in use\n", tagname); + __fscache_release_cache_tag(tag); + _leave(" = -EXIST"); + return -EEXIST; + +error: + __fscache_release_cache_tag(tag); + _leave(" = -EINVAL"); + return -EINVAL; + +nomem: + _leave(" = -ENOMEM"); + return -ENOMEM; +} +EXPORT_SYMBOL(fscache_add_cache); + +/** + * fscache_io_error - Note a cache I/O error + * @cache: The record describing the cache + * + * Note that an I/O error occurred in a cache and that it should no longer be + * used for anything. This also reports the error into the kernel log. + * + * See Documentation/filesystems/caching/backend-api.txt for a complete + * description. + */ +void fscache_io_error(struct fscache_cache *cache) +{ + set_bit(FSCACHE_IOERROR, &cache->flags); + + printk(KERN_ERR "FS-Cache: Cache %s stopped due to I/O error\n", + cache->ops->name); +} +EXPORT_SYMBOL(fscache_io_error); + +/* + * request withdrawal of all the objects in a cache + * - all the objects being withdrawn are moved onto the supplied list + */ +static void fscache_withdraw_all_objects(struct fscache_cache *cache, + struct list_head *dying_objects) +{ + struct fscache_object *object; + + spin_lock(&cache->object_list_lock); + + while (!list_empty(&cache->object_list)) { + object = list_entry(cache->object_list.next, + struct fscache_object, cache_link); + list_move_tail(&object->cache_link, dying_objects); + + _debug("withdraw %p", object->cookie); + + spin_lock(&object->lock); + spin_unlock(&cache->object_list_lock); + fscache_raise_event(object, FSCACHE_OBJECT_EV_WITHDRAW); + spin_unlock(&object->lock); + + cond_resched(); + spin_lock(&cache->object_list_lock); + } + + spin_unlock(&cache->object_list_lock); +} + +/** + * fscache_withdraw_cache - Withdraw a cache from the active service + * @cache: The record describing the cache + * + * Withdraw a cache from service, unbinding all its cache objects from the + * netfs cookies they're currently representing. + * + * See Documentation/filesystems/caching/backend-api.txt for a complete + * description. + */ +void fscache_withdraw_cache(struct fscache_cache *cache) +{ + LIST_HEAD(dying_objects); + + _enter(""); + + printk(KERN_NOTICE "FS-Cache: Withdrawing cache \"%s\"\n", + cache->tag->name); + + /* make the cache unavailable for cookie acquisition */ + if (test_and_set_bit(FSCACHE_CACHE_WITHDRAWN, &cache->flags)) + BUG(); + + down_write(&fscache_addremove_sem); + list_del_init(&cache->link); + cache->tag->cache = NULL; + up_write(&fscache_addremove_sem); + + /* make sure all pages pinned by operations on behalf of the netfs are + * written to disk */ + cache->ops->sync_cache(cache); + + /* dissociate all the netfs pages backed by this cache from the block + * mappings in the cache */ + cache->ops->dissociate_pages(cache); + + /* we now have to destroy all the active objects pertaining to this + * cache - which we do by passing them off to thread pool to be + * disposed of */ + _debug("destroy"); + + fscache_withdraw_all_objects(cache, &dying_objects); + + /* wait for all extant objects to finish their outstanding operations + * and go away */ + _debug("wait for finish"); + wait_event(fscache_cache_cleared_wq, + atomic_read(&cache->object_count) == 0); + _debug("wait for clearance"); + wait_event(fscache_cache_cleared_wq, + list_empty(&cache->object_list)); + _debug("cleared"); + ASSERT(list_empty(&dying_objects)); + + kobject_put(cache->kobj); + + clear_bit(FSCACHE_TAG_RESERVED, &cache->tag->flags); + fscache_release_cache_tag(cache->tag); + cache->tag = NULL; + + _leave(""); +} +EXPORT_SYMBOL(fscache_withdraw_cache); diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c new file mode 100644 index 000000000000..72fd18f6c71f --- /dev/null +++ b/fs/fscache/cookie.c @@ -0,0 +1,500 @@ +/* netfs cookie management + * + * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * See Documentation/filesystems/caching/netfs-api.txt for more information on + * the netfs API. + */ + +#define FSCACHE_DEBUG_LEVEL COOKIE +#include <linux/module.h> +#include <linux/slab.h> +#include "internal.h" + +struct kmem_cache *fscache_cookie_jar; + +static atomic_t fscache_object_debug_id = ATOMIC_INIT(0); + +static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie); +static int fscache_alloc_object(struct fscache_cache *cache, + struct fscache_cookie *cookie); +static int fscache_attach_object(struct fscache_cookie *cookie, + struct fscache_object *object); + +/* + * initialise an cookie jar slab element prior to any use + */ +void fscache_cookie_init_once(void *_cookie) +{ + struct fscache_cookie *cookie = _cookie; + + memset(cookie, 0, sizeof(*cookie)); + spin_lock_init(&cookie->lock); + INIT_HLIST_HEAD(&cookie->backing_objects); +} + +/* + * request a cookie to represent an object (index, datafile, xattr, etc) + * - parent specifies the parent object + * - the top level index cookie for each netfs is stored in the fscache_netfs + * struct upon registration + * - def points to the definition + * - the netfs_data will be passed to the functions pointed to in *def + * - all attached caches will be searched to see if they contain this object + * - index objects aren't stored on disk until there's a dependent file that + * needs storing + * - other objects are stored in a selected cache immediately, and all the + * indices forming the path to it are instantiated if necessary + * - we never let on to the netfs about errors + * - we may set a negative cookie pointer, but that's okay + */ +struct fscache_cookie *__fscache_acquire_cookie( + struct fscache_cookie *parent, + const struct fscache_cookie_def *def, + void *netfs_data) +{ + struct fscache_cookie *cookie; + + BUG_ON(!def); + + _enter("{%s},{%s},%p", + parent ? (char *) parent->def->name : "<no-parent>", + def->name, netfs_data); + + fscache_stat(&fscache_n_acquires); + + /* if there's no parent cookie, then we don't create one here either */ + if (!parent) { + fscache_stat(&fscache_n_acquires_null); + _leave(" [no parent]"); + return NULL; + } + + /* validate the definition */ + BUG_ON(!def->get_key); + BUG_ON(!def->name[0]); + + BUG_ON(def->type == FSCACHE_COOKIE_TYPE_INDEX && + parent->def->type != FSCACHE_COOKIE_TYPE_INDEX); + + /* allocate and initialise a cookie */ + cookie = kmem_cache_alloc(fscache_cookie_jar, GFP_KERNEL); + if (!cookie) { + fscache_stat(&fscache_n_acquires_oom); + _leave(" [ENOMEM]"); + return NULL; + } + + atomic_set(&cookie->usage, 1); + atomic_set(&cookie->n_children, 0); + + atomic_inc(&parent->usage); + atomic_inc(&parent->n_children); + + cookie->def = def; + cookie->parent = parent; + cookie->netfs_data = netfs_data; + cookie->flags = 0; + + INIT_RADIX_TREE(&cookie->stores, GFP_NOFS); + + switch (cookie->def->type) { + case FSCACHE_COOKIE_TYPE_INDEX: + fscache_stat(&fscache_n_cookie_index); + break; + case FSCACHE_COOKIE_TYPE_DATAFILE: + fscache_stat(&fscache_n_cookie_data); + break; + default: + fscache_stat(&fscache_n_cookie_special); + break; + } + + /* if the object is an index then we need do nothing more here - we + * create indices on disk when we need them as an index may exist in + * multiple caches */ + if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) { + if (fscache_acquire_non_index_cookie(cookie) < 0) { + atomic_dec(&parent->n_children); + __fscache_cookie_put(cookie); + fscache_stat(&fscache_n_acquires_nobufs); + _leave(" = NULL"); + return NULL; + } + } + + fscache_stat(&fscache_n_acquires_ok); + _leave(" = %p", cookie); + return cookie; +} +EXPORT_SYMBOL(__fscache_acquire_cookie); + +/* + * acquire a non-index cookie + * - this must make sure the index chain is instantiated and instantiate the + * object representation too + */ +static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie) +{ + struct fscache_object *object; + struct fscache_cache *cache; + uint64_t i_size; + int ret; + + _enter(""); + + cookie->flags = 1 << FSCACHE_COOKIE_UNAVAILABLE; + + /* now we need to see whether the backing objects for this cookie yet + * exist, if not there'll be nothing to search */ + down_read(&fscache_addremove_sem); + + if (list_empty(&fscache_cache_list)) { + up_read(&fscache_addremove_sem); + _leave(" = 0 [no caches]"); + return 0; + } + + /* select a cache in which to store the object */ + cache = fscache_select_cache_for_object(cookie->parent); + if (!cache) { + up_read(&fscache_addremove_sem); + fscache_stat(&fscache_n_acquires_no_cache); + _leave(" = -ENOMEDIUM [no cache]"); + return -ENOMEDIUM; + } + + _debug("cache %s", cache->tag->name); + + cookie->flags = + (1 << FSCACHE_COOKIE_LOOKING_UP) | + (1 << FSCACHE_COOKIE_CREATING) | + (1 << FSCACHE_COOKIE_NO_DATA_YET); + + /* ask the cache to allocate objects for this cookie and its parent + * chain */ + ret = fscache_alloc_object(cache, cookie); + if (ret < 0) { + up_read(&fscache_addremove_sem); + _leave(" = %d", ret); + return ret; + } + + /* pass on how big the object we're caching is supposed to be */ + cookie->def->get_attr(cookie->netfs_data, &i_size); + + spin_lock(&cookie->lock); + if (hlist_empty(&cookie->backing_objects)) { + spin_unlock(&cookie->lock); + goto unavailable; + } + + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, cookie_link); + + fscache_set_store_limit(object, i_size); + + /* initiate the process of looking up all the objects in the chain + * (done by fscache_initialise_object()) */ + fscache_enqueue_object(object); + + spin_unlock(&cookie->lock); + + /* we may be required to wait for lookup to complete at this point */ + if (!fscache_defer_lookup) { + _debug("non-deferred lookup %p", &cookie->flags); + wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP, + fscache_wait_bit, TASK_UNINTERRUPTIBLE); + _debug("complete"); + if (test_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags)) + goto unavailable; + } + + up_read(&fscache_addremove_sem); + _leave(" = 0 [deferred]"); + return 0; + +unavailable: + up_read(&fscache_addremove_sem); + _leave(" = -ENOBUFS"); + return -ENOBUFS; +} + +/* + * recursively allocate cache object records for a cookie/cache combination + * - caller must be holding the addremove sem + */ +static int fscache_alloc_object(struct fscache_cache *cache, + struct fscache_cookie *cookie) +{ + struct fscache_object *object; + struct hlist_node *_n; + int ret; + + _enter("%p,%p{%s}", cache, cookie, cookie->def->name); + + spin_lock(&cookie->lock); + hlist_for_each_entry(object, _n, &cookie->backing_objects, + cookie_link) { + if (object->cache == cache) + goto object_already_extant; + } + spin_unlock(&cookie->lock); + + /* ask the cache to allocate an object (we may end up with duplicate + * objects at this stage, but we sort that out later) */ + object = cache->ops->alloc_object(cache, cookie); + if (IS_ERR(object)) { + fscache_stat(&fscache_n_object_no_alloc); + ret = PTR_ERR(object); + goto error; + } + + fscache_stat(&fscache_n_object_alloc); + + object->debug_id = atomic_inc_return(&fscache_object_debug_id); + + _debug("ALLOC OBJ%x: %s {%lx}", + object->debug_id, cookie->def->name, object->events); + + ret = fscache_alloc_object(cache, cookie->parent); + if (ret < 0) + goto error_put; + + /* only attach if we managed to allocate all we needed, otherwise + * discard the object we just allocated and instead use the one + * attached to the cookie */ + if (fscache_attach_object(cookie, object) < 0) + cache->ops->put_object(object); + + _leave(" = 0"); + return 0; + +object_already_extant: + ret = -ENOBUFS; + if (object->state >= FSCACHE_OBJECT_DYING) { + spin_unlock(&cookie->lock); + goto error; + } + spin_unlock(&cookie->lock); + _leave(" = 0 [found]"); + return 0; + +error_put: + cache->ops->put_object(object); +error: + _leave(" = %d", ret); + return ret; +} + +/* + * attach a cache object to a cookie + */ +static int fscache_attach_object(struct fscache_cookie *cookie, + struct fscache_object *object) +{ + struct fscache_object *p; + struct fscache_cache *cache = object->cache; + struct hlist_node *_n; + int ret; + + _enter("{%s},{OBJ%x}", cookie->def->name, object->debug_id); + + spin_lock(&cookie->lock); + + /* there may be multiple initial creations of this object, but we only + * want one */ + ret = -EEXIST; + hlist_for_each_entry(p, _n, &cookie->backing_objects, cookie_link) { + if (p->cache == object->cache) { + if (p->state >= FSCACHE_OBJECT_DYING) + ret = -ENOBUFS; + goto cant_attach_object; + } + } + + /* pin the parent object */ + spin_lock_nested(&cookie->parent->lock, 1); + hlist_for_each_entry(p, _n, &cookie->parent->backing_objects, + cookie_link) { + if (p->cache == object->cache) { + if (p->state >= FSCACHE_OBJECT_DYING) { + ret = -ENOBUFS; + spin_unlock(&cookie->parent->lock); + goto cant_attach_object; + } + object->parent = p; + spin_lock(&p->lock); + p->n_children++; + spin_unlock(&p->lock); + break; + } + } + spin_unlock(&cookie->parent->lock); + + /* attach to the cache's object list */ + if (list_empty(&object->cache_link)) { + spin_lock(&cache->object_list_lock); + list_add(&object->cache_link, &cache->object_list); + spin_unlock(&cache->object_list_lock); + } + + /* attach to the cookie */ + object->cookie = cookie; + atomic_inc(&cookie->usage); + hlist_add_head(&object->cookie_link, &cookie->backing_objects); + ret = 0; + +cant_attach_object: + spin_unlock(&cookie->lock); + _leave(" = %d", ret); + return ret; +} + +/* + * update the index entries backing a cookie + */ +void __fscache_update_cookie(struct fscache_cookie *cookie) +{ + struct fscache_object *object; + struct hlist_node *_p; + + fscache_stat(&fscache_n_updates); + + if (!cookie) { + fscache_stat(&fscache_n_updates_null); + _leave(" [no cookie]"); + return; + } + + _enter("{%s}", cookie->def->name); + + BUG_ON(!cookie->def->get_aux); + + spin_lock(&cookie->lock); + + /* update the index entry on disk in each cache backing this cookie */ + hlist_for_each_entry(object, _p, + &cookie->backing_objects, cookie_link) { + fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE); + } + + spin_unlock(&cookie->lock); + _leave(""); +} +EXPORT_SYMBOL(__fscache_update_cookie); + +/* + * release a cookie back to the cache + * - the object will be marked as recyclable on disk if retire is true + * - all dependents of this cookie must have already been unregistered + * (indices/files/pages) + */ +void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire) +{ + struct fscache_cache *cache; + struct fscache_object *object; + unsigned long event; + + fscache_stat(&fscache_n_relinquishes); + + if (!cookie) { + fscache_stat(&fscache_n_relinquishes_null); + _leave(" [no cookie]"); + return; + } + + _enter("%p{%s,%p},%d", + cookie, cookie->def->name, cookie->netfs_data, retire); + + if (atomic_read(&cookie->n_children) != 0) { + printk(KERN_ERR "FS-Cache: Cookie '%s' still has children\n", + cookie->def->name); + BUG(); + } + + /* wait for the cookie to finish being instantiated (or to fail) */ + if (test_bit(FSCACHE_COOKIE_CREATING, &cookie->flags)) { + fscache_stat(&fscache_n_relinquishes_waitcrt); + wait_on_bit(&cookie->flags, FSCACHE_COOKIE_CREATING, + fscache_wait_bit, TASK_UNINTERRUPTIBLE); + } + + event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE; + + /* detach pointers back to the netfs */ + spin_lock(&cookie->lock); + + cookie->netfs_data = NULL; + cookie->def = NULL; + + /* break links with all the active objects */ + while (!hlist_empty(&cookie->backing_objects)) { + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, + cookie_link); + + _debug("RELEASE OBJ%x", object->debug_id); + + /* detach each cache object from the object cookie */ + spin_lock(&object->lock); + hlist_del_init(&object->cookie_link); + + cache = object->cache; + object->cookie = NULL; + fscache_raise_event(object, event); + spin_unlock(&object->lock); + + if (atomic_dec_and_test(&cookie->usage)) + /* the cookie refcount shouldn't be reduced to 0 yet */ + BUG(); + } + + spin_unlock(&cookie->lock); + + if (cookie->parent) { + ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0); + ASSERTCMP(atomic_read(&cookie->parent->n_children), >, 0); + atomic_dec(&cookie->parent->n_children); + } + + /* finally dispose of the cookie */ + ASSERTCMP(atomic_read(&cookie->usage), >, 0); + fscache_cookie_put(cookie); + + _leave(""); +} +EXPORT_SYMBOL(__fscache_relinquish_cookie); + +/* + * destroy a cookie + */ +void __fscache_cookie_put(struct fscache_cookie *cookie) +{ + struct fscache_cookie *parent; + + _enter("%p", cookie); + + for (;;) { + _debug("FREE COOKIE %p", cookie); + parent = cookie->parent; + BUG_ON(!hlist_empty(&cookie->backing_objects)); + kmem_cache_free(fscache_cookie_jar, cookie); + + if (!parent) + break; + + cookie = parent; + BUG_ON(atomic_read(&cookie->usage) <= 0); + if (!atomic_dec_and_test(&cookie->usage)) + break; + } + + _leave(""); +} diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c new file mode 100644 index 000000000000..f5b4baee7352 --- /dev/null +++ b/fs/fscache/fsdef.c @@ -0,0 +1,144 @@ +/* Filesystem index definition + * + * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define FSCACHE_DEBUG_LEVEL CACHE +#include <linux/module.h> +#include "internal.h" + +static uint16_t fscache_fsdef_netfs_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax); + +static uint16_t fscache_fsdef_netfs_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax); + +static +enum fscache_checkaux fscache_fsdef_netfs_check_aux(void *cookie_netfs_data, + const void *data, + uint16_t datalen); + +/* + * The root index is owned by FS-Cache itself. + * + * When a netfs requests caching facilities, FS-Cache will, if one doesn't + * already exist, create an entry in the root index with the key being the name + * of the netfs ("AFS" for example), and the auxiliary data holding the index + * structure version supplied by the netfs: + * + * FSDEF + * | + * +-----------+ + * | | + * NFS AFS + * [v=1] [v=1] + * + * If an entry with the appropriate name does already exist, the version is + * compared. If the version is different, the entire subtree from that entry + * will be discarded and a new entry created. + * + * The new entry will be an index, and a cookie referring to it will be passed + * to the netfs. This is then the root handle by which the netfs accesses the + * cache. It can create whatever objects it likes in that index, including + * further indices. + */ +static struct fscache_cookie_def fscache_fsdef_index_def = { + .name = ".FS-Cache", + .type = FSCACHE_COOKIE_TYPE_INDEX, +}; + +struct fscache_cookie fscache_fsdef_index = { + .usage = ATOMIC_INIT(1), + .lock = __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock), + .backing_objects = HLIST_HEAD_INIT, + .def = &fscache_fsdef_index_def, +}; +EXPORT_SYMBOL(fscache_fsdef_index); + +/* + * Definition of an entry in the root index. Each entry is an index, keyed to + * a specific netfs and only applicable to a particular version of the index + * structure used by that netfs. + */ +struct fscache_cookie_def fscache_fsdef_netfs_def = { + .name = "FSDEF.netfs", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = fscache_fsdef_netfs_get_key, + .get_aux = fscache_fsdef_netfs_get_aux, + .check_aux = fscache_fsdef_netfs_check_aux, +}; + +/* + * get the key data for an FSDEF index record - this is the name of the netfs + * for which this entry is created + */ +static uint16_t fscache_fsdef_netfs_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct fscache_netfs *netfs = cookie_netfs_data; + unsigned klen; + + _enter("{%s.%u},", netfs->name, netfs->version); + + klen = strlen(netfs->name); + if (klen > bufmax) + return 0; + + memcpy(buffer, netfs->name, klen); + return klen; +} + +/* + * get the auxiliary data for an FSDEF index record - this is the index + * structure version number of the netfs for which this version is created + */ +static uint16_t fscache_fsdef_netfs_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct fscache_netfs *netfs = cookie_netfs_data; + unsigned dlen; + + _enter("{%s.%u},", netfs->name, netfs->version); + + dlen = sizeof(uint32_t); + if (dlen > bufmax) + return 0; + + memcpy(buffer, &netfs->version, dlen); + return dlen; +} + +/* + * check that the index structure version number stored in the auxiliary data + * matches the one the netfs gave us + */ +static enum fscache_checkaux fscache_fsdef_netfs_check_aux( + void *cookie_netfs_data, + const void *data, + uint16_t datalen) +{ + struct fscache_netfs *netfs = cookie_netfs_data; + uint32_t version; + + _enter("{%s},,%hu", netfs->name, datalen); + + if (datalen != sizeof(version)) { + _leave(" = OBSOLETE [dl=%d v=%zu]", datalen, sizeof(version)); + return FSCACHE_CHECKAUX_OBSOLETE; + } + + memcpy(&version, data, sizeof(version)); + if (version != netfs->version) { + _leave(" = OBSOLETE [ver=%x net=%x]", version, netfs->version); + return FSCACHE_CHECKAUX_OBSOLETE; + } + + _leave(" = OKAY"); + return FSCACHE_CHECKAUX_OKAY; +} diff --git a/fs/fscache/histogram.c b/fs/fscache/histogram.c new file mode 100644 index 000000000000..bad496748a59 --- /dev/null +++ b/fs/fscache/histogram.c @@ -0,0 +1,109 @@ +/* FS-Cache latency histogram + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#define FSCACHE_DEBUG_LEVEL THREAD +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include "internal.h" + +atomic_t fscache_obj_instantiate_histogram[HZ]; +atomic_t fscache_objs_histogram[HZ]; +atomic_t fscache_ops_histogram[HZ]; +atomic_t fscache_retrieval_delay_histogram[HZ]; +atomic_t fscache_retrieval_histogram[HZ]; + +/* + * display the time-taken histogram + */ +static int fscache_histogram_show(struct seq_file *m, void *v) +{ + unsigned long index; + unsigned n[5], t; + + switch ((unsigned long) v) { + case 1: + seq_puts(m, "JIFS SECS OBJ INST OP RUNS OBJ RUNS " + " RETRV DLY RETRIEVLS\n"); + return 0; + case 2: + seq_puts(m, "===== ===== ========= ========= =========" + " ========= =========\n"); + return 0; + default: + index = (unsigned long) v - 3; + n[0] = atomic_read(&fscache_obj_instantiate_histogram[index]); + n[1] = atomic_read(&fscache_ops_histogram[index]); + n[2] = atomic_read(&fscache_objs_histogram[index]); + n[3] = atomic_read(&fscache_retrieval_delay_histogram[index]); + n[4] = atomic_read(&fscache_retrieval_histogram[index]); + if (!(n[0] | n[1] | n[2] | n[3] | n[4])) + return 0; + + t = (index * 1000) / HZ; + + seq_printf(m, "%4lu 0.%03u %9u %9u %9u %9u %9u\n", + index, t, n[0], n[1], n[2], n[3], n[4]); + return 0; + } +} + +/* + * set up the iterator to start reading from the first line + */ +static void *fscache_histogram_start(struct seq_file *m, loff_t *_pos) +{ + if ((unsigned long long)*_pos >= HZ + 2) + return NULL; + if (*_pos == 0) + *_pos = 1; + return (void *)(unsigned long) *_pos; +} + +/* + * move to the next line + */ +static void *fscache_histogram_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + return (unsigned long long)*pos > HZ + 2 ? + NULL : (void *)(unsigned long) *pos; +} + +/* + * clean up after reading + */ +static void fscache_histogram_stop(struct seq_file *m, void *v) +{ +} + +static const struct seq_operations fscache_histogram_ops = { + .start = fscache_histogram_start, + .stop = fscache_histogram_stop, + .next = fscache_histogram_next, + .show = fscache_histogram_show, +}; + +/* + * open "/proc/fs/fscache/histogram" to provide latency data + */ +static int fscache_histogram_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &fscache_histogram_ops); +} + +const struct file_operations fscache_histogram_fops = { + .owner = THIS_MODULE, + .open = fscache_histogram_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h new file mode 100644 index 000000000000..e0cbd16f6dc9 --- /dev/null +++ b/fs/fscache/internal.h @@ -0,0 +1,380 @@ +/* Internal definitions for FS-Cache + * + * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * Lock order, in the order in which multiple locks should be obtained: + * - fscache_addremove_sem + * - cookie->lock + * - cookie->parent->lock + * - cache->object_list_lock + * - object->lock + * - object->parent->lock + * - fscache_thread_lock + * + */ + +#include <linux/fscache-cache.h> +#include <linux/sched.h> + +#define FSCACHE_MIN_THREADS 4 +#define FSCACHE_MAX_THREADS 32 + +/* + * fsc-cache.c + */ +extern struct list_head fscache_cache_list; +extern struct rw_semaphore fscache_addremove_sem; + +extern struct fscache_cache *fscache_select_cache_for_object( + struct fscache_cookie *); + +/* + * fsc-cookie.c + */ +extern struct kmem_cache *fscache_cookie_jar; + +extern void fscache_cookie_init_once(void *); +extern void __fscache_cookie_put(struct fscache_cookie *); + +/* + * fsc-fsdef.c + */ +extern struct fscache_cookie fscache_fsdef_index; +extern struct fscache_cookie_def fscache_fsdef_netfs_def; + +/* + * fsc-histogram.c + */ +#ifdef CONFIG_FSCACHE_HISTOGRAM +extern atomic_t fscache_obj_instantiate_histogram[HZ]; +extern atomic_t fscache_objs_histogram[HZ]; +extern atomic_t fscache_ops_histogram[HZ]; +extern atomic_t fscache_retrieval_delay_histogram[HZ]; +extern atomic_t fscache_retrieval_histogram[HZ]; + +static inline void fscache_hist(atomic_t histogram[], unsigned long start_jif) +{ + unsigned long jif = jiffies - start_jif; + if (jif >= HZ) + jif = HZ - 1; + atomic_inc(&histogram[jif]); +} + +extern const struct file_operations fscache_histogram_fops; + +#else +#define fscache_hist(hist, start_jif) do {} while (0) +#endif + +/* + * fsc-main.c + */ +extern unsigned fscache_defer_lookup; +extern unsigned fscache_defer_create; +extern unsigned fscache_debug; +extern struct kobject *fscache_root; + +extern int fscache_wait_bit(void *); +extern int fscache_wait_bit_interruptible(void *); + +/* + * fsc-object.c + */ +extern void fscache_withdrawing_object(struct fscache_cache *, + struct fscache_object *); +extern void fscache_enqueue_object(struct fscache_object *); + +/* + * fsc-operation.c + */ +extern int fscache_submit_exclusive_op(struct fscache_object *, + struct fscache_operation *); +extern int fscache_submit_op(struct fscache_object *, + struct fscache_operation *); +extern void fscache_abort_object(struct fscache_object *); +extern void fscache_start_operations(struct fscache_object *); +extern void fscache_operation_gc(struct work_struct *); + +/* + * fsc-proc.c + */ +#ifdef CONFIG_PROC_FS +extern int __init fscache_proc_init(void); +extern void fscache_proc_cleanup(void); +#else +#define fscache_proc_init() (0) +#define fscache_proc_cleanup() do {} while (0) +#endif + +/* + * fsc-stats.c + */ +#ifdef CONFIG_FSCACHE_STATS +extern atomic_t fscache_n_ops_processed[FSCACHE_MAX_THREADS]; +extern atomic_t fscache_n_objs_processed[FSCACHE_MAX_THREADS]; + +extern atomic_t fscache_n_op_pend; +extern atomic_t fscache_n_op_run; +extern atomic_t fscache_n_op_enqueue; +extern atomic_t fscache_n_op_deferred_release; +extern atomic_t fscache_n_op_release; +extern atomic_t fscache_n_op_gc; + +extern atomic_t fscache_n_attr_changed; +extern atomic_t fscache_n_attr_changed_ok; +extern atomic_t fscache_n_attr_changed_nobufs; +extern atomic_t fscache_n_attr_changed_nomem; +extern atomic_t fscache_n_attr_changed_calls; + +extern atomic_t fscache_n_allocs; +extern atomic_t fscache_n_allocs_ok; +extern atomic_t fscache_n_allocs_wait; +extern atomic_t fscache_n_allocs_nobufs; +extern atomic_t fscache_n_alloc_ops; +extern atomic_t fscache_n_alloc_op_waits; + +extern atomic_t fscache_n_retrievals; +extern atomic_t fscache_n_retrievals_ok; +extern atomic_t fscache_n_retrievals_wait; +extern atomic_t fscache_n_retrievals_nodata; +extern atomic_t fscache_n_retrievals_nobufs; +extern atomic_t fscache_n_retrievals_intr; +extern atomic_t fscache_n_retrievals_nomem; +extern atomic_t fscache_n_retrieval_ops; +extern atomic_t fscache_n_retrieval_op_waits; + +extern atomic_t fscache_n_stores; +extern atomic_t fscache_n_stores_ok; +extern atomic_t fscache_n_stores_again; +extern atomic_t fscache_n_stores_nobufs; +extern atomic_t fscache_n_stores_oom; +extern atomic_t fscache_n_store_ops; +extern atomic_t fscache_n_store_calls; + +extern atomic_t fscache_n_marks; +extern atomic_t fscache_n_uncaches; + +extern atomic_t fscache_n_acquires; +extern atomic_t fscache_n_acquires_null; +extern atomic_t fscache_n_acquires_no_cache; +extern atomic_t fscache_n_acquires_ok; +extern atomic_t fscache_n_acquires_nobufs; +extern atomic_t fscache_n_acquires_oom; + +extern atomic_t fscache_n_updates; +extern atomic_t fscache_n_updates_null; +extern atomic_t fscache_n_updates_run; + +extern atomic_t fscache_n_relinquishes; +extern atomic_t fscache_n_relinquishes_null; +extern atomic_t fscache_n_relinquishes_waitcrt; + +extern atomic_t fscache_n_cookie_index; +extern atomic_t fscache_n_cookie_data; +extern atomic_t fscache_n_cookie_special; + +extern atomic_t fscache_n_object_alloc; +extern atomic_t fscache_n_object_no_alloc; +extern atomic_t fscache_n_object_lookups; +extern atomic_t fscache_n_object_lookups_negative; +extern atomic_t fscache_n_object_lookups_positive; +extern atomic_t fscache_n_object_created; +extern atomic_t fscache_n_object_avail; +extern atomic_t fscache_n_object_dead; + +extern atomic_t fscache_n_checkaux_none; +extern atomic_t fscache_n_checkaux_okay; +extern atomic_t fscache_n_checkaux_update; +extern atomic_t fscache_n_checkaux_obsolete; + +static inline void fscache_stat(atomic_t *stat) +{ + atomic_inc(stat); +} + +extern const struct file_operations fscache_stats_fops; +#else + +#define fscache_stat(stat) do {} while (0) +#endif + +/* + * raise an event on an object + * - if the event is not masked for that object, then the object is + * queued for attention by the thread pool. + */ +static inline void fscache_raise_event(struct fscache_object *object, + unsigned event) +{ + if (!test_and_set_bit(event, &object->events) && + test_bit(event, &object->event_mask)) + fscache_enqueue_object(object); +} + +/* + * drop a reference to a cookie + */ +static inline void fscache_cookie_put(struct fscache_cookie *cookie) +{ + BUG_ON(atomic_read(&cookie->usage) <= 0); + if (atomic_dec_and_test(&cookie->usage)) + __fscache_cookie_put(cookie); +} + +/* + * get an extra reference to a netfs retrieval context + */ +static inline +void *fscache_get_context(struct fscache_cookie *cookie, void *context) +{ + if (cookie->def->get_context) + cookie->def->get_context(cookie->netfs_data, context); + return context; +} + +/* + * release a reference to a netfs retrieval context + */ +static inline +void fscache_put_context(struct fscache_cookie *cookie, void *context) +{ + if (cookie->def->put_context) + cookie->def->put_context(cookie->netfs_data, context); +} + +/*****************************************************************************/ +/* + * debug tracing + */ +#define dbgprintk(FMT, ...) \ + printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__) + +/* make sure we maintain the format strings, even when debugging is disabled */ +static inline __attribute__((format(printf, 1, 2))) +void _dbprintk(const char *fmt, ...) +{ +} + +#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__) +#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__) +#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__) + +#define kjournal(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__) + +#ifdef __KDEBUG +#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__) +#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__) +#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__) + +#elif defined(CONFIG_FSCACHE_DEBUG) +#define _enter(FMT, ...) \ +do { \ + if (__do_kdebug(ENTER)) \ + kenter(FMT, ##__VA_ARGS__); \ +} while (0) + +#define _leave(FMT, ...) \ +do { \ + if (__do_kdebug(LEAVE)) \ + kleave(FMT, ##__VA_ARGS__); \ +} while (0) + +#define _debug(FMT, ...) \ +do { \ + if (__do_kdebug(DEBUG)) \ + kdebug(FMT, ##__VA_ARGS__); \ +} while (0) + +#else +#define _enter(FMT, ...) _dbprintk("==> %s("FMT")", __func__, ##__VA_ARGS__) +#define _leave(FMT, ...) _dbprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__) +#define _debug(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__) +#endif + +/* + * determine whether a particular optional debugging point should be logged + * - we need to go through three steps to persuade cpp to correctly join the + * shorthand in FSCACHE_DEBUG_LEVEL with its prefix + */ +#define ____do_kdebug(LEVEL, POINT) \ + unlikely((fscache_debug & \ + (FSCACHE_POINT_##POINT << (FSCACHE_DEBUG_ ## LEVEL * 3)))) +#define ___do_kdebug(LEVEL, POINT) \ + ____do_kdebug(LEVEL, POINT) +#define __do_kdebug(POINT) \ + ___do_kdebug(FSCACHE_DEBUG_LEVEL, POINT) + +#define FSCACHE_DEBUG_CACHE 0 +#define FSCACHE_DEBUG_COOKIE 1 +#define FSCACHE_DEBUG_PAGE 2 +#define FSCACHE_DEBUG_OPERATION 3 + +#define FSCACHE_POINT_ENTER 1 +#define FSCACHE_POINT_LEAVE 2 +#define FSCACHE_POINT_DEBUG 4 + +#ifndef FSCACHE_DEBUG_LEVEL +#define FSCACHE_DEBUG_LEVEL CACHE +#endif + +/* + * assertions + */ +#if 1 /* defined(__KDEBUGALL) */ + +#define ASSERT(X) \ +do { \ + if (unlikely(!(X))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "FS-Cache: Assertion failed\n"); \ + BUG(); \ + } \ +} while (0) + +#define ASSERTCMP(X, OP, Y) \ +do { \ + if (unlikely(!((X) OP (Y)))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "FS-Cache: Assertion failed\n"); \ + printk(KERN_ERR "%lx " #OP " %lx is false\n", \ + (unsigned long)(X), (unsigned long)(Y)); \ + BUG(); \ + } \ +} while (0) + +#define ASSERTIF(C, X) \ +do { \ + if (unlikely((C) && !(X))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "FS-Cache: Assertion failed\n"); \ + BUG(); \ + } \ +} while (0) + +#define ASSERTIFCMP(C, X, OP, Y) \ +do { \ + if (unlikely((C) && !((X) OP (Y)))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "FS-Cache: Assertion failed\n"); \ + printk(KERN_ERR "%lx " #OP " %lx is false\n", \ + (unsigned long)(X), (unsigned long)(Y)); \ + BUG(); \ + } \ +} while (0) + +#else + +#define ASSERT(X) do {} while (0) +#define ASSERTCMP(X, OP, Y) do {} while (0) +#define ASSERTIF(C, X) do {} while (0) +#define ASSERTIFCMP(C, X, OP, Y) do {} while (0) + +#endif /* assert or not */ diff --git a/fs/fscache/main.c b/fs/fscache/main.c new file mode 100644 index 000000000000..4de41b597499 --- /dev/null +++ b/fs/fscache/main.c @@ -0,0 +1,124 @@ +/* General filesystem local caching manager + * + * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define FSCACHE_DEBUG_LEVEL CACHE +#include <linux/module.h> +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/completion.h> +#include <linux/slab.h> +#include "internal.h" + +MODULE_DESCRIPTION("FS Cache Manager"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); + +unsigned fscache_defer_lookup = 1; +module_param_named(defer_lookup, fscache_defer_lookup, uint, + S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(fscache_defer_lookup, + "Defer cookie lookup to background thread"); + +unsigned fscache_defer_create = 1; +module_param_named(defer_create, fscache_defer_create, uint, + S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(fscache_defer_create, + "Defer cookie creation to background thread"); + +unsigned fscache_debug; +module_param_named(debug, fscache_debug, uint, + S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(fscache_debug, + "FS-Cache debugging mask"); + +struct kobject *fscache_root; + +/* + * initialise the fs caching module + */ +static int __init fscache_init(void) +{ + int ret; + + ret = slow_work_register_user(); + if (ret < 0) + goto error_slow_work; + + ret = fscache_proc_init(); + if (ret < 0) + goto error_proc; + + fscache_cookie_jar = kmem_cache_create("fscache_cookie_jar", + sizeof(struct fscache_cookie), + 0, + 0, + fscache_cookie_init_once); + if (!fscache_cookie_jar) { + printk(KERN_NOTICE + "FS-Cache: Failed to allocate a cookie jar\n"); + ret = -ENOMEM; + goto error_cookie_jar; + } + + fscache_root = kobject_create_and_add("fscache", kernel_kobj); + if (!fscache_root) + goto error_kobj; + + printk(KERN_NOTICE "FS-Cache: Loaded\n"); + return 0; + +error_kobj: + kmem_cache_destroy(fscache_cookie_jar); +error_cookie_jar: + fscache_proc_cleanup(); +error_proc: + slow_work_unregister_user(); +error_slow_work: + return ret; +} + +fs_initcall(fscache_init); + +/* + * clean up on module removal + */ +static void __exit fscache_exit(void) +{ + _enter(""); + + kobject_put(fscache_root); + kmem_cache_destroy(fscache_cookie_jar); + fscache_proc_cleanup(); + slow_work_unregister_user(); + printk(KERN_NOTICE "FS-Cache: Unloaded\n"); +} + +module_exit(fscache_exit); + +/* + * wait_on_bit() sleep function for uninterruptible waiting + */ +int fscache_wait_bit(void *flags) +{ + schedule(); + return 0; +} +EXPORT_SYMBOL(fscache_wait_bit); + +/* + * wait_on_bit() sleep function for interruptible waiting + */ +int fscache_wait_bit_interruptible(void *flags) +{ + schedule(); + return signal_pending(current); +} +EXPORT_SYMBOL(fscache_wait_bit_interruptible); diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c new file mode 100644 index 000000000000..e028b8eb1c40 --- /dev/null +++ b/fs/fscache/netfs.c @@ -0,0 +1,103 @@ +/* FS-Cache netfs (client) registration + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#define FSCACHE_DEBUG_LEVEL COOKIE +#include <linux/module.h> +#include <linux/slab.h> +#include "internal.h" + +static LIST_HEAD(fscache_netfs_list); + +/* + * register a network filesystem for caching + */ +int __fscache_register_netfs(struct fscache_netfs *netfs) +{ + struct fscache_netfs *ptr; + int ret; + + _enter("{%s}", netfs->name); + + INIT_LIST_HEAD(&netfs->link); + + /* allocate a cookie for the primary index */ + netfs->primary_index = + kmem_cache_zalloc(fscache_cookie_jar, GFP_KERNEL); + + if (!netfs->primary_index) { + _leave(" = -ENOMEM"); + return -ENOMEM; + } + + /* initialise the primary index cookie */ + atomic_set(&netfs->primary_index->usage, 1); + atomic_set(&netfs->primary_index->n_children, 0); + + netfs->primary_index->def = &fscache_fsdef_netfs_def; + netfs->primary_index->parent = &fscache_fsdef_index; + netfs->primary_index->netfs_data = netfs; + + atomic_inc(&netfs->primary_index->parent->usage); + atomic_inc(&netfs->primary_index->parent->n_children); + + spin_lock_init(&netfs->primary_index->lock); + INIT_HLIST_HEAD(&netfs->primary_index->backing_objects); + + /* check the netfs type is not already present */ + down_write(&fscache_addremove_sem); + + ret = -EEXIST; + list_for_each_entry(ptr, &fscache_netfs_list, link) { + if (strcmp(ptr->name, netfs->name) == 0) + goto already_registered; + } + + list_add(&netfs->link, &fscache_netfs_list); + ret = 0; + + printk(KERN_NOTICE "FS-Cache: Netfs '%s' registered for caching\n", + netfs->name); + +already_registered: + up_write(&fscache_addremove_sem); + + if (ret < 0) { + netfs->primary_index->parent = NULL; + __fscache_cookie_put(netfs->primary_index); + netfs->primary_index = NULL; + } + + _leave(" = %d", ret); + return ret; +} +EXPORT_SYMBOL(__fscache_register_netfs); + +/* + * unregister a network filesystem from the cache + * - all cookies must have been released first + */ +void __fscache_unregister_netfs(struct fscache_netfs *netfs) +{ + _enter("{%s.%u}", netfs->name, netfs->version); + + down_write(&fscache_addremove_sem); + + list_del(&netfs->link); + fscache_relinquish_cookie(netfs->primary_index, 0); + + up_write(&fscache_addremove_sem); + + printk(KERN_NOTICE "FS-Cache: Netfs '%s' unregistered from caching\n", + netfs->name); + + _leave(""); +} +EXPORT_SYMBOL(__fscache_unregister_netfs); diff --git a/fs/fscache/object.c b/fs/fscache/object.c new file mode 100644 index 000000000000..392a41b1b79d --- /dev/null +++ b/fs/fscache/object.c @@ -0,0 +1,810 @@ +/* FS-Cache object state machine handler + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * See Documentation/filesystems/caching/object.txt for a description of the + * object state machine and the in-kernel representations. + */ + +#define FSCACHE_DEBUG_LEVEL COOKIE +#include <linux/module.h> +#include "internal.h" + +const char *fscache_object_states[] = { + [FSCACHE_OBJECT_INIT] = "OBJECT_INIT", + [FSCACHE_OBJECT_LOOKING_UP] = "OBJECT_LOOKING_UP", + [FSCACHE_OBJECT_CREATING] = "OBJECT_CREATING", + [FSCACHE_OBJECT_AVAILABLE] = "OBJECT_AVAILABLE", + [FSCACHE_OBJECT_ACTIVE] = "OBJECT_ACTIVE", + [FSCACHE_OBJECT_UPDATING] = "OBJECT_UPDATING", + [FSCACHE_OBJECT_DYING] = "OBJECT_DYING", + [FSCACHE_OBJECT_LC_DYING] = "OBJECT_LC_DYING", + [FSCACHE_OBJECT_ABORT_INIT] = "OBJECT_ABORT_INIT", + [FSCACHE_OBJECT_RELEASING] = "OBJECT_RELEASING", + [FSCACHE_OBJECT_RECYCLING] = "OBJECT_RECYCLING", + [FSCACHE_OBJECT_WITHDRAWING] = "OBJECT_WITHDRAWING", + [FSCACHE_OBJECT_DEAD] = "OBJECT_DEAD", +}; +EXPORT_SYMBOL(fscache_object_states); + +static void fscache_object_slow_work_put_ref(struct slow_work *); +static int fscache_object_slow_work_get_ref(struct slow_work *); +static void fscache_object_slow_work_execute(struct slow_work *); +static void fscache_initialise_object(struct fscache_object *); +static void fscache_lookup_object(struct fscache_object *); +static void fscache_object_available(struct fscache_object *); +static void fscache_release_object(struct fscache_object *); +static void fscache_withdraw_object(struct fscache_object *); +static void fscache_enqueue_dependents(struct fscache_object *); +static void fscache_dequeue_object(struct fscache_object *); + +const struct slow_work_ops fscache_object_slow_work_ops = { + .get_ref = fscache_object_slow_work_get_ref, + .put_ref = fscache_object_slow_work_put_ref, + .execute = fscache_object_slow_work_execute, +}; +EXPORT_SYMBOL(fscache_object_slow_work_ops); + +/* + * we need to notify the parent when an op completes that we had outstanding + * upon it + */ +static inline void fscache_done_parent_op(struct fscache_object *object) +{ + struct fscache_object *parent = object->parent; + + _enter("OBJ%x {OBJ%x,%x}", + object->debug_id, parent->debug_id, parent->n_ops); + + spin_lock_nested(&parent->lock, 1); + parent->n_ops--; + parent->n_obj_ops--; + if (parent->n_ops == 0) + fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED); + spin_unlock(&parent->lock); +} + +/* + * process events that have been sent to an object's state machine + * - initiates parent lookup + * - does object lookup + * - does object creation + * - does object recycling and retirement + * - does object withdrawal + */ +static void fscache_object_state_machine(struct fscache_object *object) +{ + enum fscache_object_state new_state; + + ASSERT(object != NULL); + + _enter("{OBJ%x,%s,%lx}", + object->debug_id, fscache_object_states[object->state], + object->events); + + switch (object->state) { + /* wait for the parent object to become ready */ + case FSCACHE_OBJECT_INIT: + object->event_mask = + ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED); + fscache_initialise_object(object); + goto done; + + /* look up the object metadata on disk */ + case FSCACHE_OBJECT_LOOKING_UP: + fscache_lookup_object(object); + goto lookup_transit; + + /* create the object metadata on disk */ + case FSCACHE_OBJECT_CREATING: + fscache_lookup_object(object); + goto lookup_transit; + + /* handle an object becoming available; start pending + * operations and queue dependent operations for processing */ + case FSCACHE_OBJECT_AVAILABLE: + fscache_object_available(object); + goto active_transit; + + /* normal running state */ + case FSCACHE_OBJECT_ACTIVE: + goto active_transit; + + /* update the object metadata on disk */ + case FSCACHE_OBJECT_UPDATING: + clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events); + fscache_stat(&fscache_n_updates_run); + object->cache->ops->update_object(object); + goto active_transit; + + /* handle an object dying during lookup or creation */ + case FSCACHE_OBJECT_LC_DYING: + object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE); + object->cache->ops->lookup_complete(object); + + spin_lock(&object->lock); + object->state = FSCACHE_OBJECT_DYING; + if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, + &object->cookie->flags)) + wake_up_bit(&object->cookie->flags, + FSCACHE_COOKIE_CREATING); + spin_unlock(&object->lock); + + fscache_done_parent_op(object); + + /* wait for completion of all active operations on this object + * and the death of all child objects of this object */ + case FSCACHE_OBJECT_DYING: + dying: + clear_bit(FSCACHE_OBJECT_EV_CLEARED, &object->events); + spin_lock(&object->lock); + _debug("dying OBJ%x {%d,%d}", + object->debug_id, object->n_ops, object->n_children); + if (object->n_ops == 0 && object->n_children == 0) { + object->event_mask &= + ~(1 << FSCACHE_OBJECT_EV_CLEARED); + object->event_mask |= + (1 << FSCACHE_OBJECT_EV_WITHDRAW) | + (1 << FSCACHE_OBJECT_EV_RETIRE) | + (1 << FSCACHE_OBJECT_EV_RELEASE) | + (1 << FSCACHE_OBJECT_EV_ERROR); + } else { + object->event_mask &= + ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) | + (1 << FSCACHE_OBJECT_EV_RETIRE) | + (1 << FSCACHE_OBJECT_EV_RELEASE) | + (1 << FSCACHE_OBJECT_EV_ERROR)); + object->event_mask |= + 1 << FSCACHE_OBJECT_EV_CLEARED; + } + spin_unlock(&object->lock); + fscache_enqueue_dependents(object); + goto terminal_transit; + + /* handle an abort during initialisation */ + case FSCACHE_OBJECT_ABORT_INIT: + _debug("handle abort init %lx", object->events); + object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE); + + spin_lock(&object->lock); + fscache_dequeue_object(object); + + object->state = FSCACHE_OBJECT_DYING; + if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, + &object->cookie->flags)) + wake_up_bit(&object->cookie->flags, + FSCACHE_COOKIE_CREATING); + spin_unlock(&object->lock); + goto dying; + + /* handle the netfs releasing an object and possibly marking it + * obsolete too */ + case FSCACHE_OBJECT_RELEASING: + case FSCACHE_OBJECT_RECYCLING: + object->event_mask &= + ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) | + (1 << FSCACHE_OBJECT_EV_RETIRE) | + (1 << FSCACHE_OBJECT_EV_RELEASE) | + (1 << FSCACHE_OBJECT_EV_ERROR)); + fscache_release_object(object); + spin_lock(&object->lock); + object->state = FSCACHE_OBJECT_DEAD; + spin_unlock(&object->lock); + fscache_stat(&fscache_n_object_dead); + goto terminal_transit; + + /* handle the parent cache of this object being withdrawn from + * active service */ + case FSCACHE_OBJECT_WITHDRAWING: + object->event_mask &= + ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) | + (1 << FSCACHE_OBJECT_EV_RETIRE) | + (1 << FSCACHE_OBJECT_EV_RELEASE) | + (1 << FSCACHE_OBJECT_EV_ERROR)); + fscache_withdraw_object(object); + spin_lock(&object->lock); + object->state = FSCACHE_OBJECT_DEAD; + spin_unlock(&object->lock); + fscache_stat(&fscache_n_object_dead); + goto terminal_transit; + + /* complain about the object being woken up once it is + * deceased */ + case FSCACHE_OBJECT_DEAD: + printk(KERN_ERR "FS-Cache:" + " Unexpected event in dead state %lx\n", + object->events & object->event_mask); + BUG(); + + default: + printk(KERN_ERR "FS-Cache: Unknown object state %u\n", + object->state); + BUG(); + } + + /* determine the transition from a lookup state */ +lookup_transit: + switch (fls(object->events & object->event_mask) - 1) { + case FSCACHE_OBJECT_EV_WITHDRAW: + case FSCACHE_OBJECT_EV_RETIRE: + case FSCACHE_OBJECT_EV_RELEASE: + case FSCACHE_OBJECT_EV_ERROR: + new_state = FSCACHE_OBJECT_LC_DYING; + goto change_state; + case FSCACHE_OBJECT_EV_REQUEUE: + goto done; + case -1: + goto done; /* sleep until event */ + default: + goto unsupported_event; + } + + /* determine the transition from an active state */ +active_transit: + switch (fls(object->events & object->event_mask) - 1) { + case FSCACHE_OBJECT_EV_WITHDRAW: + case FSCACHE_OBJECT_EV_RETIRE: + case FSCACHE_OBJECT_EV_RELEASE: + case FSCACHE_OBJECT_EV_ERROR: + new_state = FSCACHE_OBJECT_DYING; + goto change_state; + case FSCACHE_OBJECT_EV_UPDATE: + new_state = FSCACHE_OBJECT_UPDATING; + goto change_state; + case -1: + new_state = FSCACHE_OBJECT_ACTIVE; + goto change_state; /* sleep until event */ + default: + goto unsupported_event; + } + + /* determine the transition from a terminal state */ +terminal_transit: + switch (fls(object->events & object->event_mask) - 1) { + case FSCACHE_OBJECT_EV_WITHDRAW: + new_state = FSCACHE_OBJECT_WITHDRAWING; + goto change_state; + case FSCACHE_OBJECT_EV_RETIRE: + new_state = FSCACHE_OBJECT_RECYCLING; + goto change_state; + case FSCACHE_OBJECT_EV_RELEASE: + new_state = FSCACHE_OBJECT_RELEASING; + goto change_state; + case FSCACHE_OBJECT_EV_ERROR: + new_state = FSCACHE_OBJECT_WITHDRAWING; + goto change_state; + case FSCACHE_OBJECT_EV_CLEARED: + new_state = FSCACHE_OBJECT_DYING; + goto change_state; + case -1: + goto done; /* sleep until event */ + default: + goto unsupported_event; + } + +change_state: + spin_lock(&object->lock); + object->state = new_state; + spin_unlock(&object->lock); + +done: + _leave(" [->%s]", fscache_object_states[object->state]); + return; + +unsupported_event: + printk(KERN_ERR "FS-Cache:" + " Unsupported event %lx [mask %lx] in state %s\n", + object->events, object->event_mask, + fscache_object_states[object->state]); + BUG(); +} + +/* + * execute an object + */ +static void fscache_object_slow_work_execute(struct slow_work *work) +{ + struct fscache_object *object = + container_of(work, struct fscache_object, work); + unsigned long start; + + _enter("{OBJ%x}", object->debug_id); + + clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); + + start = jiffies; + fscache_object_state_machine(object); + fscache_hist(fscache_objs_histogram, start); + if (object->events & object->event_mask) + fscache_enqueue_object(object); +} + +/* + * initialise an object + * - check the specified object's parent to see if we can make use of it + * immediately to do a creation + * - we may need to start the process of creating a parent and we need to wait + * for the parent's lookup and creation to complete if it's not there yet + * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the + * leaf-most cookies of the object and all its children + */ +static void fscache_initialise_object(struct fscache_object *object) +{ + struct fscache_object *parent; + + _enter(""); + ASSERT(object->cookie != NULL); + ASSERT(object->cookie->parent != NULL); + ASSERT(list_empty(&object->work.link)); + + if (object->events & ((1 << FSCACHE_OBJECT_EV_ERROR) | + (1 << FSCACHE_OBJECT_EV_RELEASE) | + (1 << FSCACHE_OBJECT_EV_RETIRE) | + (1 << FSCACHE_OBJECT_EV_WITHDRAW))) { + _debug("abort init %lx", object->events); + spin_lock(&object->lock); + object->state = FSCACHE_OBJECT_ABORT_INIT; + spin_unlock(&object->lock); + return; + } + + spin_lock(&object->cookie->lock); + spin_lock_nested(&object->cookie->parent->lock, 1); + + parent = object->parent; + if (!parent) { + _debug("no parent"); + set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events); + } else { + spin_lock(&object->lock); + spin_lock_nested(&parent->lock, 1); + _debug("parent %s", fscache_object_states[parent->state]); + + if (parent->state >= FSCACHE_OBJECT_DYING) { + _debug("bad parent"); + set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events); + } else if (parent->state < FSCACHE_OBJECT_AVAILABLE) { + _debug("wait"); + + /* we may get woken up in this state by child objects + * binding on to us, so we need to make sure we don't + * add ourself to the list multiple times */ + if (list_empty(&object->dep_link)) { + object->cache->ops->grab_object(object); + list_add(&object->dep_link, + &parent->dependents); + + /* fscache_acquire_non_index_cookie() uses this + * to wake the chain up */ + if (parent->state == FSCACHE_OBJECT_INIT) + fscache_enqueue_object(parent); + } + } else { + _debug("go"); + parent->n_ops++; + parent->n_obj_ops++; + object->lookup_jif = jiffies; + object->state = FSCACHE_OBJECT_LOOKING_UP; + set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); + } + + spin_unlock(&parent->lock); + spin_unlock(&object->lock); + } + + spin_unlock(&object->cookie->parent->lock); + spin_unlock(&object->cookie->lock); + _leave(""); +} + +/* + * look an object up in the cache from which it was allocated + * - we hold an "access lock" on the parent object, so the parent object cannot + * be withdrawn by either party till we've finished + * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the + * leaf-most cookies of the object and all its children + */ +static void fscache_lookup_object(struct fscache_object *object) +{ + struct fscache_cookie *cookie = object->cookie; + struct fscache_object *parent; + + _enter(""); + + parent = object->parent; + ASSERT(parent != NULL); + ASSERTCMP(parent->n_ops, >, 0); + ASSERTCMP(parent->n_obj_ops, >, 0); + + /* make sure the parent is still available */ + ASSERTCMP(parent->state, >=, FSCACHE_OBJECT_AVAILABLE); + + if (parent->state >= FSCACHE_OBJECT_DYING || + test_bit(FSCACHE_IOERROR, &object->cache->flags)) { + _debug("unavailable"); + set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events); + _leave(""); + return; + } + + _debug("LOOKUP \"%s/%s\" in \"%s\"", + parent->cookie->def->name, cookie->def->name, + object->cache->tag->name); + + fscache_stat(&fscache_n_object_lookups); + object->cache->ops->lookup_object(object); + + if (test_bit(FSCACHE_OBJECT_EV_ERROR, &object->events)) + set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); + + _leave(""); +} + +/** + * fscache_object_lookup_negative - Note negative cookie lookup + * @object: Object pointing to cookie to mark + * + * Note negative lookup, permitting those waiting to read data from an already + * existing backing object to continue as there's no data for them to read. + */ +void fscache_object_lookup_negative(struct fscache_object *object) +{ + struct fscache_cookie *cookie = object->cookie; + + _enter("{OBJ%x,%s}", + object->debug_id, fscache_object_states[object->state]); + + spin_lock(&object->lock); + if (object->state == FSCACHE_OBJECT_LOOKING_UP) { + fscache_stat(&fscache_n_object_lookups_negative); + + /* transit here to allow write requests to begin stacking up + * and read requests to begin returning ENODATA */ + object->state = FSCACHE_OBJECT_CREATING; + spin_unlock(&object->lock); + + set_bit(FSCACHE_COOKIE_PENDING_FILL, &cookie->flags); + set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); + + _debug("wake up lookup %p", &cookie->flags); + smp_mb__before_clear_bit(); + clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags); + smp_mb__after_clear_bit(); + wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP); + set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); + } else { + ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING); + spin_unlock(&object->lock); + } + + _leave(""); +} +EXPORT_SYMBOL(fscache_object_lookup_negative); + +/** + * fscache_obtained_object - Note successful object lookup or creation + * @object: Object pointing to cookie to mark + * + * Note successful lookup and/or creation, permitting those waiting to write + * data to a backing object to continue. + * + * Note that after calling this, an object's cookie may be relinquished by the + * netfs, and so must be accessed with object lock held. + */ +void fscache_obtained_object(struct fscache_object *object) +{ + struct fscache_cookie *cookie = object->cookie; + + _enter("{OBJ%x,%s}", + object->debug_id, fscache_object_states[object->state]); + + /* if we were still looking up, then we must have a positive lookup + * result, in which case there may be data available */ + spin_lock(&object->lock); + if (object->state == FSCACHE_OBJECT_LOOKING_UP) { + fscache_stat(&fscache_n_object_lookups_positive); + + clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); + + object->state = FSCACHE_OBJECT_AVAILABLE; + spin_unlock(&object->lock); + + smp_mb__before_clear_bit(); + clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags); + smp_mb__after_clear_bit(); + wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP); + set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); + } else { + ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING); + fscache_stat(&fscache_n_object_created); + + object->state = FSCACHE_OBJECT_AVAILABLE; + spin_unlock(&object->lock); + set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); + smp_wmb(); + } + + if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &cookie->flags)) + wake_up_bit(&cookie->flags, FSCACHE_COOKIE_CREATING); + + _leave(""); +} +EXPORT_SYMBOL(fscache_obtained_object); + +/* + * handle an object that has just become available + */ +static void fscache_object_available(struct fscache_object *object) +{ + _enter("{OBJ%x}", object->debug_id); + + spin_lock(&object->lock); + + if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &object->cookie->flags)) + wake_up_bit(&object->cookie->flags, FSCACHE_COOKIE_CREATING); + + fscache_done_parent_op(object); + if (object->n_in_progress == 0) { + if (object->n_ops > 0) { + ASSERTCMP(object->n_ops, >=, object->n_obj_ops); + ASSERTIF(object->n_ops > object->n_obj_ops, + !list_empty(&object->pending_ops)); + fscache_start_operations(object); + } else { + ASSERT(list_empty(&object->pending_ops)); + } + } + spin_unlock(&object->lock); + + object->cache->ops->lookup_complete(object); + fscache_enqueue_dependents(object); + + fscache_hist(fscache_obj_instantiate_histogram, object->lookup_jif); + fscache_stat(&fscache_n_object_avail); + + _leave(""); +} + +/* + * drop an object's attachments + */ +static void fscache_drop_object(struct fscache_object *object) +{ + struct fscache_object *parent = object->parent; + struct fscache_cache *cache = object->cache; + + _enter("{OBJ%x,%d}", object->debug_id, object->n_children); + + spin_lock(&cache->object_list_lock); + list_del_init(&object->cache_link); + spin_unlock(&cache->object_list_lock); + + cache->ops->drop_object(object); + + if (parent) { + _debug("release parent OBJ%x {%d}", + parent->debug_id, parent->n_children); + + spin_lock(&parent->lock); + parent->n_children--; + if (parent->n_children == 0) + fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED); + spin_unlock(&parent->lock); + object->parent = NULL; + } + + /* this just shifts the object release to the slow work processor */ + object->cache->ops->put_object(object); + + _leave(""); +} + +/* + * release or recycle an object that the netfs has discarded + */ +static void fscache_release_object(struct fscache_object *object) +{ + _enter(""); + + fscache_drop_object(object); +} + +/* + * withdraw an object from active service + */ +static void fscache_withdraw_object(struct fscache_object *object) +{ + struct fscache_cookie *cookie; + bool detached; + + _enter(""); + + spin_lock(&object->lock); + cookie = object->cookie; + if (cookie) { + /* need to get the cookie lock before the object lock, starting + * from the object pointer */ + atomic_inc(&cookie->usage); + spin_unlock(&object->lock); + + detached = false; + spin_lock(&cookie->lock); + spin_lock(&object->lock); + + if (object->cookie == cookie) { + hlist_del_init(&object->cookie_link); + object->cookie = NULL; + detached = true; + } + spin_unlock(&cookie->lock); + fscache_cookie_put(cookie); + if (detached) + fscache_cookie_put(cookie); + } + + spin_unlock(&object->lock); + + fscache_drop_object(object); +} + +/* + * withdraw an object from active service at the behest of the cache + * - need break the links to a cached object cookie + * - called under two situations: + * (1) recycler decides to reclaim an in-use object + * (2) a cache is unmounted + * - have to take care as the cookie can be being relinquished by the netfs + * simultaneously + * - the object is pinned by the caller holding a refcount on it + */ +void fscache_withdrawing_object(struct fscache_cache *cache, + struct fscache_object *object) +{ + bool enqueue = false; + + _enter(",OBJ%x", object->debug_id); + + spin_lock(&object->lock); + if (object->state < FSCACHE_OBJECT_WITHDRAWING) { + object->state = FSCACHE_OBJECT_WITHDRAWING; + enqueue = true; + } + spin_unlock(&object->lock); + + if (enqueue) + fscache_enqueue_object(object); + + _leave(""); +} + +/* + * allow the slow work item processor to get a ref on an object + */ +static int fscache_object_slow_work_get_ref(struct slow_work *work) +{ + struct fscache_object *object = + container_of(work, struct fscache_object, work); + + return object->cache->ops->grab_object(object) ? 0 : -EAGAIN; +} + +/* + * allow the slow work item processor to discard a ref on a work item + */ +static void fscache_object_slow_work_put_ref(struct slow_work *work) +{ + struct fscache_object *object = + container_of(work, struct fscache_object, work); + + return object->cache->ops->put_object(object); +} + +/* + * enqueue an object for metadata-type processing + */ +void fscache_enqueue_object(struct fscache_object *object) +{ + _enter("{OBJ%x}", object->debug_id); + + slow_work_enqueue(&object->work); +} + +/* + * enqueue the dependents of an object for metadata-type processing + * - the caller must hold the object's lock + * - this may cause an already locked object to wind up being processed again + */ +static void fscache_enqueue_dependents(struct fscache_object *object) +{ + struct fscache_object *dep; + + _enter("{OBJ%x}", object->debug_id); + + if (list_empty(&object->dependents)) + return; + + spin_lock(&object->lock); + + while (!list_empty(&object->dependents)) { + dep = list_entry(object->dependents.next, + struct fscache_object, dep_link); + list_del_init(&dep->dep_link); + + + /* sort onto appropriate lists */ + fscache_enqueue_object(dep); + dep->cache->ops->put_object(dep); + + if (!list_empty(&object->dependents)) + cond_resched_lock(&object->lock); + } + + spin_unlock(&object->lock); +} + +/* + * remove an object from whatever queue it's waiting on + * - the caller must hold object->lock + */ +void fscache_dequeue_object(struct fscache_object *object) +{ + _enter("{OBJ%x}", object->debug_id); + + if (!list_empty(&object->dep_link)) { + spin_lock(&object->parent->lock); + list_del_init(&object->dep_link); + spin_unlock(&object->parent->lock); + } + + _leave(""); +} + +/** + * fscache_check_aux - Ask the netfs whether an object on disk is still valid + * @object: The object to ask about + * @data: The auxiliary data for the object + * @datalen: The size of the auxiliary data + * + * This function consults the netfs about the coherency state of an object + */ +enum fscache_checkaux fscache_check_aux(struct fscache_object *object, + const void *data, uint16_t datalen) +{ + enum fscache_checkaux result; + + if (!object->cookie->def->check_aux) { + fscache_stat(&fscache_n_checkaux_none); + return FSCACHE_CHECKAUX_OKAY; + } + + result = object->cookie->def->check_aux(object->cookie->netfs_data, + data, datalen); + switch (result) { + /* entry okay as is */ + case FSCACHE_CHECKAUX_OKAY: + fscache_stat(&fscache_n_checkaux_okay); + break; + + /* entry requires update */ + case FSCACHE_CHECKAUX_NEEDS_UPDATE: + fscache_stat(&fscache_n_checkaux_update); + break; + + /* entry requires deletion */ + case FSCACHE_CHECKAUX_OBSOLETE: + fscache_stat(&fscache_n_checkaux_obsolete); + break; + + default: + BUG(); + } + + return result; +} +EXPORT_SYMBOL(fscache_check_aux); diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c new file mode 100644 index 000000000000..e7f8d53b8b6b --- /dev/null +++ b/fs/fscache/operation.c @@ -0,0 +1,459 @@ +/* FS-Cache worker operation management routines + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * See Documentation/filesystems/caching/operations.txt + */ + +#define FSCACHE_DEBUG_LEVEL OPERATION +#include <linux/module.h> +#include "internal.h" + +atomic_t fscache_op_debug_id; +EXPORT_SYMBOL(fscache_op_debug_id); + +/** + * fscache_enqueue_operation - Enqueue an operation for processing + * @op: The operation to enqueue + * + * Enqueue an operation for processing by the FS-Cache thread pool. + * + * This will get its own ref on the object. + */ +void fscache_enqueue_operation(struct fscache_operation *op) +{ + _enter("{OBJ%x OP%x,%u}", + op->object->debug_id, op->debug_id, atomic_read(&op->usage)); + + ASSERT(op->processor != NULL); + ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE); + ASSERTCMP(atomic_read(&op->usage), >, 0); + + if (list_empty(&op->pend_link)) { + switch (op->flags & FSCACHE_OP_TYPE) { + case FSCACHE_OP_FAST: + _debug("queue fast"); + atomic_inc(&op->usage); + if (!schedule_work(&op->fast_work)) + fscache_put_operation(op); + break; + case FSCACHE_OP_SLOW: + _debug("queue slow"); + slow_work_enqueue(&op->slow_work); + break; + case FSCACHE_OP_MYTHREAD: + _debug("queue for caller's attention"); + break; + default: + printk(KERN_ERR "FS-Cache: Unexpected op type %lx", + op->flags); + BUG(); + break; + } + fscache_stat(&fscache_n_op_enqueue); + } +} +EXPORT_SYMBOL(fscache_enqueue_operation); + +/* + * start an op running + */ +static void fscache_run_op(struct fscache_object *object, + struct fscache_operation *op) +{ + object->n_in_progress++; + if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) + wake_up_bit(&op->flags, FSCACHE_OP_WAITING); + if (op->processor) + fscache_enqueue_operation(op); + fscache_stat(&fscache_n_op_run); +} + +/* + * submit an exclusive operation for an object + * - other ops are excluded from running simultaneously with this one + * - this gets any extra refs it needs on an op + */ +int fscache_submit_exclusive_op(struct fscache_object *object, + struct fscache_operation *op) +{ + int ret; + + _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id); + + spin_lock(&object->lock); + ASSERTCMP(object->n_ops, >=, object->n_in_progress); + ASSERTCMP(object->n_ops, >=, object->n_exclusive); + + ret = -ENOBUFS; + if (fscache_object_is_active(object)) { + op->object = object; + object->n_ops++; + object->n_exclusive++; /* reads and writes must wait */ + + if (object->n_ops > 0) { + atomic_inc(&op->usage); + list_add_tail(&op->pend_link, &object->pending_ops); + fscache_stat(&fscache_n_op_pend); + } else if (!list_empty(&object->pending_ops)) { + atomic_inc(&op->usage); + list_add_tail(&op->pend_link, &object->pending_ops); + fscache_stat(&fscache_n_op_pend); + fscache_start_operations(object); + } else { + ASSERTCMP(object->n_in_progress, ==, 0); + fscache_run_op(object, op); + } + + /* need to issue a new write op after this */ + clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); + ret = 0; + } else if (object->state == FSCACHE_OBJECT_CREATING) { + op->object = object; + object->n_ops++; + object->n_exclusive++; /* reads and writes must wait */ + atomic_inc(&op->usage); + list_add_tail(&op->pend_link, &object->pending_ops); + fscache_stat(&fscache_n_op_pend); + ret = 0; + } else { + /* not allowed to submit ops in any other state */ + BUG(); + } + + spin_unlock(&object->lock); + return ret; +} + +/* + * report an unexpected submission + */ +static void fscache_report_unexpected_submission(struct fscache_object *object, + struct fscache_operation *op, + unsigned long ostate) +{ + static bool once_only; + struct fscache_operation *p; + unsigned n; + + if (once_only) + return; + once_only = true; + + kdebug("unexpected submission OP%x [OBJ%x %s]", + op->debug_id, object->debug_id, + fscache_object_states[object->state]); + kdebug("objstate=%s [%s]", + fscache_object_states[object->state], + fscache_object_states[ostate]); + kdebug("objflags=%lx", object->flags); + kdebug("objevent=%lx [%lx]", object->events, object->event_mask); + kdebug("ops=%u inp=%u exc=%u", + object->n_ops, object->n_in_progress, object->n_exclusive); + + if (!list_empty(&object->pending_ops)) { + n = 0; + list_for_each_entry(p, &object->pending_ops, pend_link) { + ASSERTCMP(p->object, ==, object); + kdebug("%p %p", op->processor, op->release); + n++; + } + + kdebug("n=%u", n); + } + + dump_stack(); +} + +/* + * submit an operation for an object + * - objects may be submitted only in the following states: + * - during object creation (write ops may be submitted) + * - whilst the object is active + * - after an I/O error incurred in one of the two above states (op rejected) + * - this gets any extra refs it needs on an op + */ +int fscache_submit_op(struct fscache_object *object, + struct fscache_operation *op) +{ + unsigned long ostate; + int ret; + + _enter("{OBJ%x OP%x},{%u}", + object->debug_id, op->debug_id, atomic_read(&op->usage)); + + ASSERTCMP(atomic_read(&op->usage), >, 0); + + spin_lock(&object->lock); + ASSERTCMP(object->n_ops, >=, object->n_in_progress); + ASSERTCMP(object->n_ops, >=, object->n_exclusive); + + ostate = object->state; + smp_rmb(); + + if (fscache_object_is_active(object)) { + op->object = object; + object->n_ops++; + + if (object->n_exclusive > 0) { + atomic_inc(&op->usage); + list_add_tail(&op->pend_link, &object->pending_ops); + fscache_stat(&fscache_n_op_pend); + } else if (!list_empty(&object->pending_ops)) { + atomic_inc(&op->usage); + list_add_tail(&op->pend_link, &object->pending_ops); + fscache_stat(&fscache_n_op_pend); + fscache_start_operations(object); + } else { + ASSERTCMP(object->n_exclusive, ==, 0); + fscache_run_op(object, op); + } + ret = 0; + } else if (object->state == FSCACHE_OBJECT_CREATING) { + op->object = object; + object->n_ops++; + atomic_inc(&op->usage); + list_add_tail(&op->pend_link, &object->pending_ops); + fscache_stat(&fscache_n_op_pend); + ret = 0; + } else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) { + fscache_report_unexpected_submission(object, op, ostate); + ASSERT(!fscache_object_is_active(object)); + ret = -ENOBUFS; + } else { + ret = -ENOBUFS; + } + + spin_unlock(&object->lock); + return ret; +} + +/* + * queue an object for withdrawal on error, aborting all following asynchronous + * operations + */ +void fscache_abort_object(struct fscache_object *object) +{ + _enter("{OBJ%x}", object->debug_id); + + fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR); +} + +/* + * jump start the operation processing on an object + * - caller must hold object->lock + */ +void fscache_start_operations(struct fscache_object *object) +{ + struct fscache_operation *op; + bool stop = false; + + while (!list_empty(&object->pending_ops) && !stop) { + op = list_entry(object->pending_ops.next, + struct fscache_operation, pend_link); + + if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) { + if (object->n_in_progress > 0) + break; + stop = true; + } + list_del_init(&op->pend_link); + object->n_in_progress++; + + if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) + wake_up_bit(&op->flags, FSCACHE_OP_WAITING); + if (op->processor) + fscache_enqueue_operation(op); + + /* the pending queue was holding a ref on the object */ + fscache_put_operation(op); + } + + ASSERTCMP(object->n_in_progress, <=, object->n_ops); + + _debug("woke %d ops on OBJ%x", + object->n_in_progress, object->debug_id); +} + +/* + * release an operation + * - queues pending ops if this is the last in-progress op + */ +void fscache_put_operation(struct fscache_operation *op) +{ + struct fscache_object *object; + struct fscache_cache *cache; + + _enter("{OBJ%x OP%x,%d}", + op->object->debug_id, op->debug_id, atomic_read(&op->usage)); + + ASSERTCMP(atomic_read(&op->usage), >, 0); + + if (!atomic_dec_and_test(&op->usage)) + return; + + _debug("PUT OP"); + if (test_and_set_bit(FSCACHE_OP_DEAD, &op->flags)) + BUG(); + + fscache_stat(&fscache_n_op_release); + + if (op->release) { + op->release(op); + op->release = NULL; + } + + object = op->object; + + /* now... we may get called with the object spinlock held, so we + * complete the cleanup here only if we can immediately acquire the + * lock, and defer it otherwise */ + if (!spin_trylock(&object->lock)) { + _debug("defer put"); + fscache_stat(&fscache_n_op_deferred_release); + + cache = object->cache; + spin_lock(&cache->op_gc_list_lock); + list_add_tail(&op->pend_link, &cache->op_gc_list); + spin_unlock(&cache->op_gc_list_lock); + schedule_work(&cache->op_gc); + _leave(" [defer]"); + return; + } + + if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) { + ASSERTCMP(object->n_exclusive, >, 0); + object->n_exclusive--; + } + + ASSERTCMP(object->n_in_progress, >, 0); + object->n_in_progress--; + if (object->n_in_progress == 0) + fscache_start_operations(object); + + ASSERTCMP(object->n_ops, >, 0); + object->n_ops--; + if (object->n_ops == 0) + fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED); + + spin_unlock(&object->lock); + + kfree(op); + _leave(" [done]"); +} +EXPORT_SYMBOL(fscache_put_operation); + +/* + * garbage collect operations that have had their release deferred + */ +void fscache_operation_gc(struct work_struct *work) +{ + struct fscache_operation *op; + struct fscache_object *object; + struct fscache_cache *cache = + container_of(work, struct fscache_cache, op_gc); + int count = 0; + + _enter(""); + + do { + spin_lock(&cache->op_gc_list_lock); + if (list_empty(&cache->op_gc_list)) { + spin_unlock(&cache->op_gc_list_lock); + break; + } + + op = list_entry(cache->op_gc_list.next, + struct fscache_operation, pend_link); + list_del(&op->pend_link); + spin_unlock(&cache->op_gc_list_lock); + + object = op->object; + + _debug("GC DEFERRED REL OBJ%x OP%x", + object->debug_id, op->debug_id); + fscache_stat(&fscache_n_op_gc); + + ASSERTCMP(atomic_read(&op->usage), ==, 0); + + spin_lock(&object->lock); + if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) { + ASSERTCMP(object->n_exclusive, >, 0); + object->n_exclusive--; + } + + ASSERTCMP(object->n_in_progress, >, 0); + object->n_in_progress--; + if (object->n_in_progress == 0) + fscache_start_operations(object); + + ASSERTCMP(object->n_ops, >, 0); + object->n_ops--; + if (object->n_ops == 0) + fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED); + + spin_unlock(&object->lock); + + } while (count++ < 20); + + if (!list_empty(&cache->op_gc_list)) + schedule_work(&cache->op_gc); + + _leave(""); +} + +/* + * allow the slow work item processor to get a ref on an operation + */ +static int fscache_op_get_ref(struct slow_work *work) +{ + struct fscache_operation *op = + container_of(work, struct fscache_operation, slow_work); + + atomic_inc(&op->usage); + return 0; +} + +/* + * allow the slow work item processor to discard a ref on an operation + */ +static void fscache_op_put_ref(struct slow_work *work) +{ + struct fscache_operation *op = + container_of(work, struct fscache_operation, slow_work); + + fscache_put_operation(op); +} + +/* + * execute an operation using the slow thread pool to provide processing context + * - the caller holds a ref to this object, so we don't need to hold one + */ +static void fscache_op_execute(struct slow_work *work) +{ + struct fscache_operation *op = + container_of(work, struct fscache_operation, slow_work); + unsigned long start; + + _enter("{OBJ%x OP%x,%d}", + op->object->debug_id, op->debug_id, atomic_read(&op->usage)); + + ASSERT(op->processor != NULL); + start = jiffies; + op->processor(op); + fscache_hist(fscache_ops_histogram, start); + + _leave(""); +} + +const struct slow_work_ops fscache_op_slow_work_ops = { + .get_ref = fscache_op_get_ref, + .put_ref = fscache_op_put_ref, + .execute = fscache_op_execute, +}; diff --git a/fs/fscache/page.c b/fs/fscache/page.c new file mode 100644 index 000000000000..2568e0eb644f --- /dev/null +++ b/fs/fscache/page.c @@ -0,0 +1,816 @@ +/* Cache page management and data I/O routines + * + * Copyright (C) 2004-2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define FSCACHE_DEBUG_LEVEL PAGE +#include <linux/module.h> +#include <linux/fscache-cache.h> +#include <linux/buffer_head.h> +#include <linux/pagevec.h> +#include "internal.h" + +/* + * check to see if a page is being written to the cache + */ +bool __fscache_check_page_write(struct fscache_cookie *cookie, struct page *page) +{ + void *val; + + rcu_read_lock(); + val = radix_tree_lookup(&cookie->stores, page->index); + rcu_read_unlock(); + + return val != NULL; +} +EXPORT_SYMBOL(__fscache_check_page_write); + +/* + * wait for a page to finish being written to the cache + */ +void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *page) +{ + wait_queue_head_t *wq = bit_waitqueue(&cookie->flags, 0); + + wait_event(*wq, !__fscache_check_page_write(cookie, page)); +} +EXPORT_SYMBOL(__fscache_wait_on_page_write); + +/* + * note that a page has finished being written to the cache + */ +static void fscache_end_page_write(struct fscache_cookie *cookie, struct page *page) +{ + struct page *xpage; + + spin_lock(&cookie->lock); + xpage = radix_tree_delete(&cookie->stores, page->index); + spin_unlock(&cookie->lock); + ASSERT(xpage != NULL); + + wake_up_bit(&cookie->flags, 0); +} + +/* + * actually apply the changed attributes to a cache object + */ +static void fscache_attr_changed_op(struct fscache_operation *op) +{ + struct fscache_object *object = op->object; + + _enter("{OBJ%x OP%x}", object->debug_id, op->debug_id); + + fscache_stat(&fscache_n_attr_changed_calls); + + if (fscache_object_is_active(object) && + object->cache->ops->attr_changed(object) < 0) + fscache_abort_object(object); + + _leave(""); +} + +/* + * notification that the attributes on an object have changed + */ +int __fscache_attr_changed(struct fscache_cookie *cookie) +{ + struct fscache_operation *op; + struct fscache_object *object; + + _enter("%p", cookie); + + ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); + + fscache_stat(&fscache_n_attr_changed); + + op = kzalloc(sizeof(*op), GFP_KERNEL); + if (!op) { + fscache_stat(&fscache_n_attr_changed_nomem); + _leave(" = -ENOMEM"); + return -ENOMEM; + } + + fscache_operation_init(op, NULL); + fscache_operation_init_slow(op, fscache_attr_changed_op); + op->flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_EXCLUSIVE); + + spin_lock(&cookie->lock); + + if (hlist_empty(&cookie->backing_objects)) + goto nobufs; + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, cookie_link); + + if (fscache_submit_exclusive_op(object, op) < 0) + goto nobufs; + spin_unlock(&cookie->lock); + fscache_stat(&fscache_n_attr_changed_ok); + fscache_put_operation(op); + _leave(" = 0"); + return 0; + +nobufs: + spin_unlock(&cookie->lock); + kfree(op); + fscache_stat(&fscache_n_attr_changed_nobufs); + _leave(" = %d", -ENOBUFS); + return -ENOBUFS; +} +EXPORT_SYMBOL(__fscache_attr_changed); + +/* + * handle secondary execution given to a retrieval op on behalf of the + * cache + */ +static void fscache_retrieval_work(struct work_struct *work) +{ + struct fscache_retrieval *op = + container_of(work, struct fscache_retrieval, op.fast_work); + unsigned long start; + + _enter("{OP%x}", op->op.debug_id); + + start = jiffies; + op->op.processor(&op->op); + fscache_hist(fscache_ops_histogram, start); + fscache_put_operation(&op->op); +} + +/* + * release a retrieval op reference + */ +static void fscache_release_retrieval_op(struct fscache_operation *_op) +{ + struct fscache_retrieval *op = + container_of(_op, struct fscache_retrieval, op); + + _enter("{OP%x}", op->op.debug_id); + + fscache_hist(fscache_retrieval_histogram, op->start_time); + if (op->context) + fscache_put_context(op->op.object->cookie, op->context); + + _leave(""); +} + +/* + * allocate a retrieval op + */ +static struct fscache_retrieval *fscache_alloc_retrieval( + struct address_space *mapping, + fscache_rw_complete_t end_io_func, + void *context) +{ + struct fscache_retrieval *op; + + /* allocate a retrieval operation and attempt to submit it */ + op = kzalloc(sizeof(*op), GFP_NOIO); + if (!op) { + fscache_stat(&fscache_n_retrievals_nomem); + return NULL; + } + + fscache_operation_init(&op->op, fscache_release_retrieval_op); + op->op.flags = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING); + op->mapping = mapping; + op->end_io_func = end_io_func; + op->context = context; + op->start_time = jiffies; + INIT_WORK(&op->op.fast_work, fscache_retrieval_work); + INIT_LIST_HEAD(&op->to_do); + return op; +} + +/* + * wait for a deferred lookup to complete + */ +static int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie) +{ + unsigned long jif; + + _enter(""); + + if (!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)) { + _leave(" = 0 [imm]"); + return 0; + } + + fscache_stat(&fscache_n_retrievals_wait); + + jif = jiffies; + if (wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP, + fscache_wait_bit_interruptible, + TASK_INTERRUPTIBLE) != 0) { + fscache_stat(&fscache_n_retrievals_intr); + _leave(" = -ERESTARTSYS"); + return -ERESTARTSYS; + } + + ASSERT(!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)); + + smp_rmb(); + fscache_hist(fscache_retrieval_delay_histogram, jif); + _leave(" = 0 [dly]"); + return 0; +} + +/* + * read a page from the cache or allocate a block in which to store it + * - we return: + * -ENOMEM - out of memory, nothing done + * -ERESTARTSYS - interrupted + * -ENOBUFS - no backing object available in which to cache the block + * -ENODATA - no data available in the backing object for this block + * 0 - dispatched a read - it'll call end_io_func() when finished + */ +int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, + struct page *page, + fscache_rw_complete_t end_io_func, + void *context, + gfp_t gfp) +{ + struct fscache_retrieval *op; + struct fscache_object *object; + int ret; + + _enter("%p,%p,,,", cookie, page); + + fscache_stat(&fscache_n_retrievals); + + if (hlist_empty(&cookie->backing_objects)) + goto nobufs; + + ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); + ASSERTCMP(page, !=, NULL); + + if (fscache_wait_for_deferred_lookup(cookie) < 0) + return -ERESTARTSYS; + + op = fscache_alloc_retrieval(page->mapping, end_io_func, context); + if (!op) { + _leave(" = -ENOMEM"); + return -ENOMEM; + } + + spin_lock(&cookie->lock); + + if (hlist_empty(&cookie->backing_objects)) + goto nobufs_unlock; + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, cookie_link); + + ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP); + + if (fscache_submit_op(object, &op->op) < 0) + goto nobufs_unlock; + spin_unlock(&cookie->lock); + + fscache_stat(&fscache_n_retrieval_ops); + + /* pin the netfs read context in case we need to do the actual netfs + * read because we've encountered a cache read failure */ + fscache_get_context(object->cookie, op->context); + + /* we wait for the operation to become active, and then process it + * *here*, in this thread, and not in the thread pool */ + if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) { + _debug(">>> WT"); + fscache_stat(&fscache_n_retrieval_op_waits); + wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, + fscache_wait_bit, TASK_UNINTERRUPTIBLE); + _debug("<<< GO"); + } + + /* ask the cache to honour the operation */ + if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) { + ret = object->cache->ops->allocate_page(op, page, gfp); + if (ret == 0) + ret = -ENODATA; + } else { + ret = object->cache->ops->read_or_alloc_page(op, page, gfp); + } + + if (ret == -ENOMEM) + fscache_stat(&fscache_n_retrievals_nomem); + else if (ret == -ERESTARTSYS) + fscache_stat(&fscache_n_retrievals_intr); + else if (ret == -ENODATA) + fscache_stat(&fscache_n_retrievals_nodata); + else if (ret < 0) + fscache_stat(&fscache_n_retrievals_nobufs); + else + fscache_stat(&fscache_n_retrievals_ok); + + fscache_put_retrieval(op); + _leave(" = %d", ret); + return ret; + +nobufs_unlock: + spin_unlock(&cookie->lock); + kfree(op); +nobufs: + fscache_stat(&fscache_n_retrievals_nobufs); + _leave(" = -ENOBUFS"); + return -ENOBUFS; +} +EXPORT_SYMBOL(__fscache_read_or_alloc_page); + +/* + * read a list of page from the cache or allocate a block in which to store + * them + * - we return: + * -ENOMEM - out of memory, some pages may be being read + * -ERESTARTSYS - interrupted, some pages may be being read + * -ENOBUFS - no backing object or space available in which to cache any + * pages not being read + * -ENODATA - no data available in the backing object for some or all of + * the pages + * 0 - dispatched a read on all pages + * + * end_io_func() will be called for each page read from the cache as it is + * finishes being read + * + * any pages for which a read is dispatched will be removed from pages and + * nr_pages + */ +int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages, + fscache_rw_complete_t end_io_func, + void *context, + gfp_t gfp) +{ + fscache_pages_retrieval_func_t func; + struct fscache_retrieval *op; + struct fscache_object *object; + int ret; + + _enter("%p,,%d,,,", cookie, *nr_pages); + + fscache_stat(&fscache_n_retrievals); + + if (hlist_empty(&cookie->backing_objects)) + goto nobufs; + + ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); + ASSERTCMP(*nr_pages, >, 0); + ASSERT(!list_empty(pages)); + + if (fscache_wait_for_deferred_lookup(cookie) < 0) + return -ERESTARTSYS; + + op = fscache_alloc_retrieval(mapping, end_io_func, context); + if (!op) + return -ENOMEM; + + spin_lock(&cookie->lock); + + if (hlist_empty(&cookie->backing_objects)) + goto nobufs_unlock; + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, cookie_link); + + if (fscache_submit_op(object, &op->op) < 0) + goto nobufs_unlock; + spin_unlock(&cookie->lock); + + fscache_stat(&fscache_n_retrieval_ops); + + /* pin the netfs read context in case we need to do the actual netfs + * read because we've encountered a cache read failure */ + fscache_get_context(object->cookie, op->context); + + /* we wait for the operation to become active, and then process it + * *here*, in this thread, and not in the thread pool */ + if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) { + _debug(">>> WT"); + fscache_stat(&fscache_n_retrieval_op_waits); + wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, + fscache_wait_bit, TASK_UNINTERRUPTIBLE); + _debug("<<< GO"); + } + + /* ask the cache to honour the operation */ + if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) + func = object->cache->ops->allocate_pages; + else + func = object->cache->ops->read_or_alloc_pages; + ret = func(op, pages, nr_pages, gfp); + + if (ret == -ENOMEM) + fscache_stat(&fscache_n_retrievals_nomem); + else if (ret == -ERESTARTSYS) + fscache_stat(&fscache_n_retrievals_intr); + else if (ret == -ENODATA) + fscache_stat(&fscache_n_retrievals_nodata); + else if (ret < 0) + fscache_stat(&fscache_n_retrievals_nobufs); + else + fscache_stat(&fscache_n_retrievals_ok); + + fscache_put_retrieval(op); + _leave(" = %d", ret); + return ret; + +nobufs_unlock: + spin_unlock(&cookie->lock); + kfree(op); +nobufs: + fscache_stat(&fscache_n_retrievals_nobufs); + _leave(" = -ENOBUFS"); + return -ENOBUFS; +} +EXPORT_SYMBOL(__fscache_read_or_alloc_pages); + +/* + * allocate a block in the cache on which to store a page + * - we return: + * -ENOMEM - out of memory, nothing done + * -ERESTARTSYS - interrupted + * -ENOBUFS - no backing object available in which to cache the block + * 0 - block allocated + */ +int __fscache_alloc_page(struct fscache_cookie *cookie, + struct page *page, + gfp_t gfp) +{ + struct fscache_retrieval *op; + struct fscache_object *object; + int ret; + + _enter("%p,%p,,,", cookie, page); + + fscache_stat(&fscache_n_allocs); + + if (hlist_empty(&cookie->backing_objects)) + goto nobufs; + + ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); + ASSERTCMP(page, !=, NULL); + + if (fscache_wait_for_deferred_lookup(cookie) < 0) + return -ERESTARTSYS; + + op = fscache_alloc_retrieval(page->mapping, NULL, NULL); + if (!op) + return -ENOMEM; + + spin_lock(&cookie->lock); + + if (hlist_empty(&cookie->backing_objects)) + goto nobufs_unlock; + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, cookie_link); + + if (fscache_submit_op(object, &op->op) < 0) + goto nobufs_unlock; + spin_unlock(&cookie->lock); + + fscache_stat(&fscache_n_alloc_ops); + + if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) { + _debug(">>> WT"); + fscache_stat(&fscache_n_alloc_op_waits); + wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, + fscache_wait_bit, TASK_UNINTERRUPTIBLE); + _debug("<<< GO"); + } + + /* ask the cache to honour the operation */ + ret = object->cache->ops->allocate_page(op, page, gfp); + + if (ret < 0) + fscache_stat(&fscache_n_allocs_nobufs); + else + fscache_stat(&fscache_n_allocs_ok); + + fscache_put_retrieval(op); + _leave(" = %d", ret); + return ret; + +nobufs_unlock: + spin_unlock(&cookie->lock); + kfree(op); +nobufs: + fscache_stat(&fscache_n_allocs_nobufs); + _leave(" = -ENOBUFS"); + return -ENOBUFS; +} +EXPORT_SYMBOL(__fscache_alloc_page); + +/* + * release a write op reference + */ +static void fscache_release_write_op(struct fscache_operation *_op) +{ + _enter("{OP%x}", _op->debug_id); +} + +/* + * perform the background storage of a page into the cache + */ +static void fscache_write_op(struct fscache_operation *_op) +{ + struct fscache_storage *op = + container_of(_op, struct fscache_storage, op); + struct fscache_object *object = op->op.object; + struct fscache_cookie *cookie = object->cookie; + struct page *page; + unsigned n; + void *results[1]; + int ret; + + _enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage)); + + spin_lock(&cookie->lock); + spin_lock(&object->lock); + + if (!fscache_object_is_active(object)) { + spin_unlock(&object->lock); + spin_unlock(&cookie->lock); + _leave(""); + return; + } + + fscache_stat(&fscache_n_store_calls); + + /* find a page to store */ + page = NULL; + n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0, 1, + FSCACHE_COOKIE_PENDING_TAG); + if (n != 1) + goto superseded; + page = results[0]; + _debug("gang %d [%lx]", n, page->index); + if (page->index > op->store_limit) + goto superseded; + + radix_tree_tag_clear(&cookie->stores, page->index, + FSCACHE_COOKIE_PENDING_TAG); + + spin_unlock(&object->lock); + spin_unlock(&cookie->lock); + + if (page) { + ret = object->cache->ops->write_page(op, page); + fscache_end_page_write(cookie, page); + page_cache_release(page); + if (ret < 0) + fscache_abort_object(object); + else + fscache_enqueue_operation(&op->op); + } + + _leave(""); + return; + +superseded: + /* this writer is going away and there aren't any more things to + * write */ + _debug("cease"); + clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); + spin_unlock(&object->lock); + spin_unlock(&cookie->lock); + _leave(""); +} + +/* + * request a page be stored in the cache + * - returns: + * -ENOMEM - out of memory, nothing done + * -ENOBUFS - no backing object available in which to cache the page + * 0 - dispatched a write - it'll call end_io_func() when finished + * + * if the cookie still has a backing object at this point, that object can be + * in one of a few states with respect to storage processing: + * + * (1) negative lookup, object not yet created (FSCACHE_COOKIE_CREATING is + * set) + * + * (a) no writes yet (set FSCACHE_COOKIE_PENDING_FILL and queue deferred + * fill op) + * + * (b) writes deferred till post-creation (mark page for writing and + * return immediately) + * + * (2) negative lookup, object created, initial fill being made from netfs + * (FSCACHE_COOKIE_INITIAL_FILL is set) + * + * (a) fill point not yet reached this page (mark page for writing and + * return) + * + * (b) fill point passed this page (queue op to store this page) + * + * (3) object extant (queue op to store this page) + * + * any other state is invalid + */ +int __fscache_write_page(struct fscache_cookie *cookie, + struct page *page, + gfp_t gfp) +{ + struct fscache_storage *op; + struct fscache_object *object; + int ret; + + _enter("%p,%x,", cookie, (u32) page->flags); + + ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); + ASSERT(PageFsCache(page)); + + fscache_stat(&fscache_n_stores); + + op = kzalloc(sizeof(*op), GFP_NOIO); + if (!op) + goto nomem; + + fscache_operation_init(&op->op, fscache_release_write_op); + fscache_operation_init_slow(&op->op, fscache_write_op); + op->op.flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_WAITING); + + ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM); + if (ret < 0) + goto nomem_free; + + ret = -ENOBUFS; + spin_lock(&cookie->lock); + + if (hlist_empty(&cookie->backing_objects)) + goto nobufs; + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, cookie_link); + if (test_bit(FSCACHE_IOERROR, &object->cache->flags)) + goto nobufs; + + /* add the page to the pending-storage radix tree on the backing + * object */ + spin_lock(&object->lock); + + _debug("store limit %llx", (unsigned long long) object->store_limit); + + ret = radix_tree_insert(&cookie->stores, page->index, page); + if (ret < 0) { + if (ret == -EEXIST) + goto already_queued; + _debug("insert failed %d", ret); + goto nobufs_unlock_obj; + } + + radix_tree_tag_set(&cookie->stores, page->index, + FSCACHE_COOKIE_PENDING_TAG); + page_cache_get(page); + + /* we only want one writer at a time, but we do need to queue new + * writers after exclusive ops */ + if (test_and_set_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags)) + goto already_pending; + + spin_unlock(&object->lock); + + op->op.debug_id = atomic_inc_return(&fscache_op_debug_id); + op->store_limit = object->store_limit; + + if (fscache_submit_op(object, &op->op) < 0) + goto submit_failed; + + spin_unlock(&cookie->lock); + radix_tree_preload_end(); + fscache_stat(&fscache_n_store_ops); + fscache_stat(&fscache_n_stores_ok); + + /* the slow work queue now carries its own ref on the object */ + fscache_put_operation(&op->op); + _leave(" = 0"); + return 0; + +already_queued: + fscache_stat(&fscache_n_stores_again); +already_pending: + spin_unlock(&object->lock); + spin_unlock(&cookie->lock); + radix_tree_preload_end(); + kfree(op); + fscache_stat(&fscache_n_stores_ok); + _leave(" = 0"); + return 0; + +submit_failed: + radix_tree_delete(&cookie->stores, page->index); + page_cache_release(page); + ret = -ENOBUFS; + goto nobufs; + +nobufs_unlock_obj: + spin_unlock(&object->lock); +nobufs: + spin_unlock(&cookie->lock); + radix_tree_preload_end(); + kfree(op); + fscache_stat(&fscache_n_stores_nobufs); + _leave(" = -ENOBUFS"); + return -ENOBUFS; + +nomem_free: + kfree(op); +nomem: + fscache_stat(&fscache_n_stores_oom); + _leave(" = -ENOMEM"); + return -ENOMEM; +} +EXPORT_SYMBOL(__fscache_write_page); + +/* + * remove a page from the cache + */ +void __fscache_uncache_page(struct fscache_cookie *cookie, struct page *page) +{ + struct fscache_object *object; + + _enter(",%p", page); + + ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); + ASSERTCMP(page, !=, NULL); + + fscache_stat(&fscache_n_uncaches); + + /* cache withdrawal may beat us to it */ + if (!PageFsCache(page)) + goto done; + + /* get the object */ + spin_lock(&cookie->lock); + + if (hlist_empty(&cookie->backing_objects)) { + ClearPageFsCache(page); + goto done_unlock; + } + + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, cookie_link); + + /* there might now be stuff on disk we could read */ + clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); + + /* only invoke the cache backend if we managed to mark the page + * uncached here; this deals with synchronisation vs withdrawal */ + if (TestClearPageFsCache(page) && + object->cache->ops->uncache_page) { + /* the cache backend releases the cookie lock */ + object->cache->ops->uncache_page(object, page); + goto done; + } + +done_unlock: + spin_unlock(&cookie->lock); +done: + _leave(""); +} +EXPORT_SYMBOL(__fscache_uncache_page); + +/** + * fscache_mark_pages_cached - Mark pages as being cached + * @op: The retrieval op pages are being marked for + * @pagevec: The pages to be marked + * + * Mark a bunch of netfs pages as being cached. After this is called, + * the netfs must call fscache_uncache_page() to remove the mark. + */ +void fscache_mark_pages_cached(struct fscache_retrieval *op, + struct pagevec *pagevec) +{ + struct fscache_cookie *cookie = op->op.object->cookie; + unsigned long loop; + +#ifdef CONFIG_FSCACHE_STATS + atomic_add(pagevec->nr, &fscache_n_marks); +#endif + + for (loop = 0; loop < pagevec->nr; loop++) { + struct page *page = pagevec->pages[loop]; + + _debug("- mark %p{%lx}", page, page->index); + if (TestSetPageFsCache(page)) { + static bool once_only; + if (!once_only) { + once_only = true; + printk(KERN_WARNING "FS-Cache:" + " Cookie type %s marked page %lx" + " multiple times\n", + cookie->def->name, page->index); + } + } + } + + if (cookie->def->mark_pages_cached) + cookie->def->mark_pages_cached(cookie->netfs_data, + op->mapping, pagevec); + pagevec_reinit(pagevec); +} +EXPORT_SYMBOL(fscache_mark_pages_cached); diff --git a/fs/fscache/proc.c b/fs/fscache/proc.c new file mode 100644 index 000000000000..beeab44bc31a --- /dev/null +++ b/fs/fscache/proc.c @@ -0,0 +1,68 @@ +/* FS-Cache statistics viewing interface + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define FSCACHE_DEBUG_LEVEL OPERATION +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include "internal.h" + +/* + * initialise the /proc/fs/fscache/ directory + */ +int __init fscache_proc_init(void) +{ + _enter(""); + + if (!proc_mkdir("fs/fscache", NULL)) + goto error_dir; + +#ifdef CONFIG_FSCACHE_STATS + if (!proc_create("fs/fscache/stats", S_IFREG | 0444, NULL, + &fscache_stats_fops)) + goto error_stats; +#endif + +#ifdef CONFIG_FSCACHE_HISTOGRAM + if (!proc_create("fs/fscache/histogram", S_IFREG | 0444, NULL, + &fscache_histogram_fops)) + goto error_histogram; +#endif + + _leave(" = 0"); + return 0; + +#ifdef CONFIG_FSCACHE_HISTOGRAM +error_histogram: +#endif +#ifdef CONFIG_FSCACHE_STATS + remove_proc_entry("fs/fscache/stats", NULL); +error_stats: +#endif + remove_proc_entry("fs/fscache", NULL); +error_dir: + _leave(" = -ENOMEM"); + return -ENOMEM; +} + +/* + * clean up the /proc/fs/fscache/ directory + */ +void fscache_proc_cleanup(void) +{ +#ifdef CONFIG_FSCACHE_HISTOGRAM + remove_proc_entry("fs/fscache/histogram", NULL); +#endif +#ifdef CONFIG_FSCACHE_STATS + remove_proc_entry("fs/fscache/stats", NULL); +#endif + remove_proc_entry("fs/fscache", NULL); +} diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c new file mode 100644 index 000000000000..65deb99e756b --- /dev/null +++ b/fs/fscache/stats.c @@ -0,0 +1,212 @@ +/* FS-Cache statistics + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define FSCACHE_DEBUG_LEVEL THREAD +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include "internal.h" + +/* + * operation counters + */ +atomic_t fscache_n_op_pend; +atomic_t fscache_n_op_run; +atomic_t fscache_n_op_enqueue; +atomic_t fscache_n_op_requeue; +atomic_t fscache_n_op_deferred_release; +atomic_t fscache_n_op_release; +atomic_t fscache_n_op_gc; + +atomic_t fscache_n_attr_changed; +atomic_t fscache_n_attr_changed_ok; +atomic_t fscache_n_attr_changed_nobufs; +atomic_t fscache_n_attr_changed_nomem; +atomic_t fscache_n_attr_changed_calls; + +atomic_t fscache_n_allocs; +atomic_t fscache_n_allocs_ok; +atomic_t fscache_n_allocs_wait; +atomic_t fscache_n_allocs_nobufs; +atomic_t fscache_n_alloc_ops; +atomic_t fscache_n_alloc_op_waits; + +atomic_t fscache_n_retrievals; +atomic_t fscache_n_retrievals_ok; +atomic_t fscache_n_retrievals_wait; +atomic_t fscache_n_retrievals_nodata; +atomic_t fscache_n_retrievals_nobufs; +atomic_t fscache_n_retrievals_intr; +atomic_t fscache_n_retrievals_nomem; +atomic_t fscache_n_retrieval_ops; +atomic_t fscache_n_retrieval_op_waits; + +atomic_t fscache_n_stores; +atomic_t fscache_n_stores_ok; +atomic_t fscache_n_stores_again; +atomic_t fscache_n_stores_nobufs; +atomic_t fscache_n_stores_oom; +atomic_t fscache_n_store_ops; +atomic_t fscache_n_store_calls; + +atomic_t fscache_n_marks; +atomic_t fscache_n_uncaches; + +atomic_t fscache_n_acquires; +atomic_t fscache_n_acquires_null; +atomic_t fscache_n_acquires_no_cache; +atomic_t fscache_n_acquires_ok; +atomic_t fscache_n_acquires_nobufs; +atomic_t fscache_n_acquires_oom; + +atomic_t fscache_n_updates; +atomic_t fscache_n_updates_null; +atomic_t fscache_n_updates_run; + +atomic_t fscache_n_relinquishes; +atomic_t fscache_n_relinquishes_null; +atomic_t fscache_n_relinquishes_waitcrt; + +atomic_t fscache_n_cookie_index; +atomic_t fscache_n_cookie_data; +atomic_t fscache_n_cookie_special; + +atomic_t fscache_n_object_alloc; +atomic_t fscache_n_object_no_alloc; +atomic_t fscache_n_object_lookups; +atomic_t fscache_n_object_lookups_negative; +atomic_t fscache_n_object_lookups_positive; +atomic_t fscache_n_object_created; +atomic_t fscache_n_object_avail; +atomic_t fscache_n_object_dead; + +atomic_t fscache_n_checkaux_none; +atomic_t fscache_n_checkaux_okay; +atomic_t fscache_n_checkaux_update; +atomic_t fscache_n_checkaux_obsolete; + +/* + * display the general statistics + */ +static int fscache_stats_show(struct seq_file *m, void *v) +{ + seq_puts(m, "FS-Cache statistics\n"); + + seq_printf(m, "Cookies: idx=%u dat=%u spc=%u\n", + atomic_read(&fscache_n_cookie_index), + atomic_read(&fscache_n_cookie_data), + atomic_read(&fscache_n_cookie_special)); + + seq_printf(m, "Objects: alc=%u nal=%u avl=%u ded=%u\n", + atomic_read(&fscache_n_object_alloc), + atomic_read(&fscache_n_object_no_alloc), + atomic_read(&fscache_n_object_avail), + atomic_read(&fscache_n_object_dead)); + seq_printf(m, "ChkAux : non=%u ok=%u upd=%u obs=%u\n", + atomic_read(&fscache_n_checkaux_none), + atomic_read(&fscache_n_checkaux_okay), + atomic_read(&fscache_n_checkaux_update), + atomic_read(&fscache_n_checkaux_obsolete)); + + seq_printf(m, "Pages : mrk=%u unc=%u\n", + atomic_read(&fscache_n_marks), + atomic_read(&fscache_n_uncaches)); + + seq_printf(m, "Acquire: n=%u nul=%u noc=%u ok=%u nbf=%u" + " oom=%u\n", + atomic_read(&fscache_n_acquires), + atomic_read(&fscache_n_acquires_null), + atomic_read(&fscache_n_acquires_no_cache), + atomic_read(&fscache_n_acquires_ok), + atomic_read(&fscache_n_acquires_nobufs), + atomic_read(&fscache_n_acquires_oom)); + + seq_printf(m, "Lookups: n=%u neg=%u pos=%u crt=%u\n", + atomic_read(&fscache_n_object_lookups), + atomic_read(&fscache_n_object_lookups_negative), + atomic_read(&fscache_n_object_lookups_positive), + atomic_read(&fscache_n_object_created)); + + seq_printf(m, "Updates: n=%u nul=%u run=%u\n", + atomic_read(&fscache_n_updates), + atomic_read(&fscache_n_updates_null), + atomic_read(&fscache_n_updates_run)); + + seq_printf(m, "Relinqs: n=%u nul=%u wcr=%u\n", + atomic_read(&fscache_n_relinquishes), + atomic_read(&fscache_n_relinquishes_null), + atomic_read(&fscache_n_relinquishes_waitcrt)); + + seq_printf(m, "AttrChg: n=%u ok=%u nbf=%u oom=%u run=%u\n", + atomic_read(&fscache_n_attr_changed), + atomic_read(&fscache_n_attr_changed_ok), + atomic_read(&fscache_n_attr_changed_nobufs), + atomic_read(&fscache_n_attr_changed_nomem), + atomic_read(&fscache_n_attr_changed_calls)); + + seq_printf(m, "Allocs : n=%u ok=%u wt=%u nbf=%u\n", + atomic_read(&fscache_n_allocs), + atomic_read(&fscache_n_allocs_ok), + atomic_read(&fscache_n_allocs_wait), + atomic_read(&fscache_n_allocs_nobufs)); + seq_printf(m, "Allocs : ops=%u owt=%u\n", + atomic_read(&fscache_n_alloc_ops), + atomic_read(&fscache_n_alloc_op_waits)); + + seq_printf(m, "Retrvls: n=%u ok=%u wt=%u nod=%u nbf=%u" + " int=%u oom=%u\n", + atomic_read(&fscache_n_retrievals), + atomic_read(&fscache_n_retrievals_ok), + atomic_read(&fscache_n_retrievals_wait), + atomic_read(&fscache_n_retrievals_nodata), + atomic_read(&fscache_n_retrievals_nobufs), + atomic_read(&fscache_n_retrievals_intr), + atomic_read(&fscache_n_retrievals_nomem)); + seq_printf(m, "Retrvls: ops=%u owt=%u\n", + atomic_read(&fscache_n_retrieval_ops), + atomic_read(&fscache_n_retrieval_op_waits)); + + seq_printf(m, "Stores : n=%u ok=%u agn=%u nbf=%u oom=%u\n", + atomic_read(&fscache_n_stores), + atomic_read(&fscache_n_stores_ok), + atomic_read(&fscache_n_stores_again), + atomic_read(&fscache_n_stores_nobufs), + atomic_read(&fscache_n_stores_oom)); + seq_printf(m, "Stores : ops=%u run=%u\n", + atomic_read(&fscache_n_store_ops), + atomic_read(&fscache_n_store_calls)); + + seq_printf(m, "Ops : pend=%u run=%u enq=%u\n", + atomic_read(&fscache_n_op_pend), + atomic_read(&fscache_n_op_run), + atomic_read(&fscache_n_op_enqueue)); + seq_printf(m, "Ops : dfr=%u rel=%u gc=%u\n", + atomic_read(&fscache_n_op_deferred_release), + atomic_read(&fscache_n_op_release), + atomic_read(&fscache_n_op_gc)); + return 0; +} + +/* + * open "/proc/fs/fscache/stats" allowing provision of a statistical summary + */ +static int fscache_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, fscache_stats_show, NULL); +} + +const struct file_operations fscache_stats_fops = { + .owner = THIS_MODULE, + .open = fscache_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 06da05261e04..8b8eebc5614b 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1032,6 +1032,7 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) fuse_put_request(fc, req); return -ENOMEM; } + req->out.argpages = 1; req->num_pages = 1; req->pages[0] = page; fuse_read_fill(req, file, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 4e340fedf768..2b25133524a3 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -386,7 +386,6 @@ void fuse_read_fill(struct fuse_req *req, struct file *file, req->in.numargs = 1; req->in.args[0].size = sizeof(struct fuse_read_in); req->in.args[0].value = inarg; - req->out.argpages = 1; req->out.argvar = 1; req->out.numargs = 1; req->out.args[0].size = count; @@ -453,6 +452,7 @@ static int fuse_readpage(struct file *file, struct page *page) attr_ver = fuse_get_attr_version(fc); req->out.page_zeroing = 1; + req->out.argpages = 1; req->num_pages = 1; req->pages[0] = page; num_read = fuse_send_read(req, file, inode, pos, count, NULL); @@ -510,6 +510,8 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file, struct fuse_conn *fc = get_fuse_conn(inode); loff_t pos = page_offset(req->pages[0]); size_t count = req->num_pages << PAGE_CACHE_SHIFT; + + req->out.argpages = 1; req->out.page_zeroing = 1; fuse_read_fill(req, file, inode, pos, count, FUSE_READ); req->misc.read.attr_ver = fuse_get_attr_version(fc); @@ -621,7 +623,6 @@ static void fuse_write_fill(struct fuse_req *req, struct file *file, inarg->flags = file ? file->f_flags : 0; req->in.h.opcode = FUSE_WRITE; req->in.h.nodeid = get_node_id(inode); - req->in.argpages = 1; req->in.numargs = 2; if (fc->minor < 9) req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE; @@ -695,6 +696,7 @@ static int fuse_buffered_write(struct file *file, struct inode *inode, if (IS_ERR(req)) return PTR_ERR(req); + req->in.argpages = 1; req->num_pages = 1; req->pages[0] = page; req->page_offset = offset; @@ -771,6 +773,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, size_t count = 0; int err; + req->in.argpages = 1; req->page_offset = offset; do { @@ -935,21 +938,28 @@ static void fuse_release_user_pages(struct fuse_req *req, int write) } static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf, - unsigned nbytes, int write) + unsigned *nbytesp, int write) { + unsigned nbytes = *nbytesp; unsigned long user_addr = (unsigned long) buf; unsigned offset = user_addr & ~PAGE_MASK; int npages; - /* This doesn't work with nfsd */ - if (!current->mm) - return -EPERM; + /* Special case for kernel I/O: can copy directly into the buffer */ + if (segment_eq(get_fs(), KERNEL_DS)) { + if (write) + req->in.args[1].value = (void *) user_addr; + else + req->out.args[0].value = (void *) user_addr; + + return 0; + } nbytes = min(nbytes, (unsigned) FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ); down_read(¤t->mm->mmap_sem); - npages = get_user_pages(current, current->mm, user_addr, npages, write, + npages = get_user_pages(current, current->mm, user_addr, npages, !write, 0, req->pages, NULL); up_read(¤t->mm->mmap_sem); if (npages < 0) @@ -957,6 +967,15 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf, req->num_pages = npages; req->page_offset = offset; + + if (write) + req->in.argpages = 1; + else + req->out.argpages = 1; + + nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset; + *nbytesp = min(*nbytesp, nbytes); + return 0; } @@ -979,15 +998,13 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf, while (count) { size_t nres; - size_t nbytes_limit = min(count, nmax); - size_t nbytes; - int err = fuse_get_user_pages(req, buf, nbytes_limit, !write); + size_t nbytes = min(count, nmax); + int err = fuse_get_user_pages(req, buf, &nbytes, write); if (err) { res = err; break; } - nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset; - nbytes = min(nbytes_limit, nbytes); + if (write) nres = fuse_send_write(req, file, inode, pos, nbytes, current->files); @@ -1163,6 +1180,7 @@ static int fuse_writepage_locked(struct page *page) fuse_write_fill(req, NULL, ff, inode, page_offset(page), 0, 1); copy_highpage(tmp_page, page); + req->in.argpages = 1; req->num_pages = 1; req->pages[0] = tmp_page; req->page_offset = 0; @@ -1274,6 +1292,15 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) return 0; } +static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma) +{ + /* Can't provide the coherency needed for MAP_SHARED */ + if (vma->vm_flags & VM_MAYSHARE) + return -ENODEV; + + return generic_file_mmap(file, vma); +} + static int convert_fuse_file_lock(const struct fuse_file_lock *ffl, struct file_lock *fl) { @@ -1908,6 +1935,7 @@ static const struct file_operations fuse_direct_io_file_operations = { .llseek = fuse_file_llseek, .read = fuse_direct_read, .write = fuse_direct_write, + .mmap = fuse_direct_mmap, .open = fuse_open, .flush = fuse_flush, .release = fuse_release, @@ -1917,7 +1945,7 @@ static const struct file_operations fuse_direct_io_file_operations = { .unlocked_ioctl = fuse_file_ioctl, .compat_ioctl = fuse_file_compat_ioctl, .poll = fuse_file_poll, - /* no mmap and splice_read */ + /* no splice_read */ }; static const struct address_space_operations fuse_file_aops = { diff --git a/fs/generic_acl.c b/fs/generic_acl.c index 995d63b2e747..e0b53aa7bbec 100644 --- a/fs/generic_acl.c +++ b/fs/generic_acl.c @@ -134,7 +134,7 @@ generic_acl_init(struct inode *inode, struct inode *dir, mode_t mode = inode->i_mode; int error; - inode->i_mode = mode & ~current->fs->umask; + inode->i_mode = mode & ~current_umask(); if (!S_ISLNK(inode->i_mode)) acl = ops->getacl(dir, ACL_TYPE_DEFAULT); if (acl) { diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 43764f4fa763..fa881bdc3d85 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -215,7 +215,7 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip) if (error) return error; if (!acl) { - mode &= ~current->fs->umask; + mode &= ~current_umask(); if (mode != ip->i_inode.i_mode) error = munge_mode(ip, mode); return error; diff --git a/fs/hfs/super.c b/fs/hfs/super.c index c8b5acf4b0b7..a36bb749926d 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -82,6 +82,7 @@ static void hfs_put_super(struct super_block *sb) static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); buf->f_type = HFS_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; @@ -90,6 +91,8 @@ static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = buf->f_bfree; buf->f_files = HFS_SB(sb)->fs_ablocks; buf->f_ffree = HFS_SB(sb)->free_ablocks; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); buf->f_namelen = HFS_NAMELEN; return 0; diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c index bab7f8d1bdfa..3fcbb0e1f6fc 100644 --- a/fs/hfsplus/options.c +++ b/fs/hfsplus/options.c @@ -48,7 +48,7 @@ void hfsplus_fill_defaults(struct hfsplus_sb_info *opts) opts->creator = HFSPLUS_DEF_CR_TYPE; opts->type = HFSPLUS_DEF_CR_TYPE; - opts->umask = current->fs->umask; + opts->umask = current_umask(); opts->uid = current_uid(); opts->gid = current_gid(); opts->part = -1; diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index eb74531a0a8e..f2a64020f42e 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -223,6 +223,7 @@ static void hfsplus_put_super(struct super_block *sb) static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); buf->f_type = HFSPLUS_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; @@ -231,6 +232,8 @@ static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = buf->f_bfree; buf->f_files = 0xFFFFFFFF; buf->f_ffree = 0xFFFFFFFF - HFSPLUS_SB(sb).next_cnid; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); buf->f_namelen = HFSPLUS_MAX_STRLEN; return 0; diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 0d049b8919c4..fecf402d7b8a 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -136,6 +136,7 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *s = dentry->d_sb; struct hpfs_sb_info *sbi = hpfs_sb(s); + u64 id = huge_encode_dev(s->s_bdev->bd_dev); lock_kernel(); /*if (sbi->sb_n_free == -1) {*/ @@ -149,6 +150,8 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = sbi->sb_n_free; buf->f_files = sbi->sb_dirband_size / 4; buf->f_ffree = sbi->sb_n_free_dnodes; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); buf->f_namelen = 254; unlock_kernel(); @@ -477,7 +480,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) uid = current_uid(); gid = current_gid(); - umask = current->fs->umask; + umask = current_umask(); lowercase = 0; conv = CONV_BINARY; eas = 2; diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index b278f7f52024..a5089a6dd67a 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -280,7 +280,12 @@ static ssize_t hppfs_read(struct file *file, char __user *buf, size_t count, "errno = %d\n", err); return err; } - count = hppfs_read_file(hppfs->host_fd, buf, count); + err = hppfs_read_file(hppfs->host_fd, buf, count); + if (err < 0) { + printk(KERN_ERR "hppfs_read: read failed: %d\n", err); + return err; + } + count = err; if (count > 0) *ppos += count; } diff --git a/fs/internal.h b/fs/internal.h index 53af885f1732..b4dac4fb6b61 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -11,6 +11,7 @@ struct super_block; struct linux_binprm; +struct path; /* * block_dev.c @@ -43,7 +44,7 @@ extern void __init chrdev_init(void); /* * exec.c */ -extern void check_unsafe_exec(struct linux_binprm *); +extern int check_unsafe_exec(struct linux_binprm *); /* * namespace.c @@ -60,3 +61,8 @@ extern void umount_tree(struct vfsmount *, int, struct list_head *); extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int); extern void __init mnt_init(void); + +/* + * fs_struct.c + */ +extern void chroot_fs_refs(struct path *, struct path *); diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 13d2eddd0692..b4cbe9603c7d 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -923,6 +923,7 @@ out_freesbi: static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); buf->f_type = ISOFS_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; @@ -932,6 +933,8 @@ static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = 0; buf->f_files = ISOFS_SB(sb)->s_ninodes; buf->f_ffree = 0; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); buf->f_namelen = NAME_MAX; return 0; } diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 3fbffb1ea714..a8e8513a78a9 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -20,6 +20,7 @@ #include <linux/slab.h> #include <linux/mm.h> #include <linux/pagemap.h> +#include <linux/bio.h> /* * Default IO end handler for temporary BJ_IO buffer_heads. @@ -171,14 +172,15 @@ static int journal_write_commit_record(journal_t *journal, return (ret == -EIO); } -static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) +static void journal_do_submit_data(struct buffer_head **wbuf, int bufs, + int write_op) { int i; for (i = 0; i < bufs; i++) { wbuf[i]->b_end_io = end_buffer_write_sync; /* We use-up our safety reference in submit_bh() */ - submit_bh(WRITE, wbuf[i]); + submit_bh(write_op, wbuf[i]); } } @@ -186,7 +188,8 @@ static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) * Submit all the data buffers to disk */ static int journal_submit_data_buffers(journal_t *journal, - transaction_t *commit_transaction) + transaction_t *commit_transaction, + int write_op) { struct journal_head *jh; struct buffer_head *bh; @@ -225,7 +228,7 @@ write_out_data: BUFFER_TRACE(bh, "needs blocking lock"); spin_unlock(&journal->j_list_lock); /* Write out all data to prevent deadlocks */ - journal_do_submit_data(wbuf, bufs); + journal_do_submit_data(wbuf, bufs, write_op); bufs = 0; lock_buffer(bh); spin_lock(&journal->j_list_lock); @@ -256,7 +259,7 @@ write_out_data: jbd_unlock_bh_state(bh); if (bufs == journal->j_wbufsize) { spin_unlock(&journal->j_list_lock); - journal_do_submit_data(wbuf, bufs); + journal_do_submit_data(wbuf, bufs, write_op); bufs = 0; goto write_out_data; } @@ -286,7 +289,7 @@ write_out_data: } } spin_unlock(&journal->j_list_lock); - journal_do_submit_data(wbuf, bufs); + journal_do_submit_data(wbuf, bufs, write_op); return err; } @@ -315,6 +318,7 @@ void journal_commit_transaction(journal_t *journal) int first_tag = 0; int tag_flag; int i; + int write_op = WRITE; /* * First job: lock down the current transaction and wait for @@ -347,6 +351,13 @@ void journal_commit_transaction(journal_t *journal) spin_lock(&journal->j_state_lock); commit_transaction->t_state = T_LOCKED; + /* + * Use plugged writes here, since we want to submit several before + * we unplug the device. We don't do explicit unplugging in here, + * instead we rely on sync_buffer() doing the unplug for us. + */ + if (commit_transaction->t_synchronous_commit) + write_op = WRITE_SYNC_PLUG; spin_lock(&commit_transaction->t_handle_lock); while (commit_transaction->t_updates) { DEFINE_WAIT(wait); @@ -431,7 +442,8 @@ void journal_commit_transaction(journal_t *journal) * Now start flushing things to disk, in the order they appear * on the transaction lists. Data blocks go first. */ - err = journal_submit_data_buffers(journal, commit_transaction); + err = journal_submit_data_buffers(journal, commit_transaction, + write_op); /* * Wait for all previously submitted IO to complete. @@ -660,7 +672,7 @@ start_journal_io: clear_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = journal_end_buffer_io_sync; - submit_bh(WRITE, bh); + submit_bh(write_op, bh); } cond_resched(); diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index e79c07812afa..737f7246a4b5 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -637,6 +637,8 @@ struct journal_head *journal_get_descriptor_buffer(journal_t *journal) return NULL; bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); + if (!bh) + return NULL; lock_buffer(bh); memset(bh->b_data, 0, journal->j_blocksize); set_buffer_uptodate(bh); @@ -733,9 +735,7 @@ journal_t * journal_init_dev(struct block_device *bdev, if (!journal->j_wbuf) { printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", __func__); - kfree(journal); - journal = NULL; - goto out; + goto out_err; } journal->j_dev = bdev; journal->j_fs_dev = fs_dev; @@ -743,11 +743,19 @@ journal_t * journal_init_dev(struct block_device *bdev, journal->j_maxlen = len; bh = __getblk(journal->j_dev, start, journal->j_blocksize); - J_ASSERT(bh != NULL); + if (!bh) { + printk(KERN_ERR + "%s: Cannot get buffer for journal superblock\n", + __func__); + goto out_err; + } journal->j_sb_buffer = bh; journal->j_superblock = (journal_superblock_t *)bh->b_data; -out: + return journal; +out_err: + kfree(journal); + return NULL; } /** @@ -787,8 +795,7 @@ journal_t * journal_init_inode (struct inode *inode) if (!journal->j_wbuf) { printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", __func__); - kfree(journal); - return NULL; + goto out_err; } err = journal_bmap(journal, 0, &blocknr); @@ -796,16 +803,23 @@ journal_t * journal_init_inode (struct inode *inode) if (err) { printk(KERN_ERR "%s: Cannnot locate journal superblock\n", __func__); - kfree(journal); - return NULL; + goto out_err; } bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); - J_ASSERT(bh != NULL); + if (!bh) { + printk(KERN_ERR + "%s: Cannot get buffer for journal superblock\n", + __func__); + goto out_err; + } journal->j_sb_buffer = bh; journal->j_superblock = (journal_superblock_t *)bh->b_data; return journal; +out_err: + kfree(journal); + return NULL; } /* diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index e6a117431277..ed886e6db399 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -1440,6 +1440,8 @@ int journal_stop(handle_t *handle) } } + if (handle->h_sync) + transaction->t_synchronous_commit = 1; current->journal_info = NULL; spin_lock(&journal->j_state_lock); spin_lock(&transaction->t_handle_lock); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 4ea72377c7a2..073c8c3df7cd 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -138,7 +138,7 @@ static int journal_submit_commit_record(journal_t *journal, set_buffer_ordered(bh); barrier_done = 1; } - ret = submit_bh(WRITE_SYNC, bh); + ret = submit_bh(WRITE_SYNC_PLUG, bh); if (barrier_done) clear_buffer_ordered(bh); @@ -159,7 +159,7 @@ static int journal_submit_commit_record(journal_t *journal, lock_buffer(bh); set_buffer_uptodate(bh); clear_buffer_dirty(bh); - ret = submit_bh(WRITE_SYNC, bh); + ret = submit_bh(WRITE_SYNC_PLUG, bh); } *cbh = bh; return ret; @@ -190,7 +190,7 @@ retry: set_buffer_uptodate(bh); bh->b_end_io = journal_end_buffer_io_sync; - ret = submit_bh(WRITE_SYNC, bh); + ret = submit_bh(WRITE_SYNC_PLUG, bh); if (ret) { unlock_buffer(bh); return ret; @@ -402,8 +402,13 @@ void jbd2_journal_commit_transaction(journal_t *journal) spin_lock(&journal->j_state_lock); commit_transaction->t_state = T_LOCKED; + /* + * Use plugged writes here, since we want to submit several before + * we unplug the device. We don't do explicit unplugging in here, + * instead we rely on sync_buffer() doing the unplug for us. + */ if (commit_transaction->t_synchronous_commit) - write_op = WRITE_SYNC; + write_op = WRITE_SYNC_PLUG; stats.u.run.rs_wait = commit_transaction->t_max_wait; stats.u.run.rs_locked = jiffies; stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start, diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index d98713777a1b..043740dde20c 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -38,12 +38,12 @@ static int jffs2_acl_count(size_t size) size_t s; size -= sizeof(struct jffs2_acl_header); - s = size - 4 * sizeof(struct jffs2_acl_entry_short); - if (s < 0) { + if (size < 4 * sizeof(struct jffs2_acl_entry_short)) { if (size % sizeof(struct jffs2_acl_entry_short)) return -1; return size / sizeof(struct jffs2_acl_entry_short); } else { + s = size - 4 * sizeof(struct jffs2_acl_entry_short); if (s % sizeof(struct jffs2_acl_entry)) return -1; return s / sizeof(struct jffs2_acl_entry) + 4; @@ -336,7 +336,7 @@ int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode) return PTR_ERR(acl); if (!acl) { - *i_mode &= ~current->fs->umask; + *i_mode &= ~current_umask(); } else { if (S_ISDIR(*i_mode)) jffs2_iset_acl(inode, &f->i_acl_default, acl); diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c index f9211252b5f1..9eff2bdae8a7 100644 --- a/fs/jffs2/malloc.c +++ b/fs/jffs2/malloc.c @@ -284,10 +284,9 @@ void jffs2_free_inode_cache(struct jffs2_inode_cache *x) struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void) { struct jffs2_xattr_datum *xd; - xd = kmem_cache_alloc(xattr_datum_cache, GFP_KERNEL); + xd = kmem_cache_zalloc(xattr_datum_cache, GFP_KERNEL); dbg_memalloc("%p\n", xd); - memset(xd, 0, sizeof(struct jffs2_xattr_datum)); xd->class = RAWNODE_CLASS_XATTR_DATUM; xd->node = (void *)xd; INIT_LIST_HEAD(&xd->xindex); @@ -303,10 +302,9 @@ void jffs2_free_xattr_datum(struct jffs2_xattr_datum *xd) struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void) { struct jffs2_xattr_ref *ref; - ref = kmem_cache_alloc(xattr_ref_cache, GFP_KERNEL); + ref = kmem_cache_zalloc(xattr_ref_cache, GFP_KERNEL); dbg_memalloc("%p\n", ref); - memset(ref, 0, sizeof(struct jffs2_xattr_ref)); ref->class = RAWNODE_CLASS_XATTR_REF; ref->node = (void *)ref; return ref; diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index a166c1669e82..06ca1b8d2054 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -182,7 +182,7 @@ int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir) cleanup: posix_acl_release(acl); } else - inode->i_mode &= ~current->fs->umask; + inode->i_mode &= ~current_umask(); JFS_IP(inode)->mode2 = (JFS_IP(inode)->mode2 & 0xffff0000) | inode->i_mode; diff --git a/fs/libfs.c b/fs/libfs.c index 4910a36f516e..cd223190c4e9 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -575,6 +575,21 @@ ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, * possibly a read which collects the result - which is stored in a * file-local buffer. */ + +void simple_transaction_set(struct file *file, size_t n) +{ + struct simple_transaction_argresp *ar = file->private_data; + + BUG_ON(n > SIMPLE_TRANSACTION_LIMIT); + + /* + * The barrier ensures that ar->size will really remain zero until + * ar->data is ready for reading. + */ + smp_mb(); + ar->size = n; +} + char *simple_transaction_get(struct file *file, const char __user *buf, size_t size) { struct simple_transaction_argresp *ar; @@ -820,6 +835,7 @@ EXPORT_SYMBOL(simple_sync_file); EXPORT_SYMBOL(simple_unlink); EXPORT_SYMBOL(simple_read_from_buffer); EXPORT_SYMBOL(memory_read_from_buffer); +EXPORT_SYMBOL(simple_transaction_set); EXPORT_SYMBOL(simple_transaction_get); EXPORT_SYMBOL(simple_transaction_read); EXPORT_SYMBOL(simple_transaction_release); diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 763b78a6e9de..83ee34203bd7 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -426,8 +426,15 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, ret = nlm_granted; goto out; case -EAGAIN: + /* + * If this is a blocking request for an + * already pending lock request then we need + * to put it back on lockd's block list + */ + if (wait) + break; ret = nlm_lck_denied; - break; + goto out; case FILE_LOCK_DEFERRED: if (wait) break; @@ -443,10 +450,6 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, goto out; } - ret = nlm_lck_denied; - if (!wait) - goto out; - ret = nlm_lck_blocked; /* Append to list of blocked */ diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 618865b3128b..daad3c2740db 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -321,15 +321,20 @@ out: static int minix_statfs(struct dentry *dentry, struct kstatfs *buf) { - struct minix_sb_info *sbi = minix_sb(dentry->d_sb); - buf->f_type = dentry->d_sb->s_magic; - buf->f_bsize = dentry->d_sb->s_blocksize; + struct super_block *sb = dentry->d_sb; + struct minix_sb_info *sbi = minix_sb(sb); + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + buf->f_type = sb->s_magic; + buf->f_bsize = sb->s_blocksize; buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size; buf->f_bfree = minix_count_free_blocks(sbi); buf->f_bavail = buf->f_bfree; buf->f_files = sbi->s_ninodes; buf->f_ffree = minix_count_free_inodes(sbi); buf->f_namelen = sbi->s_namelen; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + return 0; } diff --git a/fs/mpage.c b/fs/mpage.c index 16c3ef37eae3..680ba60863ff 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -82,7 +82,7 @@ static void mpage_end_io_write(struct bio *bio, int err) bio_put(bio); } -struct bio *mpage_bio_submit(int rw, struct bio *bio) +static struct bio *mpage_bio_submit(int rw, struct bio *bio) { bio->bi_end_io = mpage_end_io_read; if (rw == WRITE) @@ -90,7 +90,6 @@ struct bio *mpage_bio_submit(int rw, struct bio *bio) submit_bio(rw, bio); return NULL; } -EXPORT_SYMBOL(mpage_bio_submit); static struct bio * mpage_alloc(struct block_device *bdev, @@ -439,7 +438,14 @@ EXPORT_SYMBOL(mpage_readpage); * just allocate full-size (16-page) BIOs. */ -int __mpage_writepage(struct page *page, struct writeback_control *wbc, +struct mpage_data { + struct bio *bio; + sector_t last_block_in_bio; + get_block_t *get_block; + unsigned use_writepage; +}; + +static int __mpage_writepage(struct page *page, struct writeback_control *wbc, void *data) { struct mpage_data *mpd = data; @@ -648,7 +654,6 @@ out: mpd->bio = bio; return ret; } -EXPORT_SYMBOL(__mpage_writepage); /** * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them diff --git a/fs/namei.c b/fs/namei.c index d040ce11785d..b8433ebfae05 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -32,6 +32,7 @@ #include <linux/file.h> #include <linux/fcntl.h> #include <linux/device_cgroup.h> +#include <linux/fs_struct.h> #include <asm/uaccess.h> #define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE]) @@ -1578,7 +1579,7 @@ static int __open_namei_create(struct nameidata *nd, struct path *path, struct dentry *dir = nd->path.dentry; if (!IS_POSIXACL(dir->d_inode)) - mode &= ~current->fs->umask; + mode &= ~current_umask(); error = security_path_mknod(&nd->path, path->dentry, mode, 0); if (error) goto out_unlock; @@ -1989,7 +1990,7 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, int, mode, goto out_unlock; } if (!IS_POSIXACL(nd.path.dentry->d_inode)) - mode &= ~current->fs->umask; + mode &= ~current_umask(); error = may_mknod(mode); if (error) goto out_dput; @@ -2067,7 +2068,7 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, int, mode) goto out_unlock; if (!IS_POSIXACL(nd.path.dentry->d_inode)) - mode &= ~current->fs->umask; + mode &= ~current_umask(); error = mnt_want_write(nd.path.mnt); if (error) goto out_dput; @@ -2897,10 +2898,3 @@ EXPORT_SYMBOL(vfs_symlink); EXPORT_SYMBOL(vfs_unlink); EXPORT_SYMBOL(dentry_unhash); EXPORT_SYMBOL(generic_readlink); - -/* to be mentioned only in INIT_TASK */ -struct fs_struct init_fs = { - .count = ATOMIC_INIT(1), - .lock = __RW_LOCK_UNLOCKED(init_fs.lock), - .umask = 0022, -}; diff --git a/fs/namespace.c b/fs/namespace.c index 0a42e0e96027..c6f54e4c4290 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -27,6 +27,7 @@ #include <linux/ramfs.h> #include <linux/log2.h> #include <linux/idr.h> +#include <linux/fs_struct.h> #include <asm/uaccess.h> #include <asm/unistd.h> #include "pnode.h" @@ -2093,66 +2094,6 @@ out1: } /* - * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. - * It can block. Requires the big lock held. - */ -void set_fs_root(struct fs_struct *fs, struct path *path) -{ - struct path old_root; - - write_lock(&fs->lock); - old_root = fs->root; - fs->root = *path; - path_get(path); - write_unlock(&fs->lock); - if (old_root.dentry) - path_put(&old_root); -} - -/* - * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values. - * It can block. Requires the big lock held. - */ -void set_fs_pwd(struct fs_struct *fs, struct path *path) -{ - struct path old_pwd; - - write_lock(&fs->lock); - old_pwd = fs->pwd; - fs->pwd = *path; - path_get(path); - write_unlock(&fs->lock); - - if (old_pwd.dentry) - path_put(&old_pwd); -} - -static void chroot_fs_refs(struct path *old_root, struct path *new_root) -{ - struct task_struct *g, *p; - struct fs_struct *fs; - - read_lock(&tasklist_lock); - do_each_thread(g, p) { - task_lock(p); - fs = p->fs; - if (fs) { - atomic_inc(&fs->count); - task_unlock(p); - if (fs->root.dentry == old_root->dentry - && fs->root.mnt == old_root->mnt) - set_fs_root(fs, new_root); - if (fs->pwd.dentry == old_root->dentry - && fs->pwd.mnt == old_root->mnt) - set_fs_pwd(fs, new_root); - put_fs_struct(fs); - } else - task_unlock(p); - } while_each_thread(g, p); - read_unlock(&tasklist_lock); -} - -/* * pivot_root Semantics: * Moves the root file system of the current process to the directory put_old, * makes new_root as the new root file system of the current process, and sets diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 36fe20d6eba2..e67f3ec07736 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -84,3 +84,11 @@ config ROOT_NFS <file:Documentation/filesystems/nfsroot.txt>. Most people say N here. + +config NFS_FSCACHE + bool "Provide NFS client caching support (EXPERIMENTAL)" + depends on EXPERIMENTAL + depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y + help + Say Y here if you want NFS data to be cached locally on disc through + the general filesystem cache manager diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index ac6170c594a3..845159814de2 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -15,3 +15,4 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \ callback.o callback_xdr.o callback_proc.o \ nfs4namespace.o nfs-$(CONFIG_SYSCTL) += sysctl.o +nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o diff --git a/fs/nfs/client.c b/fs/nfs/client.c index aba38017bdef..75c9cd2aa119 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -45,6 +45,7 @@ #include "delegation.h" #include "iostat.h" #include "internal.h" +#include "fscache.h" #define NFSDBG_FACILITY NFSDBG_CLIENT @@ -154,6 +155,8 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_ if (!IS_ERR(cred)) clp->cl_machine_cred = cred; + nfs_fscache_get_client_cookie(clp); + return clp; error_3: @@ -187,6 +190,8 @@ static void nfs_free_client(struct nfs_client *clp) nfs4_shutdown_client(clp); + nfs_fscache_release_client_cookie(clp); + /* -EIO all pending I/O */ if (!IS_ERR(clp->cl_rpcclient)) rpc_shutdown_client(clp->cl_rpcclient); @@ -760,6 +765,7 @@ static int nfs_init_server(struct nfs_server *server, /* Initialise the client representation from the mount data */ server->flags = data->flags; + server->options = data->options; if (data->rsize) server->rsize = nfs_block_size(data->rsize, NULL); @@ -1148,6 +1154,7 @@ static int nfs4_init_server(struct nfs_server *server, /* Initialise the client representation from the mount data */ server->flags = data->flags; server->caps |= NFS_CAP_ATOMIC_OPEN; + server->options = data->options; /* Get a client record */ error = nfs4_set_client(server, @@ -1559,7 +1566,7 @@ static int nfs_volume_list_show(struct seq_file *m, void *v) /* display header on line 1 */ if (v == &nfs_volume_list) { - seq_puts(m, "NV SERVER PORT DEV FSID\n"); + seq_puts(m, "NV SERVER PORT DEV FSID FSC\n"); return 0; } /* display one transport per line on subsequent lines */ @@ -1573,12 +1580,13 @@ static int nfs_volume_list_show(struct seq_file *m, void *v) (unsigned long long) server->fsid.major, (unsigned long long) server->fsid.minor); - seq_printf(m, "v%u %s %s %-7s %-17s\n", + seq_printf(m, "v%u %s %s %-7s %-17s %s\n", clp->rpc_ops->version, rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT), dev, - fsid); + fsid, + nfs_server_fscache_state(server)); return 0; } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 0abf3f331f56..5a97bcfe03e5 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -35,6 +35,7 @@ #include "delegation.h" #include "internal.h" #include "iostat.h" +#include "fscache.h" #define NFSDBG_FACILITY NFSDBG_FILE @@ -409,6 +410,13 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, return copied; } +/* + * Partially or wholly invalidate a page + * - Release the private state associated with a page if undergoing complete + * page invalidation + * - Called if either PG_private or PG_fscache is set on the page + * - Caller holds page lock + */ static void nfs_invalidate_page(struct page *page, unsigned long offset) { dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset); @@ -417,23 +425,43 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset) return; /* Cancel any unstarted writes on this page */ nfs_wb_page_cancel(page->mapping->host, page); + + nfs_fscache_invalidate_page(page, page->mapping->host); } +/* + * Attempt to release the private state associated with a page + * - Called if either PG_private or PG_fscache is set on the page + * - Caller holds page lock + * - Return true (may release page) or false (may not) + */ static int nfs_release_page(struct page *page, gfp_t gfp) { dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); /* If PagePrivate() is set, then the page is not freeable */ - return 0; + if (PagePrivate(page)) + return 0; + return nfs_fscache_release_page(page, gfp); } +/* + * Attempt to clear the private state associated with a page when an error + * occurs that requires the cached contents of an inode to be written back or + * destroyed + * - Called if either PG_private or fscache is set on the page + * - Caller holds page lock + * - Return 0 if successful, -error otherwise + */ static int nfs_launder_page(struct page *page) { struct inode *inode = page->mapping->host; + struct nfs_inode *nfsi = NFS_I(inode); dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n", inode->i_ino, (long long)page_offset(page)); + nfs_fscache_wait_on_page_write(nfsi, page); return nfs_wb_page(inode, page); } @@ -451,6 +479,11 @@ const struct address_space_operations nfs_file_aops = { .launder_page = nfs_launder_page, }; +/* + * Notification that a PTE pointing to an NFS page is about to be made + * writable, implying that someone is about to modify the page through a + * shared-writable mapping + */ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct page *page = vmf->page; @@ -465,6 +498,9 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) filp->f_mapping->host->i_ino, (long long)page_offset(page)); + /* make sure the cache has finished storing the page */ + nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page); + lock_page(page); mapping = page->mapping; if (mapping != dentry->d_inode->i_mapping) @@ -480,8 +516,6 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) goto out_unlock; ret = nfs_updatepage(filp, page, 0, pagelen); - if (ret == 0) - ret = pagelen; out_unlock: unlock_page(page); if (ret) diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c new file mode 100644 index 000000000000..5b1006480bc2 --- /dev/null +++ b/fs/nfs/fscache-index.c @@ -0,0 +1,337 @@ +/* NFS FS-Cache index structure definition + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/nfs_fs.h> +#include <linux/nfs_fs_sb.h> +#include <linux/in6.h> + +#include "internal.h" +#include "fscache.h" + +#define NFSDBG_FACILITY NFSDBG_FSCACHE + +/* + * Define the NFS filesystem for FS-Cache. Upon registration FS-Cache sticks + * the cookie for the top-level index object for NFS into here. The top-level + * index can than have other cache objects inserted into it. + */ +struct fscache_netfs nfs_fscache_netfs = { + .name = "nfs", + .version = 0, +}; + +/* + * Register NFS for caching + */ +int nfs_fscache_register(void) +{ + return fscache_register_netfs(&nfs_fscache_netfs); +} + +/* + * Unregister NFS for caching + */ +void nfs_fscache_unregister(void) +{ + fscache_unregister_netfs(&nfs_fscache_netfs); +} + +/* + * Layout of the key for an NFS server cache object. + */ +struct nfs_server_key { + uint16_t nfsversion; /* NFS protocol version */ + uint16_t family; /* address family */ + uint16_t port; /* IP port */ + union { + struct in_addr ipv4_addr; /* IPv4 address */ + struct in6_addr ipv6_addr; /* IPv6 address */ + } addr[0]; +}; + +/* + * Generate a key to describe a server in the main NFS index + * - We return the length of the key, or 0 if we can't generate one + */ +static uint16_t nfs_server_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct nfs_client *clp = cookie_netfs_data; + const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) &clp->cl_addr; + const struct sockaddr_in *sin = (struct sockaddr_in *) &clp->cl_addr; + struct nfs_server_key *key = buffer; + uint16_t len = sizeof(struct nfs_server_key); + + key->nfsversion = clp->rpc_ops->version; + key->family = clp->cl_addr.ss_family; + + memset(key, 0, len); + + switch (clp->cl_addr.ss_family) { + case AF_INET: + key->port = sin->sin_port; + key->addr[0].ipv4_addr = sin->sin_addr; + len += sizeof(key->addr[0].ipv4_addr); + break; + + case AF_INET6: + key->port = sin6->sin6_port; + key->addr[0].ipv6_addr = sin6->sin6_addr; + len += sizeof(key->addr[0].ipv6_addr); + break; + + default: + printk(KERN_WARNING "NFS: Unknown network family '%d'\n", + clp->cl_addr.ss_family); + len = 0; + break; + } + + return len; +} + +/* + * Define the server object for FS-Cache. This is used to describe a server + * object to fscache_acquire_cookie(). It is keyed by the NFS protocol and + * server address parameters. + */ +const struct fscache_cookie_def nfs_fscache_server_index_def = { + .name = "NFS.server", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = nfs_server_get_key, +}; + +/* + * Generate a key to describe a superblock key in the main NFS index + */ +static uint16_t nfs_super_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct nfs_fscache_key *key; + const struct nfs_server *nfss = cookie_netfs_data; + uint16_t len; + + key = nfss->fscache_key; + len = sizeof(key->key) + key->key.uniq_len; + if (len > bufmax) { + len = 0; + } else { + memcpy(buffer, &key->key, sizeof(key->key)); + memcpy(buffer + sizeof(key->key), + key->key.uniquifier, key->key.uniq_len); + } + + return len; +} + +/* + * Define the superblock object for FS-Cache. This is used to describe a + * superblock object to fscache_acquire_cookie(). It is keyed by all the NFS + * parameters that might cause a separate superblock. + */ +const struct fscache_cookie_def nfs_fscache_super_index_def = { + .name = "NFS.super", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = nfs_super_get_key, +}; + +/* + * Definition of the auxiliary data attached to NFS inode storage objects + * within the cache. + * + * The contents of this struct are recorded in the on-disk local cache in the + * auxiliary data attached to the data storage object backing an inode. This + * permits coherency to be managed when a new inode binds to an already extant + * cache object. + */ +struct nfs_fscache_inode_auxdata { + struct timespec mtime; + struct timespec ctime; + loff_t size; + u64 change_attr; +}; + +/* + * Generate a key to describe an NFS inode in an NFS server's index + */ +static uint16_t nfs_fscache_inode_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct nfs_inode *nfsi = cookie_netfs_data; + uint16_t nsize; + + /* use the inode's NFS filehandle as the key */ + nsize = nfsi->fh.size; + memcpy(buffer, nfsi->fh.data, nsize); + return nsize; +} + +/* + * Get certain file attributes from the netfs data + * - This function can be absent for an index + * - Not permitted to return an error + * - The netfs data from the cookie being used as the source is presented + */ +static void nfs_fscache_inode_get_attr(const void *cookie_netfs_data, + uint64_t *size) +{ + const struct nfs_inode *nfsi = cookie_netfs_data; + + *size = nfsi->vfs_inode.i_size; +} + +/* + * Get the auxiliary data from netfs data + * - This function can be absent if the index carries no state data + * - Should store the auxiliary data in the buffer + * - Should return the amount of amount stored + * - Not permitted to return an error + * - The netfs data from the cookie being used as the source is presented + */ +static uint16_t nfs_fscache_inode_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + struct nfs_fscache_inode_auxdata auxdata; + const struct nfs_inode *nfsi = cookie_netfs_data; + + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.size = nfsi->vfs_inode.i_size; + auxdata.mtime = nfsi->vfs_inode.i_mtime; + auxdata.ctime = nfsi->vfs_inode.i_ctime; + + if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) + auxdata.change_attr = nfsi->change_attr; + + if (bufmax > sizeof(auxdata)) + bufmax = sizeof(auxdata); + + memcpy(buffer, &auxdata, bufmax); + return bufmax; +} + +/* + * Consult the netfs about the state of an object + * - This function can be absent if the index carries no state data + * - The netfs data from the cookie being used as the target is + * presented, as is the auxiliary data + */ +static +enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data, + const void *data, + uint16_t datalen) +{ + struct nfs_fscache_inode_auxdata auxdata; + struct nfs_inode *nfsi = cookie_netfs_data; + + if (datalen != sizeof(auxdata)) + return FSCACHE_CHECKAUX_OBSOLETE; + + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.size = nfsi->vfs_inode.i_size; + auxdata.mtime = nfsi->vfs_inode.i_mtime; + auxdata.ctime = nfsi->vfs_inode.i_ctime; + + if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) + auxdata.change_attr = nfsi->change_attr; + + if (memcmp(data, &auxdata, datalen) != 0) + return FSCACHE_CHECKAUX_OBSOLETE; + + return FSCACHE_CHECKAUX_OKAY; +} + +/* + * Indication from FS-Cache that the cookie is no longer cached + * - This function is called when the backing store currently caching a cookie + * is removed + * - The netfs should use this to clean up any markers indicating cached pages + * - This is mandatory for any object that may have data + */ +static void nfs_fscache_inode_now_uncached(void *cookie_netfs_data) +{ + struct nfs_inode *nfsi = cookie_netfs_data; + struct pagevec pvec; + pgoff_t first; + int loop, nr_pages; + + pagevec_init(&pvec, 0); + first = 0; + + dprintk("NFS: nfs_inode_now_uncached: nfs_inode 0x%p\n", nfsi); + + for (;;) { + /* grab a bunch of pages to unmark */ + nr_pages = pagevec_lookup(&pvec, + nfsi->vfs_inode.i_mapping, + first, + PAGEVEC_SIZE - pagevec_count(&pvec)); + if (!nr_pages) + break; + + for (loop = 0; loop < nr_pages; loop++) + ClearPageFsCache(pvec.pages[loop]); + + first = pvec.pages[nr_pages - 1]->index + 1; + + pvec.nr = nr_pages; + pagevec_release(&pvec); + cond_resched(); + } +} + +/* + * Get an extra reference on a read context. + * - This function can be absent if the completion function doesn't require a + * context. + * - The read context is passed back to NFS in the event that a data read on the + * cache fails with EIO - in which case the server must be contacted to + * retrieve the data, which requires the read context for security. + */ +static void nfs_fh_get_context(void *cookie_netfs_data, void *context) +{ + get_nfs_open_context(context); +} + +/* + * Release an extra reference on a read context. + * - This function can be absent if the completion function doesn't require a + * context. + */ +static void nfs_fh_put_context(void *cookie_netfs_data, void *context) +{ + if (context) + put_nfs_open_context(context); +} + +/* + * Define the inode object for FS-Cache. This is used to describe an inode + * object to fscache_acquire_cookie(). It is keyed by the NFS file handle for + * an inode. + * + * Coherency is managed by comparing the copies of i_size, i_mtime and i_ctime + * held in the cache auxiliary data for the data storage object with those in + * the inode struct in memory. + */ +const struct fscache_cookie_def nfs_fscache_inode_object_def = { + .name = "NFS.fh", + .type = FSCACHE_COOKIE_TYPE_DATAFILE, + .get_key = nfs_fscache_inode_get_key, + .get_attr = nfs_fscache_inode_get_attr, + .get_aux = nfs_fscache_inode_get_aux, + .check_aux = nfs_fscache_inode_check_aux, + .now_uncached = nfs_fscache_inode_now_uncached, + .get_context = nfs_fh_get_context, + .put_context = nfs_fh_put_context, +}; diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c new file mode 100644 index 000000000000..379be678cb7e --- /dev/null +++ b/fs/nfs/fscache.c @@ -0,0 +1,523 @@ +/* NFS filesystem cache interface + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/nfs_fs.h> +#include <linux/nfs_fs_sb.h> +#include <linux/in6.h> +#include <linux/seq_file.h> + +#include "internal.h" +#include "iostat.h" +#include "fscache.h" + +#define NFSDBG_FACILITY NFSDBG_FSCACHE + +static struct rb_root nfs_fscache_keys = RB_ROOT; +static DEFINE_SPINLOCK(nfs_fscache_keys_lock); + +/* + * Get the per-client index cookie for an NFS client if the appropriate mount + * flag was set + * - We always try and get an index cookie for the client, but get filehandle + * cookies on a per-superblock basis, depending on the mount flags + */ +void nfs_fscache_get_client_cookie(struct nfs_client *clp) +{ + /* create a cache index for looking up filehandles */ + clp->fscache = fscache_acquire_cookie(nfs_fscache_netfs.primary_index, + &nfs_fscache_server_index_def, + clp); + dfprintk(FSCACHE, "NFS: get client cookie (0x%p/0x%p)\n", + clp, clp->fscache); +} + +/* + * Dispose of a per-client cookie + */ +void nfs_fscache_release_client_cookie(struct nfs_client *clp) +{ + dfprintk(FSCACHE, "NFS: releasing client cookie (0x%p/0x%p)\n", + clp, clp->fscache); + + fscache_relinquish_cookie(clp->fscache, 0); + clp->fscache = NULL; +} + +/* + * Get the cache cookie for an NFS superblock. We have to handle + * uniquification here because the cache doesn't do it for us. + */ +void nfs_fscache_get_super_cookie(struct super_block *sb, + struct nfs_parsed_mount_data *data) +{ + struct nfs_fscache_key *key, *xkey; + struct nfs_server *nfss = NFS_SB(sb); + struct rb_node **p, *parent; + const char *uniq = data->fscache_uniq ?: ""; + int diff, ulen; + + ulen = strlen(uniq); + key = kzalloc(sizeof(*key) + ulen, GFP_KERNEL); + if (!key) + return; + + key->nfs_client = nfss->nfs_client; + key->key.super.s_flags = sb->s_flags & NFS_MS_MASK; + key->key.nfs_server.flags = nfss->flags; + key->key.nfs_server.rsize = nfss->rsize; + key->key.nfs_server.wsize = nfss->wsize; + key->key.nfs_server.acregmin = nfss->acregmin; + key->key.nfs_server.acregmax = nfss->acregmax; + key->key.nfs_server.acdirmin = nfss->acdirmin; + key->key.nfs_server.acdirmax = nfss->acdirmax; + key->key.nfs_server.fsid = nfss->fsid; + key->key.rpc_auth.au_flavor = nfss->client->cl_auth->au_flavor; + + key->key.uniq_len = ulen; + memcpy(key->key.uniquifier, uniq, ulen); + + spin_lock(&nfs_fscache_keys_lock); + p = &nfs_fscache_keys.rb_node; + parent = NULL; + while (*p) { + parent = *p; + xkey = rb_entry(parent, struct nfs_fscache_key, node); + + if (key->nfs_client < xkey->nfs_client) + goto go_left; + if (key->nfs_client > xkey->nfs_client) + goto go_right; + + diff = memcmp(&key->key, &xkey->key, sizeof(key->key)); + if (diff < 0) + goto go_left; + if (diff > 0) + goto go_right; + + if (key->key.uniq_len == 0) + goto non_unique; + diff = memcmp(key->key.uniquifier, + xkey->key.uniquifier, + key->key.uniq_len); + if (diff < 0) + goto go_left; + if (diff > 0) + goto go_right; + goto non_unique; + + go_left: + p = &(*p)->rb_left; + continue; + go_right: + p = &(*p)->rb_right; + } + + rb_link_node(&key->node, parent, p); + rb_insert_color(&key->node, &nfs_fscache_keys); + spin_unlock(&nfs_fscache_keys_lock); + nfss->fscache_key = key; + + /* create a cache index for looking up filehandles */ + nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache, + &nfs_fscache_super_index_def, + nfss); + dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n", + nfss, nfss->fscache); + return; + +non_unique: + spin_unlock(&nfs_fscache_keys_lock); + kfree(key); + nfss->fscache_key = NULL; + nfss->fscache = NULL; + printk(KERN_WARNING "NFS:" + " Cache request denied due to non-unique superblock keys\n"); +} + +/* + * release a per-superblock cookie + */ +void nfs_fscache_release_super_cookie(struct super_block *sb) +{ + struct nfs_server *nfss = NFS_SB(sb); + + dfprintk(FSCACHE, "NFS: releasing superblock cookie (0x%p/0x%p)\n", + nfss, nfss->fscache); + + fscache_relinquish_cookie(nfss->fscache, 0); + nfss->fscache = NULL; + + if (nfss->fscache_key) { + spin_lock(&nfs_fscache_keys_lock); + rb_erase(&nfss->fscache_key->node, &nfs_fscache_keys); + spin_unlock(&nfs_fscache_keys_lock); + kfree(nfss->fscache_key); + nfss->fscache_key = NULL; + } +} + +/* + * Initialise the per-inode cache cookie pointer for an NFS inode. + */ +void nfs_fscache_init_inode_cookie(struct inode *inode) +{ + NFS_I(inode)->fscache = NULL; + if (S_ISREG(inode->i_mode)) + set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags); +} + +/* + * Get the per-inode cache cookie for an NFS inode. + */ +static void nfs_fscache_enable_inode_cookie(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct nfs_inode *nfsi = NFS_I(inode); + + if (nfsi->fscache || !NFS_FSCACHE(inode)) + return; + + if ((NFS_SB(sb)->options & NFS_OPTION_FSCACHE)) { + nfsi->fscache = fscache_acquire_cookie( + NFS_SB(sb)->fscache, + &nfs_fscache_inode_object_def, + nfsi); + + dfprintk(FSCACHE, "NFS: get FH cookie (0x%p/0x%p/0x%p)\n", + sb, nfsi, nfsi->fscache); + } +} + +/* + * Release a per-inode cookie. + */ +void nfs_fscache_release_inode_cookie(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", + nfsi, nfsi->fscache); + + fscache_relinquish_cookie(nfsi->fscache, 0); + nfsi->fscache = NULL; +} + +/* + * Retire a per-inode cookie, destroying the data attached to it. + */ +void nfs_fscache_zap_inode_cookie(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + dfprintk(FSCACHE, "NFS: zapping cookie (0x%p/0x%p)\n", + nfsi, nfsi->fscache); + + fscache_relinquish_cookie(nfsi->fscache, 1); + nfsi->fscache = NULL; +} + +/* + * Turn off the cache with regard to a per-inode cookie if opened for writing, + * invalidating all the pages in the page cache relating to the associated + * inode to clear the per-page caching. + */ +static void nfs_fscache_disable_inode_cookie(struct inode *inode) +{ + clear_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags); + + if (NFS_I(inode)->fscache) { + dfprintk(FSCACHE, + "NFS: nfsi 0x%p turning cache off\n", NFS_I(inode)); + + /* Need to invalidate any mapped pages that were read in before + * turning off the cache. + */ + if (inode->i_mapping && inode->i_mapping->nrpages) + invalidate_inode_pages2(inode->i_mapping); + + nfs_fscache_zap_inode_cookie(inode); + } +} + +/* + * wait_on_bit() sleep function for uninterruptible waiting + */ +static int nfs_fscache_wait_bit(void *flags) +{ + schedule(); + return 0; +} + +/* + * Lock against someone else trying to also acquire or relinquish a cookie + */ +static inline void nfs_fscache_inode_lock(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + while (test_and_set_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags)) + wait_on_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK, + nfs_fscache_wait_bit, TASK_UNINTERRUPTIBLE); +} + +/* + * Unlock cookie management lock + */ +static inline void nfs_fscache_inode_unlock(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + smp_mb__before_clear_bit(); + clear_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags); + smp_mb__after_clear_bit(); + wake_up_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK); +} + +/* + * Decide if we should enable or disable local caching for this inode. + * - For now, with NFS, only regular files that are open read-only will be able + * to use the cache. + * - May be invoked multiple times in parallel by parallel nfs_open() functions. + */ +void nfs_fscache_set_inode_cookie(struct inode *inode, struct file *filp) +{ + if (NFS_FSCACHE(inode)) { + nfs_fscache_inode_lock(inode); + if ((filp->f_flags & O_ACCMODE) != O_RDONLY) + nfs_fscache_disable_inode_cookie(inode); + else + nfs_fscache_enable_inode_cookie(inode); + nfs_fscache_inode_unlock(inode); + } +} + +/* + * Replace a per-inode cookie due to revalidation detecting a file having + * changed on the server. + */ +void nfs_fscache_reset_inode_cookie(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_server *nfss = NFS_SERVER(inode); + struct fscache_cookie *old = nfsi->fscache; + + nfs_fscache_inode_lock(inode); + if (nfsi->fscache) { + /* retire the current fscache cache and get a new one */ + fscache_relinquish_cookie(nfsi->fscache, 1); + + nfsi->fscache = fscache_acquire_cookie( + nfss->nfs_client->fscache, + &nfs_fscache_inode_object_def, + nfsi); + + dfprintk(FSCACHE, + "NFS: revalidation new cookie (0x%p/0x%p/0x%p/0x%p)\n", + nfss, nfsi, old, nfsi->fscache); + } + nfs_fscache_inode_unlock(inode); +} + +/* + * Release the caching state associated with a page, if the page isn't busy + * interacting with the cache. + * - Returns true (can release page) or false (page busy). + */ +int nfs_fscache_release_page(struct page *page, gfp_t gfp) +{ + struct nfs_inode *nfsi = NFS_I(page->mapping->host); + struct fscache_cookie *cookie = nfsi->fscache; + + BUG_ON(!cookie); + + if (fscache_check_page_write(cookie, page)) { + if (!(gfp & __GFP_WAIT)) + return 0; + fscache_wait_on_page_write(cookie, page); + } + + if (PageFsCache(page)) { + dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n", + cookie, page, nfsi); + + fscache_uncache_page(cookie, page); + nfs_add_fscache_stats(page->mapping->host, + NFSIOS_FSCACHE_PAGES_UNCACHED, 1); + } + + return 1; +} + +/* + * Release the caching state associated with a page if undergoing complete page + * invalidation. + */ +void __nfs_fscache_invalidate_page(struct page *page, struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct fscache_cookie *cookie = nfsi->fscache; + + BUG_ON(!cookie); + + dfprintk(FSCACHE, "NFS: fscache invalidatepage (0x%p/0x%p/0x%p)\n", + cookie, page, nfsi); + + fscache_wait_on_page_write(cookie, page); + + BUG_ON(!PageLocked(page)); + fscache_uncache_page(cookie, page); + nfs_add_fscache_stats(page->mapping->host, + NFSIOS_FSCACHE_PAGES_UNCACHED, 1); +} + +/* + * Handle completion of a page being read from the cache. + * - Called in process (keventd) context. + */ +static void nfs_readpage_from_fscache_complete(struct page *page, + void *context, + int error) +{ + dfprintk(FSCACHE, + "NFS: readpage_from_fscache_complete (0x%p/0x%p/%d)\n", + page, context, error); + + /* if the read completes with an error, we just unlock the page and let + * the VM reissue the readpage */ + if (!error) { + SetPageUptodate(page); + unlock_page(page); + } else { + error = nfs_readpage_async(context, page->mapping->host, page); + if (error) + unlock_page(page); + } +} + +/* + * Retrieve a page from fscache + */ +int __nfs_readpage_from_fscache(struct nfs_open_context *ctx, + struct inode *inode, struct page *page) +{ + int ret; + + dfprintk(FSCACHE, + "NFS: readpage_from_fscache(fsc:%p/p:%p(i:%lx f:%lx)/0x%p)\n", + NFS_I(inode)->fscache, page, page->index, page->flags, inode); + + ret = fscache_read_or_alloc_page(NFS_I(inode)->fscache, + page, + nfs_readpage_from_fscache_complete, + ctx, + GFP_KERNEL); + + switch (ret) { + case 0: /* read BIO submitted (page in fscache) */ + dfprintk(FSCACHE, + "NFS: readpage_from_fscache: BIO submitted\n"); + nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK, 1); + return ret; + + case -ENOBUFS: /* inode not in cache */ + case -ENODATA: /* page not in cache */ + nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, 1); + dfprintk(FSCACHE, + "NFS: readpage_from_fscache %d\n", ret); + return 1; + + default: + dfprintk(FSCACHE, "NFS: readpage_from_fscache %d\n", ret); + nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, 1); + } + return ret; +} + +/* + * Retrieve a set of pages from fscache + */ +int __nfs_readpages_from_fscache(struct nfs_open_context *ctx, + struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + int ret, npages = *nr_pages; + + dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n", + NFS_I(inode)->fscache, npages, inode); + + ret = fscache_read_or_alloc_pages(NFS_I(inode)->fscache, + mapping, pages, nr_pages, + nfs_readpage_from_fscache_complete, + ctx, + mapping_gfp_mask(mapping)); + if (*nr_pages < npages) + nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK, + npages); + if (*nr_pages > 0) + nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, + *nr_pages); + + switch (ret) { + case 0: /* read submitted to the cache for all pages */ + BUG_ON(!list_empty(pages)); + BUG_ON(*nr_pages != 0); + dfprintk(FSCACHE, + "NFS: nfs_getpages_from_fscache: submitted\n"); + + return ret; + + case -ENOBUFS: /* some pages aren't cached and can't be */ + case -ENODATA: /* some pages aren't cached */ + dfprintk(FSCACHE, + "NFS: nfs_getpages_from_fscache: no page: %d\n", ret); + return 1; + + default: + dfprintk(FSCACHE, + "NFS: nfs_getpages_from_fscache: ret %d\n", ret); + } + + return ret; +} + +/* + * Store a newly fetched page in fscache + * - PG_fscache must be set on the page + */ +void __nfs_readpage_to_fscache(struct inode *inode, struct page *page, int sync) +{ + int ret; + + dfprintk(FSCACHE, + "NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx)/%d)\n", + NFS_I(inode)->fscache, page, page->index, page->flags, sync); + + ret = fscache_write_page(NFS_I(inode)->fscache, page, GFP_KERNEL); + dfprintk(FSCACHE, + "NFS: readpage_to_fscache: p:%p(i:%lu f:%lx) ret %d\n", + page, page->index, page->flags, ret); + + if (ret != 0) { + fscache_uncache_page(NFS_I(inode)->fscache, page); + nfs_add_fscache_stats(inode, + NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL, 1); + nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED, 1); + } else { + nfs_add_fscache_stats(inode, + NFSIOS_FSCACHE_PAGES_WRITTEN_OK, 1); + } +} diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h new file mode 100644 index 000000000000..6e809bb0ff08 --- /dev/null +++ b/fs/nfs/fscache.h @@ -0,0 +1,220 @@ +/* NFS filesystem cache interface definitions + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#ifndef _NFS_FSCACHE_H +#define _NFS_FSCACHE_H + +#include <linux/nfs_fs.h> +#include <linux/nfs_mount.h> +#include <linux/nfs4_mount.h> +#include <linux/fscache.h> + +#ifdef CONFIG_NFS_FSCACHE + +/* + * set of NFS FS-Cache objects that form a superblock key + */ +struct nfs_fscache_key { + struct rb_node node; + struct nfs_client *nfs_client; /* the server */ + + /* the elements of the unique key - as used by nfs_compare_super() and + * nfs_compare_mount_options() to distinguish superblocks */ + struct { + struct { + unsigned long s_flags; /* various flags + * (& NFS_MS_MASK) */ + } super; + + struct { + struct nfs_fsid fsid; + int flags; + unsigned int rsize; /* read size */ + unsigned int wsize; /* write size */ + unsigned int acregmin; /* attr cache timeouts */ + unsigned int acregmax; + unsigned int acdirmin; + unsigned int acdirmax; + } nfs_server; + + struct { + rpc_authflavor_t au_flavor; + } rpc_auth; + + /* uniquifier - can be used if nfs_server.flags includes + * NFS_MOUNT_UNSHARED */ + u8 uniq_len; + char uniquifier[0]; + } key; +}; + +/* + * fscache-index.c + */ +extern struct fscache_netfs nfs_fscache_netfs; +extern const struct fscache_cookie_def nfs_fscache_server_index_def; +extern const struct fscache_cookie_def nfs_fscache_super_index_def; +extern const struct fscache_cookie_def nfs_fscache_inode_object_def; + +extern int nfs_fscache_register(void); +extern void nfs_fscache_unregister(void); + +/* + * fscache.c + */ +extern void nfs_fscache_get_client_cookie(struct nfs_client *); +extern void nfs_fscache_release_client_cookie(struct nfs_client *); + +extern void nfs_fscache_get_super_cookie(struct super_block *, + struct nfs_parsed_mount_data *); +extern void nfs_fscache_release_super_cookie(struct super_block *); + +extern void nfs_fscache_init_inode_cookie(struct inode *); +extern void nfs_fscache_release_inode_cookie(struct inode *); +extern void nfs_fscache_zap_inode_cookie(struct inode *); +extern void nfs_fscache_set_inode_cookie(struct inode *, struct file *); +extern void nfs_fscache_reset_inode_cookie(struct inode *); + +extern void __nfs_fscache_invalidate_page(struct page *, struct inode *); +extern int nfs_fscache_release_page(struct page *, gfp_t); + +extern int __nfs_readpage_from_fscache(struct nfs_open_context *, + struct inode *, struct page *); +extern int __nfs_readpages_from_fscache(struct nfs_open_context *, + struct inode *, struct address_space *, + struct list_head *, unsigned *); +extern void __nfs_readpage_to_fscache(struct inode *, struct page *, int); + +/* + * wait for a page to complete writing to the cache + */ +static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi, + struct page *page) +{ + if (PageFsCache(page)) + fscache_wait_on_page_write(nfsi->fscache, page); +} + +/* + * release the caching state associated with a page if undergoing complete page + * invalidation + */ +static inline void nfs_fscache_invalidate_page(struct page *page, + struct inode *inode) +{ + if (PageFsCache(page)) + __nfs_fscache_invalidate_page(page, inode); +} + +/* + * Retrieve a page from an inode data storage object. + */ +static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx, + struct inode *inode, + struct page *page) +{ + if (NFS_I(inode)->fscache) + return __nfs_readpage_from_fscache(ctx, inode, page); + return -ENOBUFS; +} + +/* + * Retrieve a set of pages from an inode data storage object. + */ +static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx, + struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + if (NFS_I(inode)->fscache) + return __nfs_readpages_from_fscache(ctx, inode, mapping, pages, + nr_pages); + return -ENOBUFS; +} + +/* + * Store a page newly fetched from the server in an inode data storage object + * in the cache. + */ +static inline void nfs_readpage_to_fscache(struct inode *inode, + struct page *page, + int sync) +{ + if (PageFsCache(page)) + __nfs_readpage_to_fscache(inode, page, sync); +} + +/* + * indicate the client caching state as readable text + */ +static inline const char *nfs_server_fscache_state(struct nfs_server *server) +{ + if (server->fscache && (server->options & NFS_OPTION_FSCACHE)) + return "yes"; + return "no "; +} + + +#else /* CONFIG_NFS_FSCACHE */ +static inline int nfs_fscache_register(void) { return 0; } +static inline void nfs_fscache_unregister(void) {} + +static inline void nfs_fscache_get_client_cookie(struct nfs_client *clp) {} +static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {} + +static inline void nfs_fscache_get_super_cookie( + struct super_block *sb, + struct nfs_parsed_mount_data *data) +{ +} +static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {} + +static inline void nfs_fscache_init_inode_cookie(struct inode *inode) {} +static inline void nfs_fscache_release_inode_cookie(struct inode *inode) {} +static inline void nfs_fscache_zap_inode_cookie(struct inode *inode) {} +static inline void nfs_fscache_set_inode_cookie(struct inode *inode, + struct file *filp) {} +static inline void nfs_fscache_reset_inode_cookie(struct inode *inode) {} + +static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp) +{ + return 1; /* True: may release page */ +} +static inline void nfs_fscache_invalidate_page(struct page *page, + struct inode *inode) {} +static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi, + struct page *page) {} + +static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx, + struct inode *inode, + struct page *page) +{ + return -ENOBUFS; +} +static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx, + struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + return -ENOBUFS; +} +static inline void nfs_readpage_to_fscache(struct inode *inode, + struct page *page, int sync) {} + +static inline const char *nfs_server_fscache_state(struct nfs_server *server) +{ + return "no "; +} + +#endif /* CONFIG_NFS_FSCACHE */ +#endif /* _NFS_FSCACHE_H */ diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index a834d1d850b7..64f87194d390 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -46,6 +46,7 @@ #include "delegation.h" #include "iostat.h" #include "internal.h" +#include "fscache.h" #define NFSDBG_FACILITY NFSDBG_VFS @@ -121,6 +122,7 @@ void nfs_clear_inode(struct inode *inode) BUG_ON(!list_empty(&NFS_I(inode)->open_files)); nfs_zap_acl_cache(inode); nfs_access_zap_cache(inode); + nfs_fscache_release_inode_cookie(inode); } /** @@ -355,6 +357,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) nfsi->attrtimeo_timestamp = now; nfsi->access_cache = RB_ROOT; + nfs_fscache_init_inode_cookie(inode); + unlock_new_inode(inode); } else nfs_refresh_inode(inode, fattr); @@ -686,6 +690,7 @@ int nfs_open(struct inode *inode, struct file *filp) ctx->mode = filp->f_mode; nfs_file_set_open_context(filp, ctx); put_nfs_open_context(ctx); + nfs_fscache_set_inode_cookie(inode, filp); return 0; } @@ -786,6 +791,7 @@ static int nfs_invalidate_mapping_nolock(struct inode *inode, struct address_spa memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); spin_unlock(&inode->i_lock); nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); + nfs_fscache_reset_inode_cookie(inode); dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n", inode->i_sb->s_id, (long long)NFS_FILEID(inode)); return 0; @@ -1030,6 +1036,7 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) spin_lock(&inode->i_lock); status = nfs_refresh_inode_locked(inode, fattr); spin_unlock(&inode->i_lock); + return status; } @@ -1436,6 +1443,10 @@ static int __init init_nfs_fs(void) { int err; + err = nfs_fscache_register(); + if (err < 0) + goto out7; + err = nfsiod_start(); if (err) goto out6; @@ -1488,6 +1499,8 @@ out4: out5: nfsiod_stop(); out6: + nfs_fscache_unregister(); +out7: return err; } @@ -1498,6 +1511,7 @@ static void __exit exit_nfs_fs(void) nfs_destroy_readpagecache(); nfs_destroy_inodecache(); nfs_destroy_nfspagecache(); + nfs_fscache_unregister(); #ifdef CONFIG_PROC_FS rpc_proc_unregister("nfs"); #endif diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 2041f68ff1cc..e4d6a8348adf 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -5,6 +5,8 @@ #include <linux/mount.h> #include <linux/security.h> +#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS) + struct nfs_string; /* Maximum number of readahead requests @@ -37,10 +39,12 @@ struct nfs_parsed_mount_data { int acregmin, acregmax, acdirmin, acdirmax; int namlen; + unsigned int options; unsigned int bsize; unsigned int auth_flavor_len; rpc_authflavor_t auth_flavors[1]; char *client_address; + char *fscache_uniq; struct { struct sockaddr_storage address; diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h index a36952810032..a2ab2529b5ca 100644 --- a/fs/nfs/iostat.h +++ b/fs/nfs/iostat.h @@ -16,6 +16,9 @@ struct nfs_iostats { unsigned long long bytes[__NFSIOS_BYTESMAX]; +#ifdef CONFIG_NFS_FSCACHE + unsigned long long fscache[__NFSIOS_FSCACHEMAX]; +#endif unsigned long events[__NFSIOS_COUNTSMAX]; } ____cacheline_aligned; @@ -57,6 +60,21 @@ static inline void nfs_add_stats(const struct inode *inode, nfs_add_server_stats(NFS_SERVER(inode), stat, addend); } +#ifdef CONFIG_NFS_FSCACHE +static inline void nfs_add_fscache_stats(struct inode *inode, + enum nfs_stat_fscachecounters stat, + unsigned long addend) +{ + struct nfs_iostats *iostats; + int cpu; + + cpu = get_cpu(); + iostats = per_cpu_ptr(NFS_SERVER(inode)->io_stats, cpu); + iostats->fscache[stat] += addend; + put_cpu_no_resched(); +} +#endif + static inline struct nfs_iostats *nfs_alloc_iostats(void) { return alloc_percpu(struct nfs_iostats); diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index b82fe6847f14..d0cc5ce0edfe 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -328,7 +328,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, data->arg.create.verifier[1] = current->pid; } - sattr->ia_mode &= ~current->fs->umask; + sattr->ia_mode &= ~current_umask(); for (;;) { status = nfs3_do_create(dir, dentry, data); @@ -528,7 +528,7 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) dprintk("NFS call mkdir %s\n", dentry->d_name.name); - sattr->ia_mode &= ~current->fs->umask; + sattr->ia_mode &= ~current_umask(); data = nfs3_alloc_createdata(); if (data == NULL) @@ -639,7 +639,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, dprintk("NFS call mknod %s %u:%u\n", dentry->d_name.name, MAJOR(rdev), MINOR(rdev)); - sattr->ia_mode &= ~current->fs->umask; + sattr->ia_mode &= ~current_umask(); data = nfs3_alloc_createdata(); if (data == NULL) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 97bacccff579..a4d242680299 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1501,7 +1501,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd) attr.ia_mode = nd->intent.open.create_mode; attr.ia_valid = ATTR_MODE; if (!IS_POSIXACL(dir)) - attr.ia_mode &= ~current->fs->umask; + attr.ia_mode &= ~current_umask(); } else { attr.ia_valid = 0; BUG_ON(nd->intent.open.flags & O_CREAT); diff --git a/fs/nfs/read.c b/fs/nfs/read.c index f856004bb7fa..4ace3c50a8eb 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -24,6 +24,7 @@ #include "internal.h" #include "iostat.h" +#include "fscache.h" #define NFSDBG_FACILITY NFSDBG_PAGECACHE @@ -111,8 +112,8 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data) } } -static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, - struct page *page) +int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, + struct page *page) { LIST_HEAD(one_request); struct nfs_page *new; @@ -139,6 +140,11 @@ static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, static void nfs_readpage_release(struct nfs_page *req) { + struct inode *d_inode = req->wb_context->path.dentry->d_inode; + + if (PageUptodate(req->wb_page)) + nfs_readpage_to_fscache(d_inode, req->wb_page, 0); + unlock_page(req->wb_page); dprintk("NFS: read done (%s/%Ld %d@%Ld)\n", @@ -510,8 +516,15 @@ int nfs_readpage(struct file *file, struct page *page) } else ctx = get_nfs_open_context(nfs_file_open_context(file)); + if (!IS_SYNC(inode)) { + error = nfs_readpage_from_fscache(ctx, inode, page); + if (error == 0) + goto out; + } + error = nfs_readpage_async(ctx, inode, page); +out: put_nfs_open_context(ctx); return error; out_unlock: @@ -584,6 +597,15 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, return -EBADF; } else desc.ctx = get_nfs_open_context(nfs_file_open_context(filp)); + + /* attempt to read as many of the pages as possible from the cache + * - this returns -ENOBUFS immediately if the cookie is negative + */ + ret = nfs_readpages_from_fscache(desc.ctx, inode, mapping, + pages, &nr_pages); + if (ret == 0) + goto read_complete; /* all pages were read */ + if (rsize < PAGE_CACHE_SIZE) nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); else @@ -594,6 +616,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, nfs_pageio_complete(&pgio); npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; nfs_add_stats(inode, NFSIOS_READPAGES, npages); +read_complete: put_nfs_open_context(desc.ctx); out: return ret; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 0942fcbbad3c..6717200923fe 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -60,6 +60,7 @@ #include "delegation.h" #include "iostat.h" #include "internal.h" +#include "fscache.h" #define NFSDBG_FACILITY NFSDBG_VFS @@ -76,6 +77,7 @@ enum { Opt_rdirplus, Opt_nordirplus, Opt_sharecache, Opt_nosharecache, Opt_resvport, Opt_noresvport, + Opt_fscache, Opt_nofscache, /* Mount options that take integer arguments */ Opt_port, @@ -93,6 +95,7 @@ enum { Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost, Opt_addr, Opt_mountaddr, Opt_clientaddr, Opt_lookupcache, + Opt_fscache_uniq, /* Special mount options */ Opt_userspace, Opt_deprecated, Opt_sloppy, @@ -132,6 +135,9 @@ static const match_table_t nfs_mount_option_tokens = { { Opt_nosharecache, "nosharecache" }, { Opt_resvport, "resvport" }, { Opt_noresvport, "noresvport" }, + { Opt_fscache, "fsc" }, + { Opt_fscache_uniq, "fsc=%s" }, + { Opt_nofscache, "nofsc" }, { Opt_port, "port=%u" }, { Opt_rsize, "rsize=%u" }, @@ -563,6 +569,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, if (clp->rpc_ops->version == 4) seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr); #endif + if (nfss->options & NFS_OPTION_FSCACHE) + seq_printf(m, ",fsc"); } /* @@ -641,6 +649,10 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt) totals.events[i] += stats->events[i]; for (i = 0; i < __NFSIOS_BYTESMAX; i++) totals.bytes[i] += stats->bytes[i]; +#ifdef CONFIG_NFS_FSCACHE + for (i = 0; i < __NFSIOS_FSCACHEMAX; i++) + totals.fscache[i] += stats->fscache[i]; +#endif preempt_enable(); } @@ -651,6 +663,13 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt) seq_printf(m, "\n\tbytes:\t"); for (i = 0; i < __NFSIOS_BYTESMAX; i++) seq_printf(m, "%Lu ", totals.bytes[i]); +#ifdef CONFIG_NFS_FSCACHE + if (nfss->options & NFS_OPTION_FSCACHE) { + seq_printf(m, "\n\tfsc:\t"); + for (i = 0; i < __NFSIOS_FSCACHEMAX; i++) + seq_printf(m, "%Lu ", totals.bytes[i]); + } +#endif seq_printf(m, "\n"); rpc_print_iostats(m, nfss->client); @@ -1044,6 +1063,24 @@ static int nfs_parse_mount_options(char *raw, case Opt_noresvport: mnt->flags |= NFS_MOUNT_NORESVPORT; break; + case Opt_fscache: + mnt->options |= NFS_OPTION_FSCACHE; + kfree(mnt->fscache_uniq); + mnt->fscache_uniq = NULL; + break; + case Opt_nofscache: + mnt->options &= ~NFS_OPTION_FSCACHE; + kfree(mnt->fscache_uniq); + mnt->fscache_uniq = NULL; + break; + case Opt_fscache_uniq: + string = match_strdup(args); + if (!string) + goto out_nomem; + kfree(mnt->fscache_uniq); + mnt->fscache_uniq = string; + mnt->options |= NFS_OPTION_FSCACHE; + break; /* * options that take numeric values @@ -1191,7 +1228,6 @@ static int nfs_parse_mount_options(char *raw, goto out_nomem; token = match_token(string, nfs_xprt_protocol_tokens, args); - kfree(string); switch (token) { case Opt_xprt_udp: @@ -1221,6 +1257,7 @@ static int nfs_parse_mount_options(char *raw, goto out_nomem; token = match_token(string, nfs_xprt_protocol_tokens, args); + kfree(string); switch (token) { case Opt_xprt_udp: @@ -1870,8 +1907,6 @@ static void nfs_clone_super(struct super_block *sb, nfs_initialise_sb(sb); } -#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS) - static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags) { const struct nfs_server *a = s->s_fs_info; @@ -2036,6 +2071,7 @@ static int nfs_get_sb(struct file_system_type *fs_type, if (!s->s_root) { /* initial superblock/root creation */ nfs_fill_super(s, data); + nfs_fscache_get_super_cookie(s, data); } mntroot = nfs_get_root(s, mntfh); @@ -2056,6 +2092,7 @@ static int nfs_get_sb(struct file_system_type *fs_type, out: kfree(data->nfs_server.hostname); kfree(data->mount_server.hostname); + kfree(data->fscache_uniq); security_free_mnt_opts(&data->lsm_opts); out_free_fh: kfree(mntfh); @@ -2083,6 +2120,7 @@ static void nfs_kill_super(struct super_block *s) bdi_unregister(&server->backing_dev_info); kill_anon_super(s); + nfs_fscache_release_super_cookie(s); nfs_free_server(server); } @@ -2390,6 +2428,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type, if (!s->s_root) { /* initial superblock/root creation */ nfs4_fill_super(s); + nfs_fscache_get_super_cookie(s, data); } mntroot = nfs4_get_root(s, mntfh); @@ -2411,6 +2450,7 @@ out: kfree(data->client_address); kfree(data->nfs_server.export_path); kfree(data->nfs_server.hostname); + kfree(data->fscache_uniq); security_free_mnt_opts(&data->lsm_opts); out_free_fh: kfree(mntfh); @@ -2437,6 +2477,7 @@ static void nfs4_kill_super(struct super_block *sb) kill_anon_super(sb); nfs4_renewd_prepare_shutdown(server); + nfs_fscache_release_super_cookie(sb); nfs_free_server(server); } diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 44d7d04dab95..503b9da159a3 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -1,6 +1,7 @@ config NFSD tristate "NFS server support" depends on INET + depends on FILE_LOCKING select LOCKD select SUNRPC select EXPORTFS diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 9dbd2eb91281..7c9fe838f038 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -18,6 +18,7 @@ #include <linux/unistd.h> #include <linux/slab.h> #include <linux/major.h> +#include <linux/magic.h> #include <linux/sunrpc/svc.h> #include <linux/nfsd/nfsd.h> @@ -202,6 +203,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp, struct nfsd3_writeres *resp) { __be32 nfserr; + unsigned long cnt = argp->len; dprintk("nfsd: WRITE(3) %s %d bytes at %ld%s\n", SVCFH_fmt(&argp->fh), @@ -214,9 +216,9 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp, nfserr = nfsd_write(rqstp, &resp->fh, NULL, argp->offset, rqstp->rq_vec, argp->vlen, - argp->len, + &cnt, &resp->committed); - resp->count = argp->count; + resp->count = cnt; RETURN_STATUS(nfserr); } @@ -569,7 +571,7 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, struct super_block *sb = argp->fh.fh_dentry->d_inode->i_sb; /* Note that we don't care for remote fs's here */ - if (sb->s_magic == 0x4d44 /* MSDOS_SUPER_MAGIC */) { + if (sb->s_magic == MSDOS_SUPER_MAGIC) { resp->f_properties = NFS3_FSF_BILLYBOY; } resp->f_maxfilesize = sb->s_maxbytes; @@ -610,7 +612,7 @@ nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, resp->p_link_max = EXT2_LINK_MAX; resp->p_name_max = EXT2_NAME_LEN; break; - case 0x4d44: /* MSDOS_SUPER_MAGIC */ + case MSDOS_SUPER_MAGIC: resp->p_case_insensitive = 1; resp->p_case_preserving = 0; break; diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index c464181b5994..290289bd44f7 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -218,7 +218,7 @@ static int encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec) { __be32 *p; - int len = cb_rec->cbr_fhlen; + int len = cb_rec->cbr_fh.fh_size; RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len); WRITE32(OP_CB_RECALL); @@ -226,7 +226,7 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec) WRITEMEM(&cb_rec->cbr_stateid.si_opaque, sizeof(stateid_opaque_t)); WRITE32(cb_rec->cbr_trunc); WRITE32(len); - WRITEMEM(cb_rec->cbr_fhval, len); + WRITEMEM(&cb_rec->cbr_fh.fh_base, len); return 0; } @@ -361,9 +361,8 @@ static struct rpc_program cb_program = { /* Reference counting, callback cleanup, etc., all look racy as heck. * And why is cb_set an atomic? */ -static int do_probe_callback(void *data) +static struct rpc_clnt *setup_callback_client(struct nfs4_client *clp) { - struct nfs4_client *clp = data; struct sockaddr_in addr; struct nfs4_callback *cb = &clp->cl_callback; struct rpc_timeout timeparms = { @@ -384,17 +383,10 @@ static int do_probe_callback(void *data) .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), .client_name = clp->cl_principal, }; - struct rpc_message msg = { - .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], - .rpc_argp = clp, - }; struct rpc_clnt *client; - int status; - if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) { - status = nfserr_cb_path_down; - goto out_err; - } + if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) + return ERR_PTR(-EINVAL); /* Initialize address */ memset(&addr, 0, sizeof(addr)); @@ -404,9 +396,29 @@ static int do_probe_callback(void *data) /* Create RPC client */ client = rpc_create(&args); + if (IS_ERR(client)) + dprintk("NFSD: couldn't create callback client: %ld\n", + PTR_ERR(client)); + return client; + +} + +static int do_probe_callback(void *data) +{ + struct nfs4_client *clp = data; + struct nfs4_callback *cb = &clp->cl_callback; + struct rpc_message msg = { + .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], + .rpc_argp = clp, + }; + struct rpc_clnt *client; + int status; + + client = setup_callback_client(clp); if (IS_ERR(client)) { - dprintk("NFSD: couldn't create callback client\n"); status = PTR_ERR(client); + dprintk("NFSD: couldn't create callback client: %d\n", + status); goto out_err; } @@ -422,10 +434,10 @@ static int do_probe_callback(void *data) out_release_client: rpc_shutdown_client(client); out_err: - dprintk("NFSD: warning: no callback path to client %.*s\n", - (int)clp->cl_name.len, clp->cl_name.data); + dprintk("NFSD: warning: no callback path to client %.*s: error %d\n", + (int)clp->cl_name.len, clp->cl_name.data, status); put_nfs4_client(clp); - return status; + return 0; } /* @@ -451,7 +463,6 @@ nfsd4_probe_callback(struct nfs4_client *clp) /* * called with dp->dl_count inc'ed. - * nfs4_lock_state() may or may not have been called. */ void nfsd4_cb_recall(struct nfs4_delegation *dp) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 9fa60a3ad48c..b2883e9c6381 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -93,6 +93,21 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o open->op_truncate = 0; if (open->op_create) { + /* FIXME: check session persistence and pnfs flags. + * The nfsv4.1 spec requires the following semantics: + * + * Persistent | pNFS | Server REQUIRED | Client Allowed + * Reply Cache | server | | + * -------------+--------+-----------------+-------------------- + * no | no | EXCLUSIVE4_1 | EXCLUSIVE4_1 + * | | | (SHOULD) + * | | and EXCLUSIVE4 | or EXCLUSIVE4 + * | | | (SHOULD NOT) + * no | yes | EXCLUSIVE4_1 | EXCLUSIVE4_1 + * yes | no | GUARDED4 | GUARDED4 + * yes | yes | GUARDED4 | GUARDED4 + */ + /* * Note: create modes (UNCHECKED,GUARDED...) are the same * in NFSv4 as in v3. @@ -103,11 +118,13 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o (u32 *)open->op_verf.data, &open->op_truncate, &created); - /* If we ever decide to use different attrs to store the - * verifier in nfsd_create_v3, then we'll need to change this + /* + * Following rfc 3530 14.2.16, use the returned bitmask + * to indicate which attributes we used to store the + * verifier: */ if (open->op_createmode == NFS4_CREATE_EXCLUSIVE && status == 0) - open->op_bmval[1] |= (FATTR4_WORD1_TIME_ACCESS | + open->op_bmval[1] = (FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_MODIFY); } else { status = nfsd_lookup(rqstp, current_fh, @@ -118,13 +135,11 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o goto out; set_change_info(&open->op_cinfo, current_fh); - - /* set reply cache */ fh_dup2(current_fh, &resfh); - open->op_stateowner->so_replay.rp_openfh_len = resfh.fh_handle.fh_size; - memcpy(open->op_stateowner->so_replay.rp_openfh, - &resfh.fh_handle.fh_base, resfh.fh_handle.fh_size); + /* set reply cache */ + fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh, + &resfh.fh_handle); if (!created) status = do_open_permission(rqstp, current_fh, open, NFSD_MAY_NOP); @@ -150,10 +165,8 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_ memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info)); /* set replay cache */ - open->op_stateowner->so_replay.rp_openfh_len = current_fh->fh_handle.fh_size; - memcpy(open->op_stateowner->so_replay.rp_openfh, - ¤t_fh->fh_handle.fh_base, - current_fh->fh_handle.fh_size); + fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh, + ¤t_fh->fh_handle); open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) && (open->op_iattr.ia_size == 0); @@ -164,12 +177,23 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_ return status; } +static void +copy_clientid(clientid_t *clid, struct nfsd4_session *session) +{ + struct nfsd4_sessionid *sid = + (struct nfsd4_sessionid *)session->se_sessionid.data; + + clid->cl_boot = sid->clientid.cl_boot; + clid->cl_id = sid->clientid.cl_id; +} static __be32 nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open) { __be32 status; + struct nfsd4_compoundres *resp; + dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n", (int)open->op_fname.len, open->op_fname.data, open->op_stateowner); @@ -178,16 +202,19 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL) return nfserr_inval; + if (nfsd4_has_session(cstate)) + copy_clientid(&open->op_clientid, cstate->session); + nfs4_lock_state(); /* check seqid for replay. set nfs4_owner */ - status = nfsd4_process_open1(open); + resp = rqstp->rq_resp; + status = nfsd4_process_open1(&resp->cstate, open); if (status == nfserr_replay_me) { struct nfs4_replay *rp = &open->op_stateowner->so_replay; fh_put(&cstate->current_fh); - cstate->current_fh.fh_handle.fh_size = rp->rp_openfh_len; - memcpy(&cstate->current_fh.fh_handle.fh_base, rp->rp_openfh, - rp->rp_openfh_len); + fh_copy_shallow(&cstate->current_fh.fh_handle, + &rp->rp_openfh); status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP); if (status) dprintk("nfsd4_open: replay failed" @@ -209,10 +236,6 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, switch (open->op_claim_type) { case NFS4_OPEN_CLAIM_DELEGATE_CUR: - status = nfserr_inval; - if (open->op_create) - goto out; - /* fall through */ case NFS4_OPEN_CLAIM_NULL: /* * (1) set CURRENT_FH to the file being opened, @@ -455,8 +478,9 @@ nfsd4_getattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (getattr->ga_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1) return nfserr_inval; - getattr->ga_bmval[0] &= NFSD_SUPPORTED_ATTRS_WORD0; - getattr->ga_bmval[1] &= NFSD_SUPPORTED_ATTRS_WORD1; + getattr->ga_bmval[0] &= nfsd_suppattrs0(cstate->minorversion); + getattr->ga_bmval[1] &= nfsd_suppattrs1(cstate->minorversion); + getattr->ga_bmval[2] &= nfsd_suppattrs2(cstate->minorversion); getattr->ga_fhp = &cstate->current_fh; return nfs_ok; @@ -520,9 +544,8 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_lock_state(); /* check stateid */ - if ((status = nfs4_preprocess_stateid_op(&cstate->current_fh, - &read->rd_stateid, - CHECK_FH | RD_STATE, &read->rd_filp))) { + if ((status = nfs4_preprocess_stateid_op(cstate, &read->rd_stateid, + RD_STATE, &read->rd_filp))) { dprintk("NFSD: nfsd4_read: couldn't process stateid!\n"); goto out; } @@ -548,8 +571,9 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (readdir->rd_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1) return nfserr_inval; - readdir->rd_bmval[0] &= NFSD_SUPPORTED_ATTRS_WORD0; - readdir->rd_bmval[1] &= NFSD_SUPPORTED_ATTRS_WORD1; + readdir->rd_bmval[0] &= nfsd_suppattrs0(cstate->minorversion); + readdir->rd_bmval[1] &= nfsd_suppattrs1(cstate->minorversion); + readdir->rd_bmval[2] &= nfsd_suppattrs2(cstate->minorversion); if ((cookie > ~(u32)0) || (cookie == 1) || (cookie == 2) || (cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE))) @@ -653,8 +677,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (setattr->sa_iattr.ia_valid & ATTR_SIZE) { nfs4_lock_state(); - status = nfs4_preprocess_stateid_op(&cstate->current_fh, - &setattr->sa_stateid, CHECK_FH | WR_STATE, NULL); + status = nfs4_preprocess_stateid_op(cstate, + &setattr->sa_stateid, WR_STATE, NULL); nfs4_unlock_state(); if (status) { dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n"); @@ -685,6 +709,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct file *filp = NULL; u32 *p; __be32 status = nfs_ok; + unsigned long cnt; /* no need to check permission - this will be done in nfsd_write() */ @@ -692,8 +717,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return nfserr_inval; nfs4_lock_state(); - status = nfs4_preprocess_stateid_op(&cstate->current_fh, stateid, - CHECK_FH | WR_STATE, &filp); + status = nfs4_preprocess_stateid_op(cstate, stateid, WR_STATE, &filp); if (filp) get_file(filp); nfs4_unlock_state(); @@ -703,7 +727,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return status; } - write->wr_bytes_written = write->wr_buflen; + cnt = write->wr_buflen; write->wr_how_written = write->wr_stable_how; p = (u32 *)write->wr_verifier.data; *p++ = nfssvc_boot.tv_sec; @@ -711,10 +735,12 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfsd_write(rqstp, &cstate->current_fh, filp, write->wr_offset, rqstp->rq_vec, write->wr_vlen, - write->wr_buflen, &write->wr_how_written); + &cnt, &write->wr_how_written); if (filp) fput(filp); + write->wr_bytes_written = cnt; + if (status == nfserr_symlink) status = nfserr_inval; return status; @@ -737,8 +763,9 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) return status; - if ((verify->ve_bmval[0] & ~NFSD_SUPPORTED_ATTRS_WORD0) - || (verify->ve_bmval[1] & ~NFSD_SUPPORTED_ATTRS_WORD1)) + if ((verify->ve_bmval[0] & ~nfsd_suppattrs0(cstate->minorversion)) + || (verify->ve_bmval[1] & ~nfsd_suppattrs1(cstate->minorversion)) + || (verify->ve_bmval[2] & ~nfsd_suppattrs2(cstate->minorversion))) return nfserr_attrnotsupp; if ((verify->ve_bmval[0] & FATTR4_WORD0_RDATTR_ERROR) || (verify->ve_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)) @@ -766,7 +793,8 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) goto out_kfree; - p = buf + 3; + /* skip bitmap */ + p = buf + 1 + ntohl(buf[0]); status = nfserr_not_same; if (ntohl(*p++) != verify->ve_attrlen) goto out_kfree; @@ -813,39 +841,17 @@ static inline void nfsd4_increment_op_stats(u32 opnum) nfsdstats.nfs4_opcount[opnum]++; } -static void cstate_free(struct nfsd4_compound_state *cstate) -{ - if (cstate == NULL) - return; - fh_put(&cstate->current_fh); - fh_put(&cstate->save_fh); - BUG_ON(cstate->replay_owner); - kfree(cstate); -} - -static struct nfsd4_compound_state *cstate_alloc(void) -{ - struct nfsd4_compound_state *cstate; - - cstate = kmalloc(sizeof(struct nfsd4_compound_state), GFP_KERNEL); - if (cstate == NULL) - return NULL; - fh_init(&cstate->current_fh, NFS4_FHSIZE); - fh_init(&cstate->save_fh, NFS4_FHSIZE); - cstate->replay_owner = NULL; - return cstate; -} - typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *, void *); +enum nfsd4_op_flags { + ALLOWED_WITHOUT_FH = 1 << 0, /* No current filehandle required */ + ALLOWED_ON_ABSENT_FS = 2 << 0, /* ops processed on absent fs */ + ALLOWED_AS_FIRST_OP = 3 << 0, /* ops reqired first in compound */ +}; struct nfsd4_operation { nfsd4op_func op_func; u32 op_flags; -/* Most ops require a valid current filehandle; a few don't: */ -#define ALLOWED_WITHOUT_FH 1 -/* GETATTR and ops not listed as returning NFS4ERR_MOVED: */ -#define ALLOWED_ON_ABSENT_FS 2 char *op_name; }; @@ -854,6 +860,51 @@ static struct nfsd4_operation nfsd4_ops[]; static const char *nfsd4_op_name(unsigned opnum); /* + * This is a replay of a compound for which no cache entry pages + * were used. Encode the sequence operation, and if cachethis is FALSE + * encode the uncache rep error on the next operation. + */ +static __be32 +nfsd4_enc_uncached_replay(struct nfsd4_compoundargs *args, + struct nfsd4_compoundres *resp) +{ + struct nfsd4_op *op; + + dprintk("--> %s resp->opcnt %d ce_cachethis %u \n", __func__, + resp->opcnt, resp->cstate.slot->sl_cache_entry.ce_cachethis); + + /* Encode the replayed sequence operation */ + BUG_ON(resp->opcnt != 1); + op = &args->ops[resp->opcnt - 1]; + nfsd4_encode_operation(resp, op); + + /*return nfserr_retry_uncached_rep in next operation. */ + if (resp->cstate.slot->sl_cache_entry.ce_cachethis == 0) { + op = &args->ops[resp->opcnt++]; + op->status = nfserr_retry_uncached_rep; + nfsd4_encode_operation(resp, op); + } + return op->status; +} + +/* + * Enforce NFSv4.1 COMPOUND ordering rules. + * + * TODO: + * - enforce NFS4ERR_NOT_ONLY_OP, + * - DESTROY_SESSION MUST be the final operation in the COMPOUND request. + */ +static bool nfs41_op_ordering_ok(struct nfsd4_compoundargs *args) +{ + if (args->minorversion && args->opcnt > 0) { + struct nfsd4_op *op = &args->ops[0]; + return (op->status == nfserr_op_illegal) || + (nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP); + } + return true; +} + +/* * COMPOUND call. */ static __be32 @@ -863,12 +914,13 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, { struct nfsd4_op *op; struct nfsd4_operation *opdesc; - struct nfsd4_compound_state *cstate = NULL; + struct nfsd4_compound_state *cstate = &resp->cstate; int slack_bytes; __be32 status; resp->xbuf = &rqstp->rq_res; - resp->p = rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len; + resp->p = rqstp->rq_res.head[0].iov_base + + rqstp->rq_res.head[0].iov_len; resp->tagp = resp->p; /* reserve space for: taglen, tag, and opcnt */ resp->p += 2 + XDR_QUADLEN(args->taglen); @@ -877,18 +929,25 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, resp->tag = args->tag; resp->opcnt = 0; resp->rqstp = rqstp; + resp->cstate.minorversion = args->minorversion; + resp->cstate.replay_owner = NULL; + fh_init(&resp->cstate.current_fh, NFS4_FHSIZE); + fh_init(&resp->cstate.save_fh, NFS4_FHSIZE); + /* Use the deferral mechanism only for NFSv4.0 compounds */ + rqstp->rq_usedeferral = (args->minorversion == 0); /* * According to RFC3010, this takes precedence over all other errors. */ status = nfserr_minor_vers_mismatch; - if (args->minorversion > NFSD_SUPPORTED_MINOR_VERSION) + if (args->minorversion > nfsd_supported_minorversion) goto out; - status = nfserr_resource; - cstate = cstate_alloc(); - if (cstate == NULL) - goto out; + if (!nfs41_op_ordering_ok(args)) { + op = &args->ops[0]; + op->status = nfserr_sequence_pos; + goto encode_op; + } status = nfs_ok; while (!status && resp->opcnt < args->opcnt) { @@ -897,7 +956,6 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, dprintk("nfsv4 compound op #%d/%d: %d (%s)\n", resp->opcnt, args->opcnt, op->opnum, nfsd4_op_name(op->opnum)); - /* * The XDR decode routines may have pre-set op->status; * for example, if there is a miscellaneous XDR error @@ -938,6 +996,15 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, BUG_ON(op->status == nfs_ok); encode_op: + /* Only from SEQUENCE or CREATE_SESSION */ + if (resp->cstate.status == nfserr_replay_cache) { + dprintk("%s NFS4.1 replay from cache\n", __func__); + if (nfsd4_not_cached(resp)) + status = nfsd4_enc_uncached_replay(args, resp); + else + status = op->status; + goto out; + } if (op->status == nfserr_replay_me) { op->replay = &cstate->replay_owner->so_replay; nfsd4_encode_replay(resp, op); @@ -961,15 +1028,24 @@ encode_op: nfsd4_increment_op_stats(op->opnum); } + if (!rqstp->rq_usedeferral && status == nfserr_dropit) { + dprintk("%s Dropit - send NFS4ERR_DELAY\n", __func__); + status = nfserr_jukebox; + } - cstate_free(cstate); + resp->cstate.status = status; + fh_put(&resp->cstate.current_fh); + fh_put(&resp->cstate.save_fh); + BUG_ON(resp->cstate.replay_owner); out: nfsd4_release_compoundargs(args); + /* Reset deferral mechanism for RPC deferrals */ + rqstp->rq_usedeferral = 1; dprintk("nfsv4 compound returned %d\n", ntohl(status)); return status; } -static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = { +static struct nfsd4_operation nfsd4_ops[] = { [OP_ACCESS] = { .op_func = (nfsd4op_func)nfsd4_access, .op_name = "OP_ACCESS", @@ -1045,7 +1121,7 @@ static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = { .op_name = "OP_PUTFH", }, [OP_PUTPUBFH] = { - /* unsupported, just for future reference: */ + .op_func = (nfsd4op_func)nfsd4_putrootfh, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, .op_name = "OP_PUTPUBFH", }, @@ -1119,6 +1195,28 @@ static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = { .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, .op_name = "OP_RELEASE_LOCKOWNER", }, + + /* NFSv4.1 operations */ + [OP_EXCHANGE_ID] = { + .op_func = (nfsd4op_func)nfsd4_exchange_id, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, + .op_name = "OP_EXCHANGE_ID", + }, + [OP_CREATE_SESSION] = { + .op_func = (nfsd4op_func)nfsd4_create_session, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, + .op_name = "OP_CREATE_SESSION", + }, + [OP_DESTROY_SESSION] = { + .op_func = (nfsd4op_func)nfsd4_destroy_session, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, + .op_name = "OP_DESTROY_SESSION", + }, + [OP_SEQUENCE] = { + .op_func = (nfsd4op_func)nfsd4_sequence, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, + .op_name = "OP_SEQUENCE", + }, }; static const char *nfsd4_op_name(unsigned opnum) diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 74f7b67567fd..3444c0052a87 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -182,36 +182,26 @@ out_unlock: typedef int (recdir_func)(struct dentry *, struct dentry *); -struct dentry_list { - struct dentry *dentry; +struct name_list { + char name[HEXDIR_LEN]; struct list_head list; }; -struct dentry_list_arg { - struct list_head dentries; - struct dentry *parent; -}; - static int -nfsd4_build_dentrylist(void *arg, const char *name, int namlen, +nfsd4_build_namelist(void *arg, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { - struct dentry_list_arg *dla = arg; - struct list_head *dentries = &dla->dentries; - struct dentry *parent = dla->parent; - struct dentry *dentry; - struct dentry_list *child; + struct list_head *names = arg; + struct name_list *entry; - if (name && isdotent(name, namlen)) + if (namlen != HEXDIR_LEN - 1) return 0; - dentry = lookup_one_len(name, parent, namlen); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - child = kmalloc(sizeof(*child), GFP_KERNEL); - if (child == NULL) + entry = kmalloc(sizeof(struct name_list), GFP_KERNEL); + if (entry == NULL) return -ENOMEM; - child->dentry = dentry; - list_add(&child->list, dentries); + memcpy(entry->name, name, HEXDIR_LEN - 1); + entry->name[HEXDIR_LEN - 1] = '\0'; + list_add(&entry->list, names); return 0; } @@ -220,11 +210,9 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f) { const struct cred *original_cred; struct file *filp; - struct dentry_list_arg dla = { - .parent = dir, - }; - struct list_head *dentries = &dla.dentries; - struct dentry_list *child; + LIST_HEAD(names); + struct name_list *entry; + struct dentry *dentry; int status; if (!rec_dir_init) @@ -233,31 +221,34 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f) status = nfs4_save_creds(&original_cred); if (status < 0) return status; - INIT_LIST_HEAD(dentries); filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY, current_cred()); status = PTR_ERR(filp); if (IS_ERR(filp)) goto out; - INIT_LIST_HEAD(dentries); - status = vfs_readdir(filp, nfsd4_build_dentrylist, &dla); + status = vfs_readdir(filp, nfsd4_build_namelist, &names); fput(filp); - while (!list_empty(dentries)) { - child = list_entry(dentries->next, struct dentry_list, list); - status = f(dir, child->dentry); + while (!list_empty(&names)) { + entry = list_entry(names.next, struct name_list, list); + + dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1); + if (IS_ERR(dentry)) { + status = PTR_ERR(dentry); + goto out; + } + status = f(dir, dentry); + dput(dentry); if (status) goto out; - list_del(&child->list); - dput(child->dentry); - kfree(child); + list_del(&entry->list); + kfree(entry); } out: - while (!list_empty(dentries)) { - child = list_entry(dentries->next, struct dentry_list, list); - list_del(&child->list); - dput(child->dentry); - kfree(child); + while (!list_empty(&names)) { + entry = list_entry(names.next, struct name_list, list); + list_del(&entry->list); + kfree(entry); } nfs4_reset_creds(original_cred); return status; @@ -353,7 +344,8 @@ purge_old(struct dentry *parent, struct dentry *child) { int status; - if (nfs4_has_reclaimed_state(child->d_name.name)) + /* note: we currently use this path only for minorversion 0 */ + if (nfs4_has_reclaimed_state(child->d_name.name, false)) return 0; status = nfsd4_clear_clid_dir(parent, child); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index b6f60f48e94b..c65a27b76a9d 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -68,6 +68,7 @@ static u32 current_delegid = 1; static u32 nfs4_init; static stateid_t zerostateid; /* bits all 0 */ static stateid_t onestateid; /* bits all 1 */ +static u64 current_sessionid = 1; #define ZERO_STATEID(stateid) (!memcmp((stateid), &zerostateid, sizeof(stateid_t))) #define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t))) @@ -75,18 +76,21 @@ static stateid_t onestateid; /* bits all 1 */ /* forward declarations */ static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags); static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid); -static void release_stateid_lockowners(struct nfs4_stateid *open_stp); static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; static void nfs4_set_recdir(char *recdir); -/* Locking: - * - * client_mutex: - * protects clientid_hashtbl[], clientstr_hashtbl[], - * unconfstr_hashtbl[], uncofid_hashtbl[]. - */ +/* Locking: */ + +/* Currently used for almost all code touching nfsv4 state: */ static DEFINE_MUTEX(client_mutex); +/* + * Currently used for the del_recall_lru and file hash table. In an + * effort to decrease the scope of the client_mutex, this spinlock may + * eventually cover more: + */ +static DEFINE_SPINLOCK(recall_lock); + static struct kmem_cache *stateowner_slab = NULL; static struct kmem_cache *file_slab = NULL; static struct kmem_cache *stateid_slab = NULL; @@ -117,37 +121,23 @@ opaque_hashval(const void *ptr, int nbytes) return x; } -/* forward declarations */ -static void release_stateowner(struct nfs4_stateowner *sop); -static void release_stateid(struct nfs4_stateid *stp, int flags); - -/* - * Delegation state - */ - -/* recall_lock protects the del_recall_lru */ -static DEFINE_SPINLOCK(recall_lock); static struct list_head del_recall_lru; -static void -free_nfs4_file(struct kref *kref) -{ - struct nfs4_file *fp = container_of(kref, struct nfs4_file, fi_ref); - list_del(&fp->fi_hash); - iput(fp->fi_inode); - kmem_cache_free(file_slab, fp); -} - static inline void put_nfs4_file(struct nfs4_file *fi) { - kref_put(&fi->fi_ref, free_nfs4_file); + if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) { + list_del(&fi->fi_hash); + spin_unlock(&recall_lock); + iput(fi->fi_inode); + kmem_cache_free(file_slab, fi); + } } static inline void get_nfs4_file(struct nfs4_file *fi) { - kref_get(&fi->fi_ref); + atomic_inc(&fi->fi_ref); } static int num_delegations; @@ -220,9 +210,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f dp->dl_stateid.si_stateownerid = current_delegid++; dp->dl_stateid.si_fileid = 0; dp->dl_stateid.si_generation = 0; - dp->dl_fhlen = current_fh->fh_handle.fh_size; - memcpy(dp->dl_fhval, ¤t_fh->fh_handle.fh_base, - current_fh->fh_handle.fh_size); + fh_copy_shallow(&dp->dl_fh, ¤t_fh->fh_handle); dp->dl_time = 0; atomic_set(&dp->dl_count, 1); list_add(&dp->dl_perfile, &fp->fi_delegations); @@ -311,6 +299,291 @@ static struct list_head unconf_id_hashtbl[CLIENT_HASH_SIZE]; static struct list_head client_lru; static struct list_head close_lru; +static void unhash_generic_stateid(struct nfs4_stateid *stp) +{ + list_del(&stp->st_hash); + list_del(&stp->st_perfile); + list_del(&stp->st_perstateowner); +} + +static void free_generic_stateid(struct nfs4_stateid *stp) +{ + put_nfs4_file(stp->st_file); + kmem_cache_free(stateid_slab, stp); +} + +static void release_lock_stateid(struct nfs4_stateid *stp) +{ + unhash_generic_stateid(stp); + locks_remove_posix(stp->st_vfs_file, (fl_owner_t)stp->st_stateowner); + free_generic_stateid(stp); +} + +static void unhash_lockowner(struct nfs4_stateowner *sop) +{ + struct nfs4_stateid *stp; + + list_del(&sop->so_idhash); + list_del(&sop->so_strhash); + list_del(&sop->so_perstateid); + while (!list_empty(&sop->so_stateids)) { + stp = list_first_entry(&sop->so_stateids, + struct nfs4_stateid, st_perstateowner); + release_lock_stateid(stp); + } +} + +static void release_lockowner(struct nfs4_stateowner *sop) +{ + unhash_lockowner(sop); + nfs4_put_stateowner(sop); +} + +static void +release_stateid_lockowners(struct nfs4_stateid *open_stp) +{ + struct nfs4_stateowner *lock_sop; + + while (!list_empty(&open_stp->st_lockowners)) { + lock_sop = list_entry(open_stp->st_lockowners.next, + struct nfs4_stateowner, so_perstateid); + /* list_del(&open_stp->st_lockowners); */ + BUG_ON(lock_sop->so_is_open_owner); + release_lockowner(lock_sop); + } +} + +static void release_open_stateid(struct nfs4_stateid *stp) +{ + unhash_generic_stateid(stp); + release_stateid_lockowners(stp); + nfsd_close(stp->st_vfs_file); + free_generic_stateid(stp); +} + +static void unhash_openowner(struct nfs4_stateowner *sop) +{ + struct nfs4_stateid *stp; + + list_del(&sop->so_idhash); + list_del(&sop->so_strhash); + list_del(&sop->so_perclient); + list_del(&sop->so_perstateid); /* XXX: necessary? */ + while (!list_empty(&sop->so_stateids)) { + stp = list_first_entry(&sop->so_stateids, + struct nfs4_stateid, st_perstateowner); + release_open_stateid(stp); + } +} + +static void release_openowner(struct nfs4_stateowner *sop) +{ + unhash_openowner(sop); + list_del(&sop->so_close_lru); + nfs4_put_stateowner(sop); +} + +static DEFINE_SPINLOCK(sessionid_lock); +#define SESSION_HASH_SIZE 512 +static struct list_head sessionid_hashtbl[SESSION_HASH_SIZE]; + +static inline int +hash_sessionid(struct nfs4_sessionid *sessionid) +{ + struct nfsd4_sessionid *sid = (struct nfsd4_sessionid *)sessionid; + + return sid->sequence % SESSION_HASH_SIZE; +} + +static inline void +dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid) +{ + u32 *ptr = (u32 *)(&sessionid->data[0]); + dprintk("%s: %u:%u:%u:%u\n", fn, ptr[0], ptr[1], ptr[2], ptr[3]); +} + +static void +gen_sessionid(struct nfsd4_session *ses) +{ + struct nfs4_client *clp = ses->se_client; + struct nfsd4_sessionid *sid; + + sid = (struct nfsd4_sessionid *)ses->se_sessionid.data; + sid->clientid = clp->cl_clientid; + sid->sequence = current_sessionid++; + sid->reserved = 0; +} + +/* + * Give the client the number of slots it requests bound by + * NFSD_MAX_SLOTS_PER_SESSION and by sv_drc_max_pages. + * + * If we run out of pages (sv_drc_pages_used == sv_drc_max_pages) we + * should (up to a point) re-negotiate active sessions and reduce their + * slot usage to make rooom for new connections. For now we just fail the + * create session. + */ +static int set_forechannel_maxreqs(struct nfsd4_channel_attrs *fchan) +{ + int status = 0, np = fchan->maxreqs * NFSD_PAGES_PER_SLOT; + + spin_lock(&nfsd_serv->sv_lock); + if (np + nfsd_serv->sv_drc_pages_used > nfsd_serv->sv_drc_max_pages) + np = nfsd_serv->sv_drc_max_pages - nfsd_serv->sv_drc_pages_used; + nfsd_serv->sv_drc_pages_used += np; + spin_unlock(&nfsd_serv->sv_lock); + + if (np <= 0) { + status = nfserr_resource; + fchan->maxreqs = 0; + } else + fchan->maxreqs = np / NFSD_PAGES_PER_SLOT; + + return status; +} + +/* + * fchan holds the client values on input, and the server values on output + */ +static int init_forechannel_attrs(struct svc_rqst *rqstp, + struct nfsd4_session *session, + struct nfsd4_channel_attrs *fchan) +{ + int status = 0; + __u32 maxcount = svc_max_payload(rqstp); + + /* headerpadsz set to zero in encode routine */ + + /* Use the client's max request and max response size if possible */ + if (fchan->maxreq_sz > maxcount) + fchan->maxreq_sz = maxcount; + session->se_fmaxreq_sz = fchan->maxreq_sz; + + if (fchan->maxresp_sz > maxcount) + fchan->maxresp_sz = maxcount; + session->se_fmaxresp_sz = fchan->maxresp_sz; + + /* Set the max response cached size our default which is + * a multiple of PAGE_SIZE and small */ + session->se_fmaxresp_cached = NFSD_PAGES_PER_SLOT * PAGE_SIZE; + fchan->maxresp_cached = session->se_fmaxresp_cached; + + /* Use the client's maxops if possible */ + if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND) + fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND; + session->se_fmaxops = fchan->maxops; + + /* try to use the client requested number of slots */ + if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION) + fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION; + + /* FIXME: Error means no more DRC pages so the server should + * recover pages from existing sessions. For now fail session + * creation. + */ + status = set_forechannel_maxreqs(fchan); + + session->se_fnumslots = fchan->maxreqs; + return status; +} + +static int +alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, + struct nfsd4_create_session *cses) +{ + struct nfsd4_session *new, tmp; + int idx, status = nfserr_resource, slotsize; + + memset(&tmp, 0, sizeof(tmp)); + + /* FIXME: For now, we just accept the client back channel attributes. */ + status = init_forechannel_attrs(rqstp, &tmp, &cses->fore_channel); + if (status) + goto out; + + /* allocate struct nfsd4_session and slot table in one piece */ + slotsize = tmp.se_fnumslots * sizeof(struct nfsd4_slot); + new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL); + if (!new) + goto out; + + memcpy(new, &tmp, sizeof(*new)); + + new->se_client = clp; + gen_sessionid(new); + idx = hash_sessionid(&new->se_sessionid); + memcpy(clp->cl_sessionid.data, new->se_sessionid.data, + NFS4_MAX_SESSIONID_LEN); + + new->se_flags = cses->flags; + kref_init(&new->se_ref); + spin_lock(&sessionid_lock); + list_add(&new->se_hash, &sessionid_hashtbl[idx]); + list_add(&new->se_perclnt, &clp->cl_sessions); + spin_unlock(&sessionid_lock); + + status = nfs_ok; +out: + return status; +} + +/* caller must hold sessionid_lock */ +static struct nfsd4_session * +find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid) +{ + struct nfsd4_session *elem; + int idx; + + dump_sessionid(__func__, sessionid); + idx = hash_sessionid(sessionid); + dprintk("%s: idx is %d\n", __func__, idx); + /* Search in the appropriate list */ + list_for_each_entry(elem, &sessionid_hashtbl[idx], se_hash) { + dump_sessionid("list traversal", &elem->se_sessionid); + if (!memcmp(elem->se_sessionid.data, sessionid->data, + NFS4_MAX_SESSIONID_LEN)) { + return elem; + } + } + + dprintk("%s: session not found\n", __func__); + return NULL; +} + +/* caller must hold sessionid_lock */ +static void +unhash_session(struct nfsd4_session *ses) +{ + list_del(&ses->se_hash); + list_del(&ses->se_perclnt); +} + +static void +release_session(struct nfsd4_session *ses) +{ + spin_lock(&sessionid_lock); + unhash_session(ses); + spin_unlock(&sessionid_lock); + nfsd4_put_session(ses); +} + +static void nfsd4_release_respages(struct page **respages, short resused); + +void +free_session(struct kref *kref) +{ + struct nfsd4_session *ses; + int i; + + ses = container_of(kref, struct nfsd4_session, se_ref); + for (i = 0; i < ses->se_fnumslots; i++) { + struct nfsd4_cache_entry *e = &ses->se_slots[i].sl_cache_entry; + nfsd4_release_respages(e->ce_respages, e->ce_resused); + } + kfree(ses->se_slots); + kfree(ses); +} + static inline void renew_client(struct nfs4_client *clp) { @@ -330,8 +603,8 @@ STALE_CLIENTID(clientid_t *clid) { if (clid->cl_boot == boot_time) return 0; - dprintk("NFSD stale clientid (%08x/%08x)\n", - clid->cl_boot, clid->cl_id); + dprintk("NFSD stale clientid (%08x/%08x) boot_time %08lx\n", + clid->cl_boot, clid->cl_id, boot_time); return 1; } @@ -376,6 +649,8 @@ static inline void free_client(struct nfs4_client *clp) { shutdown_callback_client(clp); + nfsd4_release_respages(clp->cl_slot.sl_cache_entry.ce_respages, + clp->cl_slot.sl_cache_entry.ce_resused); if (clp->cl_cred.cr_group_info) put_group_info(clp->cl_cred.cr_group_info); kfree(clp->cl_principal); @@ -420,7 +695,13 @@ expire_client(struct nfs4_client *clp) list_del(&clp->cl_lru); while (!list_empty(&clp->cl_openowners)) { sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient); - release_stateowner(sop); + release_openowner(sop); + } + while (!list_empty(&clp->cl_sessions)) { + struct nfsd4_session *ses; + ses = list_entry(clp->cl_sessions.next, struct nfsd4_session, + se_perclnt); + release_session(ses); } put_nfs4_client(clp); } @@ -439,6 +720,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir) INIT_LIST_HEAD(&clp->cl_strhash); INIT_LIST_HEAD(&clp->cl_openowners); INIT_LIST_HEAD(&clp->cl_delegations); + INIT_LIST_HEAD(&clp->cl_sessions); INIT_LIST_HEAD(&clp->cl_lru); return clp; } @@ -568,25 +850,45 @@ find_unconfirmed_client(clientid_t *clid) return NULL; } +/* + * Return 1 iff clp's clientid establishment method matches the use_exchange_id + * parameter. Matching is based on the fact the at least one of the + * EXCHGID4_FLAG_USE_{NON_PNFS,PNFS_MDS,PNFS_DS} flags must be set for v4.1 + * + * FIXME: we need to unify the clientid namespaces for nfsv4.x + * and correctly deal with client upgrade/downgrade in EXCHANGE_ID + * and SET_CLIENTID{,_CONFIRM} + */ +static inline int +match_clientid_establishment(struct nfs4_client *clp, bool use_exchange_id) +{ + bool has_exchange_flags = (clp->cl_exchange_flags != 0); + return use_exchange_id == has_exchange_flags; +} + static struct nfs4_client * -find_confirmed_client_by_str(const char *dname, unsigned int hashval) +find_confirmed_client_by_str(const char *dname, unsigned int hashval, + bool use_exchange_id) { struct nfs4_client *clp; list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) { - if (same_name(clp->cl_recdir, dname)) + if (same_name(clp->cl_recdir, dname) && + match_clientid_establishment(clp, use_exchange_id)) return clp; } return NULL; } static struct nfs4_client * -find_unconfirmed_client_by_str(const char *dname, unsigned int hashval) +find_unconfirmed_client_by_str(const char *dname, unsigned int hashval, + bool use_exchange_id) { struct nfs4_client *clp; list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) { - if (same_name(clp->cl_recdir, dname)) + if (same_name(clp->cl_recdir, dname) && + match_clientid_establishment(clp, use_exchange_id)) return clp; } return NULL; @@ -685,6 +987,534 @@ out_err: return; } +void +nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp) +{ + struct nfsd4_compoundres *resp = rqstp->rq_resp; + + resp->cstate.statp = statp; +} + +/* + * Dereference the result pages. + */ +static void +nfsd4_release_respages(struct page **respages, short resused) +{ + int i; + + dprintk("--> %s\n", __func__); + for (i = 0; i < resused; i++) { + if (!respages[i]) + continue; + put_page(respages[i]); + respages[i] = NULL; + } +} + +static void +nfsd4_copy_pages(struct page **topages, struct page **frompages, short count) +{ + int i; + + for (i = 0; i < count; i++) { + topages[i] = frompages[i]; + if (!topages[i]) + continue; + get_page(topages[i]); + } +} + +/* + * Cache the reply pages up to NFSD_PAGES_PER_SLOT + 1, clearing the previous + * pages. We add a page to NFSD_PAGES_PER_SLOT for the case where the total + * length of the XDR response is less than se_fmaxresp_cached + * (NFSD_PAGES_PER_SLOT * PAGE_SIZE) but the xdr_buf pages is used for a + * of the reply (e.g. readdir). + * + * Store the base and length of the rq_req.head[0] page + * of the NFSv4.1 data, just past the rpc header. + */ +void +nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) +{ + struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry; + struct svc_rqst *rqstp = resp->rqstp; + struct nfsd4_compoundargs *args = rqstp->rq_argp; + struct nfsd4_op *op = &args->ops[resp->opcnt]; + struct kvec *resv = &rqstp->rq_res.head[0]; + + dprintk("--> %s entry %p\n", __func__, entry); + + /* Don't cache a failed OP_SEQUENCE. */ + if (resp->opcnt == 1 && op->opnum == OP_SEQUENCE && resp->cstate.status) + return; + + nfsd4_release_respages(entry->ce_respages, entry->ce_resused); + entry->ce_opcnt = resp->opcnt; + entry->ce_status = resp->cstate.status; + + /* + * Don't need a page to cache just the sequence operation - the slot + * does this for us! + */ + + if (nfsd4_not_cached(resp)) { + entry->ce_resused = 0; + entry->ce_rpchdrlen = 0; + dprintk("%s Just cache SEQUENCE. ce_cachethis %d\n", __func__, + resp->cstate.slot->sl_cache_entry.ce_cachethis); + return; + } + entry->ce_resused = rqstp->rq_resused; + if (entry->ce_resused > NFSD_PAGES_PER_SLOT + 1) + entry->ce_resused = NFSD_PAGES_PER_SLOT + 1; + nfsd4_copy_pages(entry->ce_respages, rqstp->rq_respages, + entry->ce_resused); + entry->ce_datav.iov_base = resp->cstate.statp; + entry->ce_datav.iov_len = resv->iov_len - ((char *)resp->cstate.statp - + (char *)page_address(rqstp->rq_respages[0])); + /* Current request rpc header length*/ + entry->ce_rpchdrlen = (char *)resp->cstate.statp - + (char *)page_address(rqstp->rq_respages[0]); +} + +/* + * We keep the rpc header, but take the nfs reply from the replycache. + */ +static int +nfsd41_copy_replay_data(struct nfsd4_compoundres *resp, + struct nfsd4_cache_entry *entry) +{ + struct svc_rqst *rqstp = resp->rqstp; + struct kvec *resv = &resp->rqstp->rq_res.head[0]; + int len; + + /* Current request rpc header length*/ + len = (char *)resp->cstate.statp - + (char *)page_address(rqstp->rq_respages[0]); + if (entry->ce_datav.iov_len + len > PAGE_SIZE) { + dprintk("%s v41 cached reply too large (%Zd).\n", __func__, + entry->ce_datav.iov_len); + return 0; + } + /* copy the cached reply nfsd data past the current rpc header */ + memcpy((char *)resv->iov_base + len, entry->ce_datav.iov_base, + entry->ce_datav.iov_len); + resv->iov_len = len + entry->ce_datav.iov_len; + return 1; +} + +/* + * Keep the first page of the replay. Copy the NFSv4.1 data from the first + * cached page. Replace any futher replay pages from the cache. + */ +__be32 +nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, + struct nfsd4_sequence *seq) +{ + struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry; + __be32 status; + + dprintk("--> %s entry %p\n", __func__, entry); + + /* + * If this is just the sequence operation, we did not keep + * a page in the cache entry because we can just use the + * slot info stored in struct nfsd4_sequence that was checked + * against the slot in nfsd4_sequence(). + * + * This occurs when seq->cachethis is FALSE, or when the client + * session inactivity timer fires and a solo sequence operation + * is sent (lease renewal). + */ + if (seq && nfsd4_not_cached(resp)) { + seq->maxslots = resp->cstate.session->se_fnumslots; + return nfs_ok; + } + + if (!nfsd41_copy_replay_data(resp, entry)) { + /* + * Not enough room to use the replay rpc header, send the + * cached header. Release all the allocated result pages. + */ + svc_free_res_pages(resp->rqstp); + nfsd4_copy_pages(resp->rqstp->rq_respages, entry->ce_respages, + entry->ce_resused); + } else { + /* Release all but the first allocated result page */ + + resp->rqstp->rq_resused--; + svc_free_res_pages(resp->rqstp); + + nfsd4_copy_pages(&resp->rqstp->rq_respages[1], + &entry->ce_respages[1], + entry->ce_resused - 1); + } + + resp->rqstp->rq_resused = entry->ce_resused; + resp->opcnt = entry->ce_opcnt; + resp->cstate.iovlen = entry->ce_datav.iov_len + entry->ce_rpchdrlen; + status = entry->ce_status; + + return status; +} + +/* + * Set the exchange_id flags returned by the server. + */ +static void +nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid) +{ + /* pNFS is not supported */ + new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS; + + /* Referrals are supported, Migration is not. */ + new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER; + + /* set the wire flags to return to client. */ + clid->flags = new->cl_exchange_flags; +} + +__be32 +nfsd4_exchange_id(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_exchange_id *exid) +{ + struct nfs4_client *unconf, *conf, *new; + int status; + unsigned int strhashval; + char dname[HEXDIR_LEN]; + nfs4_verifier verf = exid->verifier; + u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr; + + dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p " + " ip_addr=%u flags %x, spa_how %d\n", + __func__, rqstp, exid, exid->clname.len, exid->clname.data, + ip_addr, exid->flags, exid->spa_how); + + if (!check_name(exid->clname) || (exid->flags & ~EXCHGID4_FLAG_MASK_A)) + return nfserr_inval; + + /* Currently only support SP4_NONE */ + switch (exid->spa_how) { + case SP4_NONE: + break; + case SP4_SSV: + return nfserr_encr_alg_unsupp; + default: + BUG(); /* checked by xdr code */ + case SP4_MACH_CRED: + return nfserr_serverfault; /* no excuse :-/ */ + } + + status = nfs4_make_rec_clidname(dname, &exid->clname); + + if (status) + goto error; + + strhashval = clientstr_hashval(dname); + + nfs4_lock_state(); + status = nfs_ok; + + conf = find_confirmed_client_by_str(dname, strhashval, true); + if (conf) { + if (!same_verf(&verf, &conf->cl_verifier)) { + /* 18.35.4 case 8 */ + if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) { + status = nfserr_not_same; + goto out; + } + /* Client reboot: destroy old state */ + expire_client(conf); + goto out_new; + } + if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) { + /* 18.35.4 case 9 */ + if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) { + status = nfserr_perm; + goto out; + } + expire_client(conf); + goto out_new; + } + if (ip_addr != conf->cl_addr && + !(exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A)) { + /* Client collision. 18.35.4 case 3 */ + status = nfserr_clid_inuse; + goto out; + } + /* + * Set bit when the owner id and verifier map to an already + * confirmed client id (18.35.3). + */ + exid->flags |= EXCHGID4_FLAG_CONFIRMED_R; + + /* + * Falling into 18.35.4 case 2, possible router replay. + * Leave confirmed record intact and return same result. + */ + copy_verf(conf, &verf); + new = conf; + goto out_copy; + } else { + /* 18.35.4 case 7 */ + if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) { + status = nfserr_noent; + goto out; + } + } + + unconf = find_unconfirmed_client_by_str(dname, strhashval, true); + if (unconf) { + /* + * Possible retry or client restart. Per 18.35.4 case 4, + * a new unconfirmed record should be generated regardless + * of whether any properties have changed. + */ + expire_client(unconf); + } + +out_new: + /* Normal case */ + new = create_client(exid->clname, dname); + if (new == NULL) { + status = nfserr_resource; + goto out; + } + + copy_verf(new, &verf); + copy_cred(&new->cl_cred, &rqstp->rq_cred); + new->cl_addr = ip_addr; + gen_clid(new); + gen_confirm(new); + add_to_unconfirmed(new, strhashval); +out_copy: + exid->clientid.cl_boot = new->cl_clientid.cl_boot; + exid->clientid.cl_id = new->cl_clientid.cl_id; + + new->cl_slot.sl_seqid = 0; + exid->seqid = 1; + nfsd4_set_ex_flags(new, exid); + + dprintk("nfsd4_exchange_id seqid %d flags %x\n", + new->cl_slot.sl_seqid, new->cl_exchange_flags); + status = nfs_ok; + +out: + nfs4_unlock_state(); +error: + dprintk("nfsd4_exchange_id returns %d\n", ntohl(status)); + return status; +} + +static int +check_slot_seqid(u32 seqid, struct nfsd4_slot *slot) +{ + dprintk("%s enter. seqid %d slot->sl_seqid %d\n", __func__, seqid, + slot->sl_seqid); + + /* The slot is in use, and no response has been sent. */ + if (slot->sl_inuse) { + if (seqid == slot->sl_seqid) + return nfserr_jukebox; + else + return nfserr_seq_misordered; + } + /* Normal */ + if (likely(seqid == slot->sl_seqid + 1)) + return nfs_ok; + /* Replay */ + if (seqid == slot->sl_seqid) + return nfserr_replay_cache; + /* Wraparound */ + if (seqid == 1 && (slot->sl_seqid + 1) == 0) + return nfs_ok; + /* Misordered replay or misordered new request */ + return nfserr_seq_misordered; +} + +__be32 +nfsd4_create_session(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_create_session *cr_ses) +{ + u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr; + struct nfsd4_compoundres *resp = rqstp->rq_resp; + struct nfs4_client *conf, *unconf; + struct nfsd4_slot *slot = NULL; + int status = 0; + + nfs4_lock_state(); + unconf = find_unconfirmed_client(&cr_ses->clientid); + conf = find_confirmed_client(&cr_ses->clientid); + + if (conf) { + slot = &conf->cl_slot; + status = check_slot_seqid(cr_ses->seqid, slot); + if (status == nfserr_replay_cache) { + dprintk("Got a create_session replay! seqid= %d\n", + slot->sl_seqid); + cstate->slot = slot; + cstate->status = status; + /* Return the cached reply status */ + status = nfsd4_replay_cache_entry(resp, NULL); + goto out; + } else if (cr_ses->seqid != conf->cl_slot.sl_seqid + 1) { + status = nfserr_seq_misordered; + dprintk("Sequence misordered!\n"); + dprintk("Expected seqid= %d but got seqid= %d\n", + slot->sl_seqid, cr_ses->seqid); + goto out; + } + conf->cl_slot.sl_seqid++; + } else if (unconf) { + if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) || + (ip_addr != unconf->cl_addr)) { + status = nfserr_clid_inuse; + goto out; + } + + slot = &unconf->cl_slot; + status = check_slot_seqid(cr_ses->seqid, slot); + if (status) { + /* an unconfirmed replay returns misordered */ + status = nfserr_seq_misordered; + goto out; + } + + slot->sl_seqid++; /* from 0 to 1 */ + move_to_confirmed(unconf); + + /* + * We do not support RDMA or persistent sessions + */ + cr_ses->flags &= ~SESSION4_PERSIST; + cr_ses->flags &= ~SESSION4_RDMA; + + conf = unconf; + } else { + status = nfserr_stale_clientid; + goto out; + } + + status = alloc_init_session(rqstp, conf, cr_ses); + if (status) + goto out; + + memcpy(cr_ses->sessionid.data, conf->cl_sessionid.data, + NFS4_MAX_SESSIONID_LEN); + cr_ses->seqid = slot->sl_seqid; + + slot->sl_inuse = true; + cstate->slot = slot; + /* Ensure a page is used for the cache */ + slot->sl_cache_entry.ce_cachethis = 1; +out: + nfs4_unlock_state(); + dprintk("%s returns %d\n", __func__, ntohl(status)); + return status; +} + +__be32 +nfsd4_destroy_session(struct svc_rqst *r, + struct nfsd4_compound_state *cstate, + struct nfsd4_destroy_session *sessionid) +{ + struct nfsd4_session *ses; + u32 status = nfserr_badsession; + + /* Notes: + * - The confirmed nfs4_client->cl_sessionid holds destroyed sessinid + * - Should we return nfserr_back_chan_busy if waiting for + * callbacks on to-be-destroyed session? + * - Do we need to clear any callback info from previous session? + */ + + dump_sessionid(__func__, &sessionid->sessionid); + spin_lock(&sessionid_lock); + ses = find_in_sessionid_hashtbl(&sessionid->sessionid); + if (!ses) { + spin_unlock(&sessionid_lock); + goto out; + } + + unhash_session(ses); + spin_unlock(&sessionid_lock); + + /* wait for callbacks */ + shutdown_callback_client(ses->se_client); + nfsd4_put_session(ses); + status = nfs_ok; +out: + dprintk("%s returns %d\n", __func__, ntohl(status)); + return status; +} + +__be32 +nfsd4_sequence(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_sequence *seq) +{ + struct nfsd4_compoundres *resp = rqstp->rq_resp; + struct nfsd4_session *session; + struct nfsd4_slot *slot; + int status; + + if (resp->opcnt != 1) + return nfserr_sequence_pos; + + spin_lock(&sessionid_lock); + status = nfserr_badsession; + session = find_in_sessionid_hashtbl(&seq->sessionid); + if (!session) + goto out; + + status = nfserr_badslot; + if (seq->slotid >= session->se_fnumslots) + goto out; + + slot = &session->se_slots[seq->slotid]; + dprintk("%s: slotid %d\n", __func__, seq->slotid); + + status = check_slot_seqid(seq->seqid, slot); + if (status == nfserr_replay_cache) { + cstate->slot = slot; + cstate->session = session; + /* Return the cached reply status and set cstate->status + * for nfsd4_svc_encode_compoundres processing */ + status = nfsd4_replay_cache_entry(resp, seq); + cstate->status = nfserr_replay_cache; + goto replay_cache; + } + if (status) + goto out; + + /* Success! bump slot seqid */ + slot->sl_inuse = true; + slot->sl_seqid = seq->seqid; + slot->sl_cache_entry.ce_cachethis = seq->cachethis; + /* Always set the cache entry cachethis for solo sequence */ + if (nfsd4_is_solo_sequence(resp)) + slot->sl_cache_entry.ce_cachethis = 1; + + cstate->slot = slot; + cstate->session = session; + +replay_cache: + /* Renew the clientid on success and on replay. + * Hold a session reference until done processing the compound: + * nfsd4_put_session called only if the cstate slot is set. + */ + renew_client(session->se_client); + nfsd4_get_session(session); +out: + spin_unlock(&sessionid_lock); + dprintk("%s: return %d\n", __func__, ntohl(status)); + return status; +} + __be32 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_setclientid *setclid) @@ -716,14 +1546,13 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, strhashval = clientstr_hashval(dname); nfs4_lock_state(); - conf = find_confirmed_client_by_str(dname, strhashval); + conf = find_confirmed_client_by_str(dname, strhashval, false); if (conf) { /* RFC 3530 14.2.33 CASE 0: */ status = nfserr_clid_inuse; - if (!same_creds(&conf->cl_cred, &rqstp->rq_cred) - || conf->cl_addr != sin->sin_addr.s_addr) { - dprintk("NFSD: setclientid: string in use by clientat %pI4\n", - &conf->cl_addr); + if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) { + dprintk("NFSD: setclientid: string in use by client" + " at %pI4\n", &conf->cl_addr); goto out; } } @@ -732,7 +1561,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * has a description of SETCLIENTID request processing consisting * of 5 bullet points, labeled as CASE0 - CASE4 below. */ - unconf = find_unconfirmed_client_by_str(dname, strhashval); + unconf = find_unconfirmed_client_by_str(dname, strhashval, false); status = nfserr_resource; if (!conf) { /* @@ -887,7 +1716,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, unsigned int hash = clientstr_hashval(unconf->cl_recdir); conf = find_confirmed_client_by_str(unconf->cl_recdir, - hash); + hash, false); if (conf) { nfsd4_remove_clid_dir(conf); expire_client(conf); @@ -923,11 +1752,13 @@ alloc_init_file(struct inode *ino) fp = kmem_cache_alloc(file_slab, GFP_KERNEL); if (fp) { - kref_init(&fp->fi_ref); + atomic_set(&fp->fi_ref, 1); INIT_LIST_HEAD(&fp->fi_hash); INIT_LIST_HEAD(&fp->fi_stateids); INIT_LIST_HEAD(&fp->fi_delegations); + spin_lock(&recall_lock); list_add(&fp->fi_hash, &file_hashtbl[hashval]); + spin_unlock(&recall_lock); fp->fi_inode = igrab(ino); fp->fi_id = current_fileid++; fp->fi_had_conflict = false; @@ -1037,48 +1868,6 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str return sop; } -static void -release_stateid_lockowners(struct nfs4_stateid *open_stp) -{ - struct nfs4_stateowner *lock_sop; - - while (!list_empty(&open_stp->st_lockowners)) { - lock_sop = list_entry(open_stp->st_lockowners.next, - struct nfs4_stateowner, so_perstateid); - /* list_del(&open_stp->st_lockowners); */ - BUG_ON(lock_sop->so_is_open_owner); - release_stateowner(lock_sop); - } -} - -static void -unhash_stateowner(struct nfs4_stateowner *sop) -{ - struct nfs4_stateid *stp; - - list_del(&sop->so_idhash); - list_del(&sop->so_strhash); - if (sop->so_is_open_owner) - list_del(&sop->so_perclient); - list_del(&sop->so_perstateid); - while (!list_empty(&sop->so_stateids)) { - stp = list_entry(sop->so_stateids.next, - struct nfs4_stateid, st_perstateowner); - if (sop->so_is_open_owner) - release_stateid(stp, OPEN_STATE); - else - release_stateid(stp, LOCK_STATE); - } -} - -static void -release_stateowner(struct nfs4_stateowner *sop) -{ - unhash_stateowner(sop); - list_del(&sop->so_close_lru); - nfs4_put_stateowner(sop); -} - static inline void init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) { struct nfs4_stateowner *sop = open->op_stateowner; @@ -1100,30 +1889,13 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open * stp->st_stateid.si_generation = 0; stp->st_access_bmap = 0; stp->st_deny_bmap = 0; - __set_bit(open->op_share_access, &stp->st_access_bmap); + __set_bit(open->op_share_access & ~NFS4_SHARE_WANT_MASK, + &stp->st_access_bmap); __set_bit(open->op_share_deny, &stp->st_deny_bmap); stp->st_openstp = NULL; } static void -release_stateid(struct nfs4_stateid *stp, int flags) -{ - struct file *filp = stp->st_vfs_file; - - list_del(&stp->st_hash); - list_del(&stp->st_perfile); - list_del(&stp->st_perstateowner); - if (flags & OPEN_STATE) { - release_stateid_lockowners(stp); - stp->st_vfs_file = NULL; - nfsd_close(filp); - } else if (flags & LOCK_STATE) - locks_remove_posix(filp, (fl_owner_t) stp->st_stateowner); - put_nfs4_file(stp->st_file); - kmem_cache_free(stateid_slab, stp); -} - -static void move_to_close_lru(struct nfs4_stateowner *sop) { dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop); @@ -1160,20 +1932,33 @@ find_file(struct inode *ino) unsigned int hashval = file_hashval(ino); struct nfs4_file *fp; + spin_lock(&recall_lock); list_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) { if (fp->fi_inode == ino) { get_nfs4_file(fp); + spin_unlock(&recall_lock); return fp; } } + spin_unlock(&recall_lock); return NULL; } -static inline int access_valid(u32 x) +static inline int access_valid(u32 x, u32 minorversion) { - if (x < NFS4_SHARE_ACCESS_READ) + if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ) return 0; - if (x > NFS4_SHARE_ACCESS_BOTH) + if ((x & NFS4_SHARE_ACCESS_MASK) > NFS4_SHARE_ACCESS_BOTH) + return 0; + x &= ~NFS4_SHARE_ACCESS_MASK; + if (minorversion && x) { + if ((x & NFS4_SHARE_WANT_MASK) > NFS4_SHARE_WANT_CANCEL) + return 0; + if ((x & NFS4_SHARE_WHEN_MASK) > NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED) + return 0; + x &= ~(NFS4_SHARE_WANT_MASK | NFS4_SHARE_WHEN_MASK); + } + if (x) return 0; return 1; } @@ -1409,7 +2194,8 @@ static struct lock_manager_operations nfsd_lease_mng_ops = { __be32 -nfsd4_process_open1(struct nfsd4_open *open) +nfsd4_process_open1(struct nfsd4_compound_state *cstate, + struct nfsd4_open *open) { clientid_t *clientid = &open->op_clientid; struct nfs4_client *clp = NULL; @@ -1432,10 +2218,13 @@ nfsd4_process_open1(struct nfsd4_open *open) return nfserr_expired; goto renew; } + /* When sessions are used, skip open sequenceid processing */ + if (nfsd4_has_session(cstate)) + goto renew; if (!sop->so_confirmed) { /* Replace unconfirmed owners without checking for replay. */ clp = sop->so_client; - release_stateowner(sop); + release_openowner(sop); open->op_stateowner = NULL; goto renew; } @@ -1709,6 +2498,7 @@ out: __be32 nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) { + struct nfsd4_compoundres *resp = rqstp->rq_resp; struct nfs4_file *fp = NULL; struct inode *ino = current_fh->fh_dentry->d_inode; struct nfs4_stateid *stp = NULL; @@ -1716,7 +2506,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf __be32 status; status = nfserr_inval; - if (!access_valid(open->op_share_access) + if (!access_valid(open->op_share_access, resp->cstate.minorversion) || !deny_valid(open->op_share_deny)) goto out; /* @@ -1764,12 +2554,17 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf init_stateid(stp, fp, open); status = nfsd4_truncate(rqstp, current_fh, open); if (status) { - release_stateid(stp, OPEN_STATE); + release_open_stateid(stp); goto out; } + if (nfsd4_has_session(&resp->cstate)) + update_stateid(&stp->st_stateid); } memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t)); + if (nfsd4_has_session(&resp->cstate)) + open->op_stateowner->so_confirmed = 1; + /* * Attempt to hand out a delegation. No error return, because the * OPEN succeeds even if we fail. @@ -1790,7 +2585,8 @@ out: * To finish the open response, we just need to set the rflags. */ open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX; - if (!open->op_stateowner->so_confirmed) + if (!open->op_stateowner->so_confirmed && + !nfsd4_has_session(&resp->cstate)) open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM; return status; @@ -1898,7 +2694,7 @@ nfs4_laundromat(void) } dprintk("NFSD: purging unused open stateowner (so_id %d)\n", sop->so_id); - release_stateowner(sop); + release_openowner(sop); } if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT) clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT; @@ -1983,10 +2779,7 @@ out: static inline __be32 check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags) { - /* Trying to call delegreturn with a special stateid? Yuch: */ - if (!(flags & (RD_STATE | WR_STATE))) - return nfserr_bad_stateid; - else if (ONE_STATEID(stateid) && (flags & RD_STATE)) + if (ONE_STATEID(stateid) && (flags & RD_STATE)) return nfs_ok; else if (locks_in_grace()) { /* Answer in remaining cases depends on existance of @@ -2005,14 +2798,20 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags) * that are not able to provide mandatory locking. */ static inline int -io_during_grace_disallowed(struct inode *inode, int flags) +grace_disallows_io(struct inode *inode) { - return locks_in_grace() && (flags & (RD_STATE | WR_STATE)) - && mandatory_lock(inode); + return locks_in_grace() && mandatory_lock(inode); } -static int check_stateid_generation(stateid_t *in, stateid_t *ref) +static int check_stateid_generation(stateid_t *in, stateid_t *ref, int flags) { + /* + * When sessions are used the stateid generation number is ignored + * when it is zero. + */ + if ((flags & HAS_SESSION) && in->si_generation == 0) + goto out; + /* If the client sends us a stateid from the future, it's buggy: */ if (in->si_generation > ref->si_generation) return nfserr_bad_stateid; @@ -2028,74 +2827,77 @@ static int check_stateid_generation(stateid_t *in, stateid_t *ref) */ if (in->si_generation < ref->si_generation) return nfserr_old_stateid; +out: return nfs_ok; } +static int is_delegation_stateid(stateid_t *stateid) +{ + return stateid->si_fileid == 0; +} + /* * Checks for stateid operations */ __be32 -nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int flags, struct file **filpp) +nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, + stateid_t *stateid, int flags, struct file **filpp) { struct nfs4_stateid *stp = NULL; struct nfs4_delegation *dp = NULL; - stateid_t *stidp; + struct svc_fh *current_fh = &cstate->current_fh; struct inode *ino = current_fh->fh_dentry->d_inode; __be32 status; - dprintk("NFSD: preprocess_stateid_op: stateid = (%08x/%08x/%08x/%08x)\n", - stateid->si_boot, stateid->si_stateownerid, - stateid->si_fileid, stateid->si_generation); if (filpp) *filpp = NULL; - if (io_during_grace_disallowed(ino, flags)) + if (grace_disallows_io(ino)) return nfserr_grace; + if (nfsd4_has_session(cstate)) + flags |= HAS_SESSION; + if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) return check_special_stateids(current_fh, stateid, flags); - /* STALE STATEID */ status = nfserr_stale_stateid; if (STALE_STATEID(stateid)) goto out; - /* BAD STATEID */ status = nfserr_bad_stateid; - if (!stateid->si_fileid) { /* delegation stateid */ - if(!(dp = find_delegation_stateid(ino, stateid))) { - dprintk("NFSD: delegation stateid not found\n"); + if (is_delegation_stateid(stateid)) { + dp = find_delegation_stateid(ino, stateid); + if (!dp) goto out; - } - stidp = &dp->dl_stateid; + status = check_stateid_generation(stateid, &dp->dl_stateid, + flags); + if (status) + goto out; + status = nfs4_check_delegmode(dp, flags); + if (status) + goto out; + renew_client(dp->dl_client); + if (filpp) + *filpp = dp->dl_vfs_file; } else { /* open or lock stateid */ - if (!(stp = find_stateid(stateid, flags))) { - dprintk("NFSD: open or lock stateid not found\n"); + stp = find_stateid(stateid, flags); + if (!stp) goto out; - } - if ((flags & CHECK_FH) && nfs4_check_fh(current_fh, stp)) + if (nfs4_check_fh(current_fh, stp)) goto out; if (!stp->st_stateowner->so_confirmed) goto out; - stidp = &stp->st_stateid; - } - status = check_stateid_generation(stateid, stidp); - if (status) - goto out; - if (stp) { - if ((status = nfs4_check_openmode(stp,flags))) + status = check_stateid_generation(stateid, &stp->st_stateid, + flags); + if (status) + goto out; + status = nfs4_check_openmode(stp, flags); + if (status) goto out; renew_client(stp->st_stateowner->so_client); if (filpp) *filpp = stp->st_vfs_file; - } else { - if ((status = nfs4_check_delegmode(dp, flags))) - goto out; - renew_client(dp->dl_client); - if (flags & DELEG_RET) - unhash_delegation(dp); - if (filpp) - *filpp = dp->dl_vfs_file; } status = nfs_ok; out: @@ -2113,10 +2915,14 @@ setlkflg (int type) * Checks for sequence id mutating operations. */ static __be32 -nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *stateid, int flags, struct nfs4_stateowner **sopp, struct nfs4_stateid **stpp, struct nfsd4_lock *lock) +nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, + stateid_t *stateid, int flags, + struct nfs4_stateowner **sopp, + struct nfs4_stateid **stpp, struct nfsd4_lock *lock) { struct nfs4_stateid *stp; struct nfs4_stateowner *sop; + struct svc_fh *current_fh = &cstate->current_fh; __be32 status; dprintk("NFSD: preprocess_seqid_op: seqid=%d " @@ -2134,6 +2940,10 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei if (STALE_STATEID(stateid)) return nfserr_stale_stateid; + + if (nfsd4_has_session(cstate)) + flags |= HAS_SESSION; + /* * We return BAD_STATEID if filehandle doesn't match stateid, * the confirmed flag is incorrecly set, or the generation @@ -2166,8 +2976,9 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei if (lock->lk_is_new) { if (!sop->so_is_open_owner) return nfserr_bad_stateid; - if (!same_clid(&clp->cl_clientid, lockclid)) - return nfserr_bad_stateid; + if (!(flags & HAS_SESSION) && + !same_clid(&clp->cl_clientid, lockclid)) + return nfserr_bad_stateid; /* stp is the open stateid */ status = nfs4_check_openmode(stp, lkflg); if (status) @@ -2190,7 +3001,7 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei * For the moment, we ignore the possibility of * generation number wraparound. */ - if (seqid != sop->so_seqid) + if (!(flags & HAS_SESSION) && seqid != sop->so_seqid) goto check_replay; if (sop->so_confirmed && flags & CONFIRM) { @@ -2203,7 +3014,7 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei " confirmed yet!\n"); return nfserr_bad_stateid; } - status = check_stateid_generation(stateid, &stp->st_stateid); + status = check_stateid_generation(stateid, &stp->st_stateid, flags); if (status) return status; renew_client(sop->so_client); @@ -2239,7 +3050,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_lock_state(); - if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh, + if ((status = nfs4_preprocess_seqid_op(cstate, oc->oc_seqid, &oc->oc_req_stateid, CONFIRM | OPEN_STATE, &oc->oc_stateowner, &stp, NULL))) @@ -2304,12 +3115,12 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, (int)cstate->current_fh.fh_dentry->d_name.len, cstate->current_fh.fh_dentry->d_name.name); - if (!access_valid(od->od_share_access) + if (!access_valid(od->od_share_access, cstate->minorversion) || !deny_valid(od->od_share_deny)) return nfserr_inval; nfs4_lock_state(); - if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh, + if ((status = nfs4_preprocess_seqid_op(cstate, od->od_seqid, &od->od_stateid, OPEN_STATE, @@ -2362,7 +3173,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_lock_state(); /* check close_lru for replay */ - if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh, + if ((status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid, &close->cl_stateid, OPEN_STATE | CLOSE_STATE, @@ -2373,7 +3184,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, memcpy(&close->cl_stateid, &stp->st_stateid, sizeof(stateid_t)); /* release_stateid() calls nfsd_close() if needed */ - release_stateid(stp, OPEN_STATE); + release_open_stateid(stp); /* place unused nfs4_stateowners on so_close_lru list to be * released by the laundromat service after the lease period @@ -2394,16 +3205,40 @@ __be32 nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_delegreturn *dr) { + struct nfs4_delegation *dp; + stateid_t *stateid = &dr->dr_stateid; + struct inode *inode; __be32 status; + int flags = 0; if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) - goto out; + return status; + inode = cstate->current_fh.fh_dentry->d_inode; + if (nfsd4_has_session(cstate)) + flags |= HAS_SESSION; nfs4_lock_state(); - status = nfs4_preprocess_stateid_op(&cstate->current_fh, - &dr->dr_stateid, DELEG_RET, NULL); - nfs4_unlock_state(); + status = nfserr_bad_stateid; + if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) + goto out; + status = nfserr_stale_stateid; + if (STALE_STATEID(stateid)) + goto out; + status = nfserr_bad_stateid; + if (!is_delegation_stateid(stateid)) + goto out; + dp = find_delegation_stateid(inode, stateid); + if (!dp) + goto out; + status = check_stateid_generation(stateid, &dp->dl_stateid, flags); + if (status) + goto out; + renew_client(dp->dl_client); + + unhash_delegation(dp); out: + nfs4_unlock_state(); + return status; } @@ -2684,11 +3519,12 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfs4_file *fp; status = nfserr_stale_clientid; - if (STALE_CLIENTID(&lock->lk_new_clientid)) + if (!nfsd4_has_session(cstate) && + STALE_CLIENTID(&lock->lk_new_clientid)) goto out; /* validate and update open stateid and open seqid */ - status = nfs4_preprocess_seqid_op(&cstate->current_fh, + status = nfs4_preprocess_seqid_op(cstate, lock->lk_new_open_seqid, &lock->lk_new_open_stateid, OPEN_STATE, @@ -2715,7 +3551,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } else { /* lock (lock owner + lock stateid) already exists */ - status = nfs4_preprocess_seqid_op(&cstate->current_fh, + status = nfs4_preprocess_seqid_op(cstate, lock->lk_old_lock_seqid, &lock->lk_old_lock_stateid, LOCK_STATE, @@ -2788,7 +3624,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } out: if (status && lock->lk_is_new && lock_sop) - release_stateowner(lock_sop); + release_lockowner(lock_sop); if (lock->lk_replay_owner) { nfs4_get_stateowner(lock->lk_replay_owner); cstate->replay_owner = lock->lk_replay_owner; @@ -2838,7 +3674,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_lock_state(); status = nfserr_stale_clientid; - if (STALE_CLIENTID(&lockt->lt_clientid)) + if (!nfsd4_has_session(cstate) && STALE_CLIENTID(&lockt->lt_clientid)) goto out; if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) { @@ -2911,7 +3747,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_lock_state(); - if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh, + if ((status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid, &locku->lu_stateid, LOCK_STATE, @@ -3037,7 +3873,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, /* unhash_stateowner deletes so_perclient only * for openowners. */ list_del(&sop->so_perclient); - release_stateowner(sop); + release_lockowner(sop); } out: nfs4_unlock_state(); @@ -3051,12 +3887,12 @@ alloc_reclaim(void) } int -nfs4_has_reclaimed_state(const char *name) +nfs4_has_reclaimed_state(const char *name, bool use_exchange_id) { unsigned int strhashval = clientstr_hashval(name); struct nfs4_client *clp; - clp = find_confirmed_client_by_str(name, strhashval); + clp = find_confirmed_client_by_str(name, strhashval, use_exchange_id); return clp ? 1 : 0; } @@ -3153,6 +3989,8 @@ nfs4_state_init(void) INIT_LIST_HEAD(&unconf_str_hashtbl[i]); INIT_LIST_HEAD(&unconf_id_hashtbl[i]); } + for (i = 0; i < SESSION_HASH_SIZE; i++) + INIT_LIST_HEAD(&sessionid_hashtbl[i]); for (i = 0; i < FILE_HASH_SIZE; i++) { INIT_LIST_HEAD(&file_hashtbl[i]); } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 9250067943d8..b820c311931c 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -45,6 +45,7 @@ #include <linux/fs.h> #include <linux/namei.h> #include <linux/vfs.h> +#include <linux/utsname.h> #include <linux/sunrpc/xdr.h> #include <linux/sunrpc/svc.h> #include <linux/sunrpc/clnt.h> @@ -188,6 +189,11 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes) return p; } +static int zero_clientid(clientid_t *clid) +{ + return (clid->cl_boot == 0) && (clid->cl_id == 0); +} + static int defer_free(struct nfsd4_compoundargs *argp, void (*release)(const void *), void *p) @@ -230,6 +236,7 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval) bmval[0] = 0; bmval[1] = 0; + bmval[2] = 0; READ_BUF(4); READ32(bmlen); @@ -241,13 +248,27 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval) READ32(bmval[0]); if (bmlen > 1) READ32(bmval[1]); + if (bmlen > 2) + READ32(bmval[2]); DECODE_TAIL; } +static u32 nfsd_attrmask[] = { + NFSD_WRITEABLE_ATTRS_WORD0, + NFSD_WRITEABLE_ATTRS_WORD1, + NFSD_WRITEABLE_ATTRS_WORD2 +}; + +static u32 nfsd41_ex_attrmask[] = { + NFSD_SUPPATTR_EXCLCREAT_WORD0, + NFSD_SUPPATTR_EXCLCREAT_WORD1, + NFSD_SUPPATTR_EXCLCREAT_WORD2 +}; + static __be32 -nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *iattr, - struct nfs4_acl **acl) +nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, u32 *writable, + struct iattr *iattr, struct nfs4_acl **acl) { int expected_len, len = 0; u32 dummy32; @@ -263,9 +284,12 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia * According to spec, unsupported attributes return ERR_ATTRNOTSUPP; * read-only attributes return ERR_INVAL. */ - if ((bmval[0] & ~NFSD_SUPPORTED_ATTRS_WORD0) || (bmval[1] & ~NFSD_SUPPORTED_ATTRS_WORD1)) + if ((bmval[0] & ~nfsd_suppattrs0(argp->minorversion)) || + (bmval[1] & ~nfsd_suppattrs1(argp->minorversion)) || + (bmval[2] & ~nfsd_suppattrs2(argp->minorversion))) return nfserr_attrnotsupp; - if ((bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0) || (bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1)) + if ((bmval[0] & ~writable[0]) || (bmval[1] & ~writable[1]) || + (bmval[2] & ~writable[2])) return nfserr_inval; READ_BUF(4); @@ -400,6 +424,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia goto xdr_error; } } + BUG_ON(bmval[2]); /* no such writeable attr supported yet */ if (len != expected_len) goto xdr_error; @@ -493,7 +518,9 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create if ((status = check_filename(create->cr_name, create->cr_namelen, nfserr_inval))) return status; - if ((status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr, &create->cr_acl))) + status = nfsd4_decode_fattr(argp, create->cr_bmval, nfsd_attrmask, + &create->cr_iattr, &create->cr_acl); + if (status) goto out; DECODE_TAIL; @@ -583,6 +610,8 @@ nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt) READ_BUF(lockt->lt_owner.len); READMEM(lockt->lt_owner.data, lockt->lt_owner.len); + if (argp->minorversion && !zero_clientid(&lockt->lt_clientid)) + return nfserr_inval; DECODE_TAIL; } @@ -652,13 +681,26 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) switch (open->op_createmode) { case NFS4_CREATE_UNCHECKED: case NFS4_CREATE_GUARDED: - if ((status = nfsd4_decode_fattr(argp, open->op_bmval, &open->op_iattr, &open->op_acl))) + status = nfsd4_decode_fattr(argp, open->op_bmval, + nfsd_attrmask, &open->op_iattr, &open->op_acl); + if (status) goto out; break; case NFS4_CREATE_EXCLUSIVE: READ_BUF(8); COPYMEM(open->op_verf.data, 8); break; + case NFS4_CREATE_EXCLUSIVE4_1: + if (argp->minorversion < 1) + goto xdr_error; + READ_BUF(8); + COPYMEM(open->op_verf.data, 8); + status = nfsd4_decode_fattr(argp, open->op_bmval, + nfsd41_ex_attrmask, &open->op_iattr, + &open->op_acl); + if (status) + goto out; + break; default: goto xdr_error; } @@ -851,7 +893,7 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta status = nfsd4_decode_stateid(argp, &setattr->sa_stateid); if (status) return status; - return nfsd4_decode_fattr(argp, setattr->sa_bmval, + return nfsd4_decode_fattr(argp, setattr->sa_bmval, nfsd_attrmask, &setattr->sa_iattr, &setattr->sa_acl); } @@ -993,6 +1035,241 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel READ_BUF(rlockowner->rl_owner.len); READMEM(rlockowner->rl_owner.data, rlockowner->rl_owner.len); + if (argp->minorversion && !zero_clientid(&rlockowner->rl_clientid)) + return nfserr_inval; + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp, + struct nfsd4_exchange_id *exid) +{ + int dummy; + DECODE_HEAD; + + READ_BUF(NFS4_VERIFIER_SIZE); + COPYMEM(exid->verifier.data, NFS4_VERIFIER_SIZE); + + READ_BUF(4); + READ32(exid->clname.len); + + READ_BUF(exid->clname.len); + SAVEMEM(exid->clname.data, exid->clname.len); + + READ_BUF(4); + READ32(exid->flags); + + /* Ignore state_protect4_a */ + READ_BUF(4); + READ32(exid->spa_how); + switch (exid->spa_how) { + case SP4_NONE: + break; + case SP4_MACH_CRED: + /* spo_must_enforce */ + READ_BUF(4); + READ32(dummy); + READ_BUF(dummy * 4); + p += dummy; + + /* spo_must_allow */ + READ_BUF(4); + READ32(dummy); + READ_BUF(dummy * 4); + p += dummy; + break; + case SP4_SSV: + /* ssp_ops */ + READ_BUF(4); + READ32(dummy); + READ_BUF(dummy * 4); + p += dummy; + + READ_BUF(4); + READ32(dummy); + READ_BUF(dummy * 4); + p += dummy; + + /* ssp_hash_algs<> */ + READ_BUF(4); + READ32(dummy); + READ_BUF(dummy); + p += XDR_QUADLEN(dummy); + + /* ssp_encr_algs<> */ + READ_BUF(4); + READ32(dummy); + READ_BUF(dummy); + p += XDR_QUADLEN(dummy); + + /* ssp_window and ssp_num_gss_handles */ + READ_BUF(8); + READ32(dummy); + READ32(dummy); + break; + default: + goto xdr_error; + } + + /* Ignore Implementation ID */ + READ_BUF(4); /* nfs_impl_id4 array length */ + READ32(dummy); + + if (dummy > 1) + goto xdr_error; + + if (dummy == 1) { + /* nii_domain */ + READ_BUF(4); + READ32(dummy); + READ_BUF(dummy); + p += XDR_QUADLEN(dummy); + + /* nii_name */ + READ_BUF(4); + READ32(dummy); + READ_BUF(dummy); + p += XDR_QUADLEN(dummy); + + /* nii_date */ + READ_BUF(12); + p += 3; + } + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, + struct nfsd4_create_session *sess) +{ + DECODE_HEAD; + + u32 dummy; + char *machine_name; + int i; + int nr_secflavs; + + READ_BUF(16); + COPYMEM(&sess->clientid, 8); + READ32(sess->seqid); + READ32(sess->flags); + + /* Fore channel attrs */ + READ_BUF(28); + READ32(dummy); /* headerpadsz is always 0 */ + READ32(sess->fore_channel.maxreq_sz); + READ32(sess->fore_channel.maxresp_sz); + READ32(sess->fore_channel.maxresp_cached); + READ32(sess->fore_channel.maxops); + READ32(sess->fore_channel.maxreqs); + READ32(sess->fore_channel.nr_rdma_attrs); + if (sess->fore_channel.nr_rdma_attrs == 1) { + READ_BUF(4); + READ32(sess->fore_channel.rdma_attrs); + } else if (sess->fore_channel.nr_rdma_attrs > 1) { + dprintk("Too many fore channel attr bitmaps!\n"); + goto xdr_error; + } + + /* Back channel attrs */ + READ_BUF(28); + READ32(dummy); /* headerpadsz is always 0 */ + READ32(sess->back_channel.maxreq_sz); + READ32(sess->back_channel.maxresp_sz); + READ32(sess->back_channel.maxresp_cached); + READ32(sess->back_channel.maxops); + READ32(sess->back_channel.maxreqs); + READ32(sess->back_channel.nr_rdma_attrs); + if (sess->back_channel.nr_rdma_attrs == 1) { + READ_BUF(4); + READ32(sess->back_channel.rdma_attrs); + } else if (sess->back_channel.nr_rdma_attrs > 1) { + dprintk("Too many back channel attr bitmaps!\n"); + goto xdr_error; + } + + READ_BUF(8); + READ32(sess->callback_prog); + + /* callback_sec_params4 */ + READ32(nr_secflavs); + for (i = 0; i < nr_secflavs; ++i) { + READ_BUF(4); + READ32(dummy); + switch (dummy) { + case RPC_AUTH_NULL: + /* Nothing to read */ + break; + case RPC_AUTH_UNIX: + READ_BUF(8); + /* stamp */ + READ32(dummy); + + /* machine name */ + READ32(dummy); + READ_BUF(dummy); + SAVEMEM(machine_name, dummy); + + /* uid, gid */ + READ_BUF(8); + READ32(sess->uid); + READ32(sess->gid); + + /* more gids */ + READ_BUF(4); + READ32(dummy); + READ_BUF(dummy * 4); + for (i = 0; i < dummy; ++i) + READ32(dummy); + break; + case RPC_AUTH_GSS: + dprintk("RPC_AUTH_GSS callback secflavor " + "not supported!\n"); + READ_BUF(8); + /* gcbp_service */ + READ32(dummy); + /* gcbp_handle_from_server */ + READ32(dummy); + READ_BUF(dummy); + p += XDR_QUADLEN(dummy); + /* gcbp_handle_from_client */ + READ_BUF(4); + READ32(dummy); + READ_BUF(dummy); + p += XDR_QUADLEN(dummy); + break; + default: + dprintk("Illegal callback secflavor\n"); + return nfserr_inval; + } + } + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_destroy_session(struct nfsd4_compoundargs *argp, + struct nfsd4_destroy_session *destroy_session) +{ + DECODE_HEAD; + READ_BUF(NFS4_MAX_SESSIONID_LEN); + COPYMEM(destroy_session->sessionid.data, NFS4_MAX_SESSIONID_LEN); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_sequence(struct nfsd4_compoundargs *argp, + struct nfsd4_sequence *seq) +{ + DECODE_HEAD; + + READ_BUF(NFS4_MAX_SESSIONID_LEN + 16); + COPYMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN); + READ32(seq->seqid); + READ32(seq->slotid); + READ32(seq->maxslots); + READ32(seq->cachethis); + DECODE_TAIL; } @@ -1005,7 +1282,7 @@ nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p) static __be32 nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p) { - return nfserr_opnotsupp; + return nfserr_notsupp; } typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *); @@ -1031,7 +1308,7 @@ static nfsd4_dec nfsd4_dec_ops[] = { [OP_OPEN_CONFIRM] = (nfsd4_dec)nfsd4_decode_open_confirm, [OP_OPEN_DOWNGRADE] = (nfsd4_dec)nfsd4_decode_open_downgrade, [OP_PUTFH] = (nfsd4_dec)nfsd4_decode_putfh, - [OP_PUTPUBFH] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_PUTPUBFH] = (nfsd4_dec)nfsd4_decode_noop, [OP_PUTROOTFH] = (nfsd4_dec)nfsd4_decode_noop, [OP_READ] = (nfsd4_dec)nfsd4_decode_read, [OP_READDIR] = (nfsd4_dec)nfsd4_decode_readdir, @@ -1050,6 +1327,67 @@ static nfsd4_dec nfsd4_dec_ops[] = { [OP_RELEASE_LOCKOWNER] = (nfsd4_dec)nfsd4_decode_release_lockowner, }; +static nfsd4_dec nfsd41_dec_ops[] = { + [OP_ACCESS] (nfsd4_dec)nfsd4_decode_access, + [OP_CLOSE] (nfsd4_dec)nfsd4_decode_close, + [OP_COMMIT] (nfsd4_dec)nfsd4_decode_commit, + [OP_CREATE] (nfsd4_dec)nfsd4_decode_create, + [OP_DELEGPURGE] (nfsd4_dec)nfsd4_decode_notsupp, + [OP_DELEGRETURN] (nfsd4_dec)nfsd4_decode_delegreturn, + [OP_GETATTR] (nfsd4_dec)nfsd4_decode_getattr, + [OP_GETFH] (nfsd4_dec)nfsd4_decode_noop, + [OP_LINK] (nfsd4_dec)nfsd4_decode_link, + [OP_LOCK] (nfsd4_dec)nfsd4_decode_lock, + [OP_LOCKT] (nfsd4_dec)nfsd4_decode_lockt, + [OP_LOCKU] (nfsd4_dec)nfsd4_decode_locku, + [OP_LOOKUP] (nfsd4_dec)nfsd4_decode_lookup, + [OP_LOOKUPP] (nfsd4_dec)nfsd4_decode_noop, + [OP_NVERIFY] (nfsd4_dec)nfsd4_decode_verify, + [OP_OPEN] (nfsd4_dec)nfsd4_decode_open, + [OP_OPENATTR] (nfsd4_dec)nfsd4_decode_notsupp, + [OP_OPEN_CONFIRM] (nfsd4_dec)nfsd4_decode_notsupp, + [OP_OPEN_DOWNGRADE] (nfsd4_dec)nfsd4_decode_open_downgrade, + [OP_PUTFH] (nfsd4_dec)nfsd4_decode_putfh, + [OP_PUTPUBFH] (nfsd4_dec)nfsd4_decode_notsupp, + [OP_PUTROOTFH] (nfsd4_dec)nfsd4_decode_noop, + [OP_READ] (nfsd4_dec)nfsd4_decode_read, + [OP_READDIR] (nfsd4_dec)nfsd4_decode_readdir, + [OP_READLINK] (nfsd4_dec)nfsd4_decode_noop, + [OP_REMOVE] (nfsd4_dec)nfsd4_decode_remove, + [OP_RENAME] (nfsd4_dec)nfsd4_decode_rename, + [OP_RENEW] (nfsd4_dec)nfsd4_decode_notsupp, + [OP_RESTOREFH] (nfsd4_dec)nfsd4_decode_noop, + [OP_SAVEFH] (nfsd4_dec)nfsd4_decode_noop, + [OP_SECINFO] (nfsd4_dec)nfsd4_decode_secinfo, + [OP_SETATTR] (nfsd4_dec)nfsd4_decode_setattr, + [OP_SETCLIENTID] (nfsd4_dec)nfsd4_decode_notsupp, + [OP_SETCLIENTID_CONFIRM](nfsd4_dec)nfsd4_decode_notsupp, + [OP_VERIFY] (nfsd4_dec)nfsd4_decode_verify, + [OP_WRITE] (nfsd4_dec)nfsd4_decode_write, + [OP_RELEASE_LOCKOWNER] (nfsd4_dec)nfsd4_decode_notsupp, + + /* new operations for NFSv4.1 */ + [OP_BACKCHANNEL_CTL] (nfsd4_dec)nfsd4_decode_notsupp, + [OP_BIND_CONN_TO_SESSION](nfsd4_dec)nfsd4_decode_notsupp, + [OP_EXCHANGE_ID] (nfsd4_dec)nfsd4_decode_exchange_id, + [OP_CREATE_SESSION] (nfsd4_dec)nfsd4_decode_create_session, + [OP_DESTROY_SESSION] (nfsd4_dec)nfsd4_decode_destroy_session, + [OP_FREE_STATEID] (nfsd4_dec)nfsd4_decode_notsupp, + [OP_GET_DIR_DELEGATION] (nfsd4_dec)nfsd4_decode_notsupp, + [OP_GETDEVICEINFO] (nfsd4_dec)nfsd4_decode_notsupp, + [OP_GETDEVICELIST] (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTCOMMIT] (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTGET] (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTRETURN] (nfsd4_dec)nfsd4_decode_notsupp, + [OP_SECINFO_NO_NAME] (nfsd4_dec)nfsd4_decode_notsupp, + [OP_SEQUENCE] (nfsd4_dec)nfsd4_decode_sequence, + [OP_SET_SSV] (nfsd4_dec)nfsd4_decode_notsupp, + [OP_TEST_STATEID] (nfsd4_dec)nfsd4_decode_notsupp, + [OP_WANT_DELEGATION] (nfsd4_dec)nfsd4_decode_notsupp, + [OP_DESTROY_CLIENTID] (nfsd4_dec)nfsd4_decode_notsupp, + [OP_RECLAIM_COMPLETE] (nfsd4_dec)nfsd4_decode_notsupp, +}; + struct nfsd4_minorversion_ops { nfsd4_dec *decoders; int nops; @@ -1057,6 +1395,7 @@ struct nfsd4_minorversion_ops { static struct nfsd4_minorversion_ops nfsd4_minorversion[] = { [0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) }, + [1] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) }, }; static __be32 @@ -1412,6 +1751,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, { u32 bmval0 = bmval[0]; u32 bmval1 = bmval[1]; + u32 bmval2 = bmval[2]; struct kstat stat; struct svc_fh tempfh; struct kstatfs statfs; @@ -1425,12 +1765,16 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, int err; int aclsupport = 0; struct nfs4_acl *acl = NULL; + struct nfsd4_compoundres *resp = rqstp->rq_resp; + u32 minorversion = resp->cstate.minorversion; BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1); - BUG_ON(bmval0 & ~NFSD_SUPPORTED_ATTRS_WORD0); - BUG_ON(bmval1 & ~NFSD_SUPPORTED_ATTRS_WORD1); + BUG_ON(bmval0 & ~nfsd_suppattrs0(minorversion)); + BUG_ON(bmval1 & ~nfsd_suppattrs1(minorversion)); + BUG_ON(bmval2 & ~nfsd_suppattrs2(minorversion)); if (exp->ex_fslocs.migrated) { + BUG_ON(bmval[2]); status = fattr_handle_absent_fs(&bmval0, &bmval1, &rdattr_err); if (status) goto out; @@ -1476,22 +1820,42 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, if ((buflen -= 16) < 0) goto out_resource; - WRITE32(2); - WRITE32(bmval0); - WRITE32(bmval1); + if (unlikely(bmval2)) { + WRITE32(3); + WRITE32(bmval0); + WRITE32(bmval1); + WRITE32(bmval2); + } else if (likely(bmval1)) { + WRITE32(2); + WRITE32(bmval0); + WRITE32(bmval1); + } else { + WRITE32(1); + WRITE32(bmval0); + } attrlenp = p++; /* to be backfilled later */ if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) { - u32 word0 = NFSD_SUPPORTED_ATTRS_WORD0; + u32 word0 = nfsd_suppattrs0(minorversion); + u32 word1 = nfsd_suppattrs1(minorversion); + u32 word2 = nfsd_suppattrs2(minorversion); + if ((buflen -= 12) < 0) goto out_resource; if (!aclsupport) word0 &= ~FATTR4_WORD0_ACL; if (!exp->ex_fslocs.locations) word0 &= ~FATTR4_WORD0_FS_LOCATIONS; - WRITE32(2); - WRITE32(word0); - WRITE32(NFSD_SUPPORTED_ATTRS_WORD1); + if (!word2) { + WRITE32(2); + WRITE32(word0); + WRITE32(word1); + } else { + WRITE32(3); + WRITE32(word0); + WRITE32(word1); + WRITE32(word2); + } } if (bmval0 & FATTR4_WORD0_TYPE) { if ((buflen -= 4) < 0) @@ -1801,6 +2165,13 @@ out_acl: } WRITE64(stat.ino); } + if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { + WRITE32(3); + WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0); + WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD1); + WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD2); + } + *attrlenp = htonl((char *)p - (char *)attrlenp - 4); *countp = p - buffer; status = nfs_ok; @@ -2572,6 +2943,143 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w } static __be32 +nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, int nfserr, + struct nfsd4_exchange_id *exid) +{ + ENCODE_HEAD; + char *major_id; + char *server_scope; + int major_id_sz; + int server_scope_sz; + uint64_t minor_id = 0; + + if (nfserr) + return nfserr; + + major_id = utsname()->nodename; + major_id_sz = strlen(major_id); + server_scope = utsname()->nodename; + server_scope_sz = strlen(server_scope); + + RESERVE_SPACE( + 8 /* eir_clientid */ + + 4 /* eir_sequenceid */ + + 4 /* eir_flags */ + + 4 /* spr_how (SP4_NONE) */ + + 8 /* so_minor_id */ + + 4 /* so_major_id.len */ + + (XDR_QUADLEN(major_id_sz) * 4) + + 4 /* eir_server_scope.len */ + + (XDR_QUADLEN(server_scope_sz) * 4) + + 4 /* eir_server_impl_id.count (0) */); + + WRITEMEM(&exid->clientid, 8); + WRITE32(exid->seqid); + WRITE32(exid->flags); + + /* state_protect4_r. Currently only support SP4_NONE */ + BUG_ON(exid->spa_how != SP4_NONE); + WRITE32(exid->spa_how); + + /* The server_owner struct */ + WRITE64(minor_id); /* Minor id */ + /* major id */ + WRITE32(major_id_sz); + WRITEMEM(major_id, major_id_sz); + + /* Server scope */ + WRITE32(server_scope_sz); + WRITEMEM(server_scope, server_scope_sz); + + /* Implementation id */ + WRITE32(0); /* zero length nfs_impl_id4 array */ + ADJUST_ARGS(); + return 0; +} + +static __be32 +nfsd4_encode_create_session(struct nfsd4_compoundres *resp, int nfserr, + struct nfsd4_create_session *sess) +{ + ENCODE_HEAD; + + if (nfserr) + return nfserr; + + RESERVE_SPACE(24); + WRITEMEM(sess->sessionid.data, NFS4_MAX_SESSIONID_LEN); + WRITE32(sess->seqid); + WRITE32(sess->flags); + ADJUST_ARGS(); + + RESERVE_SPACE(28); + WRITE32(0); /* headerpadsz */ + WRITE32(sess->fore_channel.maxreq_sz); + WRITE32(sess->fore_channel.maxresp_sz); + WRITE32(sess->fore_channel.maxresp_cached); + WRITE32(sess->fore_channel.maxops); + WRITE32(sess->fore_channel.maxreqs); + WRITE32(sess->fore_channel.nr_rdma_attrs); + ADJUST_ARGS(); + + if (sess->fore_channel.nr_rdma_attrs) { + RESERVE_SPACE(4); + WRITE32(sess->fore_channel.rdma_attrs); + ADJUST_ARGS(); + } + + RESERVE_SPACE(28); + WRITE32(0); /* headerpadsz */ + WRITE32(sess->back_channel.maxreq_sz); + WRITE32(sess->back_channel.maxresp_sz); + WRITE32(sess->back_channel.maxresp_cached); + WRITE32(sess->back_channel.maxops); + WRITE32(sess->back_channel.maxreqs); + WRITE32(sess->back_channel.nr_rdma_attrs); + ADJUST_ARGS(); + + if (sess->back_channel.nr_rdma_attrs) { + RESERVE_SPACE(4); + WRITE32(sess->back_channel.rdma_attrs); + ADJUST_ARGS(); + } + return 0; +} + +static __be32 +nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, int nfserr, + struct nfsd4_destroy_session *destroy_session) +{ + return nfserr; +} + +__be32 +nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr, + struct nfsd4_sequence *seq) +{ + ENCODE_HEAD; + + if (nfserr) + return nfserr; + + RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 20); + WRITEMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN); + WRITE32(seq->seqid); + WRITE32(seq->slotid); + WRITE32(seq->maxslots); + /* + * FIXME: for now: + * target_maxslots = maxslots + * status_flags = 0 + */ + WRITE32(seq->maxslots); + WRITE32(0); + + ADJUST_ARGS(); + return 0; +} + +static __be32 nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p) { return nfserr; @@ -2579,6 +3087,11 @@ nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p) typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *); +/* + * Note: nfsd4_enc_ops vector is shared for v4.0 and v4.1 + * since we don't need to filter out obsolete ops as this is + * done in the decoding phase. + */ static nfsd4_enc nfsd4_enc_ops[] = { [OP_ACCESS] = (nfsd4_enc)nfsd4_encode_access, [OP_CLOSE] = (nfsd4_enc)nfsd4_encode_close, @@ -2617,8 +3130,77 @@ static nfsd4_enc nfsd4_enc_ops[] = { [OP_VERIFY] = (nfsd4_enc)nfsd4_encode_noop, [OP_WRITE] = (nfsd4_enc)nfsd4_encode_write, [OP_RELEASE_LOCKOWNER] = (nfsd4_enc)nfsd4_encode_noop, + + /* NFSv4.1 operations */ + [OP_BACKCHANNEL_CTL] = (nfsd4_enc)nfsd4_encode_noop, + [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_noop, + [OP_EXCHANGE_ID] = (nfsd4_enc)nfsd4_encode_exchange_id, + [OP_CREATE_SESSION] = (nfsd4_enc)nfsd4_encode_create_session, + [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session, + [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop, + [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, + [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop, + [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop, + [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_noop, + [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, + [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop, + [OP_TEST_STATEID] = (nfsd4_enc)nfsd4_encode_noop, + [OP_WANT_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, + [OP_DESTROY_CLIENTID] = (nfsd4_enc)nfsd4_encode_noop, + [OP_RECLAIM_COMPLETE] = (nfsd4_enc)nfsd4_encode_noop, }; +/* + * Calculate the total amount of memory that the compound response has taken + * after encoding the current operation. + * + * pad: add on 8 bytes for the next operation's op_code and status so that + * there is room to cache a failure on the next operation. + * + * Compare this length to the session se_fmaxresp_cached. + * + * Our se_fmaxresp_cached will always be a multiple of PAGE_SIZE, and so + * will be at least a page and will therefore hold the xdr_buf head. + */ +static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp) +{ + int status = 0; + struct xdr_buf *xb = &resp->rqstp->rq_res; + struct nfsd4_compoundargs *args = resp->rqstp->rq_argp; + struct nfsd4_session *session = NULL; + struct nfsd4_slot *slot = resp->cstate.slot; + u32 length, tlen = 0, pad = 8; + + if (!nfsd4_has_session(&resp->cstate)) + return status; + + session = resp->cstate.session; + if (session == NULL || slot->sl_cache_entry.ce_cachethis == 0) + return status; + + if (resp->opcnt >= args->opcnt) + pad = 0; /* this is the last operation */ + + if (xb->page_len == 0) { + length = (char *)resp->p - (char *)xb->head[0].iov_base + pad; + } else { + if (xb->tail[0].iov_base && xb->tail[0].iov_len > 0) + tlen = (char *)resp->p - (char *)xb->tail[0].iov_base; + + length = xb->head[0].iov_len + xb->page_len + tlen + pad; + } + dprintk("%s length %u, xb->page_len %u tlen %u pad %u\n", __func__, + length, xb->page_len, tlen, pad); + + if (length <= session->se_fmaxresp_cached) + return status; + else + return nfserr_rep_too_big_to_cache; +} + void nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) { @@ -2635,6 +3217,9 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) || !nfsd4_enc_ops[op->opnum]); op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u); + /* nfsd4_check_drc_limit guarantees enough room for error status */ + if (!op->status && nfsd4_check_drc_limit(resp)) + op->status = nfserr_rep_too_big_to_cache; status: /* * Note: We write the status directly, instead of using WRITE32(), @@ -2735,6 +3320,18 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo iov = &rqstp->rq_res.head[0]; iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base; BUG_ON(iov->iov_len > PAGE_SIZE); + if (nfsd4_has_session(&resp->cstate)) { + if (resp->cstate.status == nfserr_replay_cache && + !nfsd4_not_cached(resp)) { + iov->iov_len = resp->cstate.iovlen; + } else { + nfsd4_store_cache_entry(resp); + dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__); + resp->cstate.slot->sl_inuse = 0; + } + if (resp->cstate.session) + nfsd4_put_session(resp->cstate.session); + } return 1; } diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index a4ed8644d69c..af16849d243a 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -60,6 +60,7 @@ enum { NFSD_FO_UnlockFS, NFSD_Threads, NFSD_Pool_Threads, + NFSD_Pool_Stats, NFSD_Versions, NFSD_Ports, NFSD_MaxBlkSize, @@ -172,6 +173,16 @@ static const struct file_operations exports_operations = { .owner = THIS_MODULE, }; +extern int nfsd_pool_stats_open(struct inode *inode, struct file *file); + +static struct file_operations pool_stats_operations = { + .open = nfsd_pool_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .owner = THIS_MODULE, +}; + /*----------------------------------------------------------------------------*/ /* * payload - write methods @@ -781,8 +792,9 @@ out_free: static ssize_t __write_versions(struct file *file, char *buf, size_t size) { char *mesg = buf; - char *vers, sign; + char *vers, *minorp, sign; int len, num; + unsigned minor; ssize_t tlen = 0; char *sep; @@ -803,9 +815,20 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) do { sign = *vers; if (sign == '+' || sign == '-') - num = simple_strtol((vers+1), NULL, 0); + num = simple_strtol((vers+1), &minorp, 0); else - num = simple_strtol(vers, NULL, 0); + num = simple_strtol(vers, &minorp, 0); + if (*minorp == '.') { + if (num < 4) + return -EINVAL; + minor = simple_strtoul(minorp+1, NULL, 0); + if (minor == 0) + return -EINVAL; + if (nfsd_minorversion(minor, sign == '-' ? + NFSD_CLEAR : NFSD_SET) < 0) + return -EINVAL; + goto next; + } switch(num) { case 2: case 3: @@ -815,6 +838,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) default: return -EINVAL; } + next: vers += len + 1; tlen += len; } while ((len = qword_get(&mesg, vers, size)) > 0); @@ -833,6 +857,13 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) num); sep = " "; } + if (nfsd_vers(4, NFSD_AVAIL)) + for (minor = 1; minor <= NFSD_SUPPORTED_MINOR_VERSION; minor++) + len += sprintf(buf+len, " %c4.%u", + (nfsd_vers(4, NFSD_TEST) && + nfsd_minorversion(minor, NFSD_TEST)) ? + '+' : '-', + minor); len += sprintf(buf+len, "\n"); return len; } @@ -1248,6 +1279,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent) [NFSD_Fh] = {"filehandle", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_Pool_Stats] = {"pool_stats", &pool_stats_operations, S_IRUGO}, [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO}, [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 6f7f26351227..e298e260b5f1 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -180,6 +180,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp, { __be32 nfserr; int stable = 1; + unsigned long cnt = argp->len; dprintk("nfsd: WRITE %s %d bytes at %d\n", SVCFH_fmt(&argp->fh), @@ -188,7 +189,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp, nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), NULL, argp->offset, rqstp->rq_vec, argp->vlen, - argp->len, + &cnt, &stable); return nfsd_return_attrs(nfserr, resp); } diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index bc3567bab8c4..cbba4a935786 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -22,6 +22,7 @@ #include <linux/freezer.h> #include <linux/fs_struct.h> #include <linux/kthread.h> +#include <linux/swap.h> #include <linux/sunrpc/types.h> #include <linux/sunrpc/stats.h> @@ -40,9 +41,6 @@ extern struct svc_program nfsd_program; static int nfsd(void *vrqstp); struct timeval nfssvc_boot; -static atomic_t nfsd_busy; -static unsigned long nfsd_last_call; -static DEFINE_SPINLOCK(nfsd_call_lock); /* * nfsd_mutex protects nfsd_serv -- both the pointer itself and the members @@ -123,6 +121,8 @@ struct svc_program nfsd_program = { }; +u32 nfsd_supported_minorversion; + int nfsd_vers(int vers, enum vers_op change) { if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS) @@ -149,6 +149,28 @@ int nfsd_vers(int vers, enum vers_op change) } return 0; } + +int nfsd_minorversion(u32 minorversion, enum vers_op change) +{ + if (minorversion > NFSD_SUPPORTED_MINOR_VERSION) + return -1; + switch(change) { + case NFSD_SET: + nfsd_supported_minorversion = minorversion; + break; + case NFSD_CLEAR: + if (minorversion == 0) + return -1; + nfsd_supported_minorversion = minorversion - 1; + break; + case NFSD_TEST: + return minorversion <= nfsd_supported_minorversion; + case NFSD_AVAIL: + return minorversion <= NFSD_SUPPORTED_MINOR_VERSION; + } + return 0; +} + /* * Maximum number of nfsd processes */ @@ -200,6 +222,28 @@ void nfsd_reset_versions(void) } } +/* + * Each session guarantees a negotiated per slot memory cache for replies + * which in turn consumes memory beyond the v2/v3/v4.0 server. A dedicated + * NFSv4.1 server might want to use more memory for a DRC than a machine + * with mutiple services. + * + * Impose a hard limit on the number of pages for the DRC which varies + * according to the machines free pages. This is of course only a default. + * + * For now this is a #defined shift which could be under admin control + * in the future. + */ +static void set_max_drc(void) +{ + /* The percent of nr_free_buffer_pages used by the V4.1 server DRC */ + #define NFSD_DRC_SIZE_SHIFT 7 + nfsd_serv->sv_drc_max_pages = nr_free_buffer_pages() + >> NFSD_DRC_SIZE_SHIFT; + nfsd_serv->sv_drc_pages_used = 0; + dprintk("%s svc_drc_max_pages %u\n", __func__, + nfsd_serv->sv_drc_max_pages); +} int nfsd_create_serv(void) { @@ -227,11 +271,12 @@ int nfsd_create_serv(void) nfsd_max_blksize /= 2; } - atomic_set(&nfsd_busy, 0); nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, nfsd_last_thread, nfsd, THIS_MODULE); if (nfsd_serv == NULL) err = -ENOMEM; + else + set_max_drc(); do_gettimeofday(&nfssvc_boot); /* record boot time */ return err; @@ -375,26 +420,6 @@ nfsd_svc(unsigned short port, int nrservs) return error; } -static inline void -update_thread_usage(int busy_threads) -{ - unsigned long prev_call; - unsigned long diff; - int decile; - - spin_lock(&nfsd_call_lock); - prev_call = nfsd_last_call; - nfsd_last_call = jiffies; - decile = busy_threads*10/nfsdstats.th_cnt; - if (decile>0 && decile <= 10) { - diff = nfsd_last_call - prev_call; - if ( (nfsdstats.th_usage[decile-1] += diff) >= NFSD_USAGE_WRAP) - nfsdstats.th_usage[decile-1] -= NFSD_USAGE_WRAP; - if (decile == 10) - nfsdstats.th_fullcnt++; - } - spin_unlock(&nfsd_call_lock); -} /* * This is the NFS server kernel thread @@ -403,7 +428,6 @@ static int nfsd(void *vrqstp) { struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp; - struct fs_struct *fsp; int err, preverr = 0; /* Lock module and set up kernel thread */ @@ -412,13 +436,11 @@ nfsd(void *vrqstp) /* At this point, the thread shares current->fs * with the init process. We need to create files with a * umask of 0 instead of init's umask. */ - fsp = copy_fs_struct(current->fs); - if (!fsp) { + if (unshare_fs_struct() < 0) { printk("Unable to start nfsd thread: out of memory\n"); goto out; } - exit_fs(current); - current->fs = fsp; + current->fs->umask = 0; /* @@ -463,8 +485,6 @@ nfsd(void *vrqstp) continue; } - update_thread_usage(atomic_read(&nfsd_busy)); - atomic_inc(&nfsd_busy); /* Lock the export hash tables for reading. */ exp_readlock(); @@ -473,8 +493,6 @@ nfsd(void *vrqstp) /* Unlock export hash tables */ exp_readunlock(); - update_thread_usage(atomic_read(&nfsd_busy)); - atomic_dec(&nfsd_busy); } /* Clear signals before calling svc_exit_thread() */ @@ -542,6 +560,10 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) + rqstp->rq_res.head[0].iov_len; rqstp->rq_res.head[0].iov_len += sizeof(__be32); + /* NFSv4.1 DRC requires statp */ + if (rqstp->rq_vers == 4) + nfsd4_set_statp(rqstp, statp); + /* Now call the procedure handler, and encode NFS status. */ nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); nfserr = map_new_errors(rqstp->rq_vers, nfserr); @@ -573,3 +595,10 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) nfsd_cache_update(rqstp, proc->pc_cachetype, statp + 1); return 1; } + +int nfsd_pool_stats_open(struct inode *inode, struct file *file) +{ + if (nfsd_serv == NULL) + return -ENODEV; + return svc_pool_stats_open(nfsd_serv, file); +} diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 78376b6c0236..ab93fcfef254 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -366,8 +366,9 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, } /* Revoke setuid/setgid on chown */ - if (((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) || - ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid)) { + if (!S_ISDIR(inode->i_mode) && + (((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) || + ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid))) { iap->ia_valid |= ATTR_KILL_PRIV; if (iap->ia_valid & ATTR_MODE) { /* we're setting mode too, just clear the s*id bits */ @@ -960,7 +961,7 @@ static void kill_suid(struct dentry *dentry) static __be32 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, struct kvec *vec, int vlen, - unsigned long cnt, int *stablep) + unsigned long *cnt, int *stablep) { struct svc_export *exp; struct dentry *dentry; @@ -974,7 +975,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, err = nfserr_perm; if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && - (!lock_may_write(file->f_path.dentry->d_inode, offset, cnt))) + (!lock_may_write(file->f_path.dentry->d_inode, offset, *cnt))) goto out; #endif @@ -1009,7 +1010,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset); set_fs(oldfs); if (host_err >= 0) { - nfsdstats.io_write += cnt; + nfsdstats.io_write += host_err; fsnotify_modify(file->f_path.dentry); } @@ -1054,9 +1055,10 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, } dprintk("nfsd: write complete host_err=%d\n", host_err); - if (host_err >= 0) + if (host_err >= 0) { err = 0; - else + *cnt = host_err; + } else err = nfserrno(host_err); out: return err; @@ -1098,7 +1100,7 @@ out: */ __be32 nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, - loff_t offset, struct kvec *vec, int vlen, unsigned long cnt, + loff_t offset, struct kvec *vec, int vlen, unsigned long *cnt, int *stablep) { __be32 err = 0; @@ -1179,6 +1181,21 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp, return 0; } +/* HPUX client sometimes creates a file in mode 000, and sets size to 0. + * setting size to 0 may fail for some specific file systems by the permission + * checking which requires WRITE permission but the mode is 000. + * we ignore the resizing(to 0) on the just new created file, since the size is + * 0 after file created. + * + * call this only after vfs_create() is called. + * */ +static void +nfsd_check_ignore_resizing(struct iattr *iap) +{ + if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0)) + iap->ia_valid &= ~ATTR_SIZE; +} + /* * Create a file (regular, directory, device, fifo); UNIX sockets * not yet implemented. @@ -1274,6 +1291,8 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, switch (type) { case S_IFREG: host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL); + if (!host_err) + nfsd_check_ignore_resizing(iap); break; case S_IFDIR: host_err = vfs_mkdir(dirp, dchild, iap->ia_mode); @@ -1427,6 +1446,8 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, /* setattr will sync the child (or not) */ } + nfsd_check_ignore_resizing(iap); + if (createmode == NFS3_CREATE_EXCLUSIVE) { /* Cram the verifier into atime/mtime */ iap->ia_valid = ATTR_MTIME|ATTR_ATIME diff --git a/fs/nilfs2/Makefile b/fs/nilfs2/Makefile new file mode 100644 index 000000000000..df3e62c1ddc5 --- /dev/null +++ b/fs/nilfs2/Makefile @@ -0,0 +1,5 @@ +obj-$(CONFIG_NILFS2_FS) += nilfs2.o +nilfs2-y := inode.o file.o dir.o super.o namei.o page.o mdt.o \ + btnode.o bmap.o btree.o direct.o dat.o recovery.o \ + the_nilfs.o segbuf.o segment.o cpfile.o sufile.o \ + ifile.o alloc.o gcinode.o ioctl.o gcdat.o diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c new file mode 100644 index 000000000000..d69e6ae59251 --- /dev/null +++ b/fs/nilfs2/alloc.c @@ -0,0 +1,504 @@ +/* + * alloc.c - NILFS dat/inode allocator + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Original code was written by Koji Sato <koji@osrg.net>. + * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>, + * Amagai Yoshiji <amagai@osrg.net>. + */ + +#include <linux/types.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> +#include <linux/bitops.h> +#include "mdt.h" +#include "alloc.h" + + +static inline unsigned long +nilfs_palloc_groups_per_desc_block(const struct inode *inode) +{ + return (1UL << inode->i_blkbits) / + sizeof(struct nilfs_palloc_group_desc); +} + +static inline unsigned long +nilfs_palloc_groups_count(const struct inode *inode) +{ + return 1UL << (BITS_PER_LONG - (inode->i_blkbits + 3 /* log2(8) */)); +} + +int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size) +{ + struct nilfs_mdt_info *mi = NILFS_MDT(inode); + + mi->mi_bgl = kmalloc(sizeof(*mi->mi_bgl), GFP_NOFS); + if (!mi->mi_bgl) + return -ENOMEM; + + bgl_lock_init(mi->mi_bgl); + + nilfs_mdt_set_entry_size(inode, entry_size, 0); + + mi->mi_blocks_per_group = + DIV_ROUND_UP(nilfs_palloc_entries_per_group(inode), + mi->mi_entries_per_block) + 1; + /* Number of blocks in a group including entry blocks and + a bitmap block */ + mi->mi_blocks_per_desc_block = + nilfs_palloc_groups_per_desc_block(inode) * + mi->mi_blocks_per_group + 1; + /* Number of blocks per descriptor including the + descriptor block */ + return 0; +} + +static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr, + unsigned long *offset) +{ + __u64 group = nr; + + *offset = do_div(group, nilfs_palloc_entries_per_group(inode)); + return group; +} + +static unsigned long +nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group) +{ + unsigned long desc_block = + group / nilfs_palloc_groups_per_desc_block(inode); + return desc_block * NILFS_MDT(inode)->mi_blocks_per_desc_block; +} + +static unsigned long +nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group) +{ + unsigned long desc_offset = + group % nilfs_palloc_groups_per_desc_block(inode); + return nilfs_palloc_desc_blkoff(inode, group) + 1 + + desc_offset * NILFS_MDT(inode)->mi_blocks_per_group; +} + +static unsigned long +nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group, + const struct nilfs_palloc_group_desc *desc) +{ + unsigned long nfree; + + spin_lock(nilfs_mdt_bgl_lock(inode, group)); + nfree = le32_to_cpu(desc->pg_nfrees); + spin_unlock(nilfs_mdt_bgl_lock(inode, group)); + return nfree; +} + +static void +nilfs_palloc_group_desc_add_entries(struct inode *inode, + unsigned long group, + struct nilfs_palloc_group_desc *desc, + u32 n) +{ + spin_lock(nilfs_mdt_bgl_lock(inode, group)); + le32_add_cpu(&desc->pg_nfrees, n); + spin_unlock(nilfs_mdt_bgl_lock(inode, group)); +} + +static unsigned long +nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr) +{ + unsigned long group, group_offset; + + group = nilfs_palloc_group(inode, nr, &group_offset); + + return nilfs_palloc_bitmap_blkoff(inode, group) + 1 + + group_offset / NILFS_MDT(inode)->mi_entries_per_block; +} + +static void nilfs_palloc_desc_block_init(struct inode *inode, + struct buffer_head *bh, void *kaddr) +{ + struct nilfs_palloc_group_desc *desc = kaddr + bh_offset(bh); + unsigned long n = nilfs_palloc_groups_per_desc_block(inode); + __le32 nfrees; + + nfrees = cpu_to_le32(nilfs_palloc_entries_per_group(inode)); + while (n-- > 0) { + desc->pg_nfrees = nfrees; + desc++; + } +} + +static int nilfs_palloc_get_desc_block(struct inode *inode, + unsigned long group, + int create, struct buffer_head **bhp) +{ + return nilfs_mdt_get_block(inode, + nilfs_palloc_desc_blkoff(inode, group), + create, nilfs_palloc_desc_block_init, bhp); +} + +static int nilfs_palloc_get_bitmap_block(struct inode *inode, + unsigned long group, + int create, struct buffer_head **bhp) +{ + return nilfs_mdt_get_block(inode, + nilfs_palloc_bitmap_blkoff(inode, group), + create, NULL, bhp); +} + +int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr, + int create, struct buffer_head **bhp) +{ + return nilfs_mdt_get_block(inode, nilfs_palloc_entry_blkoff(inode, nr), + create, NULL, bhp); +} + +static struct nilfs_palloc_group_desc * +nilfs_palloc_block_get_group_desc(const struct inode *inode, + unsigned long group, + const struct buffer_head *bh, void *kaddr) +{ + return (struct nilfs_palloc_group_desc *)(kaddr + bh_offset(bh)) + + group % nilfs_palloc_groups_per_desc_block(inode); +} + +static unsigned char * +nilfs_palloc_block_get_bitmap(const struct inode *inode, + const struct buffer_head *bh, void *kaddr) +{ + return (unsigned char *)(kaddr + bh_offset(bh)); +} + +void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr, + const struct buffer_head *bh, void *kaddr) +{ + unsigned long entry_offset, group_offset; + + nilfs_palloc_group(inode, nr, &group_offset); + entry_offset = group_offset % NILFS_MDT(inode)->mi_entries_per_block; + + return kaddr + bh_offset(bh) + + entry_offset * NILFS_MDT(inode)->mi_entry_size; +} + +static int nilfs_palloc_find_available_slot(struct inode *inode, + unsigned long group, + unsigned long target, + unsigned char *bitmap, + int bsize) /* size in bits */ +{ + int curr, pos, end, i; + + if (target > 0) { + end = (target + BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1); + if (end > bsize) + end = bsize; + pos = nilfs_find_next_zero_bit(bitmap, end, target); + if (pos < end && + !nilfs_set_bit_atomic( + nilfs_mdt_bgl_lock(inode, group), pos, bitmap)) + return pos; + } else + end = 0; + + for (i = 0, curr = end; + i < bsize; + i += BITS_PER_LONG, curr += BITS_PER_LONG) { + /* wrap around */ + if (curr >= bsize) + curr = 0; + while (*((unsigned long *)bitmap + curr / BITS_PER_LONG) + != ~0UL) { + end = curr + BITS_PER_LONG; + if (end > bsize) + end = bsize; + pos = nilfs_find_next_zero_bit(bitmap, end, curr); + if ((pos < end) && + !nilfs_set_bit_atomic( + nilfs_mdt_bgl_lock(inode, group), pos, + bitmap)) + return pos; + } + } + return -ENOSPC; +} + +static unsigned long +nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode, + unsigned long curr, unsigned long max) +{ + return min_t(unsigned long, + nilfs_palloc_groups_per_desc_block(inode) - + curr % nilfs_palloc_groups_per_desc_block(inode), + max - curr + 1); +} + +int nilfs_palloc_prepare_alloc_entry(struct inode *inode, + struct nilfs_palloc_req *req) +{ + struct buffer_head *desc_bh, *bitmap_bh; + struct nilfs_palloc_group_desc *desc; + unsigned char *bitmap; + void *desc_kaddr, *bitmap_kaddr; + unsigned long group, maxgroup, ngroups; + unsigned long group_offset, maxgroup_offset; + unsigned long n, entries_per_group, groups_per_desc_block; + unsigned long i, j; + int pos, ret; + + ngroups = nilfs_palloc_groups_count(inode); + maxgroup = ngroups - 1; + group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); + entries_per_group = nilfs_palloc_entries_per_group(inode); + groups_per_desc_block = nilfs_palloc_groups_per_desc_block(inode); + + for (i = 0; i < ngroups; i += n) { + if (group >= ngroups) { + /* wrap around */ + group = 0; + maxgroup = nilfs_palloc_group(inode, req->pr_entry_nr, + &maxgroup_offset) - 1; + } + ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh); + if (ret < 0) + return ret; + desc_kaddr = kmap(desc_bh->b_page); + desc = nilfs_palloc_block_get_group_desc( + inode, group, desc_bh, desc_kaddr); + n = nilfs_palloc_rest_groups_in_desc_block(inode, group, + maxgroup); + for (j = 0; j < n; j++, desc++, group++) { + if (nilfs_palloc_group_desc_nfrees(inode, group, desc) + > 0) { + ret = nilfs_palloc_get_bitmap_block( + inode, group, 1, &bitmap_bh); + if (ret < 0) + goto out_desc; + bitmap_kaddr = kmap(bitmap_bh->b_page); + bitmap = nilfs_palloc_block_get_bitmap( + inode, bitmap_bh, bitmap_kaddr); + pos = nilfs_palloc_find_available_slot( + inode, group, group_offset, bitmap, + entries_per_group); + if (pos >= 0) { + /* found a free entry */ + nilfs_palloc_group_desc_add_entries( + inode, group, desc, -1); + req->pr_entry_nr = + entries_per_group * group + pos; + kunmap(desc_bh->b_page); + kunmap(bitmap_bh->b_page); + + req->pr_desc_bh = desc_bh; + req->pr_bitmap_bh = bitmap_bh; + return 0; + } + kunmap(bitmap_bh->b_page); + brelse(bitmap_bh); + } + + group_offset = 0; + } + + kunmap(desc_bh->b_page); + brelse(desc_bh); + } + + /* no entries left */ + return -ENOSPC; + + out_desc: + kunmap(desc_bh->b_page); + brelse(desc_bh); + return ret; +} + +void nilfs_palloc_commit_alloc_entry(struct inode *inode, + struct nilfs_palloc_req *req) +{ + nilfs_mdt_mark_buffer_dirty(req->pr_bitmap_bh); + nilfs_mdt_mark_buffer_dirty(req->pr_desc_bh); + nilfs_mdt_mark_dirty(inode); + + brelse(req->pr_bitmap_bh); + brelse(req->pr_desc_bh); +} + +void nilfs_palloc_commit_free_entry(struct inode *inode, + struct nilfs_palloc_req *req) +{ + struct nilfs_palloc_group_desc *desc; + unsigned long group, group_offset; + unsigned char *bitmap; + void *desc_kaddr, *bitmap_kaddr; + + group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); + desc_kaddr = kmap(req->pr_desc_bh->b_page); + desc = nilfs_palloc_block_get_group_desc(inode, group, + req->pr_desc_bh, desc_kaddr); + bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page); + bitmap = nilfs_palloc_block_get_bitmap(inode, req->pr_bitmap_bh, + bitmap_kaddr); + + if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group), + group_offset, bitmap)) + printk(KERN_WARNING "%s: entry number %llu already freed\n", + __func__, (unsigned long long)req->pr_entry_nr); + + nilfs_palloc_group_desc_add_entries(inode, group, desc, 1); + + kunmap(req->pr_bitmap_bh->b_page); + kunmap(req->pr_desc_bh->b_page); + + nilfs_mdt_mark_buffer_dirty(req->pr_desc_bh); + nilfs_mdt_mark_buffer_dirty(req->pr_bitmap_bh); + nilfs_mdt_mark_dirty(inode); + + brelse(req->pr_bitmap_bh); + brelse(req->pr_desc_bh); +} + +void nilfs_palloc_abort_alloc_entry(struct inode *inode, + struct nilfs_palloc_req *req) +{ + struct nilfs_palloc_group_desc *desc; + void *desc_kaddr, *bitmap_kaddr; + unsigned char *bitmap; + unsigned long group, group_offset; + + group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); + desc_kaddr = kmap(req->pr_desc_bh->b_page); + desc = nilfs_palloc_block_get_group_desc(inode, group, + req->pr_desc_bh, desc_kaddr); + bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page); + bitmap = nilfs_palloc_block_get_bitmap(inode, req->pr_bitmap_bh, + bitmap_kaddr); + if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group), + group_offset, bitmap)) + printk(KERN_WARNING "%s: entry numer %llu already freed\n", + __func__, (unsigned long long)req->pr_entry_nr); + + nilfs_palloc_group_desc_add_entries(inode, group, desc, 1); + + kunmap(req->pr_bitmap_bh->b_page); + kunmap(req->pr_desc_bh->b_page); + + brelse(req->pr_bitmap_bh); + brelse(req->pr_desc_bh); + + req->pr_entry_nr = 0; + req->pr_bitmap_bh = NULL; + req->pr_desc_bh = NULL; +} + +int nilfs_palloc_prepare_free_entry(struct inode *inode, + struct nilfs_palloc_req *req) +{ + struct buffer_head *desc_bh, *bitmap_bh; + unsigned long group, group_offset; + int ret; + + group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); + ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh); + if (ret < 0) + return ret; + ret = nilfs_palloc_get_bitmap_block(inode, group, 1, &bitmap_bh); + if (ret < 0) { + brelse(desc_bh); + return ret; + } + + req->pr_desc_bh = desc_bh; + req->pr_bitmap_bh = bitmap_bh; + return 0; +} + +void nilfs_palloc_abort_free_entry(struct inode *inode, + struct nilfs_palloc_req *req) +{ + brelse(req->pr_bitmap_bh); + brelse(req->pr_desc_bh); + + req->pr_entry_nr = 0; + req->pr_bitmap_bh = NULL; + req->pr_desc_bh = NULL; +} + +static int +nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr) +{ + __u64 first, last; + + first = group * nilfs_palloc_entries_per_group(inode); + last = first + nilfs_palloc_entries_per_group(inode) - 1; + return (nr >= first) && (nr <= last); +} + +int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) +{ + struct buffer_head *desc_bh, *bitmap_bh; + struct nilfs_palloc_group_desc *desc; + unsigned char *bitmap; + void *desc_kaddr, *bitmap_kaddr; + unsigned long group, group_offset; + int i, j, n, ret; + + for (i = 0; i < nitems; i += n) { + group = nilfs_palloc_group(inode, entry_nrs[i], &group_offset); + ret = nilfs_palloc_get_desc_block(inode, group, 0, &desc_bh); + if (ret < 0) + return ret; + ret = nilfs_palloc_get_bitmap_block(inode, group, 0, + &bitmap_bh); + if (ret < 0) { + brelse(desc_bh); + return ret; + } + desc_kaddr = kmap(desc_bh->b_page); + desc = nilfs_palloc_block_get_group_desc( + inode, group, desc_bh, desc_kaddr); + bitmap_kaddr = kmap(bitmap_bh->b_page); + bitmap = nilfs_palloc_block_get_bitmap( + inode, bitmap_bh, bitmap_kaddr); + for (j = i, n = 0; + (j < nitems) && nilfs_palloc_group_is_in(inode, group, + entry_nrs[j]); + j++, n++) { + nilfs_palloc_group(inode, entry_nrs[j], &group_offset); + if (!nilfs_clear_bit_atomic( + nilfs_mdt_bgl_lock(inode, group), + group_offset, bitmap)) { + printk(KERN_WARNING + "%s: entry number %llu already freed\n", + __func__, + (unsigned long long)entry_nrs[j]); + } + } + nilfs_palloc_group_desc_add_entries(inode, group, desc, n); + + kunmap(bitmap_bh->b_page); + kunmap(desc_bh->b_page); + + nilfs_mdt_mark_buffer_dirty(desc_bh); + nilfs_mdt_mark_buffer_dirty(bitmap_bh); + nilfs_mdt_mark_dirty(inode); + + brelse(bitmap_bh); + brelse(desc_bh); + } + return 0; +} diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h new file mode 100644 index 000000000000..4ace5475c2c7 --- /dev/null +++ b/fs/nilfs2/alloc.h @@ -0,0 +1,72 @@ +/* + * alloc.h - persistent object (dat entry/disk inode) allocator/deallocator + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Original code was written by Koji Sato <koji@osrg.net>. + * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>, + * Amagai Yoshiji <amagai@osrg.net>. + */ + +#ifndef _NILFS_ALLOC_H +#define _NILFS_ALLOC_H + +#include <linux/types.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> + +static inline unsigned long +nilfs_palloc_entries_per_group(const struct inode *inode) +{ + return 1UL << (inode->i_blkbits + 3 /* log2(8 = CHAR_BITS) */); +} + +int nilfs_palloc_init_blockgroup(struct inode *, unsigned); +int nilfs_palloc_get_entry_block(struct inode *, __u64, int, + struct buffer_head **); +void *nilfs_palloc_block_get_entry(const struct inode *, __u64, + const struct buffer_head *, void *); + +/** + * nilfs_palloc_req - persistent alloctor request and reply + * @pr_entry_nr: entry number (vblocknr or inode number) + * @pr_desc_bh: buffer head of the buffer containing block group descriptors + * @pr_bitmap_bh: buffer head of the buffer containing a block group bitmap + * @pr_entry_bh: buffer head of the buffer containing translation entries + */ +struct nilfs_palloc_req { + __u64 pr_entry_nr; + struct buffer_head *pr_desc_bh; + struct buffer_head *pr_bitmap_bh; + struct buffer_head *pr_entry_bh; +}; + +int nilfs_palloc_prepare_alloc_entry(struct inode *, + struct nilfs_palloc_req *); +void nilfs_palloc_commit_alloc_entry(struct inode *, + struct nilfs_palloc_req *); +void nilfs_palloc_abort_alloc_entry(struct inode *, struct nilfs_palloc_req *); +void nilfs_palloc_commit_free_entry(struct inode *, struct nilfs_palloc_req *); +int nilfs_palloc_prepare_free_entry(struct inode *, struct nilfs_palloc_req *); +void nilfs_palloc_abort_free_entry(struct inode *, struct nilfs_palloc_req *); +int nilfs_palloc_freev(struct inode *, __u64 *, size_t); + +#define nilfs_set_bit_atomic ext2_set_bit_atomic +#define nilfs_clear_bit_atomic ext2_clear_bit_atomic +#define nilfs_find_next_zero_bit ext2_find_next_zero_bit + +#endif /* _NILFS_ALLOC_H */ diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c new file mode 100644 index 000000000000..24638e059bf3 --- /dev/null +++ b/fs/nilfs2/bmap.c @@ -0,0 +1,783 @@ +/* + * bmap.c - NILFS block mapping. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#include <linux/fs.h> +#include <linux/string.h> +#include <linux/errno.h> +#include "nilfs.h" +#include "bmap.h" +#include "sb.h" +#include "btnode.h" +#include "mdt.h" +#include "dat.h" +#include "alloc.h" + +int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level, + __u64 *ptrp) +{ + __u64 ptr; + int ret; + + down_read(&bmap->b_sem); + ret = bmap->b_ops->bop_lookup(bmap, key, level, ptrp); + if (ret < 0) + goto out; + if (bmap->b_pops->bpop_translate != NULL) { + ret = bmap->b_pops->bpop_translate(bmap, *ptrp, &ptr); + if (ret < 0) + goto out; + *ptrp = ptr; + } + + out: + up_read(&bmap->b_sem); + return ret; +} + + +/** + * nilfs_bmap_lookup - find a record + * @bmap: bmap + * @key: key + * @recp: pointer to record + * + * Description: nilfs_bmap_lookup() finds a record whose key matches @key in + * @bmap. + * + * Return Value: On success, 0 is returned and the record associated with @key + * is stored in the place pointed by @recp. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - A record associated with @key does not exist. + */ +int nilfs_bmap_lookup(struct nilfs_bmap *bmap, + unsigned long key, + unsigned long *recp) +{ + __u64 ptr; + int ret; + + /* XXX: use macro for level 1 */ + ret = nilfs_bmap_lookup_at_level(bmap, key, 1, &ptr); + if (recp != NULL) + *recp = ptr; + return ret; +} + +static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) +{ + __u64 keys[NILFS_BMAP_SMALL_HIGH + 1]; + __u64 ptrs[NILFS_BMAP_SMALL_HIGH + 1]; + int ret, n; + + if (bmap->b_ops->bop_check_insert != NULL) { + ret = bmap->b_ops->bop_check_insert(bmap, key); + if (ret > 0) { + n = bmap->b_ops->bop_gather_data( + bmap, keys, ptrs, NILFS_BMAP_SMALL_HIGH + 1); + if (n < 0) + return n; + ret = nilfs_btree_convert_and_insert( + bmap, key, ptr, keys, ptrs, n, + NILFS_BMAP_LARGE_LOW, NILFS_BMAP_LARGE_HIGH); + if (ret == 0) + bmap->b_u.u_flags |= NILFS_BMAP_LARGE; + + return ret; + } else if (ret < 0) + return ret; + } + + return bmap->b_ops->bop_insert(bmap, key, ptr); +} + +/** + * nilfs_bmap_insert - insert a new key-record pair into a bmap + * @bmap: bmap + * @key: key + * @rec: record + * + * Description: nilfs_bmap_insert() inserts the new key-record pair specified + * by @key and @rec into @bmap. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EEXIST - A record associated with @key already exist. + */ +int nilfs_bmap_insert(struct nilfs_bmap *bmap, + unsigned long key, + unsigned long rec) +{ + int ret; + + down_write(&bmap->b_sem); + ret = nilfs_bmap_do_insert(bmap, key, rec); + up_write(&bmap->b_sem); + return ret; +} + +static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key) +{ + __u64 keys[NILFS_BMAP_LARGE_LOW + 1]; + __u64 ptrs[NILFS_BMAP_LARGE_LOW + 1]; + int ret, n; + + if (bmap->b_ops->bop_check_delete != NULL) { + ret = bmap->b_ops->bop_check_delete(bmap, key); + if (ret > 0) { + n = bmap->b_ops->bop_gather_data( + bmap, keys, ptrs, NILFS_BMAP_LARGE_LOW + 1); + if (n < 0) + return n; + ret = nilfs_direct_delete_and_convert( + bmap, key, keys, ptrs, n, + NILFS_BMAP_SMALL_LOW, NILFS_BMAP_SMALL_HIGH); + if (ret == 0) + bmap->b_u.u_flags &= ~NILFS_BMAP_LARGE; + + return ret; + } else if (ret < 0) + return ret; + } + + return bmap->b_ops->bop_delete(bmap, key); +} + +int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned long *key) +{ + __u64 lastkey; + int ret; + + down_read(&bmap->b_sem); + ret = bmap->b_ops->bop_last_key(bmap, &lastkey); + if (!ret) + *key = lastkey; + up_read(&bmap->b_sem); + return ret; +} + +/** + * nilfs_bmap_delete - delete a key-record pair from a bmap + * @bmap: bmap + * @key: key + * + * Description: nilfs_bmap_delete() deletes the key-record pair specified by + * @key from @bmap. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - A record associated with @key does not exist. + */ +int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned long key) +{ + int ret; + + down_write(&bmap->b_sem); + ret = nilfs_bmap_do_delete(bmap, key); + up_write(&bmap->b_sem); + return ret; +} + +static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, unsigned long key) +{ + __u64 lastkey; + int ret; + + ret = bmap->b_ops->bop_last_key(bmap, &lastkey); + if (ret < 0) { + if (ret == -ENOENT) + ret = 0; + return ret; + } + + while (key <= lastkey) { + ret = nilfs_bmap_do_delete(bmap, lastkey); + if (ret < 0) + return ret; + ret = bmap->b_ops->bop_last_key(bmap, &lastkey); + if (ret < 0) { + if (ret == -ENOENT) + ret = 0; + return ret; + } + } + return 0; +} + +/** + * nilfs_bmap_truncate - truncate a bmap to a specified key + * @bmap: bmap + * @key: key + * + * Description: nilfs_bmap_truncate() removes key-record pairs whose keys are + * greater than or equal to @key from @bmap. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_bmap_truncate(struct nilfs_bmap *bmap, unsigned long key) +{ + int ret; + + down_write(&bmap->b_sem); + ret = nilfs_bmap_do_truncate(bmap, key); + up_write(&bmap->b_sem); + return ret; +} + +/** + * nilfs_bmap_clear - free resources a bmap holds + * @bmap: bmap + * + * Description: nilfs_bmap_clear() frees resources associated with @bmap. + */ +void nilfs_bmap_clear(struct nilfs_bmap *bmap) +{ + down_write(&bmap->b_sem); + if (bmap->b_ops->bop_clear != NULL) + bmap->b_ops->bop_clear(bmap); + up_write(&bmap->b_sem); +} + +/** + * nilfs_bmap_propagate - propagate dirty state + * @bmap: bmap + * @bh: buffer head + * + * Description: nilfs_bmap_propagate() marks the buffers that directly or + * indirectly refer to the block specified by @bh dirty. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_bmap_propagate(struct nilfs_bmap *bmap, struct buffer_head *bh) +{ + int ret; + + down_write(&bmap->b_sem); + ret = bmap->b_ops->bop_propagate(bmap, bh); + up_write(&bmap->b_sem); + return ret; +} + +/** + * nilfs_bmap_lookup_dirty_buffers - + * @bmap: bmap + * @listp: pointer to buffer head list + */ +void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *bmap, + struct list_head *listp) +{ + if (bmap->b_ops->bop_lookup_dirty_buffers != NULL) + bmap->b_ops->bop_lookup_dirty_buffers(bmap, listp); +} + +/** + * nilfs_bmap_assign - assign a new block number to a block + * @bmap: bmap + * @bhp: pointer to buffer head + * @blocknr: block number + * @binfo: block information + * + * Description: nilfs_bmap_assign() assigns the block number @blocknr to the + * buffer specified by @bh. + * + * Return Value: On success, 0 is returned and the buffer head of a newly + * create buffer and the block information associated with the buffer are + * stored in the place pointed by @bh and @binfo, respectively. On error, one + * of the following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_bmap_assign(struct nilfs_bmap *bmap, + struct buffer_head **bh, + unsigned long blocknr, + union nilfs_binfo *binfo) +{ + int ret; + + down_write(&bmap->b_sem); + ret = bmap->b_ops->bop_assign(bmap, bh, blocknr, binfo); + up_write(&bmap->b_sem); + return ret; +} + +/** + * nilfs_bmap_mark - mark block dirty + * @bmap: bmap + * @key: key + * @level: level + * + * Description: nilfs_bmap_mark() marks the block specified by @key and @level + * as dirty. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_bmap_mark(struct nilfs_bmap *bmap, __u64 key, int level) +{ + int ret; + + if (bmap->b_ops->bop_mark == NULL) + return 0; + + down_write(&bmap->b_sem); + ret = bmap->b_ops->bop_mark(bmap, key, level); + up_write(&bmap->b_sem); + return ret; +} + +/** + * nilfs_bmap_test_and_clear_dirty - test and clear a bmap dirty state + * @bmap: bmap + * + * Description: nilfs_test_and_clear() is the atomic operation to test and + * clear the dirty state of @bmap. + * + * Return Value: 1 is returned if @bmap is dirty, or 0 if clear. + */ +int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *bmap) +{ + int ret; + + down_write(&bmap->b_sem); + ret = nilfs_bmap_dirty(bmap); + nilfs_bmap_clear_dirty(bmap); + up_write(&bmap->b_sem); + return ret; +} + + +/* + * Internal use only + */ + +void nilfs_bmap_add_blocks(const struct nilfs_bmap *bmap, int n) +{ + inode_add_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n); + if (NILFS_MDT(bmap->b_inode)) + nilfs_mdt_mark_dirty(bmap->b_inode); + else + mark_inode_dirty(bmap->b_inode); +} + +void nilfs_bmap_sub_blocks(const struct nilfs_bmap *bmap, int n) +{ + inode_sub_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n); + if (NILFS_MDT(bmap->b_inode)) + nilfs_mdt_mark_dirty(bmap->b_inode); + else + mark_inode_dirty(bmap->b_inode); +} + +int nilfs_bmap_get_block(const struct nilfs_bmap *bmap, __u64 ptr, + struct buffer_head **bhp) +{ + return nilfs_btnode_get(&NILFS_BMAP_I(bmap)->i_btnode_cache, + ptr, 0, bhp, 0); +} + +void nilfs_bmap_put_block(const struct nilfs_bmap *bmap, + struct buffer_head *bh) +{ + brelse(bh); +} + +int nilfs_bmap_get_new_block(const struct nilfs_bmap *bmap, __u64 ptr, + struct buffer_head **bhp) +{ + int ret; + + ret = nilfs_btnode_get(&NILFS_BMAP_I(bmap)->i_btnode_cache, + ptr, 0, bhp, 1); + if (ret < 0) + return ret; + set_buffer_nilfs_volatile(*bhp); + return 0; +} + +void nilfs_bmap_delete_block(const struct nilfs_bmap *bmap, + struct buffer_head *bh) +{ + nilfs_btnode_delete(bh); +} + +__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap, + const struct buffer_head *bh) +{ + struct buffer_head *pbh; + __u64 key; + + key = page_index(bh->b_page) << (PAGE_CACHE_SHIFT - + bmap->b_inode->i_blkbits); + for (pbh = page_buffers(bh->b_page); pbh != bh; + pbh = pbh->b_this_page, key++); + + return key; +} + +__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *bmap, __u64 key) +{ + __s64 diff; + + diff = key - bmap->b_last_allocated_key; + if ((nilfs_bmap_keydiff_abs(diff) < NILFS_INODE_BMAP_SIZE) && + (bmap->b_last_allocated_ptr != NILFS_BMAP_INVALID_PTR) && + (bmap->b_last_allocated_ptr + diff > 0)) + return bmap->b_last_allocated_ptr + diff; + else + return NILFS_BMAP_INVALID_PTR; +} + +static struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap) +{ + return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode)); +} + +#define NILFS_BMAP_GROUP_DIV 8 +__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap) +{ + struct inode *dat = nilfs_bmap_get_dat(bmap); + unsigned long entries_per_group = nilfs_palloc_entries_per_group(dat); + unsigned long group = bmap->b_inode->i_ino / entries_per_group; + + return group * entries_per_group + + (bmap->b_inode->i_ino % NILFS_BMAP_GROUP_DIV) * + (entries_per_group / NILFS_BMAP_GROUP_DIV); +} + +static int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + return nilfs_dat_prepare_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req); +} + +static void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + nilfs_dat_commit_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req); +} + +static void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + nilfs_dat_abort_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req); +} + +static int nilfs_bmap_prepare_start_v(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + return nilfs_dat_prepare_start(nilfs_bmap_get_dat(bmap), &req->bpr_req); +} + +static void nilfs_bmap_commit_start_v(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req, + sector_t blocknr) +{ + nilfs_dat_commit_start(nilfs_bmap_get_dat(bmap), &req->bpr_req, + blocknr); +} + +static void nilfs_bmap_abort_start_v(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + nilfs_dat_abort_start(nilfs_bmap_get_dat(bmap), &req->bpr_req); +} + +static int nilfs_bmap_prepare_end_v(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + return nilfs_dat_prepare_end(nilfs_bmap_get_dat(bmap), &req->bpr_req); +} + +static void nilfs_bmap_commit_end_v(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req, 0); +} + +static void nilfs_bmap_commit_end_vmdt(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req, 1); +} + +static void nilfs_bmap_abort_end_v(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + nilfs_dat_abort_end(nilfs_bmap_get_dat(bmap), &req->bpr_req); +} + +int nilfs_bmap_move_v(const struct nilfs_bmap *bmap, __u64 vblocknr, + sector_t blocknr) +{ + return nilfs_dat_move(nilfs_bmap_get_dat(bmap), vblocknr, blocknr); +} + +int nilfs_bmap_mark_dirty(const struct nilfs_bmap *bmap, __u64 vblocknr) +{ + return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), vblocknr); +} + +int nilfs_bmap_prepare_update(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *oldreq, + union nilfs_bmap_ptr_req *newreq) +{ + int ret; + + ret = bmap->b_pops->bpop_prepare_end_ptr(bmap, oldreq); + if (ret < 0) + return ret; + ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, newreq); + if (ret < 0) + bmap->b_pops->bpop_abort_end_ptr(bmap, oldreq); + + return ret; +} + +void nilfs_bmap_commit_update(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *oldreq, + union nilfs_bmap_ptr_req *newreq) +{ + bmap->b_pops->bpop_commit_end_ptr(bmap, oldreq); + bmap->b_pops->bpop_commit_alloc_ptr(bmap, newreq); +} + +void nilfs_bmap_abort_update(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *oldreq, + union nilfs_bmap_ptr_req *newreq) +{ + bmap->b_pops->bpop_abort_end_ptr(bmap, oldreq); + bmap->b_pops->bpop_abort_alloc_ptr(bmap, newreq); +} + +static int nilfs_bmap_translate_v(const struct nilfs_bmap *bmap, __u64 ptr, + __u64 *ptrp) +{ + sector_t blocknr; + int ret; + + ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), ptr, &blocknr); + if (ret < 0) + return ret; + if (ptrp != NULL) + *ptrp = blocknr; + return 0; +} + +static int nilfs_bmap_prepare_alloc_p(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + /* ignore target ptr */ + req->bpr_ptr = bmap->b_last_allocated_ptr++; + return 0; +} + +static void nilfs_bmap_commit_alloc_p(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + /* do nothing */ +} + +static void nilfs_bmap_abort_alloc_p(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + bmap->b_last_allocated_ptr--; +} + +static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_v = { + .bpop_prepare_alloc_ptr = nilfs_bmap_prepare_alloc_v, + .bpop_commit_alloc_ptr = nilfs_bmap_commit_alloc_v, + .bpop_abort_alloc_ptr = nilfs_bmap_abort_alloc_v, + .bpop_prepare_start_ptr = nilfs_bmap_prepare_start_v, + .bpop_commit_start_ptr = nilfs_bmap_commit_start_v, + .bpop_abort_start_ptr = nilfs_bmap_abort_start_v, + .bpop_prepare_end_ptr = nilfs_bmap_prepare_end_v, + .bpop_commit_end_ptr = nilfs_bmap_commit_end_v, + .bpop_abort_end_ptr = nilfs_bmap_abort_end_v, + + .bpop_translate = nilfs_bmap_translate_v, +}; + +static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_vmdt = { + .bpop_prepare_alloc_ptr = nilfs_bmap_prepare_alloc_v, + .bpop_commit_alloc_ptr = nilfs_bmap_commit_alloc_v, + .bpop_abort_alloc_ptr = nilfs_bmap_abort_alloc_v, + .bpop_prepare_start_ptr = nilfs_bmap_prepare_start_v, + .bpop_commit_start_ptr = nilfs_bmap_commit_start_v, + .bpop_abort_start_ptr = nilfs_bmap_abort_start_v, + .bpop_prepare_end_ptr = nilfs_bmap_prepare_end_v, + .bpop_commit_end_ptr = nilfs_bmap_commit_end_vmdt, + .bpop_abort_end_ptr = nilfs_bmap_abort_end_v, + + .bpop_translate = nilfs_bmap_translate_v, +}; + +static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_p = { + .bpop_prepare_alloc_ptr = nilfs_bmap_prepare_alloc_p, + .bpop_commit_alloc_ptr = nilfs_bmap_commit_alloc_p, + .bpop_abort_alloc_ptr = nilfs_bmap_abort_alloc_p, + .bpop_prepare_start_ptr = NULL, + .bpop_commit_start_ptr = NULL, + .bpop_abort_start_ptr = NULL, + .bpop_prepare_end_ptr = NULL, + .bpop_commit_end_ptr = NULL, + .bpop_abort_end_ptr = NULL, + + .bpop_translate = NULL, +}; + +static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_gc = { + .bpop_prepare_alloc_ptr = NULL, + .bpop_commit_alloc_ptr = NULL, + .bpop_abort_alloc_ptr = NULL, + .bpop_prepare_start_ptr = NULL, + .bpop_commit_start_ptr = NULL, + .bpop_abort_start_ptr = NULL, + .bpop_prepare_end_ptr = NULL, + .bpop_commit_end_ptr = NULL, + .bpop_abort_end_ptr = NULL, + + .bpop_translate = NULL, +}; + +/** + * nilfs_bmap_read - read a bmap from an inode + * @bmap: bmap + * @raw_inode: on-disk inode + * + * Description: nilfs_bmap_read() initializes the bmap @bmap. + * + * Return Value: On success, 0 is returned. On error, the following negative + * error code is returned. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_bmap_read(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode) +{ + if (raw_inode == NULL) + memset(bmap->b_u.u_data, 0, NILFS_BMAP_SIZE); + else + memcpy(bmap->b_u.u_data, raw_inode->i_bmap, NILFS_BMAP_SIZE); + + init_rwsem(&bmap->b_sem); + bmap->b_state = 0; + bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode; + switch (bmap->b_inode->i_ino) { + case NILFS_DAT_INO: + bmap->b_pops = &nilfs_bmap_ptr_ops_p; + bmap->b_last_allocated_key = 0; /* XXX: use macro */ + bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT; + break; + case NILFS_CPFILE_INO: + case NILFS_SUFILE_INO: + bmap->b_pops = &nilfs_bmap_ptr_ops_vmdt; + bmap->b_last_allocated_key = 0; /* XXX: use macro */ + bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR; + break; + default: + bmap->b_pops = &nilfs_bmap_ptr_ops_v; + bmap->b_last_allocated_key = 0; /* XXX: use macro */ + bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR; + break; + } + + return (bmap->b_u.u_flags & NILFS_BMAP_LARGE) ? + nilfs_btree_init(bmap, + NILFS_BMAP_LARGE_LOW, + NILFS_BMAP_LARGE_HIGH) : + nilfs_direct_init(bmap, + NILFS_BMAP_SMALL_LOW, + NILFS_BMAP_SMALL_HIGH); +} + +/** + * nilfs_bmap_write - write back a bmap to an inode + * @bmap: bmap + * @raw_inode: on-disk inode + * + * Description: nilfs_bmap_write() stores @bmap in @raw_inode. + */ +void nilfs_bmap_write(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode) +{ + down_write(&bmap->b_sem); + memcpy(raw_inode->i_bmap, bmap->b_u.u_data, + NILFS_INODE_BMAP_SIZE * sizeof(__le64)); + if (bmap->b_inode->i_ino == NILFS_DAT_INO) + bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT; + + up_write(&bmap->b_sem); +} + +void nilfs_bmap_init_gc(struct nilfs_bmap *bmap) +{ + memset(&bmap->b_u, 0, NILFS_BMAP_SIZE); + init_rwsem(&bmap->b_sem); + bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode; + bmap->b_pops = &nilfs_bmap_ptr_ops_gc; + bmap->b_last_allocated_key = 0; + bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR; + bmap->b_state = 0; + nilfs_btree_init_gc(bmap); +} + +void nilfs_bmap_init_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap) +{ + memcpy(gcbmap, bmap, sizeof(union nilfs_bmap_union)); + init_rwsem(&gcbmap->b_sem); + gcbmap->b_inode = &NILFS_BMAP_I(gcbmap)->vfs_inode; +} + +void nilfs_bmap_commit_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap) +{ + memcpy(bmap, gcbmap, sizeof(union nilfs_bmap_union)); + init_rwsem(&bmap->b_sem); + bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode; +} diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h new file mode 100644 index 000000000000..4f2708abb1ba --- /dev/null +++ b/fs/nilfs2/bmap.h @@ -0,0 +1,244 @@ +/* + * bmap.h - NILFS block mapping. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#ifndef _NILFS_BMAP_H +#define _NILFS_BMAP_H + +#include <linux/types.h> +#include <linux/fs.h> +#include <linux/buffer_head.h> +#include <linux/nilfs2_fs.h> +#include "alloc.h" + +#define NILFS_BMAP_INVALID_PTR 0 + +#define nilfs_bmap_dkey_to_key(dkey) le64_to_cpu(dkey) +#define nilfs_bmap_key_to_dkey(key) cpu_to_le64(key) +#define nilfs_bmap_dptr_to_ptr(dptr) le64_to_cpu(dptr) +#define nilfs_bmap_ptr_to_dptr(ptr) cpu_to_le64(ptr) + +#define nilfs_bmap_keydiff_abs(diff) ((diff) < 0 ? -(diff) : (diff)) + + +struct nilfs_bmap; + +/** + * union nilfs_bmap_ptr_req - request for bmap ptr + * @bpr_ptr: bmap pointer + * @bpr_req: request for persistent allocator + */ +union nilfs_bmap_ptr_req { + __u64 bpr_ptr; + struct nilfs_palloc_req bpr_req; +}; + +/** + * struct nilfs_bmap_stats - bmap statistics + * @bs_nblocks: number of blocks created or deleted + */ +struct nilfs_bmap_stats { + unsigned int bs_nblocks; +}; + +/** + * struct nilfs_bmap_operations - bmap operation table + */ +struct nilfs_bmap_operations { + int (*bop_lookup)(const struct nilfs_bmap *, __u64, int, __u64 *); + int (*bop_insert)(struct nilfs_bmap *, __u64, __u64); + int (*bop_delete)(struct nilfs_bmap *, __u64); + void (*bop_clear)(struct nilfs_bmap *); + + int (*bop_propagate)(const struct nilfs_bmap *, struct buffer_head *); + void (*bop_lookup_dirty_buffers)(struct nilfs_bmap *, + struct list_head *); + + int (*bop_assign)(struct nilfs_bmap *, + struct buffer_head **, + sector_t, + union nilfs_binfo *); + int (*bop_mark)(struct nilfs_bmap *, __u64, int); + + /* The following functions are internal use only. */ + int (*bop_last_key)(const struct nilfs_bmap *, __u64 *); + int (*bop_check_insert)(const struct nilfs_bmap *, __u64); + int (*bop_check_delete)(struct nilfs_bmap *, __u64); + int (*bop_gather_data)(struct nilfs_bmap *, __u64 *, __u64 *, int); +}; + + +/** + * struct nilfs_bmap_ptr_operations - bmap ptr operation table + */ +struct nilfs_bmap_ptr_operations { + int (*bpop_prepare_alloc_ptr)(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *); + void (*bpop_commit_alloc_ptr)(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *); + void (*bpop_abort_alloc_ptr)(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *); + int (*bpop_prepare_start_ptr)(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *); + void (*bpop_commit_start_ptr)(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *, + sector_t); + void (*bpop_abort_start_ptr)(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *); + int (*bpop_prepare_end_ptr)(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *); + void (*bpop_commit_end_ptr)(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *); + void (*bpop_abort_end_ptr)(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *); + + int (*bpop_translate)(const struct nilfs_bmap *, __u64, __u64 *); +}; + + +#define NILFS_BMAP_SIZE (NILFS_INODE_BMAP_SIZE * sizeof(__le64)) +#define NILFS_BMAP_KEY_BIT (sizeof(unsigned long) * 8 /* CHAR_BIT */) +#define NILFS_BMAP_NEW_PTR_INIT \ + (1UL << (sizeof(unsigned long) * 8 /* CHAR_BIT */ - 1)) + +static inline int nilfs_bmap_is_new_ptr(unsigned long ptr) +{ + return !!(ptr & NILFS_BMAP_NEW_PTR_INIT); +} + + +/** + * struct nilfs_bmap - bmap structure + * @b_u: raw data + * @b_sem: semaphore + * @b_inode: owner of bmap + * @b_ops: bmap operation table + * @b_pops: bmap ptr operation table + * @b_low: low watermark of conversion + * @b_high: high watermark of conversion + * @b_last_allocated_key: last allocated key for data block + * @b_last_allocated_ptr: last allocated ptr for data block + * @b_state: state + */ +struct nilfs_bmap { + union { + __u8 u_flags; + __le64 u_data[NILFS_BMAP_SIZE / sizeof(__le64)]; + } b_u; + struct rw_semaphore b_sem; + struct inode *b_inode; + const struct nilfs_bmap_operations *b_ops; + const struct nilfs_bmap_ptr_operations *b_pops; + __u64 b_low; + __u64 b_high; + __u64 b_last_allocated_key; + __u64 b_last_allocated_ptr; + int b_state; +}; + +/* state */ +#define NILFS_BMAP_DIRTY 0x00000001 + + +int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *); +int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *); +void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *); +int nilfs_bmap_lookup(struct nilfs_bmap *, unsigned long, unsigned long *); +int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long); +int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long); +int nilfs_bmap_last_key(struct nilfs_bmap *, unsigned long *); +int nilfs_bmap_truncate(struct nilfs_bmap *, unsigned long); +void nilfs_bmap_clear(struct nilfs_bmap *); +int nilfs_bmap_propagate(struct nilfs_bmap *, struct buffer_head *); +void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *, struct list_head *); +int nilfs_bmap_assign(struct nilfs_bmap *, struct buffer_head **, + unsigned long, union nilfs_binfo *); +int nilfs_bmap_lookup_at_level(struct nilfs_bmap *, __u64, int, __u64 *); +int nilfs_bmap_mark(struct nilfs_bmap *, __u64, int); + +void nilfs_bmap_init_gc(struct nilfs_bmap *); +void nilfs_bmap_init_gcdat(struct nilfs_bmap *, struct nilfs_bmap *); +void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *); + + +/* + * Internal use only + */ + +int nilfs_bmap_move_v(const struct nilfs_bmap *, __u64, sector_t); +int nilfs_bmap_mark_dirty(const struct nilfs_bmap *, __u64); + + +__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *, + const struct buffer_head *); + +__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64); +__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *); + +int nilfs_bmap_prepare_update(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *, + union nilfs_bmap_ptr_req *); +void nilfs_bmap_commit_update(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *, + union nilfs_bmap_ptr_req *); +void nilfs_bmap_abort_update(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *, + union nilfs_bmap_ptr_req *); + +void nilfs_bmap_add_blocks(const struct nilfs_bmap *, int); +void nilfs_bmap_sub_blocks(const struct nilfs_bmap *, int); + + +int nilfs_bmap_get_block(const struct nilfs_bmap *, __u64, + struct buffer_head **); +void nilfs_bmap_put_block(const struct nilfs_bmap *, struct buffer_head *); +int nilfs_bmap_get_new_block(const struct nilfs_bmap *, __u64, + struct buffer_head **); +void nilfs_bmap_delete_block(const struct nilfs_bmap *, struct buffer_head *); + + +/* Assume that bmap semaphore is locked. */ +static inline int nilfs_bmap_dirty(const struct nilfs_bmap *bmap) +{ + return !!(bmap->b_state & NILFS_BMAP_DIRTY); +} + +/* Assume that bmap semaphore is locked. */ +static inline void nilfs_bmap_set_dirty(struct nilfs_bmap *bmap) +{ + bmap->b_state |= NILFS_BMAP_DIRTY; +} + +/* Assume that bmap semaphore is locked. */ +static inline void nilfs_bmap_clear_dirty(struct nilfs_bmap *bmap) +{ + bmap->b_state &= ~NILFS_BMAP_DIRTY; +} + + +#define NILFS_BMAP_LARGE 0x1 + +#define NILFS_BMAP_SMALL_LOW NILFS_DIRECT_KEY_MIN +#define NILFS_BMAP_SMALL_HIGH NILFS_DIRECT_KEY_MAX +#define NILFS_BMAP_LARGE_LOW NILFS_BTREE_ROOT_NCHILDREN_MAX +#define NILFS_BMAP_LARGE_HIGH NILFS_BTREE_KEY_MAX + +#endif /* _NILFS_BMAP_H */ diff --git a/fs/nilfs2/bmap_union.h b/fs/nilfs2/bmap_union.h new file mode 100644 index 000000000000..d41509bff47b --- /dev/null +++ b/fs/nilfs2/bmap_union.h @@ -0,0 +1,42 @@ +/* + * bmap_union.h - NILFS block mapping. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#ifndef _NILFS_BMAP_UNION_H +#define _NILFS_BMAP_UNION_H + +#include "bmap.h" +#include "direct.h" +#include "btree.h" + +/** + * nilfs_bmap_union - + * @bi_bmap: bmap structure + * @bi_btree: direct map structure + * @bi_direct: B-tree structure + */ +union nilfs_bmap_union { + struct nilfs_bmap bi_bmap; + struct nilfs_direct bi_direct; + struct nilfs_btree bi_btree; +}; + +#endif /* _NILFS_BMAP_UNION_H */ diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c new file mode 100644 index 000000000000..4cc07b2c30e0 --- /dev/null +++ b/fs/nilfs2/btnode.c @@ -0,0 +1,316 @@ +/* + * btnode.c - NILFS B-tree node cache + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * This file was originally written by Seiji Kihara <kihara@osrg.net> + * and fully revised by Ryusuke Konishi <ryusuke@osrg.net> for + * stabilization and simplification. + * + */ + +#include <linux/types.h> +#include <linux/buffer_head.h> +#include <linux/mm.h> +#include <linux/backing-dev.h> +#include "nilfs.h" +#include "mdt.h" +#include "dat.h" +#include "page.h" +#include "btnode.h" + + +void nilfs_btnode_cache_init_once(struct address_space *btnc) +{ + INIT_RADIX_TREE(&btnc->page_tree, GFP_ATOMIC); + spin_lock_init(&btnc->tree_lock); + INIT_LIST_HEAD(&btnc->private_list); + spin_lock_init(&btnc->private_lock); + + spin_lock_init(&btnc->i_mmap_lock); + INIT_RAW_PRIO_TREE_ROOT(&btnc->i_mmap); + INIT_LIST_HEAD(&btnc->i_mmap_nonlinear); +} + +static struct address_space_operations def_btnode_aops; + +void nilfs_btnode_cache_init(struct address_space *btnc) +{ + btnc->host = NULL; /* can safely set to host inode ? */ + btnc->flags = 0; + mapping_set_gfp_mask(btnc, GFP_NOFS); + btnc->assoc_mapping = NULL; + btnc->backing_dev_info = &default_backing_dev_info; + btnc->a_ops = &def_btnode_aops; +} + +void nilfs_btnode_cache_clear(struct address_space *btnc) +{ + invalidate_mapping_pages(btnc, 0, -1); + truncate_inode_pages(btnc, 0); +} + +int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, + sector_t pblocknr, struct buffer_head **pbh, + int newblk) +{ + struct buffer_head *bh; + struct inode *inode = NILFS_BTNC_I(btnc); + int err; + + bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node); + if (unlikely(!bh)) + return -ENOMEM; + + err = -EEXIST; /* internal code */ + if (newblk) { + if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) || + buffer_dirty(bh))) { + brelse(bh); + BUG(); + } + bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev; + bh->b_blocknr = blocknr; + set_buffer_mapped(bh); + set_buffer_uptodate(bh); + goto found; + } + + if (buffer_uptodate(bh) || buffer_dirty(bh)) + goto found; + + if (pblocknr == 0) { + pblocknr = blocknr; + if (inode->i_ino != NILFS_DAT_INO) { + struct inode *dat = + nilfs_dat_inode(NILFS_I_NILFS(inode)); + + /* blocknr is a virtual block number */ + err = nilfs_dat_translate(dat, blocknr, &pblocknr); + if (unlikely(err)) { + brelse(bh); + goto out_locked; + } + } + } + lock_buffer(bh); + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + err = -EEXIST; /* internal code */ + goto found; + } + set_buffer_mapped(bh); + bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev; + bh->b_blocknr = pblocknr; /* set block address for read */ + bh->b_end_io = end_buffer_read_sync; + get_bh(bh); + submit_bh(READ, bh); + bh->b_blocknr = blocknr; /* set back to the given block address */ + err = 0; +found: + *pbh = bh; + +out_locked: + unlock_page(bh->b_page); + page_cache_release(bh->b_page); + return err; +} + +int nilfs_btnode_get(struct address_space *btnc, __u64 blocknr, + sector_t pblocknr, struct buffer_head **pbh, int newblk) +{ + struct buffer_head *bh; + int err; + + err = nilfs_btnode_submit_block(btnc, blocknr, pblocknr, pbh, newblk); + if (err == -EEXIST) /* internal code (cache hit) */ + return 0; + if (unlikely(err)) + return err; + + bh = *pbh; + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + brelse(bh); + return -EIO; + } + return 0; +} + +/** + * nilfs_btnode_delete - delete B-tree node buffer + * @bh: buffer to be deleted + * + * nilfs_btnode_delete() invalidates the specified buffer and delete the page + * including the buffer if the page gets unbusy. + */ +void nilfs_btnode_delete(struct buffer_head *bh) +{ + struct address_space *mapping; + struct page *page = bh->b_page; + pgoff_t index = page_index(page); + int still_dirty; + + page_cache_get(page); + lock_page(page); + wait_on_page_writeback(page); + + nilfs_forget_buffer(bh); + still_dirty = PageDirty(page); + mapping = page->mapping; + unlock_page(page); + page_cache_release(page); + + if (!still_dirty && mapping) + invalidate_inode_pages2_range(mapping, index, index); +} + +/** + * nilfs_btnode_prepare_change_key + * prepare to move contents of the block for old key to one of new key. + * the old buffer will not be removed, but might be reused for new buffer. + * it might return -ENOMEM because of memory allocation errors, + * and might return -EIO because of disk read errors. + */ +int nilfs_btnode_prepare_change_key(struct address_space *btnc, + struct nilfs_btnode_chkey_ctxt *ctxt) +{ + struct buffer_head *obh, *nbh; + struct inode *inode = NILFS_BTNC_I(btnc); + __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey; + int err; + + if (oldkey == newkey) + return 0; + + obh = ctxt->bh; + ctxt->newbh = NULL; + + if (inode->i_blkbits == PAGE_CACHE_SHIFT) { + lock_page(obh->b_page); + /* + * We cannot call radix_tree_preload for the kernels older + * than 2.6.23, because it is not exported for modules. + */ + err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + if (err) + goto failed_unlock; + /* BUG_ON(oldkey != obh->b_page->index); */ + if (unlikely(oldkey != obh->b_page->index)) + NILFS_PAGE_BUG(obh->b_page, + "invalid oldkey %lld (newkey=%lld)", + (unsigned long long)oldkey, + (unsigned long long)newkey); + +retry: + spin_lock_irq(&btnc->tree_lock); + err = radix_tree_insert(&btnc->page_tree, newkey, obh->b_page); + spin_unlock_irq(&btnc->tree_lock); + /* + * Note: page->index will not change to newkey until + * nilfs_btnode_commit_change_key() will be called. + * To protect the page in intermediate state, the page lock + * is held. + */ + radix_tree_preload_end(); + if (!err) + return 0; + else if (err != -EEXIST) + goto failed_unlock; + + err = invalidate_inode_pages2_range(btnc, newkey, newkey); + if (!err) + goto retry; + /* fallback to copy mode */ + unlock_page(obh->b_page); + } + + err = nilfs_btnode_get(btnc, newkey, 0, &nbh, 1); + if (likely(!err)) { + BUG_ON(nbh == obh); + ctxt->newbh = nbh; + } + return err; + + failed_unlock: + unlock_page(obh->b_page); + return err; +} + +/** + * nilfs_btnode_commit_change_key + * commit the change_key operation prepared by prepare_change_key(). + */ +void nilfs_btnode_commit_change_key(struct address_space *btnc, + struct nilfs_btnode_chkey_ctxt *ctxt) +{ + struct buffer_head *obh = ctxt->bh, *nbh = ctxt->newbh; + __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey; + struct page *opage; + + if (oldkey == newkey) + return; + + if (nbh == NULL) { /* blocksize == pagesize */ + opage = obh->b_page; + if (unlikely(oldkey != opage->index)) + NILFS_PAGE_BUG(opage, + "invalid oldkey %lld (newkey=%lld)", + (unsigned long long)oldkey, + (unsigned long long)newkey); + if (!test_set_buffer_dirty(obh) && TestSetPageDirty(opage)) + BUG(); + + spin_lock_irq(&btnc->tree_lock); + radix_tree_delete(&btnc->page_tree, oldkey); + radix_tree_tag_set(&btnc->page_tree, newkey, + PAGECACHE_TAG_DIRTY); + spin_unlock_irq(&btnc->tree_lock); + + opage->index = obh->b_blocknr = newkey; + unlock_page(opage); + } else { + nilfs_copy_buffer(nbh, obh); + nilfs_btnode_mark_dirty(nbh); + + nbh->b_blocknr = newkey; + ctxt->bh = nbh; + nilfs_btnode_delete(obh); /* will decrement bh->b_count */ + } +} + +/** + * nilfs_btnode_abort_change_key + * abort the change_key operation prepared by prepare_change_key(). + */ +void nilfs_btnode_abort_change_key(struct address_space *btnc, + struct nilfs_btnode_chkey_ctxt *ctxt) +{ + struct buffer_head *nbh = ctxt->newbh; + __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey; + + if (oldkey == newkey) + return; + + if (nbh == NULL) { /* blocksize == pagesize */ + spin_lock_irq(&btnc->tree_lock); + radix_tree_delete(&btnc->page_tree, newkey); + spin_unlock_irq(&btnc->tree_lock); + unlock_page(ctxt->bh->b_page); + } else + brelse(nbh); +} diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h new file mode 100644 index 000000000000..35faa86444a7 --- /dev/null +++ b/fs/nilfs2/btnode.h @@ -0,0 +1,58 @@ +/* + * btnode.h - NILFS B-tree node cache + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Seiji Kihara <kihara@osrg.net> + * Revised by Ryusuke Konishi <ryusuke@osrg.net> + */ + +#ifndef _NILFS_BTNODE_H +#define _NILFS_BTNODE_H + +#include <linux/types.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> +#include <linux/backing-dev.h> + + +struct nilfs_btnode_chkey_ctxt { + __u64 oldkey; + __u64 newkey; + struct buffer_head *bh; + struct buffer_head *newbh; +}; + +void nilfs_btnode_cache_init_once(struct address_space *); +void nilfs_btnode_cache_init(struct address_space *); +void nilfs_btnode_cache_clear(struct address_space *); +int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t, + struct buffer_head **, int); +int nilfs_btnode_get(struct address_space *, __u64, sector_t, + struct buffer_head **, int); +void nilfs_btnode_delete(struct buffer_head *); +int nilfs_btnode_prepare_change_key(struct address_space *, + struct nilfs_btnode_chkey_ctxt *); +void nilfs_btnode_commit_change_key(struct address_space *, + struct nilfs_btnode_chkey_ctxt *); +void nilfs_btnode_abort_change_key(struct address_space *, + struct nilfs_btnode_chkey_ctxt *); + +#define nilfs_btnode_mark_dirty(bh) nilfs_mark_buffer_dirty(bh) + + +#endif /* _NILFS_BTNODE_H */ diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c new file mode 100644 index 000000000000..6b37a2767293 --- /dev/null +++ b/fs/nilfs2/btree.c @@ -0,0 +1,2269 @@ +/* + * btree.c - NILFS B-tree. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/pagevec.h> +#include "nilfs.h" +#include "page.h" +#include "btnode.h" +#include "btree.h" +#include "alloc.h" + +/** + * struct nilfs_btree_path - A path on which B-tree operations are executed + * @bp_bh: buffer head of node block + * @bp_sib_bh: buffer head of sibling node block + * @bp_index: index of child node + * @bp_oldreq: ptr end request for old ptr + * @bp_newreq: ptr alloc request for new ptr + * @bp_op: rebalance operation + */ +struct nilfs_btree_path { + struct buffer_head *bp_bh; + struct buffer_head *bp_sib_bh; + int bp_index; + union nilfs_bmap_ptr_req bp_oldreq; + union nilfs_bmap_ptr_req bp_newreq; + struct nilfs_btnode_chkey_ctxt bp_ctxt; + void (*bp_op)(struct nilfs_btree *, struct nilfs_btree_path *, + int, __u64 *, __u64 *); +}; + +/* + * B-tree path operations + */ + +static struct kmem_cache *nilfs_btree_path_cache; + +int __init nilfs_btree_path_cache_init(void) +{ + nilfs_btree_path_cache = + kmem_cache_create("nilfs2_btree_path_cache", + sizeof(struct nilfs_btree_path) * + NILFS_BTREE_LEVEL_MAX, 0, 0, NULL); + return (nilfs_btree_path_cache != NULL) ? 0 : -ENOMEM; +} + +void nilfs_btree_path_cache_destroy(void) +{ + kmem_cache_destroy(nilfs_btree_path_cache); +} + +static inline struct nilfs_btree_path * +nilfs_btree_alloc_path(const struct nilfs_btree *btree) +{ + return (struct nilfs_btree_path *) + kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS); +} + +static inline void nilfs_btree_free_path(const struct nilfs_btree *btree, + struct nilfs_btree_path *path) +{ + kmem_cache_free(nilfs_btree_path_cache, path); +} + +static void nilfs_btree_init_path(const struct nilfs_btree *btree, + struct nilfs_btree_path *path) +{ + int level; + + for (level = NILFS_BTREE_LEVEL_DATA; + level < NILFS_BTREE_LEVEL_MAX; + level++) { + path[level].bp_bh = NULL; + path[level].bp_sib_bh = NULL; + path[level].bp_index = 0; + path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR; + path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR; + path[level].bp_op = NULL; + } +} + +static void nilfs_btree_clear_path(const struct nilfs_btree *btree, + struct nilfs_btree_path *path) +{ + int level; + + for (level = NILFS_BTREE_LEVEL_DATA; + level < NILFS_BTREE_LEVEL_MAX; + level++) { + if (path[level].bp_bh != NULL) { + nilfs_bmap_put_block(&btree->bt_bmap, + path[level].bp_bh); + path[level].bp_bh = NULL; + } + /* sib_bh is released or deleted by prepare or commit + * operations. */ + path[level].bp_sib_bh = NULL; + path[level].bp_index = 0; + path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR; + path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR; + path[level].bp_op = NULL; + } +} + + +/* + * B-tree node operations + */ + +static inline int +nilfs_btree_node_get_flags(const struct nilfs_btree *btree, + const struct nilfs_btree_node *node) +{ + return node->bn_flags; +} + +static inline void +nilfs_btree_node_set_flags(struct nilfs_btree *btree, + struct nilfs_btree_node *node, + int flags) +{ + node->bn_flags = flags; +} + +static inline int nilfs_btree_node_root(const struct nilfs_btree *btree, + const struct nilfs_btree_node *node) +{ + return nilfs_btree_node_get_flags(btree, node) & NILFS_BTREE_NODE_ROOT; +} + +static inline int +nilfs_btree_node_get_level(const struct nilfs_btree *btree, + const struct nilfs_btree_node *node) +{ + return node->bn_level; +} + +static inline void +nilfs_btree_node_set_level(struct nilfs_btree *btree, + struct nilfs_btree_node *node, + int level) +{ + node->bn_level = level; +} + +static inline int +nilfs_btree_node_get_nchildren(const struct nilfs_btree *btree, + const struct nilfs_btree_node *node) +{ + return le16_to_cpu(node->bn_nchildren); +} + +static inline void +nilfs_btree_node_set_nchildren(struct nilfs_btree *btree, + struct nilfs_btree_node *node, + int nchildren) +{ + node->bn_nchildren = cpu_to_le16(nchildren); +} + +static inline int +nilfs_btree_node_size(const struct nilfs_btree *btree) +{ + return 1 << btree->bt_bmap.b_inode->i_blkbits; +} + +static inline int +nilfs_btree_node_nchildren_min(const struct nilfs_btree *btree, + const struct nilfs_btree_node *node) +{ + return nilfs_btree_node_root(btree, node) ? + NILFS_BTREE_ROOT_NCHILDREN_MIN : + NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree)); +} + +static inline int +nilfs_btree_node_nchildren_max(const struct nilfs_btree *btree, + const struct nilfs_btree_node *node) +{ + return nilfs_btree_node_root(btree, node) ? + NILFS_BTREE_ROOT_NCHILDREN_MAX : + NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(btree)); +} + +static inline __le64 * +nilfs_btree_node_dkeys(const struct nilfs_btree *btree, + const struct nilfs_btree_node *node) +{ + return (__le64 *)((char *)(node + 1) + + (nilfs_btree_node_root(btree, node) ? + 0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE)); +} + +static inline __le64 * +nilfs_btree_node_dptrs(const struct nilfs_btree *btree, + const struct nilfs_btree_node *node) +{ + return (__le64 *)(nilfs_btree_node_dkeys(btree, node) + + nilfs_btree_node_nchildren_max(btree, node)); +} + +static inline __u64 +nilfs_btree_node_get_key(const struct nilfs_btree *btree, + const struct nilfs_btree_node *node, int index) +{ + return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(btree, node) + + index)); +} + +static inline void +nilfs_btree_node_set_key(struct nilfs_btree *btree, + struct nilfs_btree_node *node, int index, __u64 key) +{ + *(nilfs_btree_node_dkeys(btree, node) + index) = + nilfs_bmap_key_to_dkey(key); +} + +static inline __u64 +nilfs_btree_node_get_ptr(const struct nilfs_btree *btree, + const struct nilfs_btree_node *node, + int index) +{ + return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(btree, node) + + index)); +} + +static inline void +nilfs_btree_node_set_ptr(struct nilfs_btree *btree, + struct nilfs_btree_node *node, + int index, + __u64 ptr) +{ + *(nilfs_btree_node_dptrs(btree, node) + index) = + nilfs_bmap_ptr_to_dptr(ptr); +} + +static void nilfs_btree_node_init(struct nilfs_btree *btree, + struct nilfs_btree_node *node, + int flags, int level, int nchildren, + const __u64 *keys, const __u64 *ptrs) +{ + __le64 *dkeys; + __le64 *dptrs; + int i; + + nilfs_btree_node_set_flags(btree, node, flags); + nilfs_btree_node_set_level(btree, node, level); + nilfs_btree_node_set_nchildren(btree, node, nchildren); + + dkeys = nilfs_btree_node_dkeys(btree, node); + dptrs = nilfs_btree_node_dptrs(btree, node); + for (i = 0; i < nchildren; i++) { + dkeys[i] = nilfs_bmap_key_to_dkey(keys[i]); + dptrs[i] = nilfs_bmap_ptr_to_dptr(ptrs[i]); + } +} + +/* Assume the buffer heads corresponding to left and right are locked. */ +static void nilfs_btree_node_move_left(struct nilfs_btree *btree, + struct nilfs_btree_node *left, + struct nilfs_btree_node *right, + int n) +{ + __le64 *ldkeys, *rdkeys; + __le64 *ldptrs, *rdptrs; + int lnchildren, rnchildren; + + ldkeys = nilfs_btree_node_dkeys(btree, left); + ldptrs = nilfs_btree_node_dptrs(btree, left); + lnchildren = nilfs_btree_node_get_nchildren(btree, left); + + rdkeys = nilfs_btree_node_dkeys(btree, right); + rdptrs = nilfs_btree_node_dptrs(btree, right); + rnchildren = nilfs_btree_node_get_nchildren(btree, right); + + memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys)); + memcpy(ldptrs + lnchildren, rdptrs, n * sizeof(*rdptrs)); + memmove(rdkeys, rdkeys + n, (rnchildren - n) * sizeof(*rdkeys)); + memmove(rdptrs, rdptrs + n, (rnchildren - n) * sizeof(*rdptrs)); + + lnchildren += n; + rnchildren -= n; + nilfs_btree_node_set_nchildren(btree, left, lnchildren); + nilfs_btree_node_set_nchildren(btree, right, rnchildren); +} + +/* Assume that the buffer heads corresponding to left and right are locked. */ +static void nilfs_btree_node_move_right(struct nilfs_btree *btree, + struct nilfs_btree_node *left, + struct nilfs_btree_node *right, + int n) +{ + __le64 *ldkeys, *rdkeys; + __le64 *ldptrs, *rdptrs; + int lnchildren, rnchildren; + + ldkeys = nilfs_btree_node_dkeys(btree, left); + ldptrs = nilfs_btree_node_dptrs(btree, left); + lnchildren = nilfs_btree_node_get_nchildren(btree, left); + + rdkeys = nilfs_btree_node_dkeys(btree, right); + rdptrs = nilfs_btree_node_dptrs(btree, right); + rnchildren = nilfs_btree_node_get_nchildren(btree, right); + + memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys)); + memmove(rdptrs + n, rdptrs, rnchildren * sizeof(*rdptrs)); + memcpy(rdkeys, ldkeys + lnchildren - n, n * sizeof(*rdkeys)); + memcpy(rdptrs, ldptrs + lnchildren - n, n * sizeof(*rdptrs)); + + lnchildren -= n; + rnchildren += n; + nilfs_btree_node_set_nchildren(btree, left, lnchildren); + nilfs_btree_node_set_nchildren(btree, right, rnchildren); +} + +/* Assume that the buffer head corresponding to node is locked. */ +static void nilfs_btree_node_insert(struct nilfs_btree *btree, + struct nilfs_btree_node *node, + __u64 key, __u64 ptr, int index) +{ + __le64 *dkeys; + __le64 *dptrs; + int nchildren; + + dkeys = nilfs_btree_node_dkeys(btree, node); + dptrs = nilfs_btree_node_dptrs(btree, node); + nchildren = nilfs_btree_node_get_nchildren(btree, node); + if (index < nchildren) { + memmove(dkeys + index + 1, dkeys + index, + (nchildren - index) * sizeof(*dkeys)); + memmove(dptrs + index + 1, dptrs + index, + (nchildren - index) * sizeof(*dptrs)); + } + dkeys[index] = nilfs_bmap_key_to_dkey(key); + dptrs[index] = nilfs_bmap_ptr_to_dptr(ptr); + nchildren++; + nilfs_btree_node_set_nchildren(btree, node, nchildren); +} + +/* Assume that the buffer head corresponding to node is locked. */ +static void nilfs_btree_node_delete(struct nilfs_btree *btree, + struct nilfs_btree_node *node, + __u64 *keyp, __u64 *ptrp, int index) +{ + __u64 key; + __u64 ptr; + __le64 *dkeys; + __le64 *dptrs; + int nchildren; + + dkeys = nilfs_btree_node_dkeys(btree, node); + dptrs = nilfs_btree_node_dptrs(btree, node); + key = nilfs_bmap_dkey_to_key(dkeys[index]); + ptr = nilfs_bmap_dptr_to_ptr(dptrs[index]); + nchildren = nilfs_btree_node_get_nchildren(btree, node); + if (keyp != NULL) + *keyp = key; + if (ptrp != NULL) + *ptrp = ptr; + + if (index < nchildren - 1) { + memmove(dkeys + index, dkeys + index + 1, + (nchildren - index - 1) * sizeof(*dkeys)); + memmove(dptrs + index, dptrs + index + 1, + (nchildren - index - 1) * sizeof(*dptrs)); + } + nchildren--; + nilfs_btree_node_set_nchildren(btree, node, nchildren); +} + +static int nilfs_btree_node_lookup(const struct nilfs_btree *btree, + const struct nilfs_btree_node *node, + __u64 key, int *indexp) +{ + __u64 nkey; + int index, low, high, s; + + /* binary search */ + low = 0; + high = nilfs_btree_node_get_nchildren(btree, node) - 1; + index = 0; + s = 0; + while (low <= high) { + index = (low + high) / 2; + nkey = nilfs_btree_node_get_key(btree, node, index); + if (nkey == key) { + s = 0; + goto out; + } else if (nkey < key) { + low = index + 1; + s = -1; + } else { + high = index - 1; + s = 1; + } + } + + /* adjust index */ + if (nilfs_btree_node_get_level(btree, node) > + NILFS_BTREE_LEVEL_NODE_MIN) { + if ((s > 0) && (index > 0)) + index--; + } else if (s < 0) + index++; + + out: + *indexp = index; + + return s == 0; +} + +static inline struct nilfs_btree_node * +nilfs_btree_get_root(const struct nilfs_btree *btree) +{ + return (struct nilfs_btree_node *)btree->bt_bmap.b_u.u_data; +} + +static inline struct nilfs_btree_node * +nilfs_btree_get_nonroot_node(const struct nilfs_btree *btree, + const struct nilfs_btree_path *path, + int level) +{ + return (struct nilfs_btree_node *)path[level].bp_bh->b_data; +} + +static inline struct nilfs_btree_node * +nilfs_btree_get_sib_node(const struct nilfs_btree *btree, + const struct nilfs_btree_path *path, + int level) +{ + return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data; +} + +static inline int nilfs_btree_height(const struct nilfs_btree *btree) +{ + return nilfs_btree_node_get_level(btree, nilfs_btree_get_root(btree)) + + 1; +} + +static inline struct nilfs_btree_node * +nilfs_btree_get_node(const struct nilfs_btree *btree, + const struct nilfs_btree_path *path, + int level) +{ + return (level == nilfs_btree_height(btree) - 1) ? + nilfs_btree_get_root(btree) : + nilfs_btree_get_nonroot_node(btree, path, level); +} + +static int nilfs_btree_do_lookup(const struct nilfs_btree *btree, + struct nilfs_btree_path *path, + __u64 key, __u64 *ptrp, int minlevel) +{ + struct nilfs_btree_node *node; + __u64 ptr; + int level, index, found, ret; + + node = nilfs_btree_get_root(btree); + level = nilfs_btree_node_get_level(btree, node); + if ((level < minlevel) || + (nilfs_btree_node_get_nchildren(btree, node) <= 0)) + return -ENOENT; + + found = nilfs_btree_node_lookup(btree, node, key, &index); + ptr = nilfs_btree_node_get_ptr(btree, node, index); + path[level].bp_bh = NULL; + path[level].bp_index = index; + + for (level--; level >= minlevel; level--) { + ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr, + &path[level].bp_bh); + if (ret < 0) + return ret; + node = nilfs_btree_get_nonroot_node(btree, path, level); + BUG_ON(level != nilfs_btree_node_get_level(btree, node)); + if (!found) + found = nilfs_btree_node_lookup(btree, node, key, + &index); + else + index = 0; + if (index < nilfs_btree_node_nchildren_max(btree, node)) + ptr = nilfs_btree_node_get_ptr(btree, node, index); + else { + WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN); + /* insert */ + ptr = NILFS_BMAP_INVALID_PTR; + } + path[level].bp_index = index; + } + if (!found) + return -ENOENT; + + if (ptrp != NULL) + *ptrp = ptr; + + return 0; +} + +static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree, + struct nilfs_btree_path *path, + __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node; + __u64 ptr; + int index, level, ret; + + node = nilfs_btree_get_root(btree); + index = nilfs_btree_node_get_nchildren(btree, node) - 1; + if (index < 0) + return -ENOENT; + level = nilfs_btree_node_get_level(btree, node); + ptr = nilfs_btree_node_get_ptr(btree, node, index); + path[level].bp_bh = NULL; + path[level].bp_index = index; + + for (level--; level > 0; level--) { + ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr, + &path[level].bp_bh); + if (ret < 0) + return ret; + node = nilfs_btree_get_nonroot_node(btree, path, level); + BUG_ON(level != nilfs_btree_node_get_level(btree, node)); + index = nilfs_btree_node_get_nchildren(btree, node) - 1; + ptr = nilfs_btree_node_get_ptr(btree, node, index); + path[level].bp_index = index; + } + + if (keyp != NULL) + *keyp = nilfs_btree_node_get_key(btree, node, index); + if (ptrp != NULL) + *ptrp = ptr; + + return 0; +} + +static int nilfs_btree_lookup(const struct nilfs_bmap *bmap, + __u64 key, int level, __u64 *ptrp) +{ + struct nilfs_btree *btree; + struct nilfs_btree_path *path; + __u64 ptr; + int ret; + + btree = (struct nilfs_btree *)bmap; + path = nilfs_btree_alloc_path(btree); + if (path == NULL) + return -ENOMEM; + nilfs_btree_init_path(btree, path); + + ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level); + + if (ptrp != NULL) + *ptrp = ptr; + + nilfs_btree_clear_path(btree, path); + nilfs_btree_free_path(btree, path); + + return ret; +} + +static void nilfs_btree_promote_key(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, __u64 key) +{ + if (level < nilfs_btree_height(btree) - 1) { + do { + lock_buffer(path[level].bp_bh); + nilfs_btree_node_set_key( + btree, + nilfs_btree_get_nonroot_node( + btree, path, level), + path[level].bp_index, key); + if (!buffer_dirty(path[level].bp_bh)) + nilfs_btnode_mark_dirty(path[level].bp_bh); + unlock_buffer(path[level].bp_bh); + } while ((path[level].bp_index == 0) && + (++level < nilfs_btree_height(btree) - 1)); + } + + /* root */ + if (level == nilfs_btree_height(btree) - 1) { + nilfs_btree_node_set_key(btree, + nilfs_btree_get_root(btree), + path[level].bp_index, key); + } +} + +static void nilfs_btree_do_insert(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node; + + if (level < nilfs_btree_height(btree) - 1) { + lock_buffer(path[level].bp_bh); + node = nilfs_btree_get_nonroot_node(btree, path, level); + nilfs_btree_node_insert(btree, node, *keyp, *ptrp, + path[level].bp_index); + if (!buffer_dirty(path[level].bp_bh)) + nilfs_btnode_mark_dirty(path[level].bp_bh); + unlock_buffer(path[level].bp_bh); + + if (path[level].bp_index == 0) + nilfs_btree_promote_key(btree, path, level + 1, + nilfs_btree_node_get_key( + btree, node, 0)); + } else { + node = nilfs_btree_get_root(btree); + nilfs_btree_node_insert(btree, node, *keyp, *ptrp, + path[level].bp_index); + } +} + +static void nilfs_btree_carry_left(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node, *left; + int nchildren, lnchildren, n, move; + + lock_buffer(path[level].bp_bh); + lock_buffer(path[level].bp_sib_bh); + + node = nilfs_btree_get_nonroot_node(btree, path, level); + left = nilfs_btree_get_sib_node(btree, path, level); + nchildren = nilfs_btree_node_get_nchildren(btree, node); + lnchildren = nilfs_btree_node_get_nchildren(btree, left); + move = 0; + + n = (nchildren + lnchildren + 1) / 2 - lnchildren; + if (n > path[level].bp_index) { + /* move insert point */ + n--; + move = 1; + } + + nilfs_btree_node_move_left(btree, left, node, n); + + if (!buffer_dirty(path[level].bp_bh)) + nilfs_btnode_mark_dirty(path[level].bp_bh); + if (!buffer_dirty(path[level].bp_sib_bh)) + nilfs_btnode_mark_dirty(path[level].bp_sib_bh); + + unlock_buffer(path[level].bp_bh); + unlock_buffer(path[level].bp_sib_bh); + + nilfs_btree_promote_key(btree, path, level + 1, + nilfs_btree_node_get_key(btree, node, 0)); + + if (move) { + nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh); + path[level].bp_bh = path[level].bp_sib_bh; + path[level].bp_sib_bh = NULL; + path[level].bp_index += lnchildren; + path[level + 1].bp_index--; + } else { + nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh); + path[level].bp_sib_bh = NULL; + path[level].bp_index -= n; + } + + nilfs_btree_do_insert(btree, path, level, keyp, ptrp); +} + +static void nilfs_btree_carry_right(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node, *right; + int nchildren, rnchildren, n, move; + + lock_buffer(path[level].bp_bh); + lock_buffer(path[level].bp_sib_bh); + + node = nilfs_btree_get_nonroot_node(btree, path, level); + right = nilfs_btree_get_sib_node(btree, path, level); + nchildren = nilfs_btree_node_get_nchildren(btree, node); + rnchildren = nilfs_btree_node_get_nchildren(btree, right); + move = 0; + + n = (nchildren + rnchildren + 1) / 2 - rnchildren; + if (n > nchildren - path[level].bp_index) { + /* move insert point */ + n--; + move = 1; + } + + nilfs_btree_node_move_right(btree, node, right, n); + + if (!buffer_dirty(path[level].bp_bh)) + nilfs_btnode_mark_dirty(path[level].bp_bh); + if (!buffer_dirty(path[level].bp_sib_bh)) + nilfs_btnode_mark_dirty(path[level].bp_sib_bh); + + unlock_buffer(path[level].bp_bh); + unlock_buffer(path[level].bp_sib_bh); + + path[level + 1].bp_index++; + nilfs_btree_promote_key(btree, path, level + 1, + nilfs_btree_node_get_key(btree, right, 0)); + path[level + 1].bp_index--; + + if (move) { + nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh); + path[level].bp_bh = path[level].bp_sib_bh; + path[level].bp_sib_bh = NULL; + path[level].bp_index -= + nilfs_btree_node_get_nchildren(btree, node); + path[level + 1].bp_index++; + } else { + nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh); + path[level].bp_sib_bh = NULL; + } + + nilfs_btree_do_insert(btree, path, level, keyp, ptrp); +} + +static void nilfs_btree_split(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node, *right; + __u64 newkey; + __u64 newptr; + int nchildren, n, move; + + lock_buffer(path[level].bp_bh); + lock_buffer(path[level].bp_sib_bh); + + node = nilfs_btree_get_nonroot_node(btree, path, level); + right = nilfs_btree_get_sib_node(btree, path, level); + nchildren = nilfs_btree_node_get_nchildren(btree, node); + move = 0; + + n = (nchildren + 1) / 2; + if (n > nchildren - path[level].bp_index) { + n--; + move = 1; + } + + nilfs_btree_node_move_right(btree, node, right, n); + + if (!buffer_dirty(path[level].bp_bh)) + nilfs_btnode_mark_dirty(path[level].bp_bh); + if (!buffer_dirty(path[level].bp_sib_bh)) + nilfs_btnode_mark_dirty(path[level].bp_sib_bh); + + unlock_buffer(path[level].bp_bh); + unlock_buffer(path[level].bp_sib_bh); + + newkey = nilfs_btree_node_get_key(btree, right, 0); + newptr = path[level].bp_newreq.bpr_ptr; + + if (move) { + path[level].bp_index -= + nilfs_btree_node_get_nchildren(btree, node); + nilfs_btree_node_insert(btree, right, *keyp, *ptrp, + path[level].bp_index); + + *keyp = nilfs_btree_node_get_key(btree, right, 0); + *ptrp = path[level].bp_newreq.bpr_ptr; + + nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh); + path[level].bp_bh = path[level].bp_sib_bh; + path[level].bp_sib_bh = NULL; + } else { + nilfs_btree_do_insert(btree, path, level, keyp, ptrp); + + *keyp = nilfs_btree_node_get_key(btree, right, 0); + *ptrp = path[level].bp_newreq.bpr_ptr; + + nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh); + path[level].bp_sib_bh = NULL; + } + + path[level + 1].bp_index++; +} + +static void nilfs_btree_grow(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *root, *child; + int n; + + lock_buffer(path[level].bp_sib_bh); + + root = nilfs_btree_get_root(btree); + child = nilfs_btree_get_sib_node(btree, path, level); + + n = nilfs_btree_node_get_nchildren(btree, root); + + nilfs_btree_node_move_right(btree, root, child, n); + nilfs_btree_node_set_level(btree, root, level + 1); + + if (!buffer_dirty(path[level].bp_sib_bh)) + nilfs_btnode_mark_dirty(path[level].bp_sib_bh); + + unlock_buffer(path[level].bp_sib_bh); + + path[level].bp_bh = path[level].bp_sib_bh; + path[level].bp_sib_bh = NULL; + + nilfs_btree_do_insert(btree, path, level, keyp, ptrp); + + *keyp = nilfs_btree_node_get_key(btree, child, 0); + *ptrp = path[level].bp_newreq.bpr_ptr; +} + +static __u64 nilfs_btree_find_near(const struct nilfs_btree *btree, + const struct nilfs_btree_path *path) +{ + struct nilfs_btree_node *node; + int level; + + if (path == NULL) + return NILFS_BMAP_INVALID_PTR; + + /* left sibling */ + level = NILFS_BTREE_LEVEL_NODE_MIN; + if (path[level].bp_index > 0) { + node = nilfs_btree_get_node(btree, path, level); + return nilfs_btree_node_get_ptr(btree, node, + path[level].bp_index - 1); + } + + /* parent */ + level = NILFS_BTREE_LEVEL_NODE_MIN + 1; + if (level <= nilfs_btree_height(btree) - 1) { + node = nilfs_btree_get_node(btree, path, level); + return nilfs_btree_node_get_ptr(btree, node, + path[level].bp_index); + } + + return NILFS_BMAP_INVALID_PTR; +} + +static __u64 nilfs_btree_find_target_v(const struct nilfs_btree *btree, + const struct nilfs_btree_path *path, + __u64 key) +{ + __u64 ptr; + + ptr = nilfs_bmap_find_target_seq(&btree->bt_bmap, key); + if (ptr != NILFS_BMAP_INVALID_PTR) + /* sequential access */ + return ptr; + else { + ptr = nilfs_btree_find_near(btree, path); + if (ptr != NILFS_BMAP_INVALID_PTR) + /* near */ + return ptr; + } + /* block group */ + return nilfs_bmap_find_target_in_group(&btree->bt_bmap); +} + +static void nilfs_btree_set_target_v(struct nilfs_btree *btree, __u64 key, + __u64 ptr) +{ + btree->bt_bmap.b_last_allocated_key = key; + btree->bt_bmap.b_last_allocated_ptr = ptr; +} + +static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int *levelp, __u64 key, __u64 ptr, + struct nilfs_bmap_stats *stats) +{ + struct buffer_head *bh; + struct nilfs_btree_node *node, *parent, *sib; + __u64 sibptr; + int pindex, level, ret; + + stats->bs_nblocks = 0; + level = NILFS_BTREE_LEVEL_DATA; + + /* allocate a new ptr for data block */ + if (btree->bt_ops->btop_find_target != NULL) + path[level].bp_newreq.bpr_ptr = + btree->bt_ops->btop_find_target(btree, path, key); + + ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr( + &btree->bt_bmap, &path[level].bp_newreq); + if (ret < 0) + goto err_out_data; + + for (level = NILFS_BTREE_LEVEL_NODE_MIN; + level < nilfs_btree_height(btree) - 1; + level++) { + node = nilfs_btree_get_nonroot_node(btree, path, level); + if (nilfs_btree_node_get_nchildren(btree, node) < + nilfs_btree_node_nchildren_max(btree, node)) { + path[level].bp_op = nilfs_btree_do_insert; + stats->bs_nblocks++; + goto out; + } + + parent = nilfs_btree_get_node(btree, path, level + 1); + pindex = path[level + 1].bp_index; + + /* left sibling */ + if (pindex > 0) { + sibptr = nilfs_btree_node_get_ptr(btree, parent, + pindex - 1); + ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr, + &bh); + if (ret < 0) + goto err_out_child_node; + sib = (struct nilfs_btree_node *)bh->b_data; + if (nilfs_btree_node_get_nchildren(btree, sib) < + nilfs_btree_node_nchildren_max(btree, sib)) { + path[level].bp_sib_bh = bh; + path[level].bp_op = nilfs_btree_carry_left; + stats->bs_nblocks++; + goto out; + } else + nilfs_bmap_put_block(&btree->bt_bmap, bh); + } + + /* right sibling */ + if (pindex < + nilfs_btree_node_get_nchildren(btree, parent) - 1) { + sibptr = nilfs_btree_node_get_ptr(btree, parent, + pindex + 1); + ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr, + &bh); + if (ret < 0) + goto err_out_child_node; + sib = (struct nilfs_btree_node *)bh->b_data; + if (nilfs_btree_node_get_nchildren(btree, sib) < + nilfs_btree_node_nchildren_max(btree, sib)) { + path[level].bp_sib_bh = bh; + path[level].bp_op = nilfs_btree_carry_right; + stats->bs_nblocks++; + goto out; + } else + nilfs_bmap_put_block(&btree->bt_bmap, bh); + } + + /* split */ + path[level].bp_newreq.bpr_ptr = + path[level - 1].bp_newreq.bpr_ptr + 1; + ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr( + &btree->bt_bmap, &path[level].bp_newreq); + if (ret < 0) + goto err_out_child_node; + ret = nilfs_bmap_get_new_block(&btree->bt_bmap, + path[level].bp_newreq.bpr_ptr, + &bh); + if (ret < 0) + goto err_out_curr_node; + + stats->bs_nblocks++; + + lock_buffer(bh); + nilfs_btree_node_init(btree, + (struct nilfs_btree_node *)bh->b_data, + 0, level, 0, NULL, NULL); + unlock_buffer(bh); + path[level].bp_sib_bh = bh; + path[level].bp_op = nilfs_btree_split; + } + + /* root */ + node = nilfs_btree_get_root(btree); + if (nilfs_btree_node_get_nchildren(btree, node) < + nilfs_btree_node_nchildren_max(btree, node)) { + path[level].bp_op = nilfs_btree_do_insert; + stats->bs_nblocks++; + goto out; + } + + /* grow */ + path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1; + ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr( + &btree->bt_bmap, &path[level].bp_newreq); + if (ret < 0) + goto err_out_child_node; + ret = nilfs_bmap_get_new_block(&btree->bt_bmap, + path[level].bp_newreq.bpr_ptr, &bh); + if (ret < 0) + goto err_out_curr_node; + + lock_buffer(bh); + nilfs_btree_node_init(btree, (struct nilfs_btree_node *)bh->b_data, + 0, level, 0, NULL, NULL); + unlock_buffer(bh); + path[level].bp_sib_bh = bh; + path[level].bp_op = nilfs_btree_grow; + + level++; + path[level].bp_op = nilfs_btree_do_insert; + + /* a newly-created node block and a data block are added */ + stats->bs_nblocks += 2; + + /* success */ + out: + *levelp = level; + return ret; + + /* error */ + err_out_curr_node: + btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(&btree->bt_bmap, + &path[level].bp_newreq); + err_out_child_node: + for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) { + nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_sib_bh); + btree->bt_bmap.b_pops->bpop_abort_alloc_ptr( + &btree->bt_bmap, &path[level].bp_newreq); + + } + + btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(&btree->bt_bmap, + &path[level].bp_newreq); + err_out_data: + *levelp = level; + stats->bs_nblocks = 0; + return ret; +} + +static void nilfs_btree_commit_insert(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int maxlevel, __u64 key, __u64 ptr) +{ + int level; + + set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr)); + ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr; + if (btree->bt_ops->btop_set_target != NULL) + btree->bt_ops->btop_set_target(btree, key, ptr); + + for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) { + if (btree->bt_bmap.b_pops->bpop_commit_alloc_ptr != NULL) { + btree->bt_bmap.b_pops->bpop_commit_alloc_ptr( + &btree->bt_bmap, &path[level - 1].bp_newreq); + } + path[level].bp_op(btree, path, level, &key, &ptr); + } + + if (!nilfs_bmap_dirty(&btree->bt_bmap)) + nilfs_bmap_set_dirty(&btree->bt_bmap); +} + +static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) +{ + struct nilfs_btree *btree; + struct nilfs_btree_path *path; + struct nilfs_bmap_stats stats; + int level, ret; + + btree = (struct nilfs_btree *)bmap; + path = nilfs_btree_alloc_path(btree); + if (path == NULL) + return -ENOMEM; + nilfs_btree_init_path(btree, path); + + ret = nilfs_btree_do_lookup(btree, path, key, NULL, + NILFS_BTREE_LEVEL_NODE_MIN); + if (ret != -ENOENT) { + if (ret == 0) + ret = -EEXIST; + goto out; + } + + ret = nilfs_btree_prepare_insert(btree, path, &level, key, ptr, &stats); + if (ret < 0) + goto out; + nilfs_btree_commit_insert(btree, path, level, key, ptr); + nilfs_bmap_add_blocks(bmap, stats.bs_nblocks); + + out: + nilfs_btree_clear_path(btree, path); + nilfs_btree_free_path(btree, path); + return ret; +} + +static void nilfs_btree_do_delete(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node; + + if (level < nilfs_btree_height(btree) - 1) { + lock_buffer(path[level].bp_bh); + node = nilfs_btree_get_nonroot_node(btree, path, level); + nilfs_btree_node_delete(btree, node, keyp, ptrp, + path[level].bp_index); + if (!buffer_dirty(path[level].bp_bh)) + nilfs_btnode_mark_dirty(path[level].bp_bh); + unlock_buffer(path[level].bp_bh); + if (path[level].bp_index == 0) + nilfs_btree_promote_key(btree, path, level + 1, + nilfs_btree_node_get_key(btree, node, 0)); + } else { + node = nilfs_btree_get_root(btree); + nilfs_btree_node_delete(btree, node, keyp, ptrp, + path[level].bp_index); + } +} + +static void nilfs_btree_borrow_left(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node, *left; + int nchildren, lnchildren, n; + + nilfs_btree_do_delete(btree, path, level, keyp, ptrp); + + lock_buffer(path[level].bp_bh); + lock_buffer(path[level].bp_sib_bh); + + node = nilfs_btree_get_nonroot_node(btree, path, level); + left = nilfs_btree_get_sib_node(btree, path, level); + nchildren = nilfs_btree_node_get_nchildren(btree, node); + lnchildren = nilfs_btree_node_get_nchildren(btree, left); + + n = (nchildren + lnchildren) / 2 - nchildren; + + nilfs_btree_node_move_right(btree, left, node, n); + + if (!buffer_dirty(path[level].bp_bh)) + nilfs_btnode_mark_dirty(path[level].bp_bh); + if (!buffer_dirty(path[level].bp_sib_bh)) + nilfs_btnode_mark_dirty(path[level].bp_sib_bh); + + unlock_buffer(path[level].bp_bh); + unlock_buffer(path[level].bp_sib_bh); + + nilfs_btree_promote_key(btree, path, level + 1, + nilfs_btree_node_get_key(btree, node, 0)); + + nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh); + path[level].bp_sib_bh = NULL; + path[level].bp_index += n; +} + +static void nilfs_btree_borrow_right(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node, *right; + int nchildren, rnchildren, n; + + nilfs_btree_do_delete(btree, path, level, keyp, ptrp); + + lock_buffer(path[level].bp_bh); + lock_buffer(path[level].bp_sib_bh); + + node = nilfs_btree_get_nonroot_node(btree, path, level); + right = nilfs_btree_get_sib_node(btree, path, level); + nchildren = nilfs_btree_node_get_nchildren(btree, node); + rnchildren = nilfs_btree_node_get_nchildren(btree, right); + + n = (nchildren + rnchildren) / 2 - nchildren; + + nilfs_btree_node_move_left(btree, node, right, n); + + if (!buffer_dirty(path[level].bp_bh)) + nilfs_btnode_mark_dirty(path[level].bp_bh); + if (!buffer_dirty(path[level].bp_sib_bh)) + nilfs_btnode_mark_dirty(path[level].bp_sib_bh); + + unlock_buffer(path[level].bp_bh); + unlock_buffer(path[level].bp_sib_bh); + + path[level + 1].bp_index++; + nilfs_btree_promote_key(btree, path, level + 1, + nilfs_btree_node_get_key(btree, right, 0)); + path[level + 1].bp_index--; + + nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh); + path[level].bp_sib_bh = NULL; +} + +static void nilfs_btree_concat_left(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node, *left; + int n; + + nilfs_btree_do_delete(btree, path, level, keyp, ptrp); + + lock_buffer(path[level].bp_bh); + lock_buffer(path[level].bp_sib_bh); + + node = nilfs_btree_get_nonroot_node(btree, path, level); + left = nilfs_btree_get_sib_node(btree, path, level); + + n = nilfs_btree_node_get_nchildren(btree, node); + + nilfs_btree_node_move_left(btree, left, node, n); + + if (!buffer_dirty(path[level].bp_sib_bh)) + nilfs_btnode_mark_dirty(path[level].bp_sib_bh); + + unlock_buffer(path[level].bp_bh); + unlock_buffer(path[level].bp_sib_bh); + + nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_bh); + path[level].bp_bh = path[level].bp_sib_bh; + path[level].bp_sib_bh = NULL; + path[level].bp_index += nilfs_btree_node_get_nchildren(btree, left); +} + +static void nilfs_btree_concat_right(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node, *right; + int n; + + nilfs_btree_do_delete(btree, path, level, keyp, ptrp); + + lock_buffer(path[level].bp_bh); + lock_buffer(path[level].bp_sib_bh); + + node = nilfs_btree_get_nonroot_node(btree, path, level); + right = nilfs_btree_get_sib_node(btree, path, level); + + n = nilfs_btree_node_get_nchildren(btree, right); + + nilfs_btree_node_move_left(btree, node, right, n); + + if (!buffer_dirty(path[level].bp_bh)) + nilfs_btnode_mark_dirty(path[level].bp_bh); + + unlock_buffer(path[level].bp_bh); + unlock_buffer(path[level].bp_sib_bh); + + nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_sib_bh); + path[level].bp_sib_bh = NULL; + path[level + 1].bp_index++; +} + +static void nilfs_btree_shrink(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *root, *child; + int n; + + nilfs_btree_do_delete(btree, path, level, keyp, ptrp); + + lock_buffer(path[level].bp_bh); + root = nilfs_btree_get_root(btree); + child = nilfs_btree_get_nonroot_node(btree, path, level); + + nilfs_btree_node_delete(btree, root, NULL, NULL, 0); + nilfs_btree_node_set_level(btree, root, level); + n = nilfs_btree_node_get_nchildren(btree, child); + nilfs_btree_node_move_left(btree, root, child, n); + unlock_buffer(path[level].bp_bh); + + nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_bh); + path[level].bp_bh = NULL; +} + + +static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int *levelp, + struct nilfs_bmap_stats *stats) +{ + struct buffer_head *bh; + struct nilfs_btree_node *node, *parent, *sib; + __u64 sibptr; + int pindex, level, ret; + + ret = 0; + stats->bs_nblocks = 0; + for (level = NILFS_BTREE_LEVEL_NODE_MIN; + level < nilfs_btree_height(btree) - 1; + level++) { + node = nilfs_btree_get_nonroot_node(btree, path, level); + path[level].bp_oldreq.bpr_ptr = + nilfs_btree_node_get_ptr(btree, node, + path[level].bp_index); + if (btree->bt_bmap.b_pops->bpop_prepare_end_ptr != NULL) { + ret = btree->bt_bmap.b_pops->bpop_prepare_end_ptr( + &btree->bt_bmap, &path[level].bp_oldreq); + if (ret < 0) + goto err_out_child_node; + } + + if (nilfs_btree_node_get_nchildren(btree, node) > + nilfs_btree_node_nchildren_min(btree, node)) { + path[level].bp_op = nilfs_btree_do_delete; + stats->bs_nblocks++; + goto out; + } + + parent = nilfs_btree_get_node(btree, path, level + 1); + pindex = path[level + 1].bp_index; + + if (pindex > 0) { + /* left sibling */ + sibptr = nilfs_btree_node_get_ptr(btree, parent, + pindex - 1); + ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr, + &bh); + if (ret < 0) + goto err_out_curr_node; + sib = (struct nilfs_btree_node *)bh->b_data; + if (nilfs_btree_node_get_nchildren(btree, sib) > + nilfs_btree_node_nchildren_min(btree, sib)) { + path[level].bp_sib_bh = bh; + path[level].bp_op = nilfs_btree_borrow_left; + stats->bs_nblocks++; + goto out; + } else { + path[level].bp_sib_bh = bh; + path[level].bp_op = nilfs_btree_concat_left; + stats->bs_nblocks++; + /* continue; */ + } + } else if (pindex < + nilfs_btree_node_get_nchildren(btree, parent) - 1) { + /* right sibling */ + sibptr = nilfs_btree_node_get_ptr(btree, parent, + pindex + 1); + ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr, + &bh); + if (ret < 0) + goto err_out_curr_node; + sib = (struct nilfs_btree_node *)bh->b_data; + if (nilfs_btree_node_get_nchildren(btree, sib) > + nilfs_btree_node_nchildren_min(btree, sib)) { + path[level].bp_sib_bh = bh; + path[level].bp_op = nilfs_btree_borrow_right; + stats->bs_nblocks++; + goto out; + } else { + path[level].bp_sib_bh = bh; + path[level].bp_op = nilfs_btree_concat_right; + stats->bs_nblocks++; + /* continue; */ + } + } else { + /* no siblings */ + /* the only child of the root node */ + WARN_ON(level != nilfs_btree_height(btree) - 2); + if (nilfs_btree_node_get_nchildren(btree, node) - 1 <= + NILFS_BTREE_ROOT_NCHILDREN_MAX) { + path[level].bp_op = nilfs_btree_shrink; + stats->bs_nblocks += 2; + } else { + path[level].bp_op = nilfs_btree_do_delete; + stats->bs_nblocks++; + } + + goto out; + + } + } + + node = nilfs_btree_get_root(btree); + path[level].bp_oldreq.bpr_ptr = + nilfs_btree_node_get_ptr(btree, node, path[level].bp_index); + if (btree->bt_bmap.b_pops->bpop_prepare_end_ptr != NULL) { + ret = btree->bt_bmap.b_pops->bpop_prepare_end_ptr( + &btree->bt_bmap, &path[level].bp_oldreq); + if (ret < 0) + goto err_out_child_node; + } + /* child of the root node is deleted */ + path[level].bp_op = nilfs_btree_do_delete; + stats->bs_nblocks++; + + /* success */ + out: + *levelp = level; + return ret; + + /* error */ + err_out_curr_node: + if (btree->bt_bmap.b_pops->bpop_abort_end_ptr != NULL) + btree->bt_bmap.b_pops->bpop_abort_end_ptr( + &btree->bt_bmap, &path[level].bp_oldreq); + err_out_child_node: + for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) { + nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh); + if (btree->bt_bmap.b_pops->bpop_abort_end_ptr != NULL) + btree->bt_bmap.b_pops->bpop_abort_end_ptr( + &btree->bt_bmap, &path[level].bp_oldreq); + } + *levelp = level; + stats->bs_nblocks = 0; + return ret; +} + +static void nilfs_btree_commit_delete(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int maxlevel) +{ + int level; + + for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) { + if (btree->bt_bmap.b_pops->bpop_commit_end_ptr != NULL) + btree->bt_bmap.b_pops->bpop_commit_end_ptr( + &btree->bt_bmap, &path[level].bp_oldreq); + path[level].bp_op(btree, path, level, NULL, NULL); + } + + if (!nilfs_bmap_dirty(&btree->bt_bmap)) + nilfs_bmap_set_dirty(&btree->bt_bmap); +} + +static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key) + +{ + struct nilfs_btree *btree; + struct nilfs_btree_path *path; + struct nilfs_bmap_stats stats; + int level, ret; + + btree = (struct nilfs_btree *)bmap; + path = nilfs_btree_alloc_path(btree); + if (path == NULL) + return -ENOMEM; + nilfs_btree_init_path(btree, path); + ret = nilfs_btree_do_lookup(btree, path, key, NULL, + NILFS_BTREE_LEVEL_NODE_MIN); + if (ret < 0) + goto out; + + ret = nilfs_btree_prepare_delete(btree, path, &level, &stats); + if (ret < 0) + goto out; + nilfs_btree_commit_delete(btree, path, level); + nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks); + +out: + nilfs_btree_clear_path(btree, path); + nilfs_btree_free_path(btree, path); + return ret; +} + +static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp) +{ + struct nilfs_btree *btree; + struct nilfs_btree_path *path; + int ret; + + btree = (struct nilfs_btree *)bmap; + path = nilfs_btree_alloc_path(btree); + if (path == NULL) + return -ENOMEM; + nilfs_btree_init_path(btree, path); + + ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL); + + nilfs_btree_clear_path(btree, path); + nilfs_btree_free_path(btree, path); + + return ret; +} + +static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key) +{ + struct buffer_head *bh; + struct nilfs_btree *btree; + struct nilfs_btree_node *root, *node; + __u64 maxkey, nextmaxkey; + __u64 ptr; + int nchildren, ret; + + btree = (struct nilfs_btree *)bmap; + root = nilfs_btree_get_root(btree); + switch (nilfs_btree_height(btree)) { + case 2: + bh = NULL; + node = root; + break; + case 3: + nchildren = nilfs_btree_node_get_nchildren(btree, root); + if (nchildren > 1) + return 0; + ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1); + ret = nilfs_bmap_get_block(bmap, ptr, &bh); + if (ret < 0) + return ret; + node = (struct nilfs_btree_node *)bh->b_data; + break; + default: + return 0; + } + + nchildren = nilfs_btree_node_get_nchildren(btree, node); + maxkey = nilfs_btree_node_get_key(btree, node, nchildren - 1); + nextmaxkey = (nchildren > 1) ? + nilfs_btree_node_get_key(btree, node, nchildren - 2) : 0; + if (bh != NULL) + nilfs_bmap_put_block(bmap, bh); + + return (maxkey == key) && (nextmaxkey < bmap->b_low); +} + +static int nilfs_btree_gather_data(struct nilfs_bmap *bmap, + __u64 *keys, __u64 *ptrs, int nitems) +{ + struct buffer_head *bh; + struct nilfs_btree *btree; + struct nilfs_btree_node *node, *root; + __le64 *dkeys; + __le64 *dptrs; + __u64 ptr; + int nchildren, i, ret; + + btree = (struct nilfs_btree *)bmap; + root = nilfs_btree_get_root(btree); + switch (nilfs_btree_height(btree)) { + case 2: + bh = NULL; + node = root; + break; + case 3: + nchildren = nilfs_btree_node_get_nchildren(btree, root); + WARN_ON(nchildren > 1); + ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1); + ret = nilfs_bmap_get_block(bmap, ptr, &bh); + if (ret < 0) + return ret; + node = (struct nilfs_btree_node *)bh->b_data; + break; + default: + node = NULL; + return -EINVAL; + } + + nchildren = nilfs_btree_node_get_nchildren(btree, node); + if (nchildren < nitems) + nitems = nchildren; + dkeys = nilfs_btree_node_dkeys(btree, node); + dptrs = nilfs_btree_node_dptrs(btree, node); + for (i = 0; i < nitems; i++) { + keys[i] = nilfs_bmap_dkey_to_key(dkeys[i]); + ptrs[i] = nilfs_bmap_dptr_to_ptr(dptrs[i]); + } + + if (bh != NULL) + nilfs_bmap_put_block(bmap, bh); + + return nitems; +} + +static int +nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key, + union nilfs_bmap_ptr_req *dreq, + union nilfs_bmap_ptr_req *nreq, + struct buffer_head **bhp, + struct nilfs_bmap_stats *stats) +{ + struct buffer_head *bh; + struct nilfs_btree *btree; + int ret; + + btree = (struct nilfs_btree *)bmap; + stats->bs_nblocks = 0; + + /* for data */ + /* cannot find near ptr */ + if (btree->bt_ops->btop_find_target != NULL) + dreq->bpr_ptr + = btree->bt_ops->btop_find_target(btree, NULL, key); + ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, dreq); + if (ret < 0) + return ret; + + *bhp = NULL; + stats->bs_nblocks++; + if (nreq != NULL) { + nreq->bpr_ptr = dreq->bpr_ptr + 1; + ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, nreq); + if (ret < 0) + goto err_out_dreq; + + ret = nilfs_bmap_get_new_block(bmap, nreq->bpr_ptr, &bh); + if (ret < 0) + goto err_out_nreq; + + *bhp = bh; + stats->bs_nblocks++; + } + + /* success */ + return 0; + + /* error */ + err_out_nreq: + bmap->b_pops->bpop_abort_alloc_ptr(bmap, nreq); + err_out_dreq: + bmap->b_pops->bpop_abort_alloc_ptr(bmap, dreq); + stats->bs_nblocks = 0; + return ret; + +} + +static void +nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap, + __u64 key, __u64 ptr, + const __u64 *keys, const __u64 *ptrs, + int n, __u64 low, __u64 high, + union nilfs_bmap_ptr_req *dreq, + union nilfs_bmap_ptr_req *nreq, + struct buffer_head *bh) +{ + struct nilfs_btree *btree; + struct nilfs_btree_node *node; + __u64 tmpptr; + + /* free resources */ + if (bmap->b_ops->bop_clear != NULL) + bmap->b_ops->bop_clear(bmap); + + /* ptr must be a pointer to a buffer head. */ + set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr)); + + /* convert and insert */ + btree = (struct nilfs_btree *)bmap; + nilfs_btree_init(bmap, low, high); + if (nreq != NULL) { + if (bmap->b_pops->bpop_commit_alloc_ptr != NULL) { + bmap->b_pops->bpop_commit_alloc_ptr(bmap, dreq); + bmap->b_pops->bpop_commit_alloc_ptr(bmap, nreq); + } + + /* create child node at level 1 */ + lock_buffer(bh); + node = (struct nilfs_btree_node *)bh->b_data; + nilfs_btree_node_init(btree, node, 0, 1, n, keys, ptrs); + nilfs_btree_node_insert(btree, node, + key, dreq->bpr_ptr, n); + if (!buffer_dirty(bh)) + nilfs_btnode_mark_dirty(bh); + if (!nilfs_bmap_dirty(bmap)) + nilfs_bmap_set_dirty(bmap); + + unlock_buffer(bh); + nilfs_bmap_put_block(bmap, bh); + + /* create root node at level 2 */ + node = nilfs_btree_get_root(btree); + tmpptr = nreq->bpr_ptr; + nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT, + 2, 1, &keys[0], &tmpptr); + } else { + if (bmap->b_pops->bpop_commit_alloc_ptr != NULL) + bmap->b_pops->bpop_commit_alloc_ptr(bmap, dreq); + + /* create root node at level 1 */ + node = nilfs_btree_get_root(btree); + nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT, + 1, n, keys, ptrs); + nilfs_btree_node_insert(btree, node, + key, dreq->bpr_ptr, n); + if (!nilfs_bmap_dirty(bmap)) + nilfs_bmap_set_dirty(bmap); + } + + if (btree->bt_ops->btop_set_target != NULL) + btree->bt_ops->btop_set_target(btree, key, dreq->bpr_ptr); +} + +/** + * nilfs_btree_convert_and_insert - + * @bmap: + * @key: + * @ptr: + * @keys: + * @ptrs: + * @n: + * @low: + * @high: + */ +int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap, + __u64 key, __u64 ptr, + const __u64 *keys, const __u64 *ptrs, + int n, __u64 low, __u64 high) +{ + struct buffer_head *bh; + union nilfs_bmap_ptr_req dreq, nreq, *di, *ni; + struct nilfs_bmap_stats stats; + int ret; + + if (n + 1 <= NILFS_BTREE_ROOT_NCHILDREN_MAX) { + di = &dreq; + ni = NULL; + } else if ((n + 1) <= NILFS_BTREE_NODE_NCHILDREN_MAX( + 1 << bmap->b_inode->i_blkbits)) { + di = &dreq; + ni = &nreq; + } else { + di = NULL; + ni = NULL; + BUG(); + } + + ret = nilfs_btree_prepare_convert_and_insert(bmap, key, di, ni, &bh, + &stats); + if (ret < 0) + return ret; + nilfs_btree_commit_convert_and_insert(bmap, key, ptr, keys, ptrs, n, + low, high, di, ni, bh); + nilfs_bmap_add_blocks(bmap, stats.bs_nblocks); + return 0; +} + +static int nilfs_btree_propagate_p(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, + struct buffer_head *bh) +{ + while ((++level < nilfs_btree_height(btree) - 1) && + !buffer_dirty(path[level].bp_bh)) + nilfs_btnode_mark_dirty(path[level].bp_bh); + + return 0; +} + +static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level) +{ + struct nilfs_btree_node *parent; + int ret; + + parent = nilfs_btree_get_node(btree, path, level + 1); + path[level].bp_oldreq.bpr_ptr = + nilfs_btree_node_get_ptr(btree, parent, + path[level + 1].bp_index); + path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1; + ret = nilfs_bmap_prepare_update(&btree->bt_bmap, + &path[level].bp_oldreq, + &path[level].bp_newreq); + if (ret < 0) + return ret; + + if (buffer_nilfs_node(path[level].bp_bh)) { + path[level].bp_ctxt.oldkey = path[level].bp_oldreq.bpr_ptr; + path[level].bp_ctxt.newkey = path[level].bp_newreq.bpr_ptr; + path[level].bp_ctxt.bh = path[level].bp_bh; + ret = nilfs_btnode_prepare_change_key( + &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, + &path[level].bp_ctxt); + if (ret < 0) { + nilfs_bmap_abort_update(&btree->bt_bmap, + &path[level].bp_oldreq, + &path[level].bp_newreq); + return ret; + } + } + + return 0; +} + +static void nilfs_btree_commit_update_v(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level) +{ + struct nilfs_btree_node *parent; + + nilfs_bmap_commit_update(&btree->bt_bmap, + &path[level].bp_oldreq, + &path[level].bp_newreq); + + if (buffer_nilfs_node(path[level].bp_bh)) { + nilfs_btnode_commit_change_key( + &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, + &path[level].bp_ctxt); + path[level].bp_bh = path[level].bp_ctxt.bh; + } + set_buffer_nilfs_volatile(path[level].bp_bh); + + parent = nilfs_btree_get_node(btree, path, level + 1); + nilfs_btree_node_set_ptr(btree, parent, path[level + 1].bp_index, + path[level].bp_newreq.bpr_ptr); +} + +static void nilfs_btree_abort_update_v(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level) +{ + nilfs_bmap_abort_update(&btree->bt_bmap, + &path[level].bp_oldreq, + &path[level].bp_newreq); + if (buffer_nilfs_node(path[level].bp_bh)) + nilfs_btnode_abort_change_key( + &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, + &path[level].bp_ctxt); +} + +static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int minlevel, + int *maxlevelp) +{ + int level, ret; + + level = minlevel; + if (!buffer_nilfs_volatile(path[level].bp_bh)) { + ret = nilfs_btree_prepare_update_v(btree, path, level); + if (ret < 0) + return ret; + } + while ((++level < nilfs_btree_height(btree) - 1) && + !buffer_dirty(path[level].bp_bh)) { + + WARN_ON(buffer_nilfs_volatile(path[level].bp_bh)); + ret = nilfs_btree_prepare_update_v(btree, path, level); + if (ret < 0) + goto out; + } + + /* success */ + *maxlevelp = level - 1; + return 0; + + /* error */ + out: + while (--level > minlevel) + nilfs_btree_abort_update_v(btree, path, level); + if (!buffer_nilfs_volatile(path[level].bp_bh)) + nilfs_btree_abort_update_v(btree, path, level); + return ret; +} + +static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int minlevel, + int maxlevel, + struct buffer_head *bh) +{ + int level; + + if (!buffer_nilfs_volatile(path[minlevel].bp_bh)) + nilfs_btree_commit_update_v(btree, path, minlevel); + + for (level = minlevel + 1; level <= maxlevel; level++) + nilfs_btree_commit_update_v(btree, path, level); +} + +static int nilfs_btree_propagate_v(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, + struct buffer_head *bh) +{ + int maxlevel, ret; + struct nilfs_btree_node *parent; + __u64 ptr; + + get_bh(bh); + path[level].bp_bh = bh; + ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel); + if (ret < 0) + goto out; + + if (buffer_nilfs_volatile(path[level].bp_bh)) { + parent = nilfs_btree_get_node(btree, path, level + 1); + ptr = nilfs_btree_node_get_ptr(btree, parent, + path[level + 1].bp_index); + ret = nilfs_bmap_mark_dirty(&btree->bt_bmap, ptr); + if (ret < 0) + goto out; + } + + nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh); + + out: + brelse(path[level].bp_bh); + path[level].bp_bh = NULL; + return ret; +} + +static int nilfs_btree_propagate(const struct nilfs_bmap *bmap, + struct buffer_head *bh) +{ + struct nilfs_btree *btree; + struct nilfs_btree_path *path; + struct nilfs_btree_node *node; + __u64 key; + int level, ret; + + WARN_ON(!buffer_dirty(bh)); + + btree = (struct nilfs_btree *)bmap; + path = nilfs_btree_alloc_path(btree); + if (path == NULL) + return -ENOMEM; + nilfs_btree_init_path(btree, path); + + if (buffer_nilfs_node(bh)) { + node = (struct nilfs_btree_node *)bh->b_data; + key = nilfs_btree_node_get_key(btree, node, 0); + level = nilfs_btree_node_get_level(btree, node); + } else { + key = nilfs_bmap_data_get_key(bmap, bh); + level = NILFS_BTREE_LEVEL_DATA; + } + + ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1); + if (ret < 0) { + if (unlikely(ret == -ENOENT)) + printk(KERN_CRIT "%s: key = %llu, level == %d\n", + __func__, (unsigned long long)key, level); + goto out; + } + + ret = btree->bt_ops->btop_propagate(btree, path, level, bh); + + out: + nilfs_btree_clear_path(btree, path); + nilfs_btree_free_path(btree, path); + + return ret; +} + +static int nilfs_btree_propagate_gc(const struct nilfs_bmap *bmap, + struct buffer_head *bh) +{ + return nilfs_bmap_mark_dirty(bmap, bh->b_blocknr); +} + +static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree, + struct list_head *lists, + struct buffer_head *bh) +{ + struct list_head *head; + struct buffer_head *cbh; + struct nilfs_btree_node *node, *cnode; + __u64 key, ckey; + int level; + + get_bh(bh); + node = (struct nilfs_btree_node *)bh->b_data; + key = nilfs_btree_node_get_key(btree, node, 0); + level = nilfs_btree_node_get_level(btree, node); + list_for_each(head, &lists[level]) { + cbh = list_entry(head, struct buffer_head, b_assoc_buffers); + cnode = (struct nilfs_btree_node *)cbh->b_data; + ckey = nilfs_btree_node_get_key(btree, cnode, 0); + if (key < ckey) + break; + } + list_add_tail(&bh->b_assoc_buffers, head); +} + +static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *bmap, + struct list_head *listp) +{ + struct nilfs_btree *btree = (struct nilfs_btree *)bmap; + struct address_space *btcache = &NILFS_BMAP_I(bmap)->i_btnode_cache; + struct list_head lists[NILFS_BTREE_LEVEL_MAX]; + struct pagevec pvec; + struct buffer_head *bh, *head; + pgoff_t index = 0; + int level, i; + + for (level = NILFS_BTREE_LEVEL_NODE_MIN; + level < NILFS_BTREE_LEVEL_MAX; + level++) + INIT_LIST_HEAD(&lists[level]); + + pagevec_init(&pvec, 0); + + while (pagevec_lookup_tag(&pvec, btcache, &index, PAGECACHE_TAG_DIRTY, + PAGEVEC_SIZE)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + bh = head = page_buffers(pvec.pages[i]); + do { + if (buffer_dirty(bh)) + nilfs_btree_add_dirty_buffer(btree, + lists, bh); + } while ((bh = bh->b_this_page) != head); + } + pagevec_release(&pvec); + cond_resched(); + } + + for (level = NILFS_BTREE_LEVEL_NODE_MIN; + level < NILFS_BTREE_LEVEL_MAX; + level++) + list_splice(&lists[level], listp->prev); +} + +static int nilfs_btree_assign_p(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, + struct buffer_head **bh, + sector_t blocknr, + union nilfs_binfo *binfo) +{ + struct nilfs_btree_node *parent; + __u64 key; + __u64 ptr; + int ret; + + parent = nilfs_btree_get_node(btree, path, level + 1); + ptr = nilfs_btree_node_get_ptr(btree, parent, + path[level + 1].bp_index); + if (buffer_nilfs_node(*bh)) { + path[level].bp_ctxt.oldkey = ptr; + path[level].bp_ctxt.newkey = blocknr; + path[level].bp_ctxt.bh = *bh; + ret = nilfs_btnode_prepare_change_key( + &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, + &path[level].bp_ctxt); + if (ret < 0) + return ret; + nilfs_btnode_commit_change_key( + &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, + &path[level].bp_ctxt); + *bh = path[level].bp_ctxt.bh; + } + + nilfs_btree_node_set_ptr(btree, parent, + path[level + 1].bp_index, blocknr); + + key = nilfs_btree_node_get_key(btree, parent, + path[level + 1].bp_index); + /* on-disk format */ + binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key); + binfo->bi_dat.bi_level = level; + + return 0; +} + +static int nilfs_btree_assign_v(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, + struct buffer_head **bh, + sector_t blocknr, + union nilfs_binfo *binfo) +{ + struct nilfs_btree_node *parent; + __u64 key; + __u64 ptr; + union nilfs_bmap_ptr_req req; + int ret; + + parent = nilfs_btree_get_node(btree, path, level + 1); + ptr = nilfs_btree_node_get_ptr(btree, parent, + path[level + 1].bp_index); + req.bpr_ptr = ptr; + ret = btree->bt_bmap.b_pops->bpop_prepare_start_ptr(&btree->bt_bmap, + &req); + if (ret < 0) + return ret; + btree->bt_bmap.b_pops->bpop_commit_start_ptr(&btree->bt_bmap, + &req, blocknr); + + key = nilfs_btree_node_get_key(btree, parent, + path[level + 1].bp_index); + /* on-disk format */ + binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr); + binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); + + return 0; +} + +static int nilfs_btree_assign(struct nilfs_bmap *bmap, + struct buffer_head **bh, + sector_t blocknr, + union nilfs_binfo *binfo) +{ + struct nilfs_btree *btree; + struct nilfs_btree_path *path; + struct nilfs_btree_node *node; + __u64 key; + int level, ret; + + btree = (struct nilfs_btree *)bmap; + path = nilfs_btree_alloc_path(btree); + if (path == NULL) + return -ENOMEM; + nilfs_btree_init_path(btree, path); + + if (buffer_nilfs_node(*bh)) { + node = (struct nilfs_btree_node *)(*bh)->b_data; + key = nilfs_btree_node_get_key(btree, node, 0); + level = nilfs_btree_node_get_level(btree, node); + } else { + key = nilfs_bmap_data_get_key(bmap, *bh); + level = NILFS_BTREE_LEVEL_DATA; + } + + ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1); + if (ret < 0) { + WARN_ON(ret == -ENOENT); + goto out; + } + + ret = btree->bt_ops->btop_assign(btree, path, level, bh, + blocknr, binfo); + + out: + nilfs_btree_clear_path(btree, path); + nilfs_btree_free_path(btree, path); + + return ret; +} + +static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap, + struct buffer_head **bh, + sector_t blocknr, + union nilfs_binfo *binfo) +{ + struct nilfs_btree *btree; + struct nilfs_btree_node *node; + __u64 key; + int ret; + + btree = (struct nilfs_btree *)bmap; + ret = nilfs_bmap_move_v(bmap, (*bh)->b_blocknr, blocknr); + if (ret < 0) + return ret; + + if (buffer_nilfs_node(*bh)) { + node = (struct nilfs_btree_node *)(*bh)->b_data; + key = nilfs_btree_node_get_key(btree, node, 0); + } else + key = nilfs_bmap_data_get_key(bmap, *bh); + + /* on-disk format */ + binfo->bi_v.bi_vblocknr = cpu_to_le64((*bh)->b_blocknr); + binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); + + return 0; +} + +static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level) +{ + struct buffer_head *bh; + struct nilfs_btree *btree; + struct nilfs_btree_path *path; + __u64 ptr; + int ret; + + btree = (struct nilfs_btree *)bmap; + path = nilfs_btree_alloc_path(btree); + if (path == NULL) + return -ENOMEM; + nilfs_btree_init_path(btree, path); + + ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1); + if (ret < 0) { + WARN_ON(ret == -ENOENT); + goto out; + } + ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr, &bh); + if (ret < 0) { + WARN_ON(ret == -ENOENT); + goto out; + } + + if (!buffer_dirty(bh)) + nilfs_btnode_mark_dirty(bh); + nilfs_bmap_put_block(&btree->bt_bmap, bh); + if (!nilfs_bmap_dirty(&btree->bt_bmap)) + nilfs_bmap_set_dirty(&btree->bt_bmap); + + out: + nilfs_btree_clear_path(btree, path); + nilfs_btree_free_path(btree, path); + return ret; +} + +static const struct nilfs_bmap_operations nilfs_btree_ops = { + .bop_lookup = nilfs_btree_lookup, + .bop_insert = nilfs_btree_insert, + .bop_delete = nilfs_btree_delete, + .bop_clear = NULL, + + .bop_propagate = nilfs_btree_propagate, + + .bop_lookup_dirty_buffers = nilfs_btree_lookup_dirty_buffers, + + .bop_assign = nilfs_btree_assign, + .bop_mark = nilfs_btree_mark, + + .bop_last_key = nilfs_btree_last_key, + .bop_check_insert = NULL, + .bop_check_delete = nilfs_btree_check_delete, + .bop_gather_data = nilfs_btree_gather_data, +}; + +static const struct nilfs_bmap_operations nilfs_btree_ops_gc = { + .bop_lookup = NULL, + .bop_insert = NULL, + .bop_delete = NULL, + .bop_clear = NULL, + + .bop_propagate = nilfs_btree_propagate_gc, + + .bop_lookup_dirty_buffers = nilfs_btree_lookup_dirty_buffers, + + .bop_assign = nilfs_btree_assign_gc, + .bop_mark = NULL, + + .bop_last_key = NULL, + .bop_check_insert = NULL, + .bop_check_delete = NULL, + .bop_gather_data = NULL, +}; + +static const struct nilfs_btree_operations nilfs_btree_ops_v = { + .btop_find_target = nilfs_btree_find_target_v, + .btop_set_target = nilfs_btree_set_target_v, + .btop_propagate = nilfs_btree_propagate_v, + .btop_assign = nilfs_btree_assign_v, +}; + +static const struct nilfs_btree_operations nilfs_btree_ops_p = { + .btop_find_target = NULL, + .btop_set_target = NULL, + .btop_propagate = nilfs_btree_propagate_p, + .btop_assign = nilfs_btree_assign_p, +}; + +int nilfs_btree_init(struct nilfs_bmap *bmap, __u64 low, __u64 high) +{ + struct nilfs_btree *btree; + + btree = (struct nilfs_btree *)bmap; + bmap->b_ops = &nilfs_btree_ops; + bmap->b_low = low; + bmap->b_high = high; + switch (bmap->b_inode->i_ino) { + case NILFS_DAT_INO: + btree->bt_ops = &nilfs_btree_ops_p; + break; + default: + btree->bt_ops = &nilfs_btree_ops_v; + break; + } + + return 0; +} + +void nilfs_btree_init_gc(struct nilfs_bmap *bmap) +{ + bmap->b_low = NILFS_BMAP_LARGE_LOW; + bmap->b_high = NILFS_BMAP_LARGE_HIGH; + bmap->b_ops = &nilfs_btree_ops_gc; +} diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h new file mode 100644 index 000000000000..4766deb52fb1 --- /dev/null +++ b/fs/nilfs2/btree.h @@ -0,0 +1,117 @@ +/* + * btree.h - NILFS B-tree. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#ifndef _NILFS_BTREE_H +#define _NILFS_BTREE_H + +#include <linux/types.h> +#include <linux/buffer_head.h> +#include <linux/list.h> +#include <linux/nilfs2_fs.h> +#include "btnode.h" +#include "bmap.h" + +struct nilfs_btree; +struct nilfs_btree_path; + +/** + * struct nilfs_btree_operations - B-tree operation table + */ +struct nilfs_btree_operations { + __u64 (*btop_find_target)(const struct nilfs_btree *, + const struct nilfs_btree_path *, __u64); + void (*btop_set_target)(struct nilfs_btree *, __u64, __u64); + + struct the_nilfs *(*btop_get_nilfs)(struct nilfs_btree *); + + int (*btop_propagate)(struct nilfs_btree *, + struct nilfs_btree_path *, + int, + struct buffer_head *); + int (*btop_assign)(struct nilfs_btree *, + struct nilfs_btree_path *, + int, + struct buffer_head **, + sector_t, + union nilfs_binfo *); +}; + +/** + * struct nilfs_btree_node - B-tree node + * @bn_flags: flags + * @bn_level: level + * @bn_nchildren: number of children + * @bn_pad: padding + */ +struct nilfs_btree_node { + __u8 bn_flags; + __u8 bn_level; + __le16 bn_nchildren; + __le32 bn_pad; +}; + +/* flags */ +#define NILFS_BTREE_NODE_ROOT 0x01 + +/* level */ +#define NILFS_BTREE_LEVEL_DATA 0 +#define NILFS_BTREE_LEVEL_NODE_MIN (NILFS_BTREE_LEVEL_DATA + 1) +#define NILFS_BTREE_LEVEL_MAX 14 + +/** + * struct nilfs_btree - B-tree structure + * @bt_bmap: bmap base structure + * @bt_ops: B-tree operation table + */ +struct nilfs_btree { + struct nilfs_bmap bt_bmap; + + /* B-tree-specific members */ + const struct nilfs_btree_operations *bt_ops; +}; + + +#define NILFS_BTREE_ROOT_SIZE NILFS_BMAP_SIZE +#define NILFS_BTREE_ROOT_NCHILDREN_MAX \ + ((NILFS_BTREE_ROOT_SIZE - sizeof(struct nilfs_btree_node)) / \ + (sizeof(__le64 /* dkey */) + sizeof(__le64 /* dptr */))) +#define NILFS_BTREE_ROOT_NCHILDREN_MIN 0 +#define NILFS_BTREE_NODE_EXTRA_PAD_SIZE (sizeof(__le64)) +#define NILFS_BTREE_NODE_NCHILDREN_MAX(nodesize) \ + (((nodesize) - sizeof(struct nilfs_btree_node) - \ + NILFS_BTREE_NODE_EXTRA_PAD_SIZE) / \ + (sizeof(__le64 /* dkey */) + sizeof(__le64 /* dptr */))) +#define NILFS_BTREE_NODE_NCHILDREN_MIN(nodesize) \ + ((NILFS_BTREE_NODE_NCHILDREN_MAX(nodesize) - 1) / 2 + 1) +#define NILFS_BTREE_KEY_MIN ((__u64)0) +#define NILFS_BTREE_KEY_MAX (~(__u64)0) + + +int nilfs_btree_path_cache_init(void); +void nilfs_btree_path_cache_destroy(void); +int nilfs_btree_init(struct nilfs_bmap *, __u64, __u64); +int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64, + const __u64 *, const __u64 *, + int, __u64, __u64); +void nilfs_btree_init_gc(struct nilfs_bmap *); + +#endif /* _NILFS_BTREE_H */ diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c new file mode 100644 index 000000000000..e90b60dfced9 --- /dev/null +++ b/fs/nilfs2/cpfile.c @@ -0,0 +1,925 @@ +/* + * cpfile.c - NILFS checkpoint file. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/string.h> +#include <linux/buffer_head.h> +#include <linux/errno.h> +#include <linux/nilfs2_fs.h> +#include "mdt.h" +#include "cpfile.h" + + +static inline unsigned long +nilfs_cpfile_checkpoints_per_block(const struct inode *cpfile) +{ + return NILFS_MDT(cpfile)->mi_entries_per_block; +} + +/* block number from the beginning of the file */ +static unsigned long +nilfs_cpfile_get_blkoff(const struct inode *cpfile, __u64 cno) +{ + __u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1; + do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile)); + return (unsigned long)tcno; +} + +/* offset in block */ +static unsigned long +nilfs_cpfile_get_offset(const struct inode *cpfile, __u64 cno) +{ + __u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1; + return do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile)); +} + +static unsigned long +nilfs_cpfile_checkpoints_in_block(const struct inode *cpfile, + __u64 curr, + __u64 max) +{ + return min_t(__u64, + nilfs_cpfile_checkpoints_per_block(cpfile) - + nilfs_cpfile_get_offset(cpfile, curr), + max - curr); +} + +static inline int nilfs_cpfile_is_in_first(const struct inode *cpfile, + __u64 cno) +{ + return nilfs_cpfile_get_blkoff(cpfile, cno) == 0; +} + +static unsigned int +nilfs_cpfile_block_add_valid_checkpoints(const struct inode *cpfile, + struct buffer_head *bh, + void *kaddr, + unsigned int n) +{ + struct nilfs_checkpoint *cp = kaddr + bh_offset(bh); + unsigned int count; + + count = le32_to_cpu(cp->cp_checkpoints_count) + n; + cp->cp_checkpoints_count = cpu_to_le32(count); + return count; +} + +static unsigned int +nilfs_cpfile_block_sub_valid_checkpoints(const struct inode *cpfile, + struct buffer_head *bh, + void *kaddr, + unsigned int n) +{ + struct nilfs_checkpoint *cp = kaddr + bh_offset(bh); + unsigned int count; + + WARN_ON(le32_to_cpu(cp->cp_checkpoints_count) < n); + count = le32_to_cpu(cp->cp_checkpoints_count) - n; + cp->cp_checkpoints_count = cpu_to_le32(count); + return count; +} + +static inline struct nilfs_cpfile_header * +nilfs_cpfile_block_get_header(const struct inode *cpfile, + struct buffer_head *bh, + void *kaddr) +{ + return kaddr + bh_offset(bh); +} + +static struct nilfs_checkpoint * +nilfs_cpfile_block_get_checkpoint(const struct inode *cpfile, __u64 cno, + struct buffer_head *bh, + void *kaddr) +{ + return kaddr + bh_offset(bh) + nilfs_cpfile_get_offset(cpfile, cno) * + NILFS_MDT(cpfile)->mi_entry_size; +} + +static void nilfs_cpfile_block_init(struct inode *cpfile, + struct buffer_head *bh, + void *kaddr) +{ + struct nilfs_checkpoint *cp = kaddr + bh_offset(bh); + size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size; + int n = nilfs_cpfile_checkpoints_per_block(cpfile); + + while (n-- > 0) { + nilfs_checkpoint_set_invalid(cp); + cp = (void *)cp + cpsz; + } +} + +static inline int nilfs_cpfile_get_header_block(struct inode *cpfile, + struct buffer_head **bhp) +{ + return nilfs_mdt_get_block(cpfile, 0, 0, NULL, bhp); +} + +static inline int nilfs_cpfile_get_checkpoint_block(struct inode *cpfile, + __u64 cno, + int create, + struct buffer_head **bhp) +{ + return nilfs_mdt_get_block(cpfile, + nilfs_cpfile_get_blkoff(cpfile, cno), + create, nilfs_cpfile_block_init, bhp); +} + +static inline int nilfs_cpfile_delete_checkpoint_block(struct inode *cpfile, + __u64 cno) +{ + return nilfs_mdt_delete_block(cpfile, + nilfs_cpfile_get_blkoff(cpfile, cno)); +} + +/** + * nilfs_cpfile_get_checkpoint - get a checkpoint + * @cpfile: inode of checkpoint file + * @cno: checkpoint number + * @create: create flag + * @cpp: pointer to a checkpoint + * @bhp: pointer to a buffer head + * + * Description: nilfs_cpfile_get_checkpoint() acquires the checkpoint + * specified by @cno. A new checkpoint will be created if @cno is the current + * checkpoint number and @create is nonzero. + * + * Return Value: On success, 0 is returned, and the checkpoint and the + * buffer head of the buffer on which the checkpoint is located are stored in + * the place pointed by @cpp and @bhp, respectively. On error, one of the + * following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - No such checkpoint. + * + * %-EINVAL - invalid checkpoint. + */ +int nilfs_cpfile_get_checkpoint(struct inode *cpfile, + __u64 cno, + int create, + struct nilfs_checkpoint **cpp, + struct buffer_head **bhp) +{ + struct buffer_head *header_bh, *cp_bh; + struct nilfs_cpfile_header *header; + struct nilfs_checkpoint *cp; + void *kaddr; + int ret; + + if (unlikely(cno < 1 || cno > nilfs_mdt_cno(cpfile) || + (cno < nilfs_mdt_cno(cpfile) && create))) + return -EINVAL; + + down_write(&NILFS_MDT(cpfile)->mi_sem); + + ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); + if (ret < 0) + goto out_sem; + ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, create, &cp_bh); + if (ret < 0) + goto out_header; + kaddr = kmap(cp_bh->b_page); + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + if (nilfs_checkpoint_invalid(cp)) { + if (!create) { + kunmap(cp_bh->b_page); + brelse(cp_bh); + ret = -ENOENT; + goto out_header; + } + /* a newly-created checkpoint */ + nilfs_checkpoint_clear_invalid(cp); + if (!nilfs_cpfile_is_in_first(cpfile, cno)) + nilfs_cpfile_block_add_valid_checkpoints(cpfile, cp_bh, + kaddr, 1); + nilfs_mdt_mark_buffer_dirty(cp_bh); + + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = nilfs_cpfile_block_get_header(cpfile, header_bh, + kaddr); + le64_add_cpu(&header->ch_ncheckpoints, 1); + kunmap_atomic(kaddr, KM_USER0); + nilfs_mdt_mark_buffer_dirty(header_bh); + nilfs_mdt_mark_dirty(cpfile); + } + + if (cpp != NULL) + *cpp = cp; + *bhp = cp_bh; + + out_header: + brelse(header_bh); + + out_sem: + up_write(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} + +/** + * nilfs_cpfile_put_checkpoint - put a checkpoint + * @cpfile: inode of checkpoint file + * @cno: checkpoint number + * @bh: buffer head + * + * Description: nilfs_cpfile_put_checkpoint() releases the checkpoint + * specified by @cno. @bh must be the buffer head which has been returned by + * a previous call to nilfs_cpfile_get_checkpoint() with @cno. + */ +void nilfs_cpfile_put_checkpoint(struct inode *cpfile, __u64 cno, + struct buffer_head *bh) +{ + kunmap(bh->b_page); + brelse(bh); +} + +/** + * nilfs_cpfile_delete_checkpoints - delete checkpoints + * @cpfile: inode of checkpoint file + * @start: start checkpoint number + * @end: end checkpoint numer + * + * Description: nilfs_cpfile_delete_checkpoints() deletes the checkpoints in + * the period from @start to @end, excluding @end itself. The checkpoints + * which have been already deleted are ignored. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EINVAL - invalid checkpoints. + */ +int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, + __u64 start, + __u64 end) +{ + struct buffer_head *header_bh, *cp_bh; + struct nilfs_cpfile_header *header; + struct nilfs_checkpoint *cp; + size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size; + __u64 cno; + void *kaddr; + unsigned long tnicps; + int ret, ncps, nicps, count, i; + + if (unlikely(start == 0 || start > end)) { + printk(KERN_ERR "%s: invalid range of checkpoint numbers: " + "[%llu, %llu)\n", __func__, + (unsigned long long)start, (unsigned long long)end); + return -EINVAL; + } + + /* cannot delete the latest checkpoint */ + if (start == nilfs_mdt_cno(cpfile) - 1) + return -EPERM; + + down_write(&NILFS_MDT(cpfile)->mi_sem); + + ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); + if (ret < 0) + goto out_sem; + tnicps = 0; + + for (cno = start; cno < end; cno += ncps) { + ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, end); + ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh); + if (ret < 0) { + if (ret != -ENOENT) + goto out_sem; + /* skip hole */ + ret = 0; + continue; + } + + kaddr = kmap_atomic(cp_bh->b_page, KM_USER0); + cp = nilfs_cpfile_block_get_checkpoint( + cpfile, cno, cp_bh, kaddr); + nicps = 0; + for (i = 0; i < ncps; i++, cp = (void *)cp + cpsz) { + WARN_ON(nilfs_checkpoint_snapshot(cp)); + if (!nilfs_checkpoint_invalid(cp)) { + nilfs_checkpoint_set_invalid(cp); + nicps++; + } + } + if (nicps > 0) { + tnicps += nicps; + nilfs_mdt_mark_buffer_dirty(cp_bh); + nilfs_mdt_mark_dirty(cpfile); + if (!nilfs_cpfile_is_in_first(cpfile, cno) && + (count = nilfs_cpfile_block_sub_valid_checkpoints( + cpfile, cp_bh, kaddr, nicps)) == 0) { + /* make hole */ + kunmap_atomic(kaddr, KM_USER0); + brelse(cp_bh); + ret = nilfs_cpfile_delete_checkpoint_block( + cpfile, cno); + if (ret == 0) + continue; + printk(KERN_ERR "%s: cannot delete block\n", + __func__); + goto out_sem; + } + } + + kunmap_atomic(kaddr, KM_USER0); + brelse(cp_bh); + } + + if (tnicps > 0) { + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = nilfs_cpfile_block_get_header(cpfile, header_bh, + kaddr); + le64_add_cpu(&header->ch_ncheckpoints, -(u64)tnicps); + nilfs_mdt_mark_buffer_dirty(header_bh); + nilfs_mdt_mark_dirty(cpfile); + kunmap_atomic(kaddr, KM_USER0); + } + brelse(header_bh); + + out_sem: + up_write(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} + +static void nilfs_cpfile_checkpoint_to_cpinfo(struct inode *cpfile, + struct nilfs_checkpoint *cp, + struct nilfs_cpinfo *ci) +{ + ci->ci_flags = le32_to_cpu(cp->cp_flags); + ci->ci_cno = le64_to_cpu(cp->cp_cno); + ci->ci_create = le64_to_cpu(cp->cp_create); + ci->ci_nblk_inc = le64_to_cpu(cp->cp_nblk_inc); + ci->ci_inodes_count = le64_to_cpu(cp->cp_inodes_count); + ci->ci_blocks_count = le64_to_cpu(cp->cp_blocks_count); + ci->ci_next = le64_to_cpu(cp->cp_snapshot_list.ssl_next); +} + +static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop, + struct nilfs_cpinfo *ci, size_t nci) +{ + struct nilfs_checkpoint *cp; + struct buffer_head *bh; + size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size; + __u64 cur_cno = nilfs_mdt_cno(cpfile), cno = *cnop; + void *kaddr; + int n, ret; + int ncps, i; + + if (cno == 0) + return -ENOENT; /* checkpoint number 0 is invalid */ + down_read(&NILFS_MDT(cpfile)->mi_sem); + + for (n = 0; cno < cur_cno && n < nci; cno += ncps) { + ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, cur_cno); + ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh); + if (ret < 0) { + if (ret != -ENOENT) + goto out; + continue; /* skip hole */ + } + + kaddr = kmap_atomic(bh->b_page, KM_USER0); + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr); + for (i = 0; i < ncps && n < nci; i++, cp = (void *)cp + cpsz) { + if (!nilfs_checkpoint_invalid(cp)) + nilfs_cpfile_checkpoint_to_cpinfo( + cpfile, cp, &ci[n++]); + } + kunmap_atomic(kaddr, KM_USER0); + brelse(bh); + } + + ret = n; + if (n > 0) + *cnop = ci[n - 1].ci_cno + 1; + + out: + up_read(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} + +static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop, + struct nilfs_cpinfo *ci, size_t nci) +{ + struct buffer_head *bh; + struct nilfs_cpfile_header *header; + struct nilfs_checkpoint *cp; + __u64 curr = *cnop, next; + unsigned long curr_blkoff, next_blkoff; + void *kaddr; + int n = 0, ret; + + down_read(&NILFS_MDT(cpfile)->mi_sem); + + if (curr == 0) { + ret = nilfs_cpfile_get_header_block(cpfile, &bh); + if (ret < 0) + goto out; + kaddr = kmap_atomic(bh->b_page, KM_USER0); + header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr); + curr = le64_to_cpu(header->ch_snapshot_list.ssl_next); + kunmap_atomic(kaddr, KM_USER0); + brelse(bh); + if (curr == 0) { + ret = 0; + goto out; + } + } else if (unlikely(curr == ~(__u64)0)) { + ret = 0; + goto out; + } + + curr_blkoff = nilfs_cpfile_get_blkoff(cpfile, curr); + ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr, 0, &bh); + if (unlikely(ret < 0)) { + if (ret == -ENOENT) + ret = 0; /* No snapshots (started from a hole block) */ + goto out; + } + kaddr = kmap_atomic(bh->b_page, KM_USER0); + while (n < nci) { + cp = nilfs_cpfile_block_get_checkpoint(cpfile, curr, bh, kaddr); + curr = ~(__u64)0; /* Terminator */ + if (unlikely(nilfs_checkpoint_invalid(cp) || + !nilfs_checkpoint_snapshot(cp))) + break; + nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp, &ci[n++]); + next = le64_to_cpu(cp->cp_snapshot_list.ssl_next); + if (next == 0) + break; /* reach end of the snapshot list */ + + next_blkoff = nilfs_cpfile_get_blkoff(cpfile, next); + if (curr_blkoff != next_blkoff) { + kunmap_atomic(kaddr, KM_USER0); + brelse(bh); + ret = nilfs_cpfile_get_checkpoint_block(cpfile, next, + 0, &bh); + if (unlikely(ret < 0)) { + WARN_ON(ret == -ENOENT); + goto out; + } + kaddr = kmap_atomic(bh->b_page, KM_USER0); + } + curr = next; + curr_blkoff = next_blkoff; + } + kunmap_atomic(kaddr, KM_USER0); + brelse(bh); + *cnop = curr; + ret = n; + + out: + up_read(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} + +/** + * nilfs_cpfile_get_cpinfo - + * @cpfile: + * @cno: + * @ci: + * @nci: + */ + +ssize_t nilfs_cpfile_get_cpinfo(struct inode *cpfile, __u64 *cnop, int mode, + struct nilfs_cpinfo *ci, size_t nci) +{ + switch (mode) { + case NILFS_CHECKPOINT: + return nilfs_cpfile_do_get_cpinfo(cpfile, cnop, ci, nci); + case NILFS_SNAPSHOT: + return nilfs_cpfile_do_get_ssinfo(cpfile, cnop, ci, nci); + default: + return -EINVAL; + } +} + +/** + * nilfs_cpfile_delete_checkpoint - + * @cpfile: + * @cno: + */ +int nilfs_cpfile_delete_checkpoint(struct inode *cpfile, __u64 cno) +{ + struct nilfs_cpinfo ci; + __u64 tcno = cno; + ssize_t nci; + int ret; + + nci = nilfs_cpfile_do_get_cpinfo(cpfile, &tcno, &ci, 1); + if (nci < 0) + return nci; + else if (nci == 0 || ci.ci_cno != cno) + return -ENOENT; + + /* cannot delete the latest checkpoint nor snapshots */ + ret = nilfs_cpinfo_snapshot(&ci); + if (ret < 0) + return ret; + else if (ret > 0 || cno == nilfs_mdt_cno(cpfile) - 1) + return -EPERM; + + return nilfs_cpfile_delete_checkpoints(cpfile, cno, cno + 1); +} + +static struct nilfs_snapshot_list * +nilfs_cpfile_block_get_snapshot_list(const struct inode *cpfile, + __u64 cno, + struct buffer_head *bh, + void *kaddr) +{ + struct nilfs_cpfile_header *header; + struct nilfs_checkpoint *cp; + struct nilfs_snapshot_list *list; + + if (cno != 0) { + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr); + list = &cp->cp_snapshot_list; + } else { + header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr); + list = &header->ch_snapshot_list; + } + return list; +} + +static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno) +{ + struct buffer_head *header_bh, *curr_bh, *prev_bh, *cp_bh; + struct nilfs_cpfile_header *header; + struct nilfs_checkpoint *cp; + struct nilfs_snapshot_list *list; + __u64 curr, prev; + unsigned long curr_blkoff, prev_blkoff; + void *kaddr; + int ret; + + if (cno == 0) + return -ENOENT; /* checkpoint number 0 is invalid */ + down_write(&NILFS_MDT(cpfile)->mi_sem); + + ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh); + if (ret < 0) + goto out_sem; + kaddr = kmap_atomic(cp_bh->b_page, KM_USER0); + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + if (nilfs_checkpoint_invalid(cp)) { + ret = -ENOENT; + kunmap_atomic(kaddr, KM_USER0); + goto out_cp; + } + if (nilfs_checkpoint_snapshot(cp)) { + ret = 0; + kunmap_atomic(kaddr, KM_USER0); + goto out_cp; + } + kunmap_atomic(kaddr, KM_USER0); + + ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); + if (ret < 0) + goto out_cp; + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr); + list = &header->ch_snapshot_list; + curr_bh = header_bh; + get_bh(curr_bh); + curr = 0; + curr_blkoff = 0; + prev = le64_to_cpu(list->ssl_prev); + while (prev > cno) { + prev_blkoff = nilfs_cpfile_get_blkoff(cpfile, prev); + curr = prev; + if (curr_blkoff != prev_blkoff) { + kunmap_atomic(kaddr, KM_USER0); + brelse(curr_bh); + ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr, + 0, &curr_bh); + if (ret < 0) + goto out_header; + kaddr = kmap_atomic(curr_bh->b_page, KM_USER0); + } + curr_blkoff = prev_blkoff; + cp = nilfs_cpfile_block_get_checkpoint( + cpfile, curr, curr_bh, kaddr); + list = &cp->cp_snapshot_list; + prev = le64_to_cpu(list->ssl_prev); + } + kunmap_atomic(kaddr, KM_USER0); + + if (prev != 0) { + ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0, + &prev_bh); + if (ret < 0) + goto out_curr; + } else { + prev_bh = header_bh; + get_bh(prev_bh); + } + + kaddr = kmap_atomic(curr_bh->b_page, KM_USER0); + list = nilfs_cpfile_block_get_snapshot_list( + cpfile, curr, curr_bh, kaddr); + list->ssl_prev = cpu_to_le64(cno); + kunmap_atomic(kaddr, KM_USER0); + + kaddr = kmap_atomic(cp_bh->b_page, KM_USER0); + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + cp->cp_snapshot_list.ssl_next = cpu_to_le64(curr); + cp->cp_snapshot_list.ssl_prev = cpu_to_le64(prev); + nilfs_checkpoint_set_snapshot(cp); + kunmap_atomic(kaddr, KM_USER0); + + kaddr = kmap_atomic(prev_bh->b_page, KM_USER0); + list = nilfs_cpfile_block_get_snapshot_list( + cpfile, prev, prev_bh, kaddr); + list->ssl_next = cpu_to_le64(cno); + kunmap_atomic(kaddr, KM_USER0); + + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr); + le64_add_cpu(&header->ch_nsnapshots, 1); + kunmap_atomic(kaddr, KM_USER0); + + nilfs_mdt_mark_buffer_dirty(prev_bh); + nilfs_mdt_mark_buffer_dirty(curr_bh); + nilfs_mdt_mark_buffer_dirty(cp_bh); + nilfs_mdt_mark_buffer_dirty(header_bh); + nilfs_mdt_mark_dirty(cpfile); + + brelse(prev_bh); + + out_curr: + brelse(curr_bh); + + out_header: + brelse(header_bh); + + out_cp: + brelse(cp_bh); + + out_sem: + up_write(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} + +static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno) +{ + struct buffer_head *header_bh, *next_bh, *prev_bh, *cp_bh; + struct nilfs_cpfile_header *header; + struct nilfs_checkpoint *cp; + struct nilfs_snapshot_list *list; + __u64 next, prev; + void *kaddr; + int ret; + + if (cno == 0) + return -ENOENT; /* checkpoint number 0 is invalid */ + down_write(&NILFS_MDT(cpfile)->mi_sem); + + ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh); + if (ret < 0) + goto out_sem; + kaddr = kmap_atomic(cp_bh->b_page, KM_USER0); + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + if (nilfs_checkpoint_invalid(cp)) { + ret = -ENOENT; + kunmap_atomic(kaddr, KM_USER0); + goto out_cp; + } + if (!nilfs_checkpoint_snapshot(cp)) { + ret = 0; + kunmap_atomic(kaddr, KM_USER0); + goto out_cp; + } + + list = &cp->cp_snapshot_list; + next = le64_to_cpu(list->ssl_next); + prev = le64_to_cpu(list->ssl_prev); + kunmap_atomic(kaddr, KM_USER0); + + ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); + if (ret < 0) + goto out_cp; + if (next != 0) { + ret = nilfs_cpfile_get_checkpoint_block(cpfile, next, 0, + &next_bh); + if (ret < 0) + goto out_header; + } else { + next_bh = header_bh; + get_bh(next_bh); + } + if (prev != 0) { + ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0, + &prev_bh); + if (ret < 0) + goto out_next; + } else { + prev_bh = header_bh; + get_bh(prev_bh); + } + + kaddr = kmap_atomic(next_bh->b_page, KM_USER0); + list = nilfs_cpfile_block_get_snapshot_list( + cpfile, next, next_bh, kaddr); + list->ssl_prev = cpu_to_le64(prev); + kunmap_atomic(kaddr, KM_USER0); + + kaddr = kmap_atomic(prev_bh->b_page, KM_USER0); + list = nilfs_cpfile_block_get_snapshot_list( + cpfile, prev, prev_bh, kaddr); + list->ssl_next = cpu_to_le64(next); + kunmap_atomic(kaddr, KM_USER0); + + kaddr = kmap_atomic(cp_bh->b_page, KM_USER0); + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + cp->cp_snapshot_list.ssl_next = cpu_to_le64(0); + cp->cp_snapshot_list.ssl_prev = cpu_to_le64(0); + nilfs_checkpoint_clear_snapshot(cp); + kunmap_atomic(kaddr, KM_USER0); + + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr); + le64_add_cpu(&header->ch_nsnapshots, -1); + kunmap_atomic(kaddr, KM_USER0); + + nilfs_mdt_mark_buffer_dirty(next_bh); + nilfs_mdt_mark_buffer_dirty(prev_bh); + nilfs_mdt_mark_buffer_dirty(cp_bh); + nilfs_mdt_mark_buffer_dirty(header_bh); + nilfs_mdt_mark_dirty(cpfile); + + brelse(prev_bh); + + out_next: + brelse(next_bh); + + out_header: + brelse(header_bh); + + out_cp: + brelse(cp_bh); + + out_sem: + up_write(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} + +/** + * nilfs_cpfile_is_snapshot - + * @cpfile: inode of checkpoint file + * @cno: checkpoint number + * + * Description: + * + * Return Value: On success, 1 is returned if the checkpoint specified by + * @cno is a snapshot, or 0 if not. On error, one of the following negative + * error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - No such checkpoint. + */ +int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno) +{ + struct buffer_head *bh; + struct nilfs_checkpoint *cp; + void *kaddr; + int ret; + + if (cno == 0) + return -ENOENT; /* checkpoint number 0 is invalid */ + down_read(&NILFS_MDT(cpfile)->mi_sem); + + ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh); + if (ret < 0) + goto out; + kaddr = kmap_atomic(bh->b_page, KM_USER0); + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr); + ret = nilfs_checkpoint_snapshot(cp); + kunmap_atomic(kaddr, KM_USER0); + brelse(bh); + + out: + up_read(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} + +/** + * nilfs_cpfile_change_cpmode - change checkpoint mode + * @cpfile: inode of checkpoint file + * @cno: checkpoint number + * @status: mode of checkpoint + * + * Description: nilfs_change_cpmode() changes the mode of the checkpoint + * specified by @cno. The mode @mode is NILFS_CHECKPOINT or NILFS_SNAPSHOT. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - No such checkpoint. + */ +int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode) +{ + struct the_nilfs *nilfs; + int ret; + + nilfs = NILFS_MDT(cpfile)->mi_nilfs; + + switch (mode) { + case NILFS_CHECKPOINT: + /* + * Check for protecting existing snapshot mounts: + * bd_mount_sem is used to make this operation atomic and + * exclusive with a new mount job. Though it doesn't cover + * umount, it's enough for the purpose. + */ + down(&nilfs->ns_bdev->bd_mount_sem); + if (nilfs_checkpoint_is_mounted(nilfs, cno, 1)) { + /* Current implementation does not have to protect + plain read-only mounts since they are exclusive + with a read/write mount and are protected from the + cleaner. */ + ret = -EBUSY; + } else + ret = nilfs_cpfile_clear_snapshot(cpfile, cno); + up(&nilfs->ns_bdev->bd_mount_sem); + return ret; + case NILFS_SNAPSHOT: + return nilfs_cpfile_set_snapshot(cpfile, cno); + default: + return -EINVAL; + } +} + +/** + * nilfs_cpfile_get_stat - get checkpoint statistics + * @cpfile: inode of checkpoint file + * @stat: pointer to a structure of checkpoint statistics + * + * Description: nilfs_cpfile_get_stat() returns information about checkpoints. + * + * Return Value: On success, 0 is returned, and checkpoints information is + * stored in the place pointed by @stat. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat) +{ + struct buffer_head *bh; + struct nilfs_cpfile_header *header; + void *kaddr; + int ret; + + down_read(&NILFS_MDT(cpfile)->mi_sem); + + ret = nilfs_cpfile_get_header_block(cpfile, &bh); + if (ret < 0) + goto out_sem; + kaddr = kmap_atomic(bh->b_page, KM_USER0); + header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr); + cpstat->cs_cno = nilfs_mdt_cno(cpfile); + cpstat->cs_ncps = le64_to_cpu(header->ch_ncheckpoints); + cpstat->cs_nsss = le64_to_cpu(header->ch_nsnapshots); + kunmap_atomic(kaddr, KM_USER0); + brelse(bh); + + out_sem: + up_read(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h new file mode 100644 index 000000000000..1a8a1008c342 --- /dev/null +++ b/fs/nilfs2/cpfile.h @@ -0,0 +1,45 @@ +/* + * cpfile.h - NILFS checkpoint file. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#ifndef _NILFS_CPFILE_H +#define _NILFS_CPFILE_H + +#include <linux/fs.h> +#include <linux/buffer_head.h> +#include <linux/nilfs2_fs.h> + +#define NILFS_CPFILE_GFP NILFS_MDT_GFP + + +int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int, + struct nilfs_checkpoint **, + struct buffer_head **); +void nilfs_cpfile_put_checkpoint(struct inode *, __u64, struct buffer_head *); +int nilfs_cpfile_delete_checkpoints(struct inode *, __u64, __u64); +int nilfs_cpfile_delete_checkpoint(struct inode *, __u64); +int nilfs_cpfile_change_cpmode(struct inode *, __u64, int); +int nilfs_cpfile_is_snapshot(struct inode *, __u64); +int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *); +ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, + struct nilfs_cpinfo *, size_t); + +#endif /* _NILFS_CPFILE_H */ diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c new file mode 100644 index 000000000000..bb8a5818e7f1 --- /dev/null +++ b/fs/nilfs2/dat.c @@ -0,0 +1,430 @@ +/* + * dat.c - NILFS disk address translation. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#include <linux/types.h> +#include <linux/buffer_head.h> +#include <linux/string.h> +#include <linux/errno.h> +#include "nilfs.h" +#include "mdt.h" +#include "alloc.h" +#include "dat.h" + + +#define NILFS_CNO_MIN ((__u64)1) +#define NILFS_CNO_MAX (~(__u64)0) + +static int nilfs_dat_prepare_entry(struct inode *dat, + struct nilfs_palloc_req *req, int create) +{ + return nilfs_palloc_get_entry_block(dat, req->pr_entry_nr, + create, &req->pr_entry_bh); +} + +static void nilfs_dat_commit_entry(struct inode *dat, + struct nilfs_palloc_req *req) +{ + nilfs_mdt_mark_buffer_dirty(req->pr_entry_bh); + nilfs_mdt_mark_dirty(dat); + brelse(req->pr_entry_bh); +} + +static void nilfs_dat_abort_entry(struct inode *dat, + struct nilfs_palloc_req *req) +{ + brelse(req->pr_entry_bh); +} + +int nilfs_dat_prepare_alloc(struct inode *dat, struct nilfs_palloc_req *req) +{ + int ret; + + ret = nilfs_palloc_prepare_alloc_entry(dat, req); + if (ret < 0) + return ret; + + ret = nilfs_dat_prepare_entry(dat, req, 1); + if (ret < 0) + nilfs_palloc_abort_alloc_entry(dat, req); + + return ret; +} + +void nilfs_dat_commit_alloc(struct inode *dat, struct nilfs_palloc_req *req) +{ + struct nilfs_dat_entry *entry; + void *kaddr; + + kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0); + entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, + req->pr_entry_bh, kaddr); + entry->de_start = cpu_to_le64(NILFS_CNO_MIN); + entry->de_end = cpu_to_le64(NILFS_CNO_MAX); + entry->de_blocknr = cpu_to_le64(0); + kunmap_atomic(kaddr, KM_USER0); + + nilfs_palloc_commit_alloc_entry(dat, req); + nilfs_dat_commit_entry(dat, req); +} + +void nilfs_dat_abort_alloc(struct inode *dat, struct nilfs_palloc_req *req) +{ + nilfs_dat_abort_entry(dat, req); + nilfs_palloc_abort_alloc_entry(dat, req); +} + +int nilfs_dat_prepare_free(struct inode *dat, struct nilfs_palloc_req *req) +{ + int ret; + + ret = nilfs_palloc_prepare_free_entry(dat, req); + if (ret < 0) + return ret; + ret = nilfs_dat_prepare_entry(dat, req, 0); + if (ret < 0) { + nilfs_palloc_abort_free_entry(dat, req); + return ret; + } + return 0; +} + +void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req) +{ + struct nilfs_dat_entry *entry; + void *kaddr; + + kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0); + entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, + req->pr_entry_bh, kaddr); + entry->de_start = cpu_to_le64(NILFS_CNO_MIN); + entry->de_end = cpu_to_le64(NILFS_CNO_MIN); + entry->de_blocknr = cpu_to_le64(0); + kunmap_atomic(kaddr, KM_USER0); + + nilfs_dat_commit_entry(dat, req); + nilfs_palloc_commit_free_entry(dat, req); +} + +void nilfs_dat_abort_free(struct inode *dat, struct nilfs_palloc_req *req) +{ + nilfs_dat_abort_entry(dat, req); + nilfs_palloc_abort_free_entry(dat, req); +} + +int nilfs_dat_prepare_start(struct inode *dat, struct nilfs_palloc_req *req) +{ + int ret; + + ret = nilfs_dat_prepare_entry(dat, req, 0); + WARN_ON(ret == -ENOENT); + return ret; +} + +void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req, + sector_t blocknr) +{ + struct nilfs_dat_entry *entry; + void *kaddr; + + kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0); + entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, + req->pr_entry_bh, kaddr); + entry->de_start = cpu_to_le64(nilfs_mdt_cno(dat)); + if (entry->de_blocknr != cpu_to_le64(0) || + entry->de_end != cpu_to_le64(NILFS_CNO_MAX)) { + printk(KERN_CRIT + "%s: vbn = %llu, start = %llu, end = %llu, pbn = %llu\n", + __func__, (unsigned long long)req->pr_entry_nr, + (unsigned long long)le64_to_cpu(entry->de_start), + (unsigned long long)le64_to_cpu(entry->de_end), + (unsigned long long)le64_to_cpu(entry->de_blocknr)); + } + entry->de_blocknr = cpu_to_le64(blocknr); + kunmap_atomic(kaddr, KM_USER0); + + nilfs_dat_commit_entry(dat, req); +} + +void nilfs_dat_abort_start(struct inode *dat, struct nilfs_palloc_req *req) +{ + nilfs_dat_abort_entry(dat, req); +} + +int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req) +{ + struct nilfs_dat_entry *entry; + __u64 start; + sector_t blocknr; + void *kaddr; + int ret; + + ret = nilfs_dat_prepare_entry(dat, req, 0); + if (ret < 0) { + WARN_ON(ret == -ENOENT); + return ret; + } + + kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0); + entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, + req->pr_entry_bh, kaddr); + start = le64_to_cpu(entry->de_start); + blocknr = le64_to_cpu(entry->de_blocknr); + kunmap_atomic(kaddr, KM_USER0); + + if (blocknr == 0) { + ret = nilfs_palloc_prepare_free_entry(dat, req); + if (ret < 0) { + nilfs_dat_abort_entry(dat, req); + return ret; + } + } + + return 0; +} + +void nilfs_dat_commit_end(struct inode *dat, struct nilfs_palloc_req *req, + int dead) +{ + struct nilfs_dat_entry *entry; + __u64 start, end; + sector_t blocknr; + void *kaddr; + + kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0); + entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, + req->pr_entry_bh, kaddr); + end = start = le64_to_cpu(entry->de_start); + if (!dead) { + end = nilfs_mdt_cno(dat); + WARN_ON(start > end); + } + entry->de_end = cpu_to_le64(end); + blocknr = le64_to_cpu(entry->de_blocknr); + kunmap_atomic(kaddr, KM_USER0); + + if (blocknr == 0) + nilfs_dat_commit_free(dat, req); + else + nilfs_dat_commit_entry(dat, req); +} + +void nilfs_dat_abort_end(struct inode *dat, struct nilfs_palloc_req *req) +{ + struct nilfs_dat_entry *entry; + __u64 start; + sector_t blocknr; + void *kaddr; + + kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0); + entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, + req->pr_entry_bh, kaddr); + start = le64_to_cpu(entry->de_start); + blocknr = le64_to_cpu(entry->de_blocknr); + kunmap_atomic(kaddr, KM_USER0); + + if (start == nilfs_mdt_cno(dat) && blocknr == 0) + nilfs_palloc_abort_free_entry(dat, req); + nilfs_dat_abort_entry(dat, req); +} + +/** + * nilfs_dat_mark_dirty - + * @dat: DAT file inode + * @vblocknr: virtual block number + * + * Description: + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_dat_mark_dirty(struct inode *dat, __u64 vblocknr) +{ + struct nilfs_palloc_req req; + int ret; + + req.pr_entry_nr = vblocknr; + ret = nilfs_dat_prepare_entry(dat, &req, 0); + if (ret == 0) + nilfs_dat_commit_entry(dat, &req); + return ret; +} + +/** + * nilfs_dat_freev - free virtual block numbers + * @dat: DAT file inode + * @vblocknrs: array of virtual block numbers + * @nitems: number of virtual block numbers + * + * Description: nilfs_dat_freev() frees the virtual block numbers specified by + * @vblocknrs and @nitems. + * + * Return Value: On success, 0 is returned. On error, one of the following + * nagative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - The virtual block number have not been allocated. + */ +int nilfs_dat_freev(struct inode *dat, __u64 *vblocknrs, size_t nitems) +{ + return nilfs_palloc_freev(dat, vblocknrs, nitems); +} + +/** + * nilfs_dat_move - change a block number + * @dat: DAT file inode + * @vblocknr: virtual block number + * @blocknr: block number + * + * Description: nilfs_dat_move() changes the block number associated with + * @vblocknr to @blocknr. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr) +{ + struct buffer_head *entry_bh; + struct nilfs_dat_entry *entry; + void *kaddr; + int ret; + + ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh); + if (ret < 0) + return ret; + kaddr = kmap_atomic(entry_bh->b_page, KM_USER0); + entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr); + if (unlikely(entry->de_blocknr == cpu_to_le64(0))) { + printk(KERN_CRIT "%s: vbn = %llu, [%llu, %llu)\n", __func__, + (unsigned long long)vblocknr, + (unsigned long long)le64_to_cpu(entry->de_start), + (unsigned long long)le64_to_cpu(entry->de_end)); + kunmap_atomic(kaddr, KM_USER0); + brelse(entry_bh); + return -EINVAL; + } + WARN_ON(blocknr == 0); + entry->de_blocknr = cpu_to_le64(blocknr); + kunmap_atomic(kaddr, KM_USER0); + + nilfs_mdt_mark_buffer_dirty(entry_bh); + nilfs_mdt_mark_dirty(dat); + + brelse(entry_bh); + + return 0; +} + +/** + * nilfs_dat_translate - translate a virtual block number to a block number + * @dat: DAT file inode + * @vblocknr: virtual block number + * @blocknrp: pointer to a block number + * + * Description: nilfs_dat_translate() maps the virtual block number @vblocknr + * to the corresponding block number. + * + * Return Value: On success, 0 is returned and the block number associated + * with @vblocknr is stored in the place pointed by @blocknrp. On error, one + * of the following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - A block number associated with @vblocknr does not exist. + */ +int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp) +{ + struct buffer_head *entry_bh; + struct nilfs_dat_entry *entry; + sector_t blocknr; + void *kaddr; + int ret; + + ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh); + if (ret < 0) + return ret; + + kaddr = kmap_atomic(entry_bh->b_page, KM_USER0); + entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr); + blocknr = le64_to_cpu(entry->de_blocknr); + if (blocknr == 0) { + ret = -ENOENT; + goto out; + } + if (blocknrp != NULL) + *blocknrp = blocknr; + + out: + kunmap_atomic(kaddr, KM_USER0); + brelse(entry_bh); + return ret; +} + +ssize_t nilfs_dat_get_vinfo(struct inode *dat, struct nilfs_vinfo *vinfo, + size_t nvi) +{ + struct buffer_head *entry_bh; + struct nilfs_dat_entry *entry; + __u64 first, last; + void *kaddr; + unsigned long entries_per_block = NILFS_MDT(dat)->mi_entries_per_block; + int i, j, n, ret; + + for (i = 0; i < nvi; i += n) { + ret = nilfs_palloc_get_entry_block(dat, vinfo[i].vi_vblocknr, + 0, &entry_bh); + if (ret < 0) + return ret; + kaddr = kmap_atomic(entry_bh->b_page, KM_USER0); + /* last virtual block number in this block */ + first = vinfo[i].vi_vblocknr; + do_div(first, entries_per_block); + first *= entries_per_block; + last = first + entries_per_block - 1; + for (j = i, n = 0; + j < nvi && vinfo[j].vi_vblocknr >= first && + vinfo[j].vi_vblocknr <= last; + j++, n++) { + entry = nilfs_palloc_block_get_entry( + dat, vinfo[j].vi_vblocknr, entry_bh, kaddr); + vinfo[j].vi_start = le64_to_cpu(entry->de_start); + vinfo[j].vi_end = le64_to_cpu(entry->de_end); + vinfo[j].vi_blocknr = le64_to_cpu(entry->de_blocknr); + } + kunmap_atomic(kaddr, KM_USER0); + brelse(entry_bh); + } + + return nvi; +} diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h new file mode 100644 index 000000000000..d9560654a4b7 --- /dev/null +++ b/fs/nilfs2/dat.h @@ -0,0 +1,52 @@ +/* + * dat.h - NILFS disk address translation. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#ifndef _NILFS_DAT_H +#define _NILFS_DAT_H + +#include <linux/types.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> + +#define NILFS_DAT_GFP NILFS_MDT_GFP + +struct nilfs_palloc_req; + +int nilfs_dat_translate(struct inode *, __u64, sector_t *); + +int nilfs_dat_prepare_alloc(struct inode *, struct nilfs_palloc_req *); +void nilfs_dat_commit_alloc(struct inode *, struct nilfs_palloc_req *); +void nilfs_dat_abort_alloc(struct inode *, struct nilfs_palloc_req *); +int nilfs_dat_prepare_start(struct inode *, struct nilfs_palloc_req *); +void nilfs_dat_commit_start(struct inode *, struct nilfs_palloc_req *, + sector_t); +void nilfs_dat_abort_start(struct inode *, struct nilfs_palloc_req *); +int nilfs_dat_prepare_end(struct inode *, struct nilfs_palloc_req *); +void nilfs_dat_commit_end(struct inode *, struct nilfs_palloc_req *, int); +void nilfs_dat_abort_end(struct inode *, struct nilfs_palloc_req *); + +int nilfs_dat_mark_dirty(struct inode *, __u64); +int nilfs_dat_freev(struct inode *, __u64 *, size_t); +int nilfs_dat_move(struct inode *, __u64, sector_t); +ssize_t nilfs_dat_get_vinfo(struct inode *, struct nilfs_vinfo *, size_t); + +#endif /* _NILFS_DAT_H */ diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c new file mode 100644 index 000000000000..54100acc1102 --- /dev/null +++ b/fs/nilfs2/dir.c @@ -0,0 +1,711 @@ +/* + * dir.c - NILFS directory entry operations + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net> + */ +/* + * linux/fs/ext2/dir.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/dir.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext2 directory handling functions + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * + * All code that works with directory layout had been switched to pagecache + * and moved here. AV + */ + +#include <linux/pagemap.h> +#include <linux/smp_lock.h> +#include "nilfs.h" +#include "page.h" + +/* + * nilfs uses block-sized chunks. Arguably, sector-sized ones would be + * more robust, but we have what we have + */ +static inline unsigned nilfs_chunk_size(struct inode *inode) +{ + return inode->i_sb->s_blocksize; +} + +static inline void nilfs_put_page(struct page *page) +{ + kunmap(page); + page_cache_release(page); +} + +static inline unsigned long dir_pages(struct inode *inode) +{ + return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; +} + +/* + * Return the offset into page `page_nr' of the last valid + * byte in that page, plus one. + */ +static unsigned nilfs_last_byte(struct inode *inode, unsigned long page_nr) +{ + unsigned last_byte = inode->i_size; + + last_byte -= page_nr << PAGE_CACHE_SHIFT; + if (last_byte > PAGE_CACHE_SIZE) + last_byte = PAGE_CACHE_SIZE; + return last_byte; +} + +static int nilfs_prepare_chunk_uninterruptible(struct page *page, + struct address_space *mapping, + unsigned from, unsigned to) +{ + loff_t pos = page_offset(page) + from; + return block_write_begin(NULL, mapping, pos, to - from, + AOP_FLAG_UNINTERRUPTIBLE, &page, + NULL, nilfs_get_block); +} + +static int nilfs_prepare_chunk(struct page *page, + struct address_space *mapping, + unsigned from, unsigned to) +{ + loff_t pos = page_offset(page) + from; + return block_write_begin(NULL, mapping, pos, to - from, 0, &page, + NULL, nilfs_get_block); +} + +static int nilfs_commit_chunk(struct page *page, + struct address_space *mapping, + unsigned from, unsigned to) +{ + struct inode *dir = mapping->host; + struct nilfs_sb_info *sbi = NILFS_SB(dir->i_sb); + loff_t pos = page_offset(page) + from; + unsigned len = to - from; + unsigned nr_dirty, copied; + int err; + + nr_dirty = nilfs_page_count_clean_buffers(page, from, to); + copied = block_write_end(NULL, mapping, pos, len, len, page, NULL); + if (pos + copied > dir->i_size) { + i_size_write(dir, pos + copied); + mark_inode_dirty(dir); + } + if (IS_DIRSYNC(dir)) + nilfs_set_transaction_flag(NILFS_TI_SYNC); + err = nilfs_set_file_dirty(sbi, dir, nr_dirty); + unlock_page(page); + return err; +} + +static void nilfs_check_page(struct page *page) +{ + struct inode *dir = page->mapping->host; + struct super_block *sb = dir->i_sb; + unsigned chunk_size = nilfs_chunk_size(dir); + char *kaddr = page_address(page); + unsigned offs, rec_len; + unsigned limit = PAGE_CACHE_SIZE; + struct nilfs_dir_entry *p; + char *error; + + if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) { + limit = dir->i_size & ~PAGE_CACHE_MASK; + if (limit & (chunk_size - 1)) + goto Ebadsize; + if (!limit) + goto out; + } + for (offs = 0; offs <= limit - NILFS_DIR_REC_LEN(1); offs += rec_len) { + p = (struct nilfs_dir_entry *)(kaddr + offs); + rec_len = le16_to_cpu(p->rec_len); + + if (rec_len < NILFS_DIR_REC_LEN(1)) + goto Eshort; + if (rec_len & 3) + goto Ealign; + if (rec_len < NILFS_DIR_REC_LEN(p->name_len)) + goto Enamelen; + if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1)) + goto Espan; + } + if (offs != limit) + goto Eend; +out: + SetPageChecked(page); + return; + + /* Too bad, we had an error */ + +Ebadsize: + nilfs_error(sb, "nilfs_check_page", + "size of directory #%lu is not a multiple of chunk size", + dir->i_ino + ); + goto fail; +Eshort: + error = "rec_len is smaller than minimal"; + goto bad_entry; +Ealign: + error = "unaligned directory entry"; + goto bad_entry; +Enamelen: + error = "rec_len is too small for name_len"; + goto bad_entry; +Espan: + error = "directory entry across blocks"; +bad_entry: + nilfs_error(sb, "nilfs_check_page", "bad entry in directory #%lu: %s - " + "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", + dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs, + (unsigned long) le64_to_cpu(p->inode), + rec_len, p->name_len); + goto fail; +Eend: + p = (struct nilfs_dir_entry *)(kaddr + offs); + nilfs_error(sb, "nilfs_check_page", + "entry in directory #%lu spans the page boundary" + "offset=%lu, inode=%lu", + dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs, + (unsigned long) le64_to_cpu(p->inode)); +fail: + SetPageChecked(page); + SetPageError(page); +} + +static struct page *nilfs_get_page(struct inode *dir, unsigned long n) +{ + struct address_space *mapping = dir->i_mapping; + struct page *page = read_cache_page(mapping, n, + (filler_t *)mapping->a_ops->readpage, NULL); + if (!IS_ERR(page)) { + wait_on_page_locked(page); + kmap(page); + if (!PageUptodate(page)) + goto fail; + if (!PageChecked(page)) + nilfs_check_page(page); + if (PageError(page)) + goto fail; + } + return page; + +fail: + nilfs_put_page(page); + return ERR_PTR(-EIO); +} + +/* + * NOTE! unlike strncmp, nilfs_match returns 1 for success, 0 for failure. + * + * len <= NILFS_NAME_LEN and de != NULL are guaranteed by caller. + */ +static int +nilfs_match(int len, const char * const name, struct nilfs_dir_entry *de) +{ + if (len != de->name_len) + return 0; + if (!de->inode) + return 0; + return !memcmp(name, de->name, len); +} + +/* + * p is at least 6 bytes before the end of page + */ +static struct nilfs_dir_entry *nilfs_next_entry(struct nilfs_dir_entry *p) +{ + return (struct nilfs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len)); +} + +static unsigned char +nilfs_filetype_table[NILFS_FT_MAX] = { + [NILFS_FT_UNKNOWN] = DT_UNKNOWN, + [NILFS_FT_REG_FILE] = DT_REG, + [NILFS_FT_DIR] = DT_DIR, + [NILFS_FT_CHRDEV] = DT_CHR, + [NILFS_FT_BLKDEV] = DT_BLK, + [NILFS_FT_FIFO] = DT_FIFO, + [NILFS_FT_SOCK] = DT_SOCK, + [NILFS_FT_SYMLINK] = DT_LNK, +}; + +#define S_SHIFT 12 +static unsigned char +nilfs_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] = NILFS_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = NILFS_FT_DIR, + [S_IFCHR >> S_SHIFT] = NILFS_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = NILFS_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = NILFS_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = NILFS_FT_SOCK, + [S_IFLNK >> S_SHIFT] = NILFS_FT_SYMLINK, +}; + +static void nilfs_set_de_type(struct nilfs_dir_entry *de, struct inode *inode) +{ + mode_t mode = inode->i_mode; + + de->file_type = nilfs_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; +} + +static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + loff_t pos = filp->f_pos; + struct inode *inode = filp->f_dentry->d_inode; + struct super_block *sb = inode->i_sb; + unsigned int offset = pos & ~PAGE_CACHE_MASK; + unsigned long n = pos >> PAGE_CACHE_SHIFT; + unsigned long npages = dir_pages(inode); +/* unsigned chunk_mask = ~(nilfs_chunk_size(inode)-1); */ + unsigned char *types = NULL; + int ret; + + if (pos > inode->i_size - NILFS_DIR_REC_LEN(1)) + goto success; + + types = nilfs_filetype_table; + + for ( ; n < npages; n++, offset = 0) { + char *kaddr, *limit; + struct nilfs_dir_entry *de; + struct page *page = nilfs_get_page(inode, n); + + if (IS_ERR(page)) { + nilfs_error(sb, __func__, "bad page in #%lu", + inode->i_ino); + filp->f_pos += PAGE_CACHE_SIZE - offset; + ret = -EIO; + goto done; + } + kaddr = page_address(page); + de = (struct nilfs_dir_entry *)(kaddr + offset); + limit = kaddr + nilfs_last_byte(inode, n) - + NILFS_DIR_REC_LEN(1); + for ( ; (char *)de <= limit; de = nilfs_next_entry(de)) { + if (de->rec_len == 0) { + nilfs_error(sb, __func__, + "zero-length directory entry"); + ret = -EIO; + nilfs_put_page(page); + goto done; + } + if (de->inode) { + int over; + unsigned char d_type = DT_UNKNOWN; + + if (types && de->file_type < NILFS_FT_MAX) + d_type = types[de->file_type]; + + offset = (char *)de - kaddr; + over = filldir(dirent, de->name, de->name_len, + (n<<PAGE_CACHE_SHIFT) | offset, + le64_to_cpu(de->inode), d_type); + if (over) { + nilfs_put_page(page); + goto success; + } + } + filp->f_pos += le16_to_cpu(de->rec_len); + } + nilfs_put_page(page); + } + +success: + ret = 0; +done: + return ret; +} + +/* + * nilfs_find_entry() + * + * finds an entry in the specified directory with the wanted name. It + * returns the page in which the entry was found, and the entry itself + * (as a parameter - res_dir). Page is returned mapped and unlocked. + * Entry is guaranteed to be valid. + */ +struct nilfs_dir_entry * +nilfs_find_entry(struct inode *dir, struct dentry *dentry, + struct page **res_page) +{ + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + unsigned reclen = NILFS_DIR_REC_LEN(namelen); + unsigned long start, n; + unsigned long npages = dir_pages(dir); + struct page *page = NULL; + struct nilfs_inode_info *ei = NILFS_I(dir); + struct nilfs_dir_entry *de; + + if (npages == 0) + goto out; + + /* OFFSET_CACHE */ + *res_page = NULL; + + start = ei->i_dir_start_lookup; + if (start >= npages) + start = 0; + n = start; + do { + char *kaddr; + page = nilfs_get_page(dir, n); + if (!IS_ERR(page)) { + kaddr = page_address(page); + de = (struct nilfs_dir_entry *)kaddr; + kaddr += nilfs_last_byte(dir, n) - reclen; + while ((char *) de <= kaddr) { + if (de->rec_len == 0) { + nilfs_error(dir->i_sb, __func__, + "zero-length directory entry"); + nilfs_put_page(page); + goto out; + } + if (nilfs_match(namelen, name, de)) + goto found; + de = nilfs_next_entry(de); + } + nilfs_put_page(page); + } + if (++n >= npages) + n = 0; + /* next page is past the blocks we've got */ + if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) { + nilfs_error(dir->i_sb, __func__, + "dir %lu size %lld exceeds block cout %llu", + dir->i_ino, dir->i_size, + (unsigned long long)dir->i_blocks); + goto out; + } + } while (n != start); +out: + return NULL; + +found: + *res_page = page; + ei->i_dir_start_lookup = n; + return de; +} + +struct nilfs_dir_entry *nilfs_dotdot(struct inode *dir, struct page **p) +{ + struct page *page = nilfs_get_page(dir, 0); + struct nilfs_dir_entry *de = NULL; + + if (!IS_ERR(page)) { + de = nilfs_next_entry( + (struct nilfs_dir_entry *)page_address(page)); + *p = page; + } + return de; +} + +ino_t nilfs_inode_by_name(struct inode *dir, struct dentry *dentry) +{ + ino_t res = 0; + struct nilfs_dir_entry *de; + struct page *page; + + de = nilfs_find_entry(dir, dentry, &page); + if (de) { + res = le64_to_cpu(de->inode); + kunmap(page); + page_cache_release(page); + } + return res; +} + +/* Releases the page */ +void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de, + struct page *page, struct inode *inode) +{ + unsigned from = (char *) de - (char *) page_address(page); + unsigned to = from + le16_to_cpu(de->rec_len); + struct address_space *mapping = page->mapping; + int err; + + lock_page(page); + err = nilfs_prepare_chunk_uninterruptible(page, mapping, from, to); + BUG_ON(err); + de->inode = cpu_to_le64(inode->i_ino); + nilfs_set_de_type(de, inode); + err = nilfs_commit_chunk(page, mapping, from, to); + nilfs_put_page(page); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; +/* NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */ + mark_inode_dirty(dir); +} + +/* + * Parent is locked. + */ +int nilfs_add_link(struct dentry *dentry, struct inode *inode) +{ + struct inode *dir = dentry->d_parent->d_inode; + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + unsigned chunk_size = nilfs_chunk_size(dir); + unsigned reclen = NILFS_DIR_REC_LEN(namelen); + unsigned short rec_len, name_len; + struct page *page = NULL; + struct nilfs_dir_entry *de; + unsigned long npages = dir_pages(dir); + unsigned long n; + char *kaddr; + unsigned from, to; + int err; + + /* + * We take care of directory expansion in the same loop. + * This code plays outside i_size, so it locks the page + * to protect that region. + */ + for (n = 0; n <= npages; n++) { + char *dir_end; + + page = nilfs_get_page(dir, n); + err = PTR_ERR(page); + if (IS_ERR(page)) + goto out; + lock_page(page); + kaddr = page_address(page); + dir_end = kaddr + nilfs_last_byte(dir, n); + de = (struct nilfs_dir_entry *)kaddr; + kaddr += PAGE_CACHE_SIZE - reclen; + while ((char *)de <= kaddr) { + if ((char *)de == dir_end) { + /* We hit i_size */ + name_len = 0; + rec_len = chunk_size; + de->rec_len = cpu_to_le16(chunk_size); + de->inode = 0; + goto got_it; + } + if (de->rec_len == 0) { + nilfs_error(dir->i_sb, __func__, + "zero-length directory entry"); + err = -EIO; + goto out_unlock; + } + err = -EEXIST; + if (nilfs_match(namelen, name, de)) + goto out_unlock; + name_len = NILFS_DIR_REC_LEN(de->name_len); + rec_len = le16_to_cpu(de->rec_len); + if (!de->inode && rec_len >= reclen) + goto got_it; + if (rec_len >= name_len + reclen) + goto got_it; + de = (struct nilfs_dir_entry *)((char *)de + rec_len); + } + unlock_page(page); + nilfs_put_page(page); + } + BUG(); + return -EINVAL; + +got_it: + from = (char *)de - (char *)page_address(page); + to = from + rec_len; + err = nilfs_prepare_chunk(page, page->mapping, from, to); + if (err) + goto out_unlock; + if (de->inode) { + struct nilfs_dir_entry *de1; + + de1 = (struct nilfs_dir_entry *)((char *)de + name_len); + de1->rec_len = cpu_to_le16(rec_len - name_len); + de->rec_len = cpu_to_le16(name_len); + de = de1; + } + de->name_len = namelen; + memcpy(de->name, name, namelen); + de->inode = cpu_to_le64(inode->i_ino); + nilfs_set_de_type(de, inode); + err = nilfs_commit_chunk(page, page->mapping, from, to); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; +/* NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */ + mark_inode_dirty(dir); + /* OFFSET_CACHE */ +out_put: + nilfs_put_page(page); +out: + return err; +out_unlock: + unlock_page(page); + goto out_put; +} + +/* + * nilfs_delete_entry deletes a directory entry by merging it with the + * previous entry. Page is up-to-date. Releases the page. + */ +int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + char *kaddr = page_address(page); + unsigned from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1); + unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len); + struct nilfs_dir_entry *pde = NULL; + struct nilfs_dir_entry *de = (struct nilfs_dir_entry *)(kaddr + from); + int err; + + while ((char *)de < (char *)dir) { + if (de->rec_len == 0) { + nilfs_error(inode->i_sb, __func__, + "zero-length directory entry"); + err = -EIO; + goto out; + } + pde = de; + de = nilfs_next_entry(de); + } + if (pde) + from = (char *)pde - (char *)page_address(page); + lock_page(page); + err = nilfs_prepare_chunk(page, mapping, from, to); + BUG_ON(err); + if (pde) + pde->rec_len = cpu_to_le16(to - from); + dir->inode = 0; + err = nilfs_commit_chunk(page, mapping, from, to); + inode->i_ctime = inode->i_mtime = CURRENT_TIME; +/* NILFS_I(inode)->i_flags &= ~NILFS_BTREE_FL; */ + mark_inode_dirty(inode); +out: + nilfs_put_page(page); + return err; +} + +/* + * Set the first fragment of directory. + */ +int nilfs_make_empty(struct inode *inode, struct inode *parent) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page = grab_cache_page(mapping, 0); + unsigned chunk_size = nilfs_chunk_size(inode); + struct nilfs_dir_entry *de; + int err; + void *kaddr; + + if (!page) + return -ENOMEM; + + err = nilfs_prepare_chunk(page, mapping, 0, chunk_size); + if (unlikely(err)) { + unlock_page(page); + goto fail; + } + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr, 0, chunk_size); + de = (struct nilfs_dir_entry *)kaddr; + de->name_len = 1; + de->rec_len = cpu_to_le16(NILFS_DIR_REC_LEN(1)); + memcpy(de->name, ".\0\0", 4); + de->inode = cpu_to_le64(inode->i_ino); + nilfs_set_de_type(de, inode); + + de = (struct nilfs_dir_entry *)(kaddr + NILFS_DIR_REC_LEN(1)); + de->name_len = 2; + de->rec_len = cpu_to_le16(chunk_size - NILFS_DIR_REC_LEN(1)); + de->inode = cpu_to_le64(parent->i_ino); + memcpy(de->name, "..\0", 4); + nilfs_set_de_type(de, inode); + kunmap_atomic(kaddr, KM_USER0); + err = nilfs_commit_chunk(page, mapping, 0, chunk_size); +fail: + page_cache_release(page); + return err; +} + +/* + * routine to check that the specified directory is empty (for rmdir) + */ +int nilfs_empty_dir(struct inode *inode) +{ + struct page *page = NULL; + unsigned long i, npages = dir_pages(inode); + + for (i = 0; i < npages; i++) { + char *kaddr; + struct nilfs_dir_entry *de; + + page = nilfs_get_page(inode, i); + if (IS_ERR(page)) + continue; + + kaddr = page_address(page); + de = (struct nilfs_dir_entry *)kaddr; + kaddr += nilfs_last_byte(inode, i) - NILFS_DIR_REC_LEN(1); + + while ((char *)de <= kaddr) { + if (de->rec_len == 0) { + nilfs_error(inode->i_sb, __func__, + "zero-length directory entry " + "(kaddr=%p, de=%p)\n", kaddr, de); + goto not_empty; + } + if (de->inode != 0) { + /* check for . and .. */ + if (de->name[0] != '.') + goto not_empty; + if (de->name_len > 2) + goto not_empty; + if (de->name_len < 2) { + if (de->inode != + cpu_to_le64(inode->i_ino)) + goto not_empty; + } else if (de->name[1] != '.') + goto not_empty; + } + de = nilfs_next_entry(de); + } + nilfs_put_page(page); + } + return 1; + +not_empty: + nilfs_put_page(page); + return 0; +} + +struct file_operations nilfs_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = nilfs_readdir, + .unlocked_ioctl = nilfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = nilfs_ioctl, +#endif /* CONFIG_COMPAT */ + .fsync = nilfs_sync_file, + +}; diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c new file mode 100644 index 000000000000..c6379e482781 --- /dev/null +++ b/fs/nilfs2/direct.c @@ -0,0 +1,436 @@ +/* + * direct.c - NILFS direct block pointer. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#include <linux/errno.h> +#include "nilfs.h" +#include "page.h" +#include "direct.h" +#include "alloc.h" + +static inline __le64 *nilfs_direct_dptrs(const struct nilfs_direct *direct) +{ + return (__le64 *) + ((struct nilfs_direct_node *)direct->d_bmap.b_u.u_data + 1); +} + +static inline __u64 +nilfs_direct_get_ptr(const struct nilfs_direct *direct, __u64 key) +{ + return nilfs_bmap_dptr_to_ptr(*(nilfs_direct_dptrs(direct) + key)); +} + +static inline void nilfs_direct_set_ptr(struct nilfs_direct *direct, + __u64 key, __u64 ptr) +{ + *(nilfs_direct_dptrs(direct) + key) = nilfs_bmap_ptr_to_dptr(ptr); +} + +static int nilfs_direct_lookup(const struct nilfs_bmap *bmap, + __u64 key, int level, __u64 *ptrp) +{ + struct nilfs_direct *direct; + __u64 ptr; + + direct = (struct nilfs_direct *)bmap; + if ((key > NILFS_DIRECT_KEY_MAX) || + (level != 1) || /* XXX: use macro for level 1 */ + ((ptr = nilfs_direct_get_ptr(direct, key)) == + NILFS_BMAP_INVALID_PTR)) + return -ENOENT; + + if (ptrp != NULL) + *ptrp = ptr; + return 0; +} + +static __u64 +nilfs_direct_find_target_v(const struct nilfs_direct *direct, __u64 key) +{ + __u64 ptr; + + ptr = nilfs_bmap_find_target_seq(&direct->d_bmap, key); + if (ptr != NILFS_BMAP_INVALID_PTR) + /* sequential access */ + return ptr; + else + /* block group */ + return nilfs_bmap_find_target_in_group(&direct->d_bmap); +} + +static void nilfs_direct_set_target_v(struct nilfs_direct *direct, + __u64 key, __u64 ptr) +{ + direct->d_bmap.b_last_allocated_key = key; + direct->d_bmap.b_last_allocated_ptr = ptr; +} + +static int nilfs_direct_prepare_insert(struct nilfs_direct *direct, + __u64 key, + union nilfs_bmap_ptr_req *req, + struct nilfs_bmap_stats *stats) +{ + int ret; + + if (direct->d_ops->dop_find_target != NULL) + req->bpr_ptr = direct->d_ops->dop_find_target(direct, key); + ret = direct->d_bmap.b_pops->bpop_prepare_alloc_ptr(&direct->d_bmap, + req); + if (ret < 0) + return ret; + + stats->bs_nblocks = 1; + return 0; +} + +static void nilfs_direct_commit_insert(struct nilfs_direct *direct, + union nilfs_bmap_ptr_req *req, + __u64 key, __u64 ptr) +{ + struct buffer_head *bh; + + /* ptr must be a pointer to a buffer head. */ + bh = (struct buffer_head *)((unsigned long)ptr); + set_buffer_nilfs_volatile(bh); + + if (direct->d_bmap.b_pops->bpop_commit_alloc_ptr != NULL) + direct->d_bmap.b_pops->bpop_commit_alloc_ptr( + &direct->d_bmap, req); + nilfs_direct_set_ptr(direct, key, req->bpr_ptr); + + if (!nilfs_bmap_dirty(&direct->d_bmap)) + nilfs_bmap_set_dirty(&direct->d_bmap); + + if (direct->d_ops->dop_set_target != NULL) + direct->d_ops->dop_set_target(direct, key, req->bpr_ptr); +} + +static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) +{ + struct nilfs_direct *direct; + union nilfs_bmap_ptr_req req; + struct nilfs_bmap_stats stats; + int ret; + + direct = (struct nilfs_direct *)bmap; + if (key > NILFS_DIRECT_KEY_MAX) + return -ENOENT; + if (nilfs_direct_get_ptr(direct, key) != NILFS_BMAP_INVALID_PTR) + return -EEXIST; + + ret = nilfs_direct_prepare_insert(direct, key, &req, &stats); + if (ret < 0) + return ret; + nilfs_direct_commit_insert(direct, &req, key, ptr); + nilfs_bmap_add_blocks(bmap, stats.bs_nblocks); + + return 0; +} + +static int nilfs_direct_prepare_delete(struct nilfs_direct *direct, + union nilfs_bmap_ptr_req *req, + __u64 key, + struct nilfs_bmap_stats *stats) +{ + int ret; + + if (direct->d_bmap.b_pops->bpop_prepare_end_ptr != NULL) { + req->bpr_ptr = nilfs_direct_get_ptr(direct, key); + ret = direct->d_bmap.b_pops->bpop_prepare_end_ptr( + &direct->d_bmap, req); + if (ret < 0) + return ret; + } + + stats->bs_nblocks = 1; + return 0; +} + +static void nilfs_direct_commit_delete(struct nilfs_direct *direct, + union nilfs_bmap_ptr_req *req, + __u64 key) +{ + if (direct->d_bmap.b_pops->bpop_commit_end_ptr != NULL) + direct->d_bmap.b_pops->bpop_commit_end_ptr( + &direct->d_bmap, req); + nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR); +} + +static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key) +{ + struct nilfs_direct *direct; + union nilfs_bmap_ptr_req req; + struct nilfs_bmap_stats stats; + int ret; + + direct = (struct nilfs_direct *)bmap; + if ((key > NILFS_DIRECT_KEY_MAX) || + nilfs_direct_get_ptr(direct, key) == NILFS_BMAP_INVALID_PTR) + return -ENOENT; + + ret = nilfs_direct_prepare_delete(direct, &req, key, &stats); + if (ret < 0) + return ret; + nilfs_direct_commit_delete(direct, &req, key); + nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks); + + return 0; +} + +static int nilfs_direct_last_key(const struct nilfs_bmap *bmap, __u64 *keyp) +{ + struct nilfs_direct *direct; + __u64 key, lastkey; + + direct = (struct nilfs_direct *)bmap; + lastkey = NILFS_DIRECT_KEY_MAX + 1; + for (key = NILFS_DIRECT_KEY_MIN; key <= NILFS_DIRECT_KEY_MAX; key++) + if (nilfs_direct_get_ptr(direct, key) != + NILFS_BMAP_INVALID_PTR) + lastkey = key; + + if (lastkey == NILFS_DIRECT_KEY_MAX + 1) + return -ENOENT; + + *keyp = lastkey; + + return 0; +} + +static int nilfs_direct_check_insert(const struct nilfs_bmap *bmap, __u64 key) +{ + return key > NILFS_DIRECT_KEY_MAX; +} + +static int nilfs_direct_gather_data(struct nilfs_bmap *bmap, + __u64 *keys, __u64 *ptrs, int nitems) +{ + struct nilfs_direct *direct; + __u64 key; + __u64 ptr; + int n; + + direct = (struct nilfs_direct *)bmap; + if (nitems > NILFS_DIRECT_NBLOCKS) + nitems = NILFS_DIRECT_NBLOCKS; + n = 0; + for (key = 0; key < nitems; key++) { + ptr = nilfs_direct_get_ptr(direct, key); + if (ptr != NILFS_BMAP_INVALID_PTR) { + keys[n] = key; + ptrs[n] = ptr; + n++; + } + } + return n; +} + +int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap, + __u64 key, __u64 *keys, __u64 *ptrs, + int n, __u64 low, __u64 high) +{ + struct nilfs_direct *direct; + __le64 *dptrs; + int ret, i, j; + + /* no need to allocate any resource for conversion */ + + /* delete */ + ret = bmap->b_ops->bop_delete(bmap, key); + if (ret < 0) + return ret; + + /* free resources */ + if (bmap->b_ops->bop_clear != NULL) + bmap->b_ops->bop_clear(bmap); + + /* convert */ + direct = (struct nilfs_direct *)bmap; + dptrs = nilfs_direct_dptrs(direct); + for (i = 0, j = 0; i < NILFS_DIRECT_NBLOCKS; i++) { + if ((j < n) && (i == keys[j])) { + dptrs[i] = (i != key) ? + nilfs_bmap_ptr_to_dptr(ptrs[j]) : + NILFS_BMAP_INVALID_PTR; + j++; + } else + dptrs[i] = NILFS_BMAP_INVALID_PTR; + } + + nilfs_direct_init(bmap, low, high); + + return 0; +} + +static int nilfs_direct_propagate_v(struct nilfs_direct *direct, + struct buffer_head *bh) +{ + union nilfs_bmap_ptr_req oldreq, newreq; + __u64 key; + __u64 ptr; + int ret; + + key = nilfs_bmap_data_get_key(&direct->d_bmap, bh); + ptr = nilfs_direct_get_ptr(direct, key); + if (!buffer_nilfs_volatile(bh)) { + oldreq.bpr_ptr = ptr; + newreq.bpr_ptr = ptr; + ret = nilfs_bmap_prepare_update(&direct->d_bmap, &oldreq, + &newreq); + if (ret < 0) + return ret; + nilfs_bmap_commit_update(&direct->d_bmap, &oldreq, &newreq); + set_buffer_nilfs_volatile(bh); + nilfs_direct_set_ptr(direct, key, newreq.bpr_ptr); + } else + ret = nilfs_bmap_mark_dirty(&direct->d_bmap, ptr); + + return ret; +} + +static int nilfs_direct_propagate(const struct nilfs_bmap *bmap, + struct buffer_head *bh) +{ + struct nilfs_direct *direct; + + direct = (struct nilfs_direct *)bmap; + return (direct->d_ops->dop_propagate != NULL) ? + direct->d_ops->dop_propagate(direct, bh) : + 0; +} + +static int nilfs_direct_assign_v(struct nilfs_direct *direct, + __u64 key, __u64 ptr, + struct buffer_head **bh, + sector_t blocknr, + union nilfs_binfo *binfo) +{ + union nilfs_bmap_ptr_req req; + int ret; + + req.bpr_ptr = ptr; + ret = direct->d_bmap.b_pops->bpop_prepare_start_ptr( + &direct->d_bmap, &req); + if (ret < 0) + return ret; + direct->d_bmap.b_pops->bpop_commit_start_ptr(&direct->d_bmap, + &req, blocknr); + + binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr); + binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); + + return 0; +} + +static int nilfs_direct_assign_p(struct nilfs_direct *direct, + __u64 key, __u64 ptr, + struct buffer_head **bh, + sector_t blocknr, + union nilfs_binfo *binfo) +{ + nilfs_direct_set_ptr(direct, key, blocknr); + + binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key); + binfo->bi_dat.bi_level = 0; + + return 0; +} + +static int nilfs_direct_assign(struct nilfs_bmap *bmap, + struct buffer_head **bh, + sector_t blocknr, + union nilfs_binfo *binfo) +{ + struct nilfs_direct *direct; + __u64 key; + __u64 ptr; + + direct = (struct nilfs_direct *)bmap; + key = nilfs_bmap_data_get_key(bmap, *bh); + if (unlikely(key > NILFS_DIRECT_KEY_MAX)) { + printk(KERN_CRIT "%s: invalid key: %llu\n", __func__, + (unsigned long long)key); + return -EINVAL; + } + ptr = nilfs_direct_get_ptr(direct, key); + if (unlikely(ptr == NILFS_BMAP_INVALID_PTR)) { + printk(KERN_CRIT "%s: invalid pointer: %llu\n", __func__, + (unsigned long long)ptr); + return -EINVAL; + } + + return direct->d_ops->dop_assign(direct, key, ptr, bh, + blocknr, binfo); +} + +static const struct nilfs_bmap_operations nilfs_direct_ops = { + .bop_lookup = nilfs_direct_lookup, + .bop_insert = nilfs_direct_insert, + .bop_delete = nilfs_direct_delete, + .bop_clear = NULL, + + .bop_propagate = nilfs_direct_propagate, + + .bop_lookup_dirty_buffers = NULL, + + .bop_assign = nilfs_direct_assign, + .bop_mark = NULL, + + .bop_last_key = nilfs_direct_last_key, + .bop_check_insert = nilfs_direct_check_insert, + .bop_check_delete = NULL, + .bop_gather_data = nilfs_direct_gather_data, +}; + + +static const struct nilfs_direct_operations nilfs_direct_ops_v = { + .dop_find_target = nilfs_direct_find_target_v, + .dop_set_target = nilfs_direct_set_target_v, + .dop_propagate = nilfs_direct_propagate_v, + .dop_assign = nilfs_direct_assign_v, +}; + +static const struct nilfs_direct_operations nilfs_direct_ops_p = { + .dop_find_target = NULL, + .dop_set_target = NULL, + .dop_propagate = NULL, + .dop_assign = nilfs_direct_assign_p, +}; + +int nilfs_direct_init(struct nilfs_bmap *bmap, __u64 low, __u64 high) +{ + struct nilfs_direct *direct; + + direct = (struct nilfs_direct *)bmap; + bmap->b_ops = &nilfs_direct_ops; + bmap->b_low = low; + bmap->b_high = high; + switch (bmap->b_inode->i_ino) { + case NILFS_DAT_INO: + direct->d_ops = &nilfs_direct_ops_p; + break; + default: + direct->d_ops = &nilfs_direct_ops_v; + break; + } + + return 0; +} diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h new file mode 100644 index 000000000000..45d2c5cda812 --- /dev/null +++ b/fs/nilfs2/direct.h @@ -0,0 +1,78 @@ +/* + * direct.h - NILFS direct block pointer. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#ifndef _NILFS_DIRECT_H +#define _NILFS_DIRECT_H + +#include <linux/types.h> +#include <linux/buffer_head.h> +#include "bmap.h" + + +struct nilfs_direct; + +/** + * struct nilfs_direct_operations - direct mapping operation table + */ +struct nilfs_direct_operations { + __u64 (*dop_find_target)(const struct nilfs_direct *, __u64); + void (*dop_set_target)(struct nilfs_direct *, __u64, __u64); + int (*dop_propagate)(struct nilfs_direct *, struct buffer_head *); + int (*dop_assign)(struct nilfs_direct *, __u64, __u64, + struct buffer_head **, sector_t, + union nilfs_binfo *); +}; + +/** + * struct nilfs_direct_node - direct node + * @dn_flags: flags + * @dn_pad: padding + */ +struct nilfs_direct_node { + __u8 dn_flags; + __u8 pad[7]; +}; + +/** + * struct nilfs_direct - direct mapping + * @d_bmap: bmap structure + * @d_ops: direct mapping operation table + */ +struct nilfs_direct { + struct nilfs_bmap d_bmap; + + /* direct-mapping-specific members */ + const struct nilfs_direct_operations *d_ops; +}; + + +#define NILFS_DIRECT_NBLOCKS (NILFS_BMAP_SIZE / sizeof(__le64) - 1) +#define NILFS_DIRECT_KEY_MIN 0 +#define NILFS_DIRECT_KEY_MAX (NILFS_DIRECT_NBLOCKS - 1) + + +int nilfs_direct_init(struct nilfs_bmap *, __u64, __u64); +int nilfs_direct_delete_and_convert(struct nilfs_bmap *, __u64, __u64 *, + __u64 *, int, __u64, __u64); + + +#endif /* _NILFS_DIRECT_H */ diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c new file mode 100644 index 000000000000..6bd84a0d8238 --- /dev/null +++ b/fs/nilfs2/file.c @@ -0,0 +1,160 @@ +/* + * file.c - NILFS regular file handling primitives including fsync(). + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Amagai Yoshiji <amagai@osrg.net>, + * Ryusuke Konishi <ryusuke@osrg.net> + */ + +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/writeback.h> +#include "nilfs.h" +#include "segment.h" + +int nilfs_sync_file(struct file *file, struct dentry *dentry, int datasync) +{ + /* + * Called from fsync() system call + * This is the only entry point that can catch write and synch + * timing for both data blocks and intermediate blocks. + * + * This function should be implemented when the writeback function + * will be implemented. + */ + struct inode *inode = dentry->d_inode; + int err; + + if (!nilfs_inode_dirty(inode)) + return 0; + + if (datasync) + err = nilfs_construct_dsync_segment(inode->i_sb, inode, 0, + LLONG_MAX); + else + err = nilfs_construct_segment(inode->i_sb); + + return err; +} + +static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct inode *inode = vma->vm_file->f_dentry->d_inode; + struct nilfs_transaction_info ti; + int ret; + + if (unlikely(nilfs_near_disk_full(NILFS_SB(inode->i_sb)->s_nilfs))) + return VM_FAULT_SIGBUS; /* -ENOSPC */ + + lock_page(page); + if (page->mapping != inode->i_mapping || + page_offset(page) >= i_size_read(inode) || !PageUptodate(page)) { + unlock_page(page); + return VM_FAULT_NOPAGE; /* make the VM retry the fault */ + } + + /* + * check to see if the page is mapped already (no holes) + */ + if (PageMappedToDisk(page)) { + unlock_page(page); + goto mapped; + } + if (page_has_buffers(page)) { + struct buffer_head *bh, *head; + int fully_mapped = 1; + + bh = head = page_buffers(page); + do { + if (!buffer_mapped(bh)) { + fully_mapped = 0; + break; + } + } while (bh = bh->b_this_page, bh != head); + + if (fully_mapped) { + SetPageMappedToDisk(page); + unlock_page(page); + goto mapped; + } + } + unlock_page(page); + + /* + * fill hole blocks + */ + ret = nilfs_transaction_begin(inode->i_sb, &ti, 1); + /* never returns -ENOMEM, but may return -ENOSPC */ + if (unlikely(ret)) + return VM_FAULT_SIGBUS; + + ret = block_page_mkwrite(vma, vmf, nilfs_get_block); + if (unlikely(ret)) { + nilfs_transaction_abort(inode->i_sb); + return ret; + } + nilfs_transaction_commit(inode->i_sb); + + mapped: + SetPageChecked(page); + wait_on_page_writeback(page); + return 0; +} + +struct vm_operations_struct nilfs_file_vm_ops = { + .fault = filemap_fault, + .page_mkwrite = nilfs_page_mkwrite, +}; + +static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + file_accessed(file); + vma->vm_ops = &nilfs_file_vm_ops; + vma->vm_flags |= VM_CAN_NONLINEAR; + return 0; +} + +/* + * We have mostly NULL's here: the current defaults are ok for + * the nilfs filesystem. + */ +struct file_operations nilfs_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = generic_file_aio_read, + .aio_write = generic_file_aio_write, + .unlocked_ioctl = nilfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = nilfs_ioctl, +#endif /* CONFIG_COMPAT */ + .mmap = nilfs_file_mmap, + .open = generic_file_open, + /* .release = nilfs_release_file, */ + .fsync = nilfs_sync_file, + .splice_read = generic_file_splice_read, +}; + +struct inode_operations nilfs_file_inode_operations = { + .truncate = nilfs_truncate, + .setattr = nilfs_setattr, + .permission = nilfs_permission, +}; + +/* end of file */ diff --git a/fs/nilfs2/gcdat.c b/fs/nilfs2/gcdat.c new file mode 100644 index 000000000000..93383c5cee90 --- /dev/null +++ b/fs/nilfs2/gcdat.c @@ -0,0 +1,84 @@ +/* + * gcdat.c - NILFS shadow DAT inode for GC + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>, + * and Ryusuke Konishi <ryusuke@osrg.net>. + * + */ + +#include <linux/buffer_head.h> +#include "nilfs.h" +#include "page.h" +#include "mdt.h" + +int nilfs_init_gcdat_inode(struct the_nilfs *nilfs) +{ + struct inode *dat = nilfs->ns_dat, *gcdat = nilfs->ns_gc_dat; + struct nilfs_inode_info *dii = NILFS_I(dat), *gii = NILFS_I(gcdat); + int err; + + gcdat->i_state = 0; + gcdat->i_blocks = dat->i_blocks; + gii->i_flags = dii->i_flags; + gii->i_state = dii->i_state | (1 << NILFS_I_GCDAT); + gii->i_cno = 0; + nilfs_bmap_init_gcdat(gii->i_bmap, dii->i_bmap); + err = nilfs_copy_dirty_pages(gcdat->i_mapping, dat->i_mapping); + if (unlikely(err)) + return err; + + return nilfs_copy_dirty_pages(&gii->i_btnode_cache, + &dii->i_btnode_cache); +} + +void nilfs_commit_gcdat_inode(struct the_nilfs *nilfs) +{ + struct inode *dat = nilfs->ns_dat, *gcdat = nilfs->ns_gc_dat; + struct nilfs_inode_info *dii = NILFS_I(dat), *gii = NILFS_I(gcdat); + struct address_space *mapping = dat->i_mapping; + struct address_space *gmapping = gcdat->i_mapping; + + down_write(&NILFS_MDT(dat)->mi_sem); + dat->i_blocks = gcdat->i_blocks; + dii->i_flags = gii->i_flags; + dii->i_state = gii->i_state & ~(1 << NILFS_I_GCDAT); + + nilfs_bmap_commit_gcdat(gii->i_bmap, dii->i_bmap); + + nilfs_clear_dirty_pages(mapping); + nilfs_copy_back_pages(mapping, gmapping); + /* note: mdt dirty flags should be cleared by segctor. */ + + nilfs_clear_dirty_pages(&dii->i_btnode_cache); + nilfs_copy_back_pages(&dii->i_btnode_cache, &gii->i_btnode_cache); + + up_write(&NILFS_MDT(dat)->mi_sem); +} + +void nilfs_clear_gcdat_inode(struct the_nilfs *nilfs) +{ + struct inode *gcdat = nilfs->ns_gc_dat; + struct nilfs_inode_info *gii = NILFS_I(gcdat); + + gcdat->i_state = I_CLEAR; + gii->i_flags = 0; + + truncate_inode_pages(gcdat->i_mapping, 0); + truncate_inode_pages(&gii->i_btnode_cache, 0); +} diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c new file mode 100644 index 000000000000..19d2102b6a69 --- /dev/null +++ b/fs/nilfs2/gcinode.c @@ -0,0 +1,288 @@ +/* + * gcinode.c - dummy inodes to buffer blocks for garbage collection + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>, + * and Ryusuke Konishi <ryusuke@osrg.net>. + * Revised by Ryusuke Konishi <ryusuke@osrg.net>. + * + */ +/* + * This file adds the cache of on-disk blocks to be moved in garbage + * collection. The disk blocks are held with dummy inodes (called + * gcinodes), and this file provides lookup function of the dummy + * inodes and their buffer read function. + * + * Since NILFS2 keeps up multiple checkpoints/snapshots accross GC, it + * has to treat blocks that belong to a same file but have different + * checkpoint numbers. To avoid interference among generations, dummy + * inodes are managed separatly from actual inodes, and their lookup + * function (nilfs_gc_iget) is designed to be specified with a + * checkpoint number argument as well as an inode number. + * + * Buffers and pages held by the dummy inodes will be released each + * time after they are copied to a new log. Dirty blocks made on the + * current generation and the blocks to be moved by GC never overlap + * because the dirty blocks make a new generation; they rather must be + * written individually. + */ + +#include <linux/buffer_head.h> +#include <linux/mpage.h> +#include <linux/hash.h> +#include <linux/swap.h> +#include "nilfs.h" +#include "page.h" +#include "mdt.h" +#include "dat.h" +#include "ifile.h" + +static struct address_space_operations def_gcinode_aops = {}; +/* XXX need def_gcinode_iops/fops? */ + +/* + * nilfs_gccache_submit_read_data() - add data buffer and submit read request + * @inode - gc inode + * @blkoff - dummy offset treated as the key for the page cache + * @pbn - physical block number of the block + * @vbn - virtual block number of the block, 0 for non-virtual block + * @out_bh - indirect pointer to a buffer_head struct to receive the results + * + * Description: nilfs_gccache_submit_read_data() registers the data buffer + * specified by @pbn to the GC pagecache with the key @blkoff. + * This function sets @vbn (@pbn if @vbn is zero) in b_blocknr of the buffer. + * + * Return Value: On success, 0 is returned. On Error, one of the following + * negative error code is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - The block specified with @pbn does not exist. + */ +int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff, + sector_t pbn, __u64 vbn, + struct buffer_head **out_bh) +{ + struct buffer_head *bh; + int err; + + bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0); + if (unlikely(!bh)) + return -ENOMEM; + + if (buffer_uptodate(bh)) + goto out; + + if (pbn == 0) { + struct inode *dat_inode = NILFS_I_NILFS(inode)->ns_dat; + /* use original dat, not gc dat. */ + err = nilfs_dat_translate(dat_inode, vbn, &pbn); + if (unlikely(err)) { /* -EIO, -ENOMEM, -ENOENT */ + brelse(bh); + goto failed; + } + } + + lock_buffer(bh); + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + goto out; + } + + if (!buffer_mapped(bh)) { + bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev; + set_buffer_mapped(bh); + } + bh->b_blocknr = pbn; + bh->b_end_io = end_buffer_read_sync; + get_bh(bh); + submit_bh(READ, bh); + if (vbn) + bh->b_blocknr = vbn; + out: + err = 0; + *out_bh = bh; + + failed: + unlock_page(bh->b_page); + page_cache_release(bh->b_page); + return err; +} + +/* + * nilfs_gccache_submit_read_node() - add node buffer and submit read request + * @inode - gc inode + * @pbn - physical block number for the block + * @vbn - virtual block number for the block + * @out_bh - indirect pointer to a buffer_head struct to receive the results + * + * Description: nilfs_gccache_submit_read_node() registers the node buffer + * specified by @vbn to the GC pagecache. @pbn can be supplied by the + * caller to avoid translation of the disk block address. + * + * Return Value: On success, 0 is returned. On Error, one of the following + * negative error code is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn, + __u64 vbn, struct buffer_head **out_bh) +{ + int ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache, + vbn ? : pbn, pbn, out_bh, 0); + if (ret == -EEXIST) /* internal code (cache hit) */ + ret = 0; + return ret; +} + +int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh) +{ + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + return -EIO; + if (buffer_dirty(bh)) + return -EEXIST; + + if (buffer_nilfs_node(bh)) + nilfs_btnode_mark_dirty(bh); + else + nilfs_mdt_mark_buffer_dirty(bh); + return 0; +} + +/* + * nilfs_init_gccache() - allocate and initialize gc_inode hash table + * @nilfs - the_nilfs + * + * Return Value: On success, 0. + * On error, a negative error code is returned. + */ +int nilfs_init_gccache(struct the_nilfs *nilfs) +{ + int loop; + + BUG_ON(nilfs->ns_gc_inodes_h); + + INIT_LIST_HEAD(&nilfs->ns_gc_inodes); + + nilfs->ns_gc_inodes_h = + kmalloc(sizeof(struct hlist_head) * NILFS_GCINODE_HASH_SIZE, + GFP_NOFS); + if (nilfs->ns_gc_inodes_h == NULL) + return -ENOMEM; + + for (loop = 0; loop < NILFS_GCINODE_HASH_SIZE; loop++) + INIT_HLIST_HEAD(&nilfs->ns_gc_inodes_h[loop]); + return 0; +} + +/* + * nilfs_destroy_gccache() - free gc_inode hash table + * @nilfs - the nilfs + */ +void nilfs_destroy_gccache(struct the_nilfs *nilfs) +{ + if (nilfs->ns_gc_inodes_h) { + nilfs_remove_all_gcinode(nilfs); + kfree(nilfs->ns_gc_inodes_h); + nilfs->ns_gc_inodes_h = NULL; + } +} + +static struct inode *alloc_gcinode(struct the_nilfs *nilfs, ino_t ino, + __u64 cno) +{ + struct inode *inode = nilfs_mdt_new_common(nilfs, NULL, ino, GFP_NOFS); + struct nilfs_inode_info *ii; + + if (!inode) + return NULL; + + inode->i_op = NULL; + inode->i_fop = NULL; + inode->i_mapping->a_ops = &def_gcinode_aops; + + ii = NILFS_I(inode); + ii->i_cno = cno; + ii->i_flags = 0; + ii->i_state = 1 << NILFS_I_GCINODE; + ii->i_bh = NULL; + nilfs_bmap_init_gc(ii->i_bmap); + + return inode; +} + +static unsigned long ihash(ino_t ino, __u64 cno) +{ + return hash_long((unsigned long)((ino << 2) + cno), + NILFS_GCINODE_HASH_BITS); +} + +/* + * nilfs_gc_iget() - find or create gc inode with specified (ino,cno) + */ +struct inode *nilfs_gc_iget(struct the_nilfs *nilfs, ino_t ino, __u64 cno) +{ + struct hlist_head *head = nilfs->ns_gc_inodes_h + ihash(ino, cno); + struct hlist_node *node; + struct inode *inode; + + hlist_for_each_entry(inode, node, head, i_hash) { + if (inode->i_ino == ino && NILFS_I(inode)->i_cno == cno) + return inode; + } + + inode = alloc_gcinode(nilfs, ino, cno); + if (likely(inode)) { + hlist_add_head(&inode->i_hash, head); + list_add(&NILFS_I(inode)->i_dirty, &nilfs->ns_gc_inodes); + } + return inode; +} + +/* + * nilfs_clear_gcinode() - clear and free a gc inode + */ +void nilfs_clear_gcinode(struct inode *inode) +{ + nilfs_mdt_clear(inode); + nilfs_mdt_destroy(inode); +} + +/* + * nilfs_remove_all_gcinode() - remove all inodes from the_nilfs + */ +void nilfs_remove_all_gcinode(struct the_nilfs *nilfs) +{ + struct hlist_head *head = nilfs->ns_gc_inodes_h; + struct hlist_node *node, *n; + struct inode *inode; + int loop; + + for (loop = 0; loop < NILFS_GCINODE_HASH_SIZE; loop++, head++) { + hlist_for_each_entry_safe(inode, node, n, head, i_hash) { + hlist_del_init(&inode->i_hash); + list_del_init(&NILFS_I(inode)->i_dirty); + nilfs_clear_gcinode(inode); /* might sleep */ + } + } +} diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c new file mode 100644 index 000000000000..de86401f209f --- /dev/null +++ b/fs/nilfs2/ifile.c @@ -0,0 +1,150 @@ +/* + * ifile.c - NILFS inode file + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Amagai Yoshiji <amagai@osrg.net>. + * Revised by Ryusuke Konishi <ryusuke@osrg.net>. + * + */ + +#include <linux/types.h> +#include <linux/buffer_head.h> +#include "nilfs.h" +#include "mdt.h" +#include "alloc.h" +#include "ifile.h" + +/** + * nilfs_ifile_create_inode - create a new disk inode + * @ifile: ifile inode + * @out_ino: pointer to a variable to store inode number + * @out_bh: buffer_head contains newly allocated disk inode + * + * Return Value: On success, 0 is returned and the newly allocated inode + * number is stored in the place pointed by @ino, and buffer_head pointer + * that contains newly allocated disk inode structure is stored in the + * place pointed by @out_bh + * On error, one of the following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOSPC - No inode left. + */ +int nilfs_ifile_create_inode(struct inode *ifile, ino_t *out_ino, + struct buffer_head **out_bh) +{ + struct nilfs_palloc_req req; + int ret; + + req.pr_entry_nr = 0; /* 0 says find free inode from beginning of + a group. dull code!! */ + req.pr_entry_bh = NULL; + + ret = nilfs_palloc_prepare_alloc_entry(ifile, &req); + if (!ret) { + ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 1, + &req.pr_entry_bh); + if (ret < 0) + nilfs_palloc_abort_alloc_entry(ifile, &req); + } + if (ret < 0) { + brelse(req.pr_entry_bh); + return ret; + } + nilfs_palloc_commit_alloc_entry(ifile, &req); + nilfs_mdt_mark_buffer_dirty(req.pr_entry_bh); + nilfs_mdt_mark_dirty(ifile); + *out_ino = (ino_t)req.pr_entry_nr; + *out_bh = req.pr_entry_bh; + return 0; +} + +/** + * nilfs_ifile_delete_inode - delete a disk inode + * @ifile: ifile inode + * @ino: inode number + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - The inode number @ino have not been allocated. + */ +int nilfs_ifile_delete_inode(struct inode *ifile, ino_t ino) +{ + struct nilfs_palloc_req req = { + .pr_entry_nr = ino, .pr_entry_bh = NULL + }; + struct nilfs_inode *raw_inode; + void *kaddr; + int ret; + + ret = nilfs_palloc_prepare_free_entry(ifile, &req); + if (!ret) { + ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 0, + &req.pr_entry_bh); + if (ret < 0) + nilfs_palloc_abort_free_entry(ifile, &req); + } + if (ret < 0) { + brelse(req.pr_entry_bh); + return ret; + } + + kaddr = kmap_atomic(req.pr_entry_bh->b_page, KM_USER0); + raw_inode = nilfs_palloc_block_get_entry(ifile, req.pr_entry_nr, + req.pr_entry_bh, kaddr); + raw_inode->i_flags = 0; + kunmap_atomic(kaddr, KM_USER0); + + nilfs_mdt_mark_buffer_dirty(req.pr_entry_bh); + brelse(req.pr_entry_bh); + + nilfs_palloc_commit_free_entry(ifile, &req); + + return 0; +} + +int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino, + struct buffer_head **out_bh) +{ + struct super_block *sb = ifile->i_sb; + int err; + + if (unlikely(!NILFS_VALID_INODE(sb, ino))) { + nilfs_error(sb, __func__, "bad inode number: %lu", + (unsigned long) ino); + return -EINVAL; + } + + err = nilfs_palloc_get_entry_block(ifile, ino, 0, out_bh); + if (unlikely(err)) { + if (err == -EINVAL) + nilfs_error(sb, __func__, "ifile is broken"); + else + nilfs_warning(sb, __func__, + "unable to read inode: %lu", + (unsigned long) ino); + } + return err; +} diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h new file mode 100644 index 000000000000..5d30a35679b5 --- /dev/null +++ b/fs/nilfs2/ifile.h @@ -0,0 +1,53 @@ +/* + * ifile.h - NILFS inode file + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Amagai Yoshiji <amagai@osrg.net> + * Revised by Ryusuke Konishi <ryusuke@osrg.net> + * + */ + +#ifndef _NILFS_IFILE_H +#define _NILFS_IFILE_H + +#include <linux/fs.h> +#include <linux/buffer_head.h> +#include <linux/nilfs2_fs.h> +#include "mdt.h" +#include "alloc.h" + +#define NILFS_IFILE_GFP NILFS_MDT_GFP + +static inline struct nilfs_inode * +nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh) +{ + void *kaddr = kmap(ibh->b_page); + return nilfs_palloc_block_get_entry(ifile, ino, ibh, kaddr); +} + +static inline void nilfs_ifile_unmap_inode(struct inode *ifile, ino_t ino, + struct buffer_head *ibh) +{ + kunmap(ibh->b_page); +} + +int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **); +int nilfs_ifile_delete_inode(struct inode *, ino_t); +int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **); + +#endif /* _NILFS_IFILE_H */ diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c new file mode 100644 index 000000000000..49ab4a49bb4f --- /dev/null +++ b/fs/nilfs2/inode.c @@ -0,0 +1,785 @@ +/* + * inode.c - NILFS inode operations. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net> + * + */ + +#include <linux/buffer_head.h> +#include <linux/mpage.h> +#include <linux/writeback.h> +#include <linux/uio.h> +#include "nilfs.h" +#include "segment.h" +#include "page.h" +#include "mdt.h" +#include "cpfile.h" +#include "ifile.h" + + +/** + * nilfs_get_block() - get a file block on the filesystem (callback function) + * @inode - inode struct of the target file + * @blkoff - file block number + * @bh_result - buffer head to be mapped on + * @create - indicate whether allocating the block or not when it has not + * been allocated yet. + * + * This function does not issue actual read request of the specified data + * block. It is done by VFS. + * Bulk read for direct-io is not supported yet. (should be supported) + */ +int nilfs_get_block(struct inode *inode, sector_t blkoff, + struct buffer_head *bh_result, int create) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + unsigned long blknum = 0; + int err = 0, ret; + struct inode *dat = nilfs_dat_inode(NILFS_I_NILFS(inode)); + + /* This exclusion control is a workaround; should be revised */ + down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ + ret = nilfs_bmap_lookup(ii->i_bmap, (unsigned long)blkoff, &blknum); + up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ + if (ret == 0) { /* found */ + map_bh(bh_result, inode->i_sb, blknum); + goto out; + } + /* data block was not found */ + if (ret == -ENOENT && create) { + struct nilfs_transaction_info ti; + + bh_result->b_blocknr = 0; + err = nilfs_transaction_begin(inode->i_sb, &ti, 1); + if (unlikely(err)) + goto out; + err = nilfs_bmap_insert(ii->i_bmap, (unsigned long)blkoff, + (unsigned long)bh_result); + if (unlikely(err != 0)) { + if (err == -EEXIST) { + /* + * The get_block() function could be called + * from multiple callers for an inode. + * However, the page having this block must + * be locked in this case. + */ + printk(KERN_WARNING + "nilfs_get_block: a race condition " + "while inserting a data block. " + "(inode number=%lu, file block " + "offset=%llu)\n", + inode->i_ino, + (unsigned long long)blkoff); + err = 0; + } else if (err == -EINVAL) { + nilfs_error(inode->i_sb, __func__, + "broken bmap (inode=%lu)\n", + inode->i_ino); + err = -EIO; + } + nilfs_transaction_abort(inode->i_sb); + goto out; + } + nilfs_transaction_commit(inode->i_sb); /* never fails */ + /* Error handling should be detailed */ + set_buffer_new(bh_result); + map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed + to proper value */ + } else if (ret == -ENOENT) { + /* not found is not error (e.g. hole); must return without + the mapped state flag. */ + ; + } else { + err = ret; + } + + out: + return err; +} + +/** + * nilfs_readpage() - implement readpage() method of nilfs_aops {} + * address_space_operations. + * @file - file struct of the file to be read + * @page - the page to be read + */ +static int nilfs_readpage(struct file *file, struct page *page) +{ + return mpage_readpage(page, nilfs_get_block); +} + +/** + * nilfs_readpages() - implement readpages() method of nilfs_aops {} + * address_space_operations. + * @file - file struct of the file to be read + * @mapping - address_space struct used for reading multiple pages + * @pages - the pages to be read + * @nr_pages - number of pages to be read + */ +static int nilfs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, nilfs_get_block); +} + +static int nilfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + int err = 0; + + if (wbc->sync_mode == WB_SYNC_ALL) + err = nilfs_construct_dsync_segment(inode->i_sb, inode, + wbc->range_start, + wbc->range_end); + return err; +} + +static int nilfs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + int err; + + redirty_page_for_writepage(wbc, page); + unlock_page(page); + + if (wbc->sync_mode == WB_SYNC_ALL) { + err = nilfs_construct_segment(inode->i_sb); + if (unlikely(err)) + return err; + } else if (wbc->for_reclaim) + nilfs_flush_segment(inode->i_sb, inode->i_ino); + + return 0; +} + +static int nilfs_set_page_dirty(struct page *page) +{ + int ret = __set_page_dirty_buffers(page); + + if (ret) { + struct inode *inode = page->mapping->host; + struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb); + unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits); + + nilfs_set_file_dirty(sbi, inode, nr_dirty); + } + return ret; +} + +static int nilfs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) + +{ + struct inode *inode = mapping->host; + int err = nilfs_transaction_begin(inode->i_sb, NULL, 1); + + if (unlikely(err)) + return err; + + *pagep = NULL; + err = block_write_begin(file, mapping, pos, len, flags, pagep, + fsdata, nilfs_get_block); + if (unlikely(err)) + nilfs_transaction_abort(inode->i_sb); + return err; +} + +static int nilfs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + unsigned start = pos & (PAGE_CACHE_SIZE - 1); + unsigned nr_dirty; + int err; + + nr_dirty = nilfs_page_count_clean_buffers(page, start, + start + copied); + copied = generic_write_end(file, mapping, pos, len, copied, page, + fsdata); + nilfs_set_file_dirty(NILFS_SB(inode->i_sb), inode, nr_dirty); + err = nilfs_transaction_commit(inode->i_sb); + return err ? : copied; +} + +static ssize_t +nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, + loff_t offset, unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + ssize_t size; + + if (rw == WRITE) + return 0; + + /* Needs synchronization with the cleaner */ + size = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, + offset, nr_segs, nilfs_get_block, NULL); + return size; +} + +struct address_space_operations nilfs_aops = { + .writepage = nilfs_writepage, + .readpage = nilfs_readpage, + /* .sync_page = nilfs_sync_page, */ + .writepages = nilfs_writepages, + .set_page_dirty = nilfs_set_page_dirty, + .readpages = nilfs_readpages, + .write_begin = nilfs_write_begin, + .write_end = nilfs_write_end, + /* .releasepage = nilfs_releasepage, */ + .invalidatepage = block_invalidatepage, + .direct_IO = nilfs_direct_IO, +}; + +struct inode *nilfs_new_inode(struct inode *dir, int mode) +{ + struct super_block *sb = dir->i_sb; + struct nilfs_sb_info *sbi = NILFS_SB(sb); + struct inode *inode; + struct nilfs_inode_info *ii; + int err = -ENOMEM; + ino_t ino; + + inode = new_inode(sb); + if (unlikely(!inode)) + goto failed; + + mapping_set_gfp_mask(inode->i_mapping, + mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); + + ii = NILFS_I(inode); + ii->i_state = 1 << NILFS_I_NEW; + + err = nilfs_ifile_create_inode(sbi->s_ifile, &ino, &ii->i_bh); + if (unlikely(err)) + goto failed_ifile_create_inode; + /* reference count of i_bh inherits from nilfs_mdt_read_block() */ + + atomic_inc(&sbi->s_inodes_count); + + inode->i_uid = current_fsuid(); + if (dir->i_mode & S_ISGID) { + inode->i_gid = dir->i_gid; + if (S_ISDIR(mode)) + mode |= S_ISGID; + } else + inode->i_gid = current_fsgid(); + + inode->i_mode = mode; + inode->i_ino = ino; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + + if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { + err = nilfs_bmap_read(ii->i_bmap, NULL); + if (err < 0) + goto failed_bmap; + + set_bit(NILFS_I_BMAP, &ii->i_state); + /* No lock is needed; iget() ensures it. */ + } + + ii->i_flags = NILFS_I(dir)->i_flags; + if (S_ISLNK(mode)) + ii->i_flags &= ~(NILFS_IMMUTABLE_FL | NILFS_APPEND_FL); + if (!S_ISDIR(mode)) + ii->i_flags &= ~NILFS_DIRSYNC_FL; + + /* ii->i_file_acl = 0; */ + /* ii->i_dir_acl = 0; */ + ii->i_dir_start_lookup = 0; +#ifdef CONFIG_NILFS_FS_POSIX_ACL + ii->i_acl = NULL; + ii->i_default_acl = NULL; +#endif + ii->i_cno = 0; + nilfs_set_inode_flags(inode); + spin_lock(&sbi->s_next_gen_lock); + inode->i_generation = sbi->s_next_generation++; + spin_unlock(&sbi->s_next_gen_lock); + insert_inode_hash(inode); + + err = nilfs_init_acl(inode, dir); + if (unlikely(err)) + goto failed_acl; /* never occur. When supporting + nilfs_init_acl(), proper cancellation of + above jobs should be considered */ + + mark_inode_dirty(inode); + return inode; + + failed_acl: + failed_bmap: + inode->i_nlink = 0; + iput(inode); /* raw_inode will be deleted through + generic_delete_inode() */ + goto failed; + + failed_ifile_create_inode: + make_bad_inode(inode); + iput(inode); /* if i_nlink == 1, generic_forget_inode() will be + called */ + failed: + return ERR_PTR(err); +} + +void nilfs_free_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct nilfs_sb_info *sbi = NILFS_SB(sb); + + clear_inode(inode); + /* XXX: check error code? Is there any thing I can do? */ + (void) nilfs_ifile_delete_inode(sbi->s_ifile, inode->i_ino); + atomic_dec(&sbi->s_inodes_count); +} + +void nilfs_set_inode_flags(struct inode *inode) +{ + unsigned int flags = NILFS_I(inode)->i_flags; + + inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | + S_DIRSYNC); + if (flags & NILFS_SYNC_FL) + inode->i_flags |= S_SYNC; + if (flags & NILFS_APPEND_FL) + inode->i_flags |= S_APPEND; + if (flags & NILFS_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; +#ifndef NILFS_ATIME_DISABLE + if (flags & NILFS_NOATIME_FL) +#endif + inode->i_flags |= S_NOATIME; + if (flags & NILFS_DIRSYNC_FL) + inode->i_flags |= S_DIRSYNC; + mapping_set_gfp_mask(inode->i_mapping, + mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); +} + +int nilfs_read_inode_common(struct inode *inode, + struct nilfs_inode *raw_inode) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + int err; + + inode->i_mode = le16_to_cpu(raw_inode->i_mode); + inode->i_uid = (uid_t)le32_to_cpu(raw_inode->i_uid); + inode->i_gid = (gid_t)le32_to_cpu(raw_inode->i_gid); + inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); + inode->i_size = le64_to_cpu(raw_inode->i_size); + inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime); + inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime); + inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime); + inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); + inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec); + inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); + if (inode->i_nlink == 0 && inode->i_mode == 0) + return -EINVAL; /* this inode is deleted */ + + inode->i_blocks = le64_to_cpu(raw_inode->i_blocks); + ii->i_flags = le32_to_cpu(raw_inode->i_flags); +#if 0 + ii->i_file_acl = le32_to_cpu(raw_inode->i_file_acl); + ii->i_dir_acl = S_ISREG(inode->i_mode) ? + 0 : le32_to_cpu(raw_inode->i_dir_acl); +#endif + ii->i_cno = 0; + inode->i_generation = le32_to_cpu(raw_inode->i_generation); + + if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) { + err = nilfs_bmap_read(ii->i_bmap, raw_inode); + if (err < 0) + return err; + set_bit(NILFS_I_BMAP, &ii->i_state); + /* No lock is needed; iget() ensures it. */ + } + return 0; +} + +static int __nilfs_read_inode(struct super_block *sb, unsigned long ino, + struct inode *inode) +{ + struct nilfs_sb_info *sbi = NILFS_SB(sb); + struct inode *dat = nilfs_dat_inode(sbi->s_nilfs); + struct buffer_head *bh; + struct nilfs_inode *raw_inode; + int err; + + down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ + err = nilfs_ifile_get_inode_block(sbi->s_ifile, ino, &bh); + if (unlikely(err)) + goto bad_inode; + + raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, bh); + +#ifdef CONFIG_NILFS_FS_POSIX_ACL + ii->i_acl = NILFS_ACL_NOT_CACHED; + ii->i_default_acl = NILFS_ACL_NOT_CACHED; +#endif + if (nilfs_read_inode_common(inode, raw_inode)) + goto failed_unmap; + + if (S_ISREG(inode->i_mode)) { + inode->i_op = &nilfs_file_inode_operations; + inode->i_fop = &nilfs_file_operations; + inode->i_mapping->a_ops = &nilfs_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &nilfs_dir_inode_operations; + inode->i_fop = &nilfs_dir_operations; + inode->i_mapping->a_ops = &nilfs_aops; + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = &nilfs_symlink_inode_operations; + inode->i_mapping->a_ops = &nilfs_aops; + } else { + inode->i_op = &nilfs_special_inode_operations; + init_special_inode( + inode, inode->i_mode, + new_decode_dev(le64_to_cpu(raw_inode->i_device_code))); + } + nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh); + brelse(bh); + up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ + nilfs_set_inode_flags(inode); + return 0; + + failed_unmap: + nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh); + brelse(bh); + + bad_inode: + up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ + return err; +} + +struct inode *nilfs_iget(struct super_block *sb, unsigned long ino) +{ + struct inode *inode; + int err; + + inode = iget_locked(sb, ino); + if (unlikely(!inode)) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + err = __nilfs_read_inode(sb, ino, inode); + if (unlikely(err)) { + iget_failed(inode); + return ERR_PTR(err); + } + unlock_new_inode(inode); + return inode; +} + +void nilfs_write_inode_common(struct inode *inode, + struct nilfs_inode *raw_inode, int has_bmap) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + + raw_inode->i_mode = cpu_to_le16(inode->i_mode); + raw_inode->i_uid = cpu_to_le32(inode->i_uid); + raw_inode->i_gid = cpu_to_le32(inode->i_gid); + raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); + raw_inode->i_size = cpu_to_le64(inode->i_size); + raw_inode->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + raw_inode->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); + raw_inode->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + raw_inode->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + raw_inode->i_blocks = cpu_to_le64(inode->i_blocks); + + raw_inode->i_flags = cpu_to_le32(ii->i_flags); + raw_inode->i_generation = cpu_to_le32(inode->i_generation); + + if (has_bmap) + nilfs_bmap_write(ii->i_bmap, raw_inode); + else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + raw_inode->i_device_code = + cpu_to_le64(new_encode_dev(inode->i_rdev)); + /* When extending inode, nilfs->ns_inode_size should be checked + for substitutions of appended fields */ +} + +void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh) +{ + ino_t ino = inode->i_ino; + struct nilfs_inode_info *ii = NILFS_I(inode); + struct super_block *sb = inode->i_sb; + struct nilfs_sb_info *sbi = NILFS_SB(sb); + struct nilfs_inode *raw_inode; + + raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, ibh); + + /* The buffer is guarded with lock_buffer() by the caller */ + if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state)) + memset(raw_inode, 0, NILFS_MDT(sbi->s_ifile)->mi_entry_size); + set_bit(NILFS_I_INODE_DIRTY, &ii->i_state); + + nilfs_write_inode_common(inode, raw_inode, 0); + /* XXX: call with has_bmap = 0 is a workaround to avoid + deadlock of bmap. This delays update of i_bmap to just + before writing */ + nilfs_ifile_unmap_inode(sbi->s_ifile, ino, ibh); +} + +#define NILFS_MAX_TRUNCATE_BLOCKS 16384 /* 64MB for 4KB block */ + +static void nilfs_truncate_bmap(struct nilfs_inode_info *ii, + unsigned long from) +{ + unsigned long b; + int ret; + + if (!test_bit(NILFS_I_BMAP, &ii->i_state)) + return; + repeat: + ret = nilfs_bmap_last_key(ii->i_bmap, &b); + if (ret == -ENOENT) + return; + else if (ret < 0) + goto failed; + + if (b < from) + return; + + b -= min_t(unsigned long, NILFS_MAX_TRUNCATE_BLOCKS, b - from); + ret = nilfs_bmap_truncate(ii->i_bmap, b); + nilfs_relax_pressure_in_lock(ii->vfs_inode.i_sb); + if (!ret || (ret == -ENOMEM && + nilfs_bmap_truncate(ii->i_bmap, b) == 0)) + goto repeat; + + failed: + if (ret == -EINVAL) + nilfs_error(ii->vfs_inode.i_sb, __func__, + "bmap is broken (ino=%lu)", ii->vfs_inode.i_ino); + else + nilfs_warning(ii->vfs_inode.i_sb, __func__, + "failed to truncate bmap (ino=%lu, err=%d)", + ii->vfs_inode.i_ino, ret); +} + +void nilfs_truncate(struct inode *inode) +{ + unsigned long blkoff; + unsigned int blocksize; + struct nilfs_transaction_info ti; + struct super_block *sb = inode->i_sb; + struct nilfs_inode_info *ii = NILFS_I(inode); + + if (!test_bit(NILFS_I_BMAP, &ii->i_state)) + return; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return; + + blocksize = sb->s_blocksize; + blkoff = (inode->i_size + blocksize - 1) >> sb->s_blocksize_bits; + nilfs_transaction_begin(sb, &ti, 0); /* never fails */ + + block_truncate_page(inode->i_mapping, inode->i_size, nilfs_get_block); + + nilfs_truncate_bmap(ii, blkoff); + + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + if (IS_SYNC(inode)) + nilfs_set_transaction_flag(NILFS_TI_SYNC); + + nilfs_set_file_dirty(NILFS_SB(sb), inode, 0); + nilfs_transaction_commit(sb); + /* May construct a logical segment and may fail in sync mode. + But truncate has no return value. */ +} + +void nilfs_delete_inode(struct inode *inode) +{ + struct nilfs_transaction_info ti; + struct super_block *sb = inode->i_sb; + struct nilfs_inode_info *ii = NILFS_I(inode); + + if (unlikely(is_bad_inode(inode))) { + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); + clear_inode(inode); + return; + } + nilfs_transaction_begin(sb, &ti, 0); /* never fails */ + + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); + + nilfs_truncate_bmap(ii, 0); + nilfs_free_inode(inode); + /* nilfs_free_inode() marks inode buffer dirty */ + if (IS_SYNC(inode)) + nilfs_set_transaction_flag(NILFS_TI_SYNC); + nilfs_transaction_commit(sb); + /* May construct a logical segment and may fail in sync mode. + But delete_inode has no return value. */ +} + +int nilfs_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct nilfs_transaction_info ti; + struct inode *inode = dentry->d_inode; + struct super_block *sb = inode->i_sb; + int err; + + err = inode_change_ok(inode, iattr); + if (err) + return err; + + err = nilfs_transaction_begin(sb, &ti, 0); + if (unlikely(err)) + return err; + err = inode_setattr(inode, iattr); + if (!err && (iattr->ia_valid & ATTR_MODE)) + err = nilfs_acl_chmod(inode); + if (likely(!err)) + err = nilfs_transaction_commit(sb); + else + nilfs_transaction_abort(sb); + + return err; +} + +int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode, + struct buffer_head **pbh) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + int err; + + spin_lock(&sbi->s_inode_lock); + /* Caller of this function MUST lock s_inode_lock */ + if (ii->i_bh == NULL) { + spin_unlock(&sbi->s_inode_lock); + err = nilfs_ifile_get_inode_block(sbi->s_ifile, inode->i_ino, + pbh); + if (unlikely(err)) + return err; + spin_lock(&sbi->s_inode_lock); + if (ii->i_bh == NULL) + ii->i_bh = *pbh; + else { + brelse(*pbh); + *pbh = ii->i_bh; + } + } else + *pbh = ii->i_bh; + + get_bh(*pbh); + spin_unlock(&sbi->s_inode_lock); + return 0; +} + +int nilfs_inode_dirty(struct inode *inode) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb); + int ret = 0; + + if (!list_empty(&ii->i_dirty)) { + spin_lock(&sbi->s_inode_lock); + ret = test_bit(NILFS_I_DIRTY, &ii->i_state) || + test_bit(NILFS_I_BUSY, &ii->i_state); + spin_unlock(&sbi->s_inode_lock); + } + return ret; +} + +int nilfs_set_file_dirty(struct nilfs_sb_info *sbi, struct inode *inode, + unsigned nr_dirty) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + + atomic_add(nr_dirty, &sbi->s_nilfs->ns_ndirtyblks); + + if (test_and_set_bit(NILFS_I_DIRTY, &ii->i_state)) + return 0; + + spin_lock(&sbi->s_inode_lock); + if (!test_bit(NILFS_I_QUEUED, &ii->i_state) && + !test_bit(NILFS_I_BUSY, &ii->i_state)) { + /* Because this routine may race with nilfs_dispose_list(), + we have to check NILFS_I_QUEUED here, too. */ + if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) { + /* This will happen when somebody is freeing + this inode. */ + nilfs_warning(sbi->s_super, __func__, + "cannot get inode (ino=%lu)\n", + inode->i_ino); + spin_unlock(&sbi->s_inode_lock); + return -EINVAL; /* NILFS_I_DIRTY may remain for + freeing inode */ + } + list_del(&ii->i_dirty); + list_add_tail(&ii->i_dirty, &sbi->s_dirty_files); + set_bit(NILFS_I_QUEUED, &ii->i_state); + } + spin_unlock(&sbi->s_inode_lock); + return 0; +} + +int nilfs_mark_inode_dirty(struct inode *inode) +{ + struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb); + struct buffer_head *ibh; + int err; + + err = nilfs_load_inode_block(sbi, inode, &ibh); + if (unlikely(err)) { + nilfs_warning(inode->i_sb, __func__, + "failed to reget inode block.\n"); + return err; + } + lock_buffer(ibh); + nilfs_update_inode(inode, ibh); + unlock_buffer(ibh); + nilfs_mdt_mark_buffer_dirty(ibh); + nilfs_mdt_mark_dirty(sbi->s_ifile); + brelse(ibh); + return 0; +} + +/** + * nilfs_dirty_inode - reflect changes on given inode to an inode block. + * @inode: inode of the file to be registered. + * + * nilfs_dirty_inode() loads a inode block containing the specified + * @inode and copies data from a nilfs_inode to a corresponding inode + * entry in the inode block. This operation is excluded from the segment + * construction. This function can be called both as a single operation + * and as a part of indivisible file operations. + */ +void nilfs_dirty_inode(struct inode *inode) +{ + struct nilfs_transaction_info ti; + + if (is_bad_inode(inode)) { + nilfs_warning(inode->i_sb, __func__, + "tried to mark bad_inode dirty. ignored.\n"); + dump_stack(); + return; + } + nilfs_transaction_begin(inode->i_sb, &ti, 0); + nilfs_mark_inode_dirty(inode); + nilfs_transaction_commit(inode->i_sb); /* never fails */ +} diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c new file mode 100644 index 000000000000..108d281ebca5 --- /dev/null +++ b/fs/nilfs2/ioctl.c @@ -0,0 +1,654 @@ +/* + * ioctl.c - NILFS ioctl operations. + * + * Copyright (C) 2007, 2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#include <linux/fs.h> +#include <linux/wait.h> +#include <linux/smp_lock.h> /* lock_kernel(), unlock_kernel() */ +#include <linux/capability.h> /* capable() */ +#include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */ +#include <linux/nilfs2_fs.h> +#include "nilfs.h" +#include "segment.h" +#include "bmap.h" +#include "cpfile.h" +#include "sufile.h" +#include "dat.h" + + +static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs, + struct nilfs_argv *argv, int dir, + ssize_t (*dofunc)(struct the_nilfs *, + __u64 *, int, + void *, size_t, size_t)) +{ + void *buf; + void __user *base = (void __user *)(unsigned long)argv->v_base; + size_t maxmembs, total, n; + ssize_t nr; + int ret, i; + __u64 pos, ppos; + + if (argv->v_nmembs == 0) + return 0; + + if (argv->v_size > PAGE_SIZE) + return -EINVAL; + + buf = (void *)__get_free_pages(GFP_NOFS, 0); + if (unlikely(!buf)) + return -ENOMEM; + maxmembs = PAGE_SIZE / argv->v_size; + + ret = 0; + total = 0; + pos = argv->v_index; + for (i = 0; i < argv->v_nmembs; i += n) { + n = (argv->v_nmembs - i < maxmembs) ? + argv->v_nmembs - i : maxmembs; + if ((dir & _IOC_WRITE) && + copy_from_user(buf, base + argv->v_size * i, + argv->v_size * n)) { + ret = -EFAULT; + break; + } + ppos = pos; + nr = dofunc(nilfs, &pos, argv->v_flags, buf, argv->v_size, + n); + if (nr < 0) { + ret = nr; + break; + } + if ((dir & _IOC_READ) && + copy_to_user(base + argv->v_size * i, buf, + argv->v_size * nr)) { + ret = -EFAULT; + break; + } + total += nr; + if ((size_t)nr < n) + break; + if (pos == ppos) + pos += n; + } + argv->v_nmembs = total; + + free_pages((unsigned long)buf, 0); + return ret; +} + +static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + struct inode *cpfile = NILFS_SB(inode->i_sb)->s_nilfs->ns_cpfile; + struct nilfs_transaction_info ti; + struct nilfs_cpmode cpmode; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&cpmode, argp, sizeof(cpmode))) + return -EFAULT; + + nilfs_transaction_begin(inode->i_sb, &ti, 0); + ret = nilfs_cpfile_change_cpmode( + cpfile, cpmode.cm_cno, cpmode.cm_mode); + if (unlikely(ret < 0)) { + nilfs_transaction_abort(inode->i_sb); + return ret; + } + nilfs_transaction_commit(inode->i_sb); /* never fails */ + return ret; +} + +static int +nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + struct inode *cpfile = NILFS_SB(inode->i_sb)->s_nilfs->ns_cpfile; + struct nilfs_transaction_info ti; + __u64 cno; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&cno, argp, sizeof(cno))) + return -EFAULT; + + nilfs_transaction_begin(inode->i_sb, &ti, 0); + ret = nilfs_cpfile_delete_checkpoint(cpfile, cno); + if (unlikely(ret < 0)) { + nilfs_transaction_abort(inode->i_sb); + return ret; + } + nilfs_transaction_commit(inode->i_sb); /* never fails */ + return ret; +} + +static ssize_t +nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, + void *buf, size_t size, size_t nmembs) +{ + return nilfs_cpfile_get_cpinfo(nilfs->ns_cpfile, posp, flags, buf, + nmembs); +} + +static int nilfs_ioctl_get_cpinfo(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs; + struct nilfs_argv argv; + int ret; + + if (copy_from_user(&argv, argp, sizeof(argv))) + return -EFAULT; + + down_read(&nilfs->ns_segctor_sem); + ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd), + nilfs_ioctl_do_get_cpinfo); + up_read(&nilfs->ns_segctor_sem); + if (ret < 0) + return ret; + + if (copy_to_user(argp, &argv, sizeof(argv))) + ret = -EFAULT; + return ret; +} + +static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs; + struct nilfs_cpstat cpstat; + int ret; + + down_read(&nilfs->ns_segctor_sem); + ret = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat); + up_read(&nilfs->ns_segctor_sem); + if (ret < 0) + return ret; + + if (copy_to_user(argp, &cpstat, sizeof(cpstat))) + ret = -EFAULT; + return ret; +} + +static ssize_t +nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, + void *buf, size_t size, size_t nmembs) +{ + return nilfs_sufile_get_suinfo(nilfs->ns_sufile, *posp, buf, nmembs); +} + +static int nilfs_ioctl_get_suinfo(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs; + struct nilfs_argv argv; + int ret; + + if (copy_from_user(&argv, argp, sizeof(argv))) + return -EFAULT; + + down_read(&nilfs->ns_segctor_sem); + ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd), + nilfs_ioctl_do_get_suinfo); + up_read(&nilfs->ns_segctor_sem); + if (ret < 0) + return ret; + + if (copy_to_user(argp, &argv, sizeof(argv))) + ret = -EFAULT; + return ret; +} + +static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs; + struct nilfs_sustat sustat; + int ret; + + down_read(&nilfs->ns_segctor_sem); + ret = nilfs_sufile_get_stat(nilfs->ns_sufile, &sustat); + up_read(&nilfs->ns_segctor_sem); + if (ret < 0) + return ret; + + if (copy_to_user(argp, &sustat, sizeof(sustat))) + ret = -EFAULT; + return ret; +} + +static ssize_t +nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, + void *buf, size_t size, size_t nmembs) +{ + return nilfs_dat_get_vinfo(nilfs_dat_inode(nilfs), buf, nmembs); +} + +static int nilfs_ioctl_get_vinfo(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs; + struct nilfs_argv argv; + int ret; + + if (copy_from_user(&argv, argp, sizeof(argv))) + return -EFAULT; + + down_read(&nilfs->ns_segctor_sem); + ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd), + nilfs_ioctl_do_get_vinfo); + up_read(&nilfs->ns_segctor_sem); + if (ret < 0) + return ret; + + if (copy_to_user(argp, &argv, sizeof(argv))) + ret = -EFAULT; + return ret; +} + +static ssize_t +nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags, + void *buf, size_t size, size_t nmembs) +{ + struct inode *dat = nilfs_dat_inode(nilfs); + struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap; + struct nilfs_bdesc *bdescs = buf; + int ret, i; + + for (i = 0; i < nmembs; i++) { + ret = nilfs_bmap_lookup_at_level(bmap, + bdescs[i].bd_offset, + bdescs[i].bd_level + 1, + &bdescs[i].bd_blocknr); + if (ret < 0) { + if (ret != -ENOENT) + return ret; + bdescs[i].bd_blocknr = 0; + } + } + return nmembs; +} + +static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs; + struct nilfs_argv argv; + int ret; + + if (copy_from_user(&argv, argp, sizeof(argv))) + return -EFAULT; + + down_read(&nilfs->ns_segctor_sem); + ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd), + nilfs_ioctl_do_get_bdescs); + up_read(&nilfs->ns_segctor_sem); + if (ret < 0) + return ret; + + if (copy_to_user(argp, &argv, sizeof(argv))) + ret = -EFAULT; + return ret; +} + +static int nilfs_ioctl_move_inode_block(struct inode *inode, + struct nilfs_vdesc *vdesc, + struct list_head *buffers) +{ + struct buffer_head *bh; + int ret; + + if (vdesc->vd_flags == 0) + ret = nilfs_gccache_submit_read_data( + inode, vdesc->vd_offset, vdesc->vd_blocknr, + vdesc->vd_vblocknr, &bh); + else + ret = nilfs_gccache_submit_read_node( + inode, vdesc->vd_blocknr, vdesc->vd_vblocknr, &bh); + + if (unlikely(ret < 0)) { + if (ret == -ENOENT) + printk(KERN_CRIT + "%s: invalid virtual block address (%s): " + "ino=%llu, cno=%llu, offset=%llu, " + "blocknr=%llu, vblocknr=%llu\n", + __func__, vdesc->vd_flags ? "node" : "data", + (unsigned long long)vdesc->vd_ino, + (unsigned long long)vdesc->vd_cno, + (unsigned long long)vdesc->vd_offset, + (unsigned long long)vdesc->vd_blocknr, + (unsigned long long)vdesc->vd_vblocknr); + return ret; + } + bh->b_private = vdesc; + list_add_tail(&bh->b_assoc_buffers, buffers); + return 0; +} + +static ssize_t +nilfs_ioctl_do_move_blocks(struct the_nilfs *nilfs, __u64 *posp, int flags, + void *buf, size_t size, size_t nmembs) +{ + struct inode *inode; + struct nilfs_vdesc *vdesc; + struct buffer_head *bh, *n; + LIST_HEAD(buffers); + ino_t ino; + __u64 cno; + int i, ret; + + for (i = 0, vdesc = buf; i < nmembs; ) { + ino = vdesc->vd_ino; + cno = vdesc->vd_cno; + inode = nilfs_gc_iget(nilfs, ino, cno); + if (unlikely(inode == NULL)) { + ret = -ENOMEM; + goto failed; + } + do { + ret = nilfs_ioctl_move_inode_block(inode, vdesc, + &buffers); + if (unlikely(ret < 0)) + goto failed; + vdesc++; + } while (++i < nmembs && + vdesc->vd_ino == ino && vdesc->vd_cno == cno); + } + + list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) { + ret = nilfs_gccache_wait_and_mark_dirty(bh); + if (unlikely(ret < 0)) { + if (ret == -EEXIST) { + vdesc = bh->b_private; + printk(KERN_CRIT + "%s: conflicting %s buffer: " + "ino=%llu, cno=%llu, offset=%llu, " + "blocknr=%llu, vblocknr=%llu\n", + __func__, + vdesc->vd_flags ? "node" : "data", + (unsigned long long)vdesc->vd_ino, + (unsigned long long)vdesc->vd_cno, + (unsigned long long)vdesc->vd_offset, + (unsigned long long)vdesc->vd_blocknr, + (unsigned long long)vdesc->vd_vblocknr); + } + goto failed; + } + list_del_init(&bh->b_assoc_buffers); + bh->b_private = NULL; + brelse(bh); + } + return nmembs; + + failed: + list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) { + list_del_init(&bh->b_assoc_buffers); + bh->b_private = NULL; + brelse(bh); + } + return ret; +} + +static inline int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs, + struct nilfs_argv *argv, + int dir) +{ + return nilfs_ioctl_wrap_copy(nilfs, argv, dir, + nilfs_ioctl_do_move_blocks); +} + +static ssize_t +nilfs_ioctl_do_delete_checkpoints(struct the_nilfs *nilfs, __u64 *posp, + int flags, void *buf, size_t size, + size_t nmembs) +{ + struct inode *cpfile = nilfs->ns_cpfile; + struct nilfs_period *periods = buf; + int ret, i; + + for (i = 0; i < nmembs; i++) { + ret = nilfs_cpfile_delete_checkpoints( + cpfile, periods[i].p_start, periods[i].p_end); + if (ret < 0) + return ret; + } + return nmembs; +} + +static inline int nilfs_ioctl_delete_checkpoints(struct the_nilfs *nilfs, + struct nilfs_argv *argv, + int dir) +{ + return nilfs_ioctl_wrap_copy(nilfs, argv, dir, + nilfs_ioctl_do_delete_checkpoints); +} + +static ssize_t +nilfs_ioctl_do_free_vblocknrs(struct the_nilfs *nilfs, __u64 *posp, int flags, + void *buf, size_t size, size_t nmembs) +{ + int ret = nilfs_dat_freev(nilfs_dat_inode(nilfs), buf, nmembs); + + return (ret < 0) ? ret : nmembs; +} + +static inline int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs, + struct nilfs_argv *argv, + int dir) +{ + return nilfs_ioctl_wrap_copy(nilfs, argv, dir, + nilfs_ioctl_do_free_vblocknrs); +} + +static ssize_t +nilfs_ioctl_do_mark_blocks_dirty(struct the_nilfs *nilfs, __u64 *posp, + int flags, void *buf, size_t size, + size_t nmembs) +{ + struct inode *dat = nilfs_dat_inode(nilfs); + struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap; + struct nilfs_bdesc *bdescs = buf; + int ret, i; + + for (i = 0; i < nmembs; i++) { + /* XXX: use macro or inline func to check liveness */ + ret = nilfs_bmap_lookup_at_level(bmap, + bdescs[i].bd_offset, + bdescs[i].bd_level + 1, + &bdescs[i].bd_blocknr); + if (ret < 0) { + if (ret != -ENOENT) + return ret; + bdescs[i].bd_blocknr = 0; + } + if (bdescs[i].bd_blocknr != bdescs[i].bd_oblocknr) + /* skip dead block */ + continue; + if (bdescs[i].bd_level == 0) { + ret = nilfs_mdt_mark_block_dirty(dat, + bdescs[i].bd_offset); + if (ret < 0) { + WARN_ON(ret == -ENOENT); + return ret; + } + } else { + ret = nilfs_bmap_mark(bmap, bdescs[i].bd_offset, + bdescs[i].bd_level); + if (ret < 0) { + WARN_ON(ret == -ENOENT); + return ret; + } + } + } + return nmembs; +} + +static inline int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs, + struct nilfs_argv *argv, + int dir) +{ + return nilfs_ioctl_wrap_copy(nilfs, argv, dir, + nilfs_ioctl_do_mark_blocks_dirty); +} + +static ssize_t +nilfs_ioctl_do_free_segments(struct the_nilfs *nilfs, __u64 *posp, int flags, + void *buf, size_t size, size_t nmembs) +{ + struct nilfs_sb_info *sbi = nilfs_get_writer(nilfs); + int ret; + + if (unlikely(!sbi)) + return -EROFS; + ret = nilfs_segctor_add_segments_to_be_freed( + NILFS_SC(sbi), buf, nmembs); + nilfs_put_writer(nilfs); + + return (ret < 0) ? ret : nmembs; +} + +static inline int nilfs_ioctl_free_segments(struct the_nilfs *nilfs, + struct nilfs_argv *argv, + int dir) +{ + return nilfs_ioctl_wrap_copy(nilfs, argv, dir, + nilfs_ioctl_do_free_segments); +} + +int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs, + void __user *argp) +{ + struct nilfs_argv argv[5]; + const char *msg; + int dir, ret; + + if (copy_from_user(argv, argp, sizeof(argv))) + return -EFAULT; + + dir = _IOC_WRITE; + ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], dir); + if (ret < 0) { + msg = "cannot read source blocks"; + goto failed; + } + ret = nilfs_ioctl_delete_checkpoints(nilfs, &argv[1], dir); + if (ret < 0) { + /* + * can safely abort because checkpoints can be removed + * independently. + */ + msg = "cannot delete checkpoints"; + goto failed; + } + ret = nilfs_ioctl_free_vblocknrs(nilfs, &argv[2], dir); + if (ret < 0) { + /* + * can safely abort because DAT file is updated atomically + * using a copy-on-write technique. + */ + msg = "cannot delete virtual blocks from DAT file"; + goto failed; + } + ret = nilfs_ioctl_mark_blocks_dirty(nilfs, &argv[3], dir); + if (ret < 0) { + /* + * can safely abort because the operation is nondestructive. + */ + msg = "cannot mark copying blocks dirty"; + goto failed; + } + ret = nilfs_ioctl_free_segments(nilfs, &argv[4], dir); + if (ret < 0) { + /* + * can safely abort because this operation is atomic. + */ + msg = "cannot set segments to be freed"; + goto failed; + } + return 0; + + failed: + nilfs_remove_all_gcinode(nilfs); + printk(KERN_ERR "NILFS: GC failed during preparation: %s: err=%d\n", + msg, ret); + return ret; +} + +static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + return nilfs_clean_segments(inode->i_sb, argp); +} + +static int nilfs_ioctl_sync(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + __u64 cno; + int ret; + + ret = nilfs_construct_segment(inode->i_sb); + if (ret < 0) + return ret; + + if (argp != NULL) { + cno = NILFS_SB(inode->i_sb)->s_nilfs->ns_cno - 1; + if (copy_to_user(argp, &cno, sizeof(cno))) + return -EFAULT; + } + return 0; +} + +long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = filp->f_dentry->d_inode; + void __user *argp = (void * __user *)arg; + + switch (cmd) { + case NILFS_IOCTL_CHANGE_CPMODE: + return nilfs_ioctl_change_cpmode(inode, filp, cmd, argp); + case NILFS_IOCTL_DELETE_CHECKPOINT: + return nilfs_ioctl_delete_checkpoint(inode, filp, cmd, argp); + case NILFS_IOCTL_GET_CPINFO: + return nilfs_ioctl_get_cpinfo(inode, filp, cmd, argp); + case NILFS_IOCTL_GET_CPSTAT: + return nilfs_ioctl_get_cpstat(inode, filp, cmd, argp); + case NILFS_IOCTL_GET_SUINFO: + return nilfs_ioctl_get_suinfo(inode, filp, cmd, argp); + case NILFS_IOCTL_GET_SUSTAT: + return nilfs_ioctl_get_sustat(inode, filp, cmd, argp); + case NILFS_IOCTL_GET_VINFO: + /* XXX: rename to ??? */ + return nilfs_ioctl_get_vinfo(inode, filp, cmd, argp); + case NILFS_IOCTL_GET_BDESCS: + return nilfs_ioctl_get_bdescs(inode, filp, cmd, argp); + case NILFS_IOCTL_CLEAN_SEGMENTS: + return nilfs_ioctl_clean_segments(inode, filp, cmd, argp); + case NILFS_IOCTL_SYNC: + return nilfs_ioctl_sync(inode, filp, cmd, argp); + default: + return -ENOTTY; + } +} diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c new file mode 100644 index 000000000000..47dd815433fd --- /dev/null +++ b/fs/nilfs2/mdt.c @@ -0,0 +1,563 @@ +/* + * mdt.c - meta data file for NILFS + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net> + */ + +#include <linux/buffer_head.h> +#include <linux/mpage.h> +#include <linux/mm.h> +#include <linux/writeback.h> +#include <linux/backing-dev.h> +#include <linux/swap.h> +#include "nilfs.h" +#include "segment.h" +#include "page.h" +#include "mdt.h" + + +#define NILFS_MDT_MAX_RA_BLOCKS (16 - 1) + +#define INIT_UNUSED_INODE_FIELDS + +static int +nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block, + struct buffer_head *bh, + void (*init_block)(struct inode *, + struct buffer_head *, void *)) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + void *kaddr; + int ret; + + /* Caller exclude read accesses using page lock */ + + /* set_buffer_new(bh); */ + bh->b_blocknr = 0; + + ret = nilfs_bmap_insert(ii->i_bmap, block, (unsigned long)bh); + if (unlikely(ret)) + return ret; + + set_buffer_mapped(bh); + + kaddr = kmap_atomic(bh->b_page, KM_USER0); + memset(kaddr + bh_offset(bh), 0, 1 << inode->i_blkbits); + if (init_block) + init_block(inode, bh, kaddr); + flush_dcache_page(bh->b_page); + kunmap_atomic(kaddr, KM_USER0); + + set_buffer_uptodate(bh); + nilfs_mark_buffer_dirty(bh); + nilfs_mdt_mark_dirty(inode); + return 0; +} + +static int nilfs_mdt_create_block(struct inode *inode, unsigned long block, + struct buffer_head **out_bh, + void (*init_block)(struct inode *, + struct buffer_head *, + void *)) +{ + struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs; + struct nilfs_sb_info *writer = NULL; + struct super_block *sb = inode->i_sb; + struct nilfs_transaction_info ti; + struct buffer_head *bh; + int err; + + if (!sb) { + writer = nilfs_get_writer(nilfs); + if (!writer) { + err = -EROFS; + goto out; + } + sb = writer->s_super; + } + + nilfs_transaction_begin(sb, &ti, 0); + + err = -ENOMEM; + bh = nilfs_grab_buffer(inode, inode->i_mapping, block, 0); + if (unlikely(!bh)) + goto failed_unlock; + + err = -EEXIST; + if (buffer_uptodate(bh) || buffer_mapped(bh)) + goto failed_bh; +#if 0 + /* The uptodate flag is not protected by the page lock, but + the mapped flag is. Thus, we don't have to wait the buffer. */ + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + goto failed_bh; +#endif + + bh->b_bdev = nilfs->ns_bdev; + err = nilfs_mdt_insert_new_block(inode, block, bh, init_block); + if (likely(!err)) { + get_bh(bh); + *out_bh = bh; + } + + failed_bh: + unlock_page(bh->b_page); + page_cache_release(bh->b_page); + brelse(bh); + + failed_unlock: + if (likely(!err)) + err = nilfs_transaction_commit(sb); + else + nilfs_transaction_abort(sb); + if (writer) + nilfs_put_writer(nilfs); + out: + return err; +} + +static int +nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, + int mode, struct buffer_head **out_bh) +{ + struct buffer_head *bh; + unsigned long blknum = 0; + int ret = -ENOMEM; + + bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0); + if (unlikely(!bh)) + goto failed; + + ret = -EEXIST; /* internal code */ + if (buffer_uptodate(bh)) + goto out; + + if (mode == READA) { + if (!trylock_buffer(bh)) { + ret = -EBUSY; + goto failed_bh; + } + } else /* mode == READ */ + lock_buffer(bh); + + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + goto out; + } + if (!buffer_mapped(bh)) { /* unused buffer */ + ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff, + &blknum); + if (unlikely(ret)) { + unlock_buffer(bh); + goto failed_bh; + } + bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev; + bh->b_blocknr = blknum; + set_buffer_mapped(bh); + } + + bh->b_end_io = end_buffer_read_sync; + get_bh(bh); + submit_bh(mode, bh); + ret = 0; + out: + get_bh(bh); + *out_bh = bh; + + failed_bh: + unlock_page(bh->b_page); + page_cache_release(bh->b_page); + brelse(bh); + failed: + return ret; +} + +static int nilfs_mdt_read_block(struct inode *inode, unsigned long block, + struct buffer_head **out_bh) +{ + struct buffer_head *first_bh, *bh; + unsigned long blkoff; + int i, nr_ra_blocks = NILFS_MDT_MAX_RA_BLOCKS; + int err; + + err = nilfs_mdt_submit_block(inode, block, READ, &first_bh); + if (err == -EEXIST) /* internal code */ + goto out; + + if (unlikely(err)) + goto failed; + + blkoff = block + 1; + for (i = 0; i < nr_ra_blocks; i++, blkoff++) { + err = nilfs_mdt_submit_block(inode, blkoff, READA, &bh); + if (likely(!err || err == -EEXIST)) + brelse(bh); + else if (err != -EBUSY) + break; /* abort readahead if bmap lookup failed */ + + if (!buffer_locked(first_bh)) + goto out_no_wait; + } + + wait_on_buffer(first_bh); + + out_no_wait: + err = -EIO; + if (!buffer_uptodate(first_bh)) + goto failed_bh; + out: + *out_bh = first_bh; + return 0; + + failed_bh: + brelse(first_bh); + failed: + return err; +} + +/** + * nilfs_mdt_get_block - read or create a buffer on meta data file. + * @inode: inode of the meta data file + * @blkoff: block offset + * @create: create flag + * @init_block: initializer used for newly allocated block + * @out_bh: output of a pointer to the buffer_head + * + * nilfs_mdt_get_block() looks up the specified buffer and tries to create + * a new buffer if @create is not zero. On success, the returned buffer is + * assured to be either existing or formatted using a buffer lock on success. + * @out_bh is substituted only when zero is returned. + * + * Return Value: On success, it returns 0. On error, the following negative + * error code is returned. + * + * %-ENOMEM - Insufficient memory available. + * + * %-EIO - I/O error + * + * %-ENOENT - the specified block does not exist (hole block) + * + * %-EINVAL - bmap is broken. (the caller should call nilfs_error()) + * + * %-EROFS - Read only filesystem (for create mode) + */ +int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create, + void (*init_block)(struct inode *, + struct buffer_head *, void *), + struct buffer_head **out_bh) +{ + int ret; + + /* Should be rewritten with merging nilfs_mdt_read_block() */ + retry: + ret = nilfs_mdt_read_block(inode, blkoff, out_bh); + if (!create || ret != -ENOENT) + return ret; + + ret = nilfs_mdt_create_block(inode, blkoff, out_bh, init_block); + if (unlikely(ret == -EEXIST)) { + /* create = 0; */ /* limit read-create loop retries */ + goto retry; + } + return ret; +} + +/** + * nilfs_mdt_delete_block - make a hole on the meta data file. + * @inode: inode of the meta data file + * @block: block offset + * + * Return Value: On success, zero is returned. + * On error, one of the following negative error code is returned. + * + * %-ENOMEM - Insufficient memory available. + * + * %-EIO - I/O error + * + * %-EINVAL - bmap is broken. (the caller should call nilfs_error()) + */ +int nilfs_mdt_delete_block(struct inode *inode, unsigned long block) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + int err; + + err = nilfs_bmap_delete(ii->i_bmap, block); + if (likely(!err)) { + nilfs_mdt_mark_dirty(inode); + nilfs_mdt_forget_block(inode, block); + } + return err; +} + +/** + * nilfs_mdt_forget_block - discard dirty state and try to remove the page + * @inode: inode of the meta data file + * @block: block offset + * + * nilfs_mdt_forget_block() clears a dirty flag of the specified buffer, and + * tries to release the page including the buffer from a page cache. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error code is returned. + * + * %-EBUSY - page has an active buffer. + * + * %-ENOENT - page cache has no page addressed by the offset. + */ +int nilfs_mdt_forget_block(struct inode *inode, unsigned long block) +{ + pgoff_t index = (pgoff_t)block >> + (PAGE_CACHE_SHIFT - inode->i_blkbits); + struct page *page; + unsigned long first_block; + int ret = 0; + int still_dirty; + + page = find_lock_page(inode->i_mapping, index); + if (!page) + return -ENOENT; + + wait_on_page_writeback(page); + + first_block = (unsigned long)index << + (PAGE_CACHE_SHIFT - inode->i_blkbits); + if (page_has_buffers(page)) { + struct buffer_head *bh; + + bh = nilfs_page_get_nth_block(page, block - first_block); + nilfs_forget_buffer(bh); + } + still_dirty = PageDirty(page); + unlock_page(page); + page_cache_release(page); + + if (still_dirty || + invalidate_inode_pages2_range(inode->i_mapping, index, index) != 0) + ret = -EBUSY; + return ret; +} + +/** + * nilfs_mdt_mark_block_dirty - mark a block on the meta data file dirty. + * @inode: inode of the meta data file + * @block: block offset + * + * Return Value: On success, it returns 0. On error, the following negative + * error code is returned. + * + * %-ENOMEM - Insufficient memory available. + * + * %-EIO - I/O error + * + * %-ENOENT - the specified block does not exist (hole block) + * + * %-EINVAL - bmap is broken. (the caller should call nilfs_error()) + */ +int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block) +{ + struct buffer_head *bh; + int err; + + err = nilfs_mdt_read_block(inode, block, &bh); + if (unlikely(err)) + return err; + nilfs_mark_buffer_dirty(bh); + nilfs_mdt_mark_dirty(inode); + brelse(bh); + return 0; +} + +int nilfs_mdt_fetch_dirty(struct inode *inode) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + + if (nilfs_bmap_test_and_clear_dirty(ii->i_bmap)) { + set_bit(NILFS_I_DIRTY, &ii->i_state); + return 1; + } + return test_bit(NILFS_I_DIRTY, &ii->i_state); +} + +static int +nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode = container_of(page->mapping, + struct inode, i_data); + struct super_block *sb = inode->i_sb; + struct nilfs_sb_info *writer = NULL; + int err = 0; + + redirty_page_for_writepage(wbc, page); + unlock_page(page); + + if (page->mapping->assoc_mapping) + return 0; /* Do not request flush for shadow page cache */ + if (!sb) { + writer = nilfs_get_writer(NILFS_MDT(inode)->mi_nilfs); + if (!writer) + return -EROFS; + sb = writer->s_super; + } + + if (wbc->sync_mode == WB_SYNC_ALL) + err = nilfs_construct_segment(sb); + else if (wbc->for_reclaim) + nilfs_flush_segment(sb, inode->i_ino); + + if (writer) + nilfs_put_writer(NILFS_MDT(inode)->mi_nilfs); + return err; +} + + +static struct address_space_operations def_mdt_aops = { + .writepage = nilfs_mdt_write_page, +}; + +static struct inode_operations def_mdt_iops; +static struct file_operations def_mdt_fops; + +/* + * NILFS2 uses pseudo inodes for meta data files such as DAT, cpfile, sufile, + * ifile, or gcinodes. This allows the B-tree code and segment constructor + * to treat them like regular files, and this helps to simplify the + * implementation. + * On the other hand, some of the pseudo inodes have an irregular point: + * They don't have valid inode->i_sb pointer because their lifetimes are + * longer than those of the super block structs; they may continue for + * several consecutive mounts/umounts. This would need discussions. + */ +struct inode * +nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb, + ino_t ino, gfp_t gfp_mask) +{ + struct inode *inode = nilfs_alloc_inode(sb); + + if (!inode) + return NULL; + else { + struct address_space * const mapping = &inode->i_data; + struct nilfs_mdt_info *mi = kzalloc(sizeof(*mi), GFP_NOFS); + + if (!mi) { + nilfs_destroy_inode(inode); + return NULL; + } + mi->mi_nilfs = nilfs; + init_rwsem(&mi->mi_sem); + + inode->i_sb = sb; /* sb may be NULL for some meta data files */ + inode->i_blkbits = nilfs->ns_blocksize_bits; + inode->i_flags = 0; + atomic_set(&inode->i_count, 1); + inode->i_nlink = 1; + inode->i_ino = ino; + inode->i_mode = S_IFREG; + inode->i_private = mi; + +#ifdef INIT_UNUSED_INODE_FIELDS + atomic_set(&inode->i_writecount, 0); + inode->i_size = 0; + inode->i_blocks = 0; + inode->i_bytes = 0; + inode->i_generation = 0; +#ifdef CONFIG_QUOTA + memset(&inode->i_dquot, 0, sizeof(inode->i_dquot)); +#endif + inode->i_pipe = NULL; + inode->i_bdev = NULL; + inode->i_cdev = NULL; + inode->i_rdev = 0; +#ifdef CONFIG_SECURITY + inode->i_security = NULL; +#endif + inode->dirtied_when = 0; + + INIT_LIST_HEAD(&inode->i_list); + INIT_LIST_HEAD(&inode->i_sb_list); + inode->i_state = 0; +#endif + + spin_lock_init(&inode->i_lock); + mutex_init(&inode->i_mutex); + init_rwsem(&inode->i_alloc_sem); + + mapping->host = NULL; /* instead of inode */ + mapping->flags = 0; + mapping_set_gfp_mask(mapping, gfp_mask); + mapping->assoc_mapping = NULL; + mapping->backing_dev_info = nilfs->ns_bdi; + + inode->i_mapping = mapping; + } + + return inode; +} + +struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb, + ino_t ino, gfp_t gfp_mask) +{ + struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino, gfp_mask); + + if (!inode) + return NULL; + + inode->i_op = &def_mdt_iops; + inode->i_fop = &def_mdt_fops; + inode->i_mapping->a_ops = &def_mdt_aops; + return inode; +} + +void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size, + unsigned header_size) +{ + struct nilfs_mdt_info *mi = NILFS_MDT(inode); + + mi->mi_entry_size = entry_size; + mi->mi_entries_per_block = (1 << inode->i_blkbits) / entry_size; + mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size); +} + +void nilfs_mdt_set_shadow(struct inode *orig, struct inode *shadow) +{ + shadow->i_mapping->assoc_mapping = orig->i_mapping; + NILFS_I(shadow)->i_btnode_cache.assoc_mapping = + &NILFS_I(orig)->i_btnode_cache; +} + +void nilfs_mdt_clear(struct inode *inode) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + + invalidate_mapping_pages(inode->i_mapping, 0, -1); + truncate_inode_pages(inode->i_mapping, 0); + + nilfs_bmap_clear(ii->i_bmap); + nilfs_btnode_cache_clear(&ii->i_btnode_cache); +} + +void nilfs_mdt_destroy(struct inode *inode) +{ + struct nilfs_mdt_info *mdi = NILFS_MDT(inode); + + kfree(mdi->mi_bgl); /* kfree(NULL) is safe */ + kfree(mdi); + nilfs_destroy_inode(inode); +} diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h new file mode 100644 index 000000000000..df683e0bca6a --- /dev/null +++ b/fs/nilfs2/mdt.h @@ -0,0 +1,125 @@ +/* + * mdt.h - NILFS meta data file prototype and definitions + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net> + */ + +#ifndef _NILFS_MDT_H +#define _NILFS_MDT_H + +#include <linux/buffer_head.h> +#include <linux/blockgroup_lock.h> +#include "nilfs.h" +#include "page.h" + +/** + * struct nilfs_mdt_info - on-memory private data of meta data files + * @mi_nilfs: back pointer to the_nilfs struct + * @mi_sem: reader/writer semaphore for meta data operations + * @mi_bgl: per-blockgroup locking + * @mi_entry_size: size of an entry + * @mi_first_entry_offset: offset to the first entry + * @mi_entries_per_block: number of entries in a block + * @mi_blocks_per_group: number of blocks in a group + * @mi_blocks_per_desc_block: number of blocks per descriptor block + */ +struct nilfs_mdt_info { + struct the_nilfs *mi_nilfs; + struct rw_semaphore mi_sem; + struct blockgroup_lock *mi_bgl; + unsigned mi_entry_size; + unsigned mi_first_entry_offset; + unsigned long mi_entries_per_block; + unsigned long mi_blocks_per_group; + unsigned long mi_blocks_per_desc_block; +}; + +static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode) +{ + return inode->i_private; +} + +static inline struct the_nilfs *NILFS_I_NILFS(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + return sb ? NILFS_SB(sb)->s_nilfs : NILFS_MDT(inode)->mi_nilfs; +} + +/* Default GFP flags using highmem */ +#define NILFS_MDT_GFP (__GFP_WAIT | __GFP_IO | __GFP_HIGHMEM) + +int nilfs_mdt_get_block(struct inode *, unsigned long, int, + void (*init_block)(struct inode *, + struct buffer_head *, void *), + struct buffer_head **); +int nilfs_mdt_delete_block(struct inode *, unsigned long); +int nilfs_mdt_forget_block(struct inode *, unsigned long); +int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long); +int nilfs_mdt_fetch_dirty(struct inode *); + +struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t, + gfp_t); +struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *, + ino_t, gfp_t); +void nilfs_mdt_destroy(struct inode *); +void nilfs_mdt_clear(struct inode *); +void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned); +void nilfs_mdt_set_shadow(struct inode *, struct inode *); + + +#define nilfs_mdt_mark_buffer_dirty(bh) nilfs_mark_buffer_dirty(bh) + +static inline void nilfs_mdt_mark_dirty(struct inode *inode) +{ + if (!test_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state)) + set_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state); +} + +static inline void nilfs_mdt_clear_dirty(struct inode *inode) +{ + clear_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state); +} + +static inline __u64 nilfs_mdt_cno(struct inode *inode) +{ + return NILFS_MDT(inode)->mi_nilfs->ns_cno; +} + +#define nilfs_mdt_bgl_lock(inode, bg) \ + (&NILFS_MDT(inode)->mi_bgl->locks[(bg) & (NR_BG_LOCKS-1)].lock) + + +static inline int +nilfs_mdt_read_inode_direct(struct inode *inode, struct buffer_head *bh, + unsigned n) +{ + return nilfs_read_inode_common( + inode, (struct nilfs_inode *)(bh->b_data + n)); +} + +static inline void +nilfs_mdt_write_inode_direct(struct inode *inode, struct buffer_head *bh, + unsigned n) +{ + nilfs_write_inode_common( + inode, (struct nilfs_inode *)(bh->b_data + n), 1); +} + +#endif /* _NILFS_MDT_H */ diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c new file mode 100644 index 000000000000..df70dadb336f --- /dev/null +++ b/fs/nilfs2/namei.c @@ -0,0 +1,474 @@ +/* + * namei.c - NILFS pathname lookup operations. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net>, + * Ryusuke Konishi <ryusuke@osrg.net> + */ +/* + * linux/fs/ext2/namei.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/namei.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include <linux/pagemap.h> +#include "nilfs.h" + + +static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode) +{ + int err = nilfs_add_link(dentry, inode); + if (!err) { + d_instantiate(dentry, inode); + return 0; + } + inode_dec_link_count(inode); + iput(inode); + return err; +} + +/* + * Methods themselves. + */ + +static struct dentry * +nilfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +{ + struct inode *inode; + ino_t ino; + + if (dentry->d_name.len > NILFS_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + ino = nilfs_inode_by_name(dir, dentry); + inode = NULL; + if (ino) { + inode = nilfs_iget(dir->i_sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + } + return d_splice_alias(inode, dentry); +} + +struct dentry *nilfs_get_parent(struct dentry *child) +{ + unsigned long ino; + struct inode *inode; + struct dentry dotdot; + + dotdot.d_name.name = ".."; + dotdot.d_name.len = 2; + + ino = nilfs_inode_by_name(child->d_inode, &dotdot); + if (!ino) + return ERR_PTR(-ENOENT); + + inode = nilfs_iget(child->d_inode->i_sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + return d_obtain_alias(inode); +} + +/* + * By the time this is called, we already have created + * the directory cache entry for the new file, but it + * is so far negative - it has no inode. + * + * If the create succeeds, we fill in the inode information + * with d_instantiate(). + */ +static int nilfs_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + struct inode *inode; + struct nilfs_transaction_info ti; + int err; + + err = nilfs_transaction_begin(dir->i_sb, &ti, 1); + if (err) + return err; + inode = nilfs_new_inode(dir, mode); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + inode->i_op = &nilfs_file_inode_operations; + inode->i_fop = &nilfs_file_operations; + inode->i_mapping->a_ops = &nilfs_aops; + mark_inode_dirty(inode); + err = nilfs_add_nondir(dentry, inode); + } + if (!err) + err = nilfs_transaction_commit(dir->i_sb); + else + nilfs_transaction_abort(dir->i_sb); + + return err; +} + +static int +nilfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) +{ + struct inode *inode; + struct nilfs_transaction_info ti; + int err; + + if (!new_valid_dev(rdev)) + return -EINVAL; + + err = nilfs_transaction_begin(dir->i_sb, &ti, 1); + if (err) + return err; + inode = nilfs_new_inode(dir, mode); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + init_special_inode(inode, inode->i_mode, rdev); + mark_inode_dirty(inode); + err = nilfs_add_nondir(dentry, inode); + } + if (!err) + err = nilfs_transaction_commit(dir->i_sb); + else + nilfs_transaction_abort(dir->i_sb); + + return err; +} + +static int nilfs_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct nilfs_transaction_info ti; + struct super_block *sb = dir->i_sb; + unsigned l = strlen(symname)+1; + struct inode *inode; + int err; + + if (l > sb->s_blocksize) + return -ENAMETOOLONG; + + err = nilfs_transaction_begin(dir->i_sb, &ti, 1); + if (err) + return err; + + inode = nilfs_new_inode(dir, S_IFLNK | S_IRWXUGO); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out; + + /* slow symlink */ + inode->i_op = &nilfs_symlink_inode_operations; + inode->i_mapping->a_ops = &nilfs_aops; + err = page_symlink(inode, symname, l); + if (err) + goto out_fail; + + /* mark_inode_dirty(inode); */ + /* nilfs_new_inode() and page_symlink() do this */ + + err = nilfs_add_nondir(dentry, inode); +out: + if (!err) + err = nilfs_transaction_commit(dir->i_sb); + else + nilfs_transaction_abort(dir->i_sb); + + return err; + +out_fail: + inode_dec_link_count(inode); + iput(inode); + goto out; +} + +static int nilfs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct inode *inode = old_dentry->d_inode; + struct nilfs_transaction_info ti; + int err; + + if (inode->i_nlink >= NILFS_LINK_MAX) + return -EMLINK; + + err = nilfs_transaction_begin(dir->i_sb, &ti, 1); + if (err) + return err; + + inode->i_ctime = CURRENT_TIME; + inode_inc_link_count(inode); + atomic_inc(&inode->i_count); + + err = nilfs_add_nondir(dentry, inode); + if (!err) + err = nilfs_transaction_commit(dir->i_sb); + else + nilfs_transaction_abort(dir->i_sb); + + return err; +} + +static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + struct inode *inode; + struct nilfs_transaction_info ti; + int err; + + if (dir->i_nlink >= NILFS_LINK_MAX) + return -EMLINK; + + err = nilfs_transaction_begin(dir->i_sb, &ti, 1); + if (err) + return err; + + inode_inc_link_count(dir); + + inode = nilfs_new_inode(dir, S_IFDIR | mode); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_dir; + + inode->i_op = &nilfs_dir_inode_operations; + inode->i_fop = &nilfs_dir_operations; + inode->i_mapping->a_ops = &nilfs_aops; + + inode_inc_link_count(inode); + + err = nilfs_make_empty(inode, dir); + if (err) + goto out_fail; + + err = nilfs_add_link(dentry, inode); + if (err) + goto out_fail; + + d_instantiate(dentry, inode); +out: + if (!err) + err = nilfs_transaction_commit(dir->i_sb); + else + nilfs_transaction_abort(dir->i_sb); + + return err; + +out_fail: + inode_dec_link_count(inode); + inode_dec_link_count(inode); + iput(inode); +out_dir: + inode_dec_link_count(dir); + goto out; +} + +static int nilfs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode; + struct nilfs_dir_entry *de; + struct page *page; + struct nilfs_transaction_info ti; + int err; + + err = nilfs_transaction_begin(dir->i_sb, &ti, 0); + if (err) + return err; + + err = -ENOENT; + de = nilfs_find_entry(dir, dentry, &page); + if (!de) + goto out; + + inode = dentry->d_inode; + err = -EIO; + if (le64_to_cpu(de->inode) != inode->i_ino) + goto out; + + if (!inode->i_nlink) { + nilfs_warning(inode->i_sb, __func__, + "deleting nonexistent file (%lu), %d\n", + inode->i_ino, inode->i_nlink); + inode->i_nlink = 1; + } + err = nilfs_delete_entry(de, page); + if (err) + goto out; + + inode->i_ctime = dir->i_ctime; + inode_dec_link_count(inode); + err = 0; +out: + if (!err) + err = nilfs_transaction_commit(dir->i_sb); + else + nilfs_transaction_abort(dir->i_sb); + + return err; +} + +static int nilfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + struct nilfs_transaction_info ti; + int err; + + err = nilfs_transaction_begin(dir->i_sb, &ti, 0); + if (err) + return err; + + err = -ENOTEMPTY; + if (nilfs_empty_dir(inode)) { + err = nilfs_unlink(dir, dentry); + if (!err) { + inode->i_size = 0; + inode_dec_link_count(inode); + inode_dec_link_count(dir); + } + } + if (!err) + err = nilfs_transaction_commit(dir->i_sb); + else + nilfs_transaction_abort(dir->i_sb); + + return err; +} + +static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct inode *old_inode = old_dentry->d_inode; + struct inode *new_inode = new_dentry->d_inode; + struct page *dir_page = NULL; + struct nilfs_dir_entry *dir_de = NULL; + struct page *old_page; + struct nilfs_dir_entry *old_de; + struct nilfs_transaction_info ti; + int err; + + err = nilfs_transaction_begin(old_dir->i_sb, &ti, 1); + if (unlikely(err)) + return err; + + err = -ENOENT; + old_de = nilfs_find_entry(old_dir, old_dentry, &old_page); + if (!old_de) + goto out; + + if (S_ISDIR(old_inode->i_mode)) { + err = -EIO; + dir_de = nilfs_dotdot(old_inode, &dir_page); + if (!dir_de) + goto out_old; + } + + if (new_inode) { + struct page *new_page; + struct nilfs_dir_entry *new_de; + + err = -ENOTEMPTY; + if (dir_de && !nilfs_empty_dir(new_inode)) + goto out_dir; + + err = -ENOENT; + new_de = nilfs_find_entry(new_dir, new_dentry, &new_page); + if (!new_de) + goto out_dir; + inode_inc_link_count(old_inode); + nilfs_set_link(new_dir, new_de, new_page, old_inode); + new_inode->i_ctime = CURRENT_TIME; + if (dir_de) + drop_nlink(new_inode); + inode_dec_link_count(new_inode); + } else { + if (dir_de) { + err = -EMLINK; + if (new_dir->i_nlink >= NILFS_LINK_MAX) + goto out_dir; + } + inode_inc_link_count(old_inode); + err = nilfs_add_link(new_dentry, old_inode); + if (err) { + inode_dec_link_count(old_inode); + goto out_dir; + } + if (dir_de) + inode_inc_link_count(new_dir); + } + + /* + * Like most other Unix systems, set the ctime for inodes on a + * rename. + * inode_dec_link_count() will mark the inode dirty. + */ + old_inode->i_ctime = CURRENT_TIME; + + nilfs_delete_entry(old_de, old_page); + inode_dec_link_count(old_inode); + + if (dir_de) { + nilfs_set_link(old_inode, dir_de, dir_page, new_dir); + inode_dec_link_count(old_dir); + } + + err = nilfs_transaction_commit(old_dir->i_sb); + return err; + +out_dir: + if (dir_de) { + kunmap(dir_page); + page_cache_release(dir_page); + } +out_old: + kunmap(old_page); + page_cache_release(old_page); +out: + nilfs_transaction_abort(old_dir->i_sb); + return err; +} + +struct inode_operations nilfs_dir_inode_operations = { + .create = nilfs_create, + .lookup = nilfs_lookup, + .link = nilfs_link, + .unlink = nilfs_unlink, + .symlink = nilfs_symlink, + .mkdir = nilfs_mkdir, + .rmdir = nilfs_rmdir, + .mknod = nilfs_mknod, + .rename = nilfs_rename, + .setattr = nilfs_setattr, + .permission = nilfs_permission, +}; + +struct inode_operations nilfs_special_inode_operations = { + .setattr = nilfs_setattr, + .permission = nilfs_permission, +}; + +struct inode_operations nilfs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, +}; diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h new file mode 100644 index 000000000000..7558c977db02 --- /dev/null +++ b/fs/nilfs2/nilfs.h @@ -0,0 +1,318 @@ +/* + * nilfs.h - NILFS local header file. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net> + * Ryusuke Konishi <ryusuke@osrg.net> + */ + +#ifndef _NILFS_H +#define _NILFS_H + +#include <linux/kernel.h> +#include <linux/buffer_head.h> +#include <linux/spinlock.h> +#include <linux/blkdev.h> +#include <linux/nilfs2_fs.h> +#include "the_nilfs.h" +#include "sb.h" +#include "bmap.h" +#include "bmap_union.h" + +/* + * NILFS filesystem version + */ +#define NILFS_VERSION "2.0.5" + +/* + * nilfs inode data in memory + */ +struct nilfs_inode_info { + __u32 i_flags; + unsigned long i_state; /* Dynamic state flags */ + struct nilfs_bmap *i_bmap; + union nilfs_bmap_union i_bmap_union; + __u64 i_xattr; /* sector_t ??? */ + __u32 i_dir_start_lookup; + __u64 i_cno; /* check point number for GC inode */ + struct address_space i_btnode_cache; + struct list_head i_dirty; /* List for connecting dirty files */ + +#ifdef CONFIG_NILFS_XATTR + /* + * Extended attributes can be read independently of the main file + * data. Taking i_sem even when reading would cause contention + * between readers of EAs and writers of regular file data, so + * instead we synchronize on xattr_sem when reading or changing + * EAs. + */ + struct rw_semaphore xattr_sem; +#endif +#ifdef CONFIG_NILFS_POSIX_ACL + struct posix_acl *i_acl; + struct posix_acl *i_default_acl; +#endif + struct buffer_head *i_bh; /* i_bh contains a new or dirty + disk inode */ + struct inode vfs_inode; +}; + +static inline struct nilfs_inode_info *NILFS_I(const struct inode *inode) +{ + return container_of(inode, struct nilfs_inode_info, vfs_inode); +} + +static inline struct nilfs_inode_info * +NILFS_BMAP_I(const struct nilfs_bmap *bmap) +{ + return container_of((union nilfs_bmap_union *)bmap, + struct nilfs_inode_info, + i_bmap_union); +} + +static inline struct inode *NILFS_BTNC_I(struct address_space *btnc) +{ + struct nilfs_inode_info *ii = + container_of(btnc, struct nilfs_inode_info, i_btnode_cache); + return &ii->vfs_inode; +} + +static inline struct inode *NILFS_AS_I(struct address_space *mapping) +{ + return (mapping->host) ? : + container_of(mapping, struct inode, i_data); +} + +/* + * Dynamic state flags of NILFS on-memory inode (i_state) + */ +enum { + NILFS_I_NEW = 0, /* Inode is newly created */ + NILFS_I_DIRTY, /* The file is dirty */ + NILFS_I_QUEUED, /* inode is in dirty_files list */ + NILFS_I_BUSY, /* inode is grabbed by a segment + constructor */ + NILFS_I_COLLECTED, /* All dirty blocks are collected */ + NILFS_I_UPDATED, /* The file has been written back */ + NILFS_I_INODE_DIRTY, /* write_inode is requested */ + NILFS_I_BMAP, /* has bmap and btnode_cache */ + NILFS_I_GCINODE, /* inode for GC, on memory only */ + NILFS_I_GCDAT, /* shadow DAT, on memory only */ +}; + +/* + * Macros to check inode numbers + */ +#define NILFS_MDT_INO_BITS \ + ((unsigned int)(1 << NILFS_DAT_INO | 1 << NILFS_CPFILE_INO | \ + 1 << NILFS_SUFILE_INO | 1 << NILFS_IFILE_INO | \ + 1 << NILFS_ATIME_INO | 1 << NILFS_SKETCH_INO)) + +#define NILFS_SYS_INO_BITS \ + ((unsigned int)(1 << NILFS_ROOT_INO) | NILFS_MDT_INO_BITS) + +#define NILFS_FIRST_INO(sb) (NILFS_SB(sb)->s_nilfs->ns_first_ino) + +#define NILFS_MDT_INODE(sb, ino) \ + ((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & (1 << (ino)))) +#define NILFS_VALID_INODE(sb, ino) \ + ((ino) >= NILFS_FIRST_INO(sb) || (NILFS_SYS_INO_BITS & (1 << (ino)))) + +/** + * struct nilfs_transaction_info: context information for synchronization + * @ti_magic: Magic number + * @ti_save: Backup of journal_info field of task_struct + * @ti_flags: Flags + * @ti_count: Nest level + * @ti_garbage: List of inode to be put when releasing semaphore + */ +struct nilfs_transaction_info { + u32 ti_magic; + void *ti_save; + /* This should never used. If this happens, + one of other filesystems has a bug. */ + unsigned short ti_flags; + unsigned short ti_count; + struct list_head ti_garbage; +}; + +/* ti_magic */ +#define NILFS_TI_MAGIC 0xd9e392fb + +/* ti_flags */ +#define NILFS_TI_DYNAMIC_ALLOC 0x0001 /* Allocated from slab */ +#define NILFS_TI_SYNC 0x0002 /* Force to construct segment at the + end of transaction. */ +#define NILFS_TI_GC 0x0004 /* GC context */ +#define NILFS_TI_COMMIT 0x0008 /* Change happened or not */ +#define NILFS_TI_WRITER 0x0010 /* Constructor context */ + + +int nilfs_transaction_begin(struct super_block *, + struct nilfs_transaction_info *, int); +int nilfs_transaction_commit(struct super_block *); +void nilfs_transaction_abort(struct super_block *); + +static inline void nilfs_set_transaction_flag(unsigned int flag) +{ + struct nilfs_transaction_info *ti = current->journal_info; + + ti->ti_flags |= flag; +} + +static inline int nilfs_test_transaction_flag(unsigned int flag) +{ + struct nilfs_transaction_info *ti = current->journal_info; + + if (ti == NULL || ti->ti_magic != NILFS_TI_MAGIC) + return 0; + return !!(ti->ti_flags & flag); +} + +static inline int nilfs_doing_gc(void) +{ + return nilfs_test_transaction_flag(NILFS_TI_GC); +} + +static inline int nilfs_doing_construction(void) +{ + return nilfs_test_transaction_flag(NILFS_TI_WRITER); +} + +static inline struct inode *nilfs_dat_inode(const struct the_nilfs *nilfs) +{ + return nilfs_doing_gc() ? nilfs->ns_gc_dat : nilfs->ns_dat; +} + +/* + * function prototype + */ +#ifdef CONFIG_NILFS_POSIX_ACL +#error "NILFS: not yet supported POSIX ACL" +extern int nilfs_permission(struct inode *, int, struct nameidata *); +extern int nilfs_acl_chmod(struct inode *); +extern int nilfs_init_acl(struct inode *, struct inode *); +#else +#define nilfs_permission NULL + +static inline int nilfs_acl_chmod(struct inode *inode) +{ + return 0; +} + +static inline int nilfs_init_acl(struct inode *inode, struct inode *dir) +{ + inode->i_mode &= ~current_umask(); + return 0; +} +#endif + +#define NILFS_ATIME_DISABLE + +/* dir.c */ +extern int nilfs_add_link(struct dentry *, struct inode *); +extern ino_t nilfs_inode_by_name(struct inode *, struct dentry *); +extern int nilfs_make_empty(struct inode *, struct inode *); +extern struct nilfs_dir_entry * +nilfs_find_entry(struct inode *, struct dentry *, struct page **); +extern int nilfs_delete_entry(struct nilfs_dir_entry *, struct page *); +extern int nilfs_empty_dir(struct inode *); +extern struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct page **); +extern void nilfs_set_link(struct inode *, struct nilfs_dir_entry *, + struct page *, struct inode *); + +/* file.c */ +extern int nilfs_sync_file(struct file *, struct dentry *, int); + +/* ioctl.c */ +long nilfs_ioctl(struct file *, unsigned int, unsigned long); +int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *, void __user *); + +/* inode.c */ +extern struct inode *nilfs_new_inode(struct inode *, int); +extern void nilfs_free_inode(struct inode *); +extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int); +extern void nilfs_set_inode_flags(struct inode *); +extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *); +extern void nilfs_write_inode_common(struct inode *, struct nilfs_inode *, int); +extern struct inode *nilfs_iget(struct super_block *, unsigned long); +extern void nilfs_update_inode(struct inode *, struct buffer_head *); +extern void nilfs_truncate(struct inode *); +extern void nilfs_delete_inode(struct inode *); +extern int nilfs_setattr(struct dentry *, struct iattr *); +extern int nilfs_load_inode_block(struct nilfs_sb_info *, struct inode *, + struct buffer_head **); +extern int nilfs_inode_dirty(struct inode *); +extern int nilfs_set_file_dirty(struct nilfs_sb_info *, struct inode *, + unsigned); +extern int nilfs_mark_inode_dirty(struct inode *); +extern void nilfs_dirty_inode(struct inode *); + +/* namei.c */ +extern struct dentry *nilfs_get_parent(struct dentry *); + +/* super.c */ +extern struct inode *nilfs_alloc_inode(struct super_block *); +extern void nilfs_destroy_inode(struct inode *); +extern void nilfs_error(struct super_block *, const char *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); +extern void nilfs_warning(struct super_block *, const char *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); +extern struct nilfs_super_block * +nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **); +extern int nilfs_store_magic_and_option(struct super_block *, + struct nilfs_super_block *, char *); +extern int nilfs_commit_super(struct nilfs_sb_info *, int); +extern int nilfs_attach_checkpoint(struct nilfs_sb_info *, __u64); +extern void nilfs_detach_checkpoint(struct nilfs_sb_info *); + +/* gcinode.c */ +int nilfs_gccache_submit_read_data(struct inode *, sector_t, sector_t, __u64, + struct buffer_head **); +int nilfs_gccache_submit_read_node(struct inode *, sector_t, __u64, + struct buffer_head **); +int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *); +int nilfs_init_gccache(struct the_nilfs *); +void nilfs_destroy_gccache(struct the_nilfs *); +void nilfs_clear_gcinode(struct inode *); +struct inode *nilfs_gc_iget(struct the_nilfs *, ino_t, __u64); +void nilfs_remove_all_gcinode(struct the_nilfs *); + +/* gcdat.c */ +int nilfs_init_gcdat_inode(struct the_nilfs *); +void nilfs_commit_gcdat_inode(struct the_nilfs *); +void nilfs_clear_gcdat_inode(struct the_nilfs *); + +/* + * Inodes and files operations + */ +extern struct file_operations nilfs_dir_operations; +extern struct inode_operations nilfs_file_inode_operations; +extern struct file_operations nilfs_file_operations; +extern struct address_space_operations nilfs_aops; +extern struct inode_operations nilfs_dir_inode_operations; +extern struct inode_operations nilfs_special_inode_operations; +extern struct inode_operations nilfs_symlink_inode_operations; + +/* + * filesystem type + */ +extern struct file_system_type nilfs_fs_type; + + +#endif /* _NILFS_H */ diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c new file mode 100644 index 000000000000..1bfbba9c0e9a --- /dev/null +++ b/fs/nilfs2/page.c @@ -0,0 +1,540 @@ +/* + * page.c - buffer/page management specific to NILFS + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net>, + * Seiji Kihara <kihara@osrg.net>. + */ + +#include <linux/pagemap.h> +#include <linux/writeback.h> +#include <linux/swap.h> +#include <linux/bitops.h> +#include <linux/page-flags.h> +#include <linux/list.h> +#include <linux/highmem.h> +#include <linux/pagevec.h> +#include "nilfs.h" +#include "page.h" +#include "mdt.h" + + +#define NILFS_BUFFER_INHERENT_BITS \ + ((1UL << BH_Uptodate) | (1UL << BH_Mapped) | (1UL << BH_NILFS_Node) | \ + (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Allocated)) + +static struct buffer_head * +__nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index, + int blkbits, unsigned long b_state) + +{ + unsigned long first_block; + struct buffer_head *bh; + + if (!page_has_buffers(page)) + create_empty_buffers(page, 1 << blkbits, b_state); + + first_block = (unsigned long)index << (PAGE_CACHE_SHIFT - blkbits); + bh = nilfs_page_get_nth_block(page, block - first_block); + + touch_buffer(bh); + wait_on_buffer(bh); + return bh; +} + +/* + * Since the page cache of B-tree node pages or data page cache of pseudo + * inodes does not have a valid mapping->host pointer, calling + * mark_buffer_dirty() for their buffers causes a NULL pointer dereference; + * it calls __mark_inode_dirty(NULL) through __set_page_dirty(). + * To avoid this problem, the old style mark_buffer_dirty() is used instead. + */ +void nilfs_mark_buffer_dirty(struct buffer_head *bh) +{ + if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh)) + __set_page_dirty_nobuffers(bh->b_page); +} + +struct buffer_head *nilfs_grab_buffer(struct inode *inode, + struct address_space *mapping, + unsigned long blkoff, + unsigned long b_state) +{ + int blkbits = inode->i_blkbits; + pgoff_t index = blkoff >> (PAGE_CACHE_SHIFT - blkbits); + struct page *page, *opage; + struct buffer_head *bh, *obh; + + page = grab_cache_page(mapping, index); + if (unlikely(!page)) + return NULL; + + bh = __nilfs_get_page_block(page, blkoff, index, blkbits, b_state); + if (unlikely(!bh)) { + unlock_page(page); + page_cache_release(page); + return NULL; + } + if (!buffer_uptodate(bh) && mapping->assoc_mapping != NULL) { + /* + * Shadow page cache uses assoc_mapping to point its original + * page cache. The following code tries the original cache + * if the given cache is a shadow and it didn't hit. + */ + opage = find_lock_page(mapping->assoc_mapping, index); + if (!opage) + return bh; + + obh = __nilfs_get_page_block(opage, blkoff, index, blkbits, + b_state); + if (buffer_uptodate(obh)) { + nilfs_copy_buffer(bh, obh); + if (buffer_dirty(obh)) { + nilfs_mark_buffer_dirty(bh); + if (!buffer_nilfs_node(bh) && NILFS_MDT(inode)) + nilfs_mdt_mark_dirty(inode); + } + } + brelse(obh); + unlock_page(opage); + page_cache_release(opage); + } + return bh; +} + +/** + * nilfs_forget_buffer - discard dirty state + * @inode: owner inode of the buffer + * @bh: buffer head of the buffer to be discarded + */ +void nilfs_forget_buffer(struct buffer_head *bh) +{ + struct page *page = bh->b_page; + + lock_buffer(bh); + clear_buffer_nilfs_volatile(bh); + if (test_clear_buffer_dirty(bh) && nilfs_page_buffers_clean(page)) + __nilfs_clear_page_dirty(page); + + clear_buffer_uptodate(bh); + clear_buffer_mapped(bh); + bh->b_blocknr = -1; + ClearPageUptodate(page); + ClearPageMappedToDisk(page); + unlock_buffer(bh); + brelse(bh); +} + +/** + * nilfs_copy_buffer -- copy buffer data and flags + * @dbh: destination buffer + * @sbh: source buffer + */ +void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh) +{ + void *kaddr0, *kaddr1; + unsigned long bits; + struct page *spage = sbh->b_page, *dpage = dbh->b_page; + struct buffer_head *bh; + + kaddr0 = kmap_atomic(spage, KM_USER0); + kaddr1 = kmap_atomic(dpage, KM_USER1); + memcpy(kaddr1 + bh_offset(dbh), kaddr0 + bh_offset(sbh), sbh->b_size); + kunmap_atomic(kaddr1, KM_USER1); + kunmap_atomic(kaddr0, KM_USER0); + + dbh->b_state = sbh->b_state & NILFS_BUFFER_INHERENT_BITS; + dbh->b_blocknr = sbh->b_blocknr; + dbh->b_bdev = sbh->b_bdev; + + bh = dbh; + bits = sbh->b_state & ((1UL << BH_Uptodate) | (1UL << BH_Mapped)); + while ((bh = bh->b_this_page) != dbh) { + lock_buffer(bh); + bits &= bh->b_state; + unlock_buffer(bh); + } + if (bits & (1UL << BH_Uptodate)) + SetPageUptodate(dpage); + else + ClearPageUptodate(dpage); + if (bits & (1UL << BH_Mapped)) + SetPageMappedToDisk(dpage); + else + ClearPageMappedToDisk(dpage); +} + +/** + * nilfs_page_buffers_clean - check if a page has dirty buffers or not. + * @page: page to be checked + * + * nilfs_page_buffers_clean() returns zero if the page has dirty buffers. + * Otherwise, it returns non-zero value. + */ +int nilfs_page_buffers_clean(struct page *page) +{ + struct buffer_head *bh, *head; + + bh = head = page_buffers(page); + do { + if (buffer_dirty(bh)) + return 0; + bh = bh->b_this_page; + } while (bh != head); + return 1; +} + +void nilfs_page_bug(struct page *page) +{ + struct address_space *m; + unsigned long ino = 0; + + if (unlikely(!page)) { + printk(KERN_CRIT "NILFS_PAGE_BUG(NULL)\n"); + return; + } + + m = page->mapping; + if (m) { + struct inode *inode = NILFS_AS_I(m); + if (inode != NULL) + ino = inode->i_ino; + } + printk(KERN_CRIT "NILFS_PAGE_BUG(%p): cnt=%d index#=%llu flags=0x%lx " + "mapping=%p ino=%lu\n", + page, atomic_read(&page->_count), + (unsigned long long)page->index, page->flags, m, ino); + + if (page_has_buffers(page)) { + struct buffer_head *bh, *head; + int i = 0; + + bh = head = page_buffers(page); + do { + printk(KERN_CRIT + " BH[%d] %p: cnt=%d block#=%llu state=0x%lx\n", + i++, bh, atomic_read(&bh->b_count), + (unsigned long long)bh->b_blocknr, bh->b_state); + bh = bh->b_this_page; + } while (bh != head); + } +} + +/** + * nilfs_alloc_private_page - allocate a private page with buffer heads + * + * Return Value: On success, a pointer to the allocated page is returned. + * On error, NULL is returned. + */ +struct page *nilfs_alloc_private_page(struct block_device *bdev, int size, + unsigned long state) +{ + struct buffer_head *bh, *head, *tail; + struct page *page; + + page = alloc_page(GFP_NOFS); /* page_count of the returned page is 1 */ + if (unlikely(!page)) + return NULL; + + lock_page(page); + head = alloc_page_buffers(page, size, 0); + if (unlikely(!head)) { + unlock_page(page); + __free_page(page); + return NULL; + } + + bh = head; + do { + bh->b_state = (1UL << BH_NILFS_Allocated) | state; + tail = bh; + bh->b_bdev = bdev; + bh = bh->b_this_page; + } while (bh); + + tail->b_this_page = head; + attach_page_buffers(page, head); + + return page; +} + +void nilfs_free_private_page(struct page *page) +{ + BUG_ON(!PageLocked(page)); + BUG_ON(page->mapping); + + if (page_has_buffers(page) && !try_to_free_buffers(page)) + NILFS_PAGE_BUG(page, "failed to free page"); + + unlock_page(page); + __free_page(page); +} + +/** + * nilfs_copy_page -- copy the page with buffers + * @dst: destination page + * @src: source page + * @copy_dirty: flag whether to copy dirty states on the page's buffer heads. + * + * This fuction is for both data pages and btnode pages. The dirty flag + * should be treated by caller. The page must not be under i/o. + * Both src and dst page must be locked + */ +static void nilfs_copy_page(struct page *dst, struct page *src, int copy_dirty) +{ + struct buffer_head *dbh, *dbufs, *sbh, *sbufs; + unsigned long mask = NILFS_BUFFER_INHERENT_BITS; + + BUG_ON(PageWriteback(dst)); + + sbh = sbufs = page_buffers(src); + if (!page_has_buffers(dst)) + create_empty_buffers(dst, sbh->b_size, 0); + + if (copy_dirty) + mask |= (1UL << BH_Dirty); + + dbh = dbufs = page_buffers(dst); + do { + lock_buffer(sbh); + lock_buffer(dbh); + dbh->b_state = sbh->b_state & mask; + dbh->b_blocknr = sbh->b_blocknr; + dbh->b_bdev = sbh->b_bdev; + sbh = sbh->b_this_page; + dbh = dbh->b_this_page; + } while (dbh != dbufs); + + copy_highpage(dst, src); + + if (PageUptodate(src) && !PageUptodate(dst)) + SetPageUptodate(dst); + else if (!PageUptodate(src) && PageUptodate(dst)) + ClearPageUptodate(dst); + if (PageMappedToDisk(src) && !PageMappedToDisk(dst)) + SetPageMappedToDisk(dst); + else if (!PageMappedToDisk(src) && PageMappedToDisk(dst)) + ClearPageMappedToDisk(dst); + + do { + unlock_buffer(sbh); + unlock_buffer(dbh); + sbh = sbh->b_this_page; + dbh = dbh->b_this_page; + } while (dbh != dbufs); +} + +int nilfs_copy_dirty_pages(struct address_space *dmap, + struct address_space *smap) +{ + struct pagevec pvec; + unsigned int i; + pgoff_t index = 0; + int err = 0; + + pagevec_init(&pvec, 0); +repeat: + if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY, + PAGEVEC_SIZE)) + return 0; + + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i], *dpage; + + lock_page(page); + if (unlikely(!PageDirty(page))) + NILFS_PAGE_BUG(page, "inconsistent dirty state"); + + dpage = grab_cache_page(dmap, page->index); + if (unlikely(!dpage)) { + /* No empty page is added to the page cache */ + err = -ENOMEM; + unlock_page(page); + break; + } + if (unlikely(!page_has_buffers(page))) + NILFS_PAGE_BUG(page, + "found empty page in dat page cache"); + + nilfs_copy_page(dpage, page, 1); + __set_page_dirty_nobuffers(dpage); + + unlock_page(dpage); + page_cache_release(dpage); + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + + if (likely(!err)) + goto repeat; + return err; +} + +/** + * nilfs_copy_back_pages -- copy back pages to orignal cache from shadow cache + * @dmap: destination page cache + * @smap: source page cache + * + * No pages must no be added to the cache during this process. + * This must be ensured by the caller. + */ +void nilfs_copy_back_pages(struct address_space *dmap, + struct address_space *smap) +{ + struct pagevec pvec; + unsigned int i, n; + pgoff_t index = 0; + int err; + + pagevec_init(&pvec, 0); +repeat: + n = pagevec_lookup(&pvec, smap, index, PAGEVEC_SIZE); + if (!n) + return; + index = pvec.pages[n - 1]->index + 1; + + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i], *dpage; + pgoff_t offset = page->index; + + lock_page(page); + dpage = find_lock_page(dmap, offset); + if (dpage) { + /* override existing page on the destination cache */ + WARN_ON(PageDirty(dpage)); + nilfs_copy_page(dpage, page, 0); + unlock_page(dpage); + page_cache_release(dpage); + } else { + struct page *page2; + + /* move the page to the destination cache */ + spin_lock_irq(&smap->tree_lock); + page2 = radix_tree_delete(&smap->page_tree, offset); + WARN_ON(page2 != page); + + smap->nrpages--; + spin_unlock_irq(&smap->tree_lock); + + spin_lock_irq(&dmap->tree_lock); + err = radix_tree_insert(&dmap->page_tree, offset, page); + if (unlikely(err < 0)) { + WARN_ON(err == -EEXIST); + page->mapping = NULL; + page_cache_release(page); /* for cache */ + } else { + page->mapping = dmap; + dmap->nrpages++; + if (PageDirty(page)) + radix_tree_tag_set(&dmap->page_tree, + offset, + PAGECACHE_TAG_DIRTY); + } + spin_unlock_irq(&dmap->tree_lock); + } + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + + goto repeat; +} + +void nilfs_clear_dirty_pages(struct address_space *mapping) +{ + struct pagevec pvec; + unsigned int i; + pgoff_t index = 0; + + pagevec_init(&pvec, 0); + + while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, + PAGEVEC_SIZE)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + struct buffer_head *bh, *head; + + lock_page(page); + ClearPageUptodate(page); + ClearPageMappedToDisk(page); + bh = head = page_buffers(page); + do { + lock_buffer(bh); + clear_buffer_dirty(bh); + clear_buffer_nilfs_volatile(bh); + clear_buffer_uptodate(bh); + clear_buffer_mapped(bh); + unlock_buffer(bh); + bh = bh->b_this_page; + } while (bh != head); + + __nilfs_clear_page_dirty(page); + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + } +} + +unsigned nilfs_page_count_clean_buffers(struct page *page, + unsigned from, unsigned to) +{ + unsigned block_start, block_end; + struct buffer_head *bh, *head; + unsigned nc = 0; + + for (bh = head = page_buffers(page), block_start = 0; + bh != head || !block_start; + block_start = block_end, bh = bh->b_this_page) { + block_end = block_start + bh->b_size; + if (block_end > from && block_start < to && !buffer_dirty(bh)) + nc++; + } + return nc; +} + +/* + * NILFS2 needs clear_page_dirty() in the following two cases: + * + * 1) For B-tree node pages and data pages of the dat/gcdat, NILFS2 clears + * page dirty flags when it copies back pages from the shadow cache + * (gcdat->{i_mapping,i_btnode_cache}) to its original cache + * (dat->{i_mapping,i_btnode_cache}). + * + * 2) Some B-tree operations like insertion or deletion may dispose buffers + * in dirty state, and this needs to cancel the dirty state of their pages. + */ +int __nilfs_clear_page_dirty(struct page *page) +{ + struct address_space *mapping = page->mapping; + + if (mapping) { + spin_lock_irq(&mapping->tree_lock); + if (test_bit(PG_dirty, &page->flags)) { + radix_tree_tag_clear(&mapping->page_tree, + page_index(page), + PAGECACHE_TAG_DIRTY); + spin_unlock_irq(&mapping->tree_lock); + return clear_page_dirty_for_io(page); + } + spin_unlock_irq(&mapping->tree_lock); + return 0; + } + return TestClearPageDirty(page); +} diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h new file mode 100644 index 000000000000..8abca4d1c1f8 --- /dev/null +++ b/fs/nilfs2/page.h @@ -0,0 +1,76 @@ +/* + * page.h - buffer/page management specific to NILFS + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net>, + * Seiji Kihara <kihara@osrg.net>. + */ + +#ifndef _NILFS_PAGE_H +#define _NILFS_PAGE_H + +#include <linux/buffer_head.h> +#include "nilfs.h" + +/* + * Extended buffer state bits + */ +enum { + BH_NILFS_Allocated = BH_PrivateStart, + BH_NILFS_Node, + BH_NILFS_Volatile, +}; + +BUFFER_FNS(NILFS_Allocated, nilfs_allocated) /* nilfs private buffers */ +BUFFER_FNS(NILFS_Node, nilfs_node) /* nilfs node buffers */ +BUFFER_FNS(NILFS_Volatile, nilfs_volatile) + + +void nilfs_mark_buffer_dirty(struct buffer_head *bh); +int __nilfs_clear_page_dirty(struct page *); + +struct buffer_head *nilfs_grab_buffer(struct inode *, struct address_space *, + unsigned long, unsigned long); +void nilfs_forget_buffer(struct buffer_head *); +void nilfs_copy_buffer(struct buffer_head *, struct buffer_head *); +int nilfs_page_buffers_clean(struct page *); +void nilfs_page_bug(struct page *); +struct page *nilfs_alloc_private_page(struct block_device *, int, + unsigned long); +void nilfs_free_private_page(struct page *); + +int nilfs_copy_dirty_pages(struct address_space *, struct address_space *); +void nilfs_copy_back_pages(struct address_space *, struct address_space *); +void nilfs_clear_dirty_pages(struct address_space *); +unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned); + +#define NILFS_PAGE_BUG(page, m, a...) \ + do { nilfs_page_bug(page); BUG(); } while (0) + +static inline struct buffer_head * +nilfs_page_get_nth_block(struct page *page, unsigned int count) +{ + struct buffer_head *bh = page_buffers(page); + + while (count-- > 0) + bh = bh->b_this_page; + get_bh(bh); + return bh; +} + +#endif /* _NILFS_PAGE_H */ diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c new file mode 100644 index 000000000000..6ade0963fc1d --- /dev/null +++ b/fs/nilfs2/recovery.c @@ -0,0 +1,929 @@ +/* + * recovery.c - NILFS recovery logic + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net> + */ + +#include <linux/buffer_head.h> +#include <linux/blkdev.h> +#include <linux/swap.h> +#include <linux/crc32.h> +#include "nilfs.h" +#include "segment.h" +#include "sufile.h" +#include "page.h" +#include "seglist.h" +#include "segbuf.h" + +/* + * Segment check result + */ +enum { + NILFS_SEG_VALID, + NILFS_SEG_NO_SUPER_ROOT, + NILFS_SEG_FAIL_IO, + NILFS_SEG_FAIL_MAGIC, + NILFS_SEG_FAIL_SEQ, + NILFS_SEG_FAIL_CHECKSUM_SEGSUM, + NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT, + NILFS_SEG_FAIL_CHECKSUM_FULL, + NILFS_SEG_FAIL_CONSISTENCY, +}; + +/* work structure for recovery */ +struct nilfs_recovery_block { + ino_t ino; /* Inode number of the file that this block + belongs to */ + sector_t blocknr; /* block number */ + __u64 vblocknr; /* virtual block number */ + unsigned long blkoff; /* File offset of the data block (per block) */ + struct list_head list; +}; + + +static int nilfs_warn_segment_error(int err) +{ + switch (err) { + case NILFS_SEG_FAIL_IO: + printk(KERN_WARNING + "NILFS warning: I/O error on loading last segment\n"); + return -EIO; + case NILFS_SEG_FAIL_MAGIC: + printk(KERN_WARNING + "NILFS warning: Segment magic number invalid\n"); + break; + case NILFS_SEG_FAIL_SEQ: + printk(KERN_WARNING + "NILFS warning: Sequence number mismatch\n"); + break; + case NILFS_SEG_FAIL_CHECKSUM_SEGSUM: + printk(KERN_WARNING + "NILFS warning: Checksum error in segment summary\n"); + break; + case NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT: + printk(KERN_WARNING + "NILFS warning: Checksum error in super root\n"); + break; + case NILFS_SEG_FAIL_CHECKSUM_FULL: + printk(KERN_WARNING + "NILFS warning: Checksum error in segment payload\n"); + break; + case NILFS_SEG_FAIL_CONSISTENCY: + printk(KERN_WARNING + "NILFS warning: Inconsistent segment\n"); + break; + case NILFS_SEG_NO_SUPER_ROOT: + printk(KERN_WARNING + "NILFS warning: No super root in the last segment\n"); + break; + } + return -EINVAL; +} + +static void store_segsum_info(struct nilfs_segsum_info *ssi, + struct nilfs_segment_summary *sum, + unsigned int blocksize) +{ + ssi->flags = le16_to_cpu(sum->ss_flags); + ssi->seg_seq = le64_to_cpu(sum->ss_seq); + ssi->ctime = le64_to_cpu(sum->ss_create); + ssi->next = le64_to_cpu(sum->ss_next); + ssi->nblocks = le32_to_cpu(sum->ss_nblocks); + ssi->nfinfo = le32_to_cpu(sum->ss_nfinfo); + ssi->sumbytes = le32_to_cpu(sum->ss_sumbytes); + + ssi->nsumblk = DIV_ROUND_UP(ssi->sumbytes, blocksize); + ssi->nfileblk = ssi->nblocks - ssi->nsumblk - !!NILFS_SEG_HAS_SR(ssi); +} + +/** + * calc_crc_cont - check CRC of blocks continuously + * @sbi: nilfs_sb_info + * @bhs: buffer head of start block + * @sum: place to store result + * @offset: offset bytes in the first block + * @check_bytes: number of bytes to be checked + * @start: DBN of start block + * @nblock: number of blocks to be checked + */ +static int calc_crc_cont(struct nilfs_sb_info *sbi, struct buffer_head *bhs, + u32 *sum, unsigned long offset, u64 check_bytes, + sector_t start, unsigned long nblock) +{ + unsigned long blocksize = sbi->s_super->s_blocksize; + unsigned long size; + u32 crc; + + BUG_ON(offset >= blocksize); + check_bytes -= offset; + size = min_t(u64, check_bytes, blocksize - offset); + crc = crc32_le(sbi->s_nilfs->ns_crc_seed, + (unsigned char *)bhs->b_data + offset, size); + if (--nblock > 0) { + do { + struct buffer_head *bh + = sb_bread(sbi->s_super, ++start); + if (!bh) + return -EIO; + check_bytes -= size; + size = min_t(u64, check_bytes, blocksize); + crc = crc32_le(crc, bh->b_data, size); + brelse(bh); + } while (--nblock > 0); + } + *sum = crc; + return 0; +} + +/** + * nilfs_read_super_root_block - read super root block + * @sb: super_block + * @sr_block: disk block number of the super root block + * @pbh: address of a buffer_head pointer to return super root buffer + * @check: CRC check flag + */ +int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block, + struct buffer_head **pbh, int check) +{ + struct buffer_head *bh_sr; + struct nilfs_super_root *sr; + u32 crc; + int ret; + + *pbh = NULL; + bh_sr = sb_bread(sb, sr_block); + if (unlikely(!bh_sr)) { + ret = NILFS_SEG_FAIL_IO; + goto failed; + } + + sr = (struct nilfs_super_root *)bh_sr->b_data; + if (check) { + unsigned bytes = le16_to_cpu(sr->sr_bytes); + + if (bytes == 0 || bytes > sb->s_blocksize) { + ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT; + goto failed_bh; + } + if (calc_crc_cont(NILFS_SB(sb), bh_sr, &crc, + sizeof(sr->sr_sum), bytes, sr_block, 1)) { + ret = NILFS_SEG_FAIL_IO; + goto failed_bh; + } + if (crc != le32_to_cpu(sr->sr_sum)) { + ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT; + goto failed_bh; + } + } + *pbh = bh_sr; + return 0; + + failed_bh: + brelse(bh_sr); + + failed: + return nilfs_warn_segment_error(ret); +} + +/** + * load_segment_summary - read segment summary of the specified partial segment + * @sbi: nilfs_sb_info + * @pseg_start: start disk block number of partial segment + * @seg_seq: sequence number requested + * @ssi: pointer to nilfs_segsum_info struct to store information + * @full_check: full check flag + * (0: only checks segment summary CRC, 1: data CRC) + */ +static int +load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start, + u64 seg_seq, struct nilfs_segsum_info *ssi, + int full_check) +{ + struct buffer_head *bh_sum; + struct nilfs_segment_summary *sum; + unsigned long offset, nblock; + u64 check_bytes; + u32 crc, crc_sum; + int ret = NILFS_SEG_FAIL_IO; + + bh_sum = sb_bread(sbi->s_super, pseg_start); + if (!bh_sum) + goto out; + + sum = (struct nilfs_segment_summary *)bh_sum->b_data; + + /* Check consistency of segment summary */ + if (le32_to_cpu(sum->ss_magic) != NILFS_SEGSUM_MAGIC) { + ret = NILFS_SEG_FAIL_MAGIC; + goto failed; + } + store_segsum_info(ssi, sum, sbi->s_super->s_blocksize); + if (seg_seq != ssi->seg_seq) { + ret = NILFS_SEG_FAIL_SEQ; + goto failed; + } + if (full_check) { + offset = sizeof(sum->ss_datasum); + check_bytes = + ((u64)ssi->nblocks << sbi->s_super->s_blocksize_bits); + nblock = ssi->nblocks; + crc_sum = le32_to_cpu(sum->ss_datasum); + ret = NILFS_SEG_FAIL_CHECKSUM_FULL; + } else { /* only checks segment summary */ + offset = sizeof(sum->ss_datasum) + sizeof(sum->ss_sumsum); + check_bytes = ssi->sumbytes; + nblock = ssi->nsumblk; + crc_sum = le32_to_cpu(sum->ss_sumsum); + ret = NILFS_SEG_FAIL_CHECKSUM_SEGSUM; + } + + if (unlikely(nblock == 0 || + nblock > sbi->s_nilfs->ns_blocks_per_segment)) { + /* This limits the number of blocks read in the CRC check */ + ret = NILFS_SEG_FAIL_CONSISTENCY; + goto failed; + } + if (calc_crc_cont(sbi, bh_sum, &crc, offset, check_bytes, + pseg_start, nblock)) { + ret = NILFS_SEG_FAIL_IO; + goto failed; + } + if (crc == crc_sum) + ret = 0; + failed: + brelse(bh_sum); + out: + return ret; +} + +static void *segsum_get(struct super_block *sb, struct buffer_head **pbh, + unsigned int *offset, unsigned int bytes) +{ + void *ptr; + sector_t blocknr; + + BUG_ON((*pbh)->b_size < *offset); + if (bytes > (*pbh)->b_size - *offset) { + blocknr = (*pbh)->b_blocknr; + brelse(*pbh); + *pbh = sb_bread(sb, blocknr + 1); + if (unlikely(!*pbh)) + return NULL; + *offset = 0; + } + ptr = (*pbh)->b_data + *offset; + *offset += bytes; + return ptr; +} + +static void segsum_skip(struct super_block *sb, struct buffer_head **pbh, + unsigned int *offset, unsigned int bytes, + unsigned long count) +{ + unsigned int rest_item_in_current_block + = ((*pbh)->b_size - *offset) / bytes; + + if (count <= rest_item_in_current_block) { + *offset += bytes * count; + } else { + sector_t blocknr = (*pbh)->b_blocknr; + unsigned int nitem_per_block = (*pbh)->b_size / bytes; + unsigned int bcnt; + + count -= rest_item_in_current_block; + bcnt = DIV_ROUND_UP(count, nitem_per_block); + *offset = bytes * (count - (bcnt - 1) * nitem_per_block); + + brelse(*pbh); + *pbh = sb_bread(sb, blocknr + bcnt); + } +} + +static int +collect_blocks_from_segsum(struct nilfs_sb_info *sbi, sector_t sum_blocknr, + struct nilfs_segsum_info *ssi, + struct list_head *head) +{ + struct buffer_head *bh; + unsigned int offset; + unsigned long nfinfo = ssi->nfinfo; + sector_t blocknr = sum_blocknr + ssi->nsumblk; + ino_t ino; + int err = -EIO; + + if (!nfinfo) + return 0; + + bh = sb_bread(sbi->s_super, sum_blocknr); + if (unlikely(!bh)) + goto out; + + offset = le16_to_cpu( + ((struct nilfs_segment_summary *)bh->b_data)->ss_bytes); + for (;;) { + unsigned long nblocks, ndatablk, nnodeblk; + struct nilfs_finfo *finfo; + + finfo = segsum_get(sbi->s_super, &bh, &offset, sizeof(*finfo)); + if (unlikely(!finfo)) + goto out; + + ino = le64_to_cpu(finfo->fi_ino); + nblocks = le32_to_cpu(finfo->fi_nblocks); + ndatablk = le32_to_cpu(finfo->fi_ndatablk); + nnodeblk = nblocks - ndatablk; + + while (ndatablk-- > 0) { + struct nilfs_recovery_block *rb; + struct nilfs_binfo_v *binfo; + + binfo = segsum_get(sbi->s_super, &bh, &offset, + sizeof(*binfo)); + if (unlikely(!binfo)) + goto out; + + rb = kmalloc(sizeof(*rb), GFP_NOFS); + if (unlikely(!rb)) { + err = -ENOMEM; + goto out; + } + rb->ino = ino; + rb->blocknr = blocknr++; + rb->vblocknr = le64_to_cpu(binfo->bi_vblocknr); + rb->blkoff = le64_to_cpu(binfo->bi_blkoff); + /* INIT_LIST_HEAD(&rb->list); */ + list_add_tail(&rb->list, head); + } + if (--nfinfo == 0) + break; + blocknr += nnodeblk; /* always 0 for the data sync segments */ + segsum_skip(sbi->s_super, &bh, &offset, sizeof(__le64), + nnodeblk); + if (unlikely(!bh)) + goto out; + } + err = 0; + out: + brelse(bh); /* brelse(NULL) is just ignored */ + return err; +} + +static void dispose_recovery_list(struct list_head *head) +{ + while (!list_empty(head)) { + struct nilfs_recovery_block *rb + = list_entry(head->next, + struct nilfs_recovery_block, list); + list_del(&rb->list); + kfree(rb); + } +} + +void nilfs_dispose_segment_list(struct list_head *head) +{ + while (!list_empty(head)) { + struct nilfs_segment_entry *ent + = list_entry(head->next, + struct nilfs_segment_entry, list); + list_del(&ent->list); + nilfs_free_segment_entry(ent); + } +} + +static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs, + struct nilfs_recovery_info *ri) +{ + struct list_head *head = &ri->ri_used_segments; + struct nilfs_segment_entry *ent, *n; + struct inode *sufile = nilfs->ns_sufile; + __u64 segnum[4]; + time_t mtime; + int err; + int i; + + segnum[0] = nilfs->ns_segnum; + segnum[1] = nilfs->ns_nextnum; + segnum[2] = ri->ri_segnum; + segnum[3] = ri->ri_nextnum; + + /* + * Releasing the next segment of the latest super root. + * The next segment is invalidated by this recovery. + */ + err = nilfs_sufile_free(sufile, segnum[1]); + if (unlikely(err)) + goto failed; + + err = -ENOMEM; + for (i = 1; i < 4; i++) { + ent = nilfs_alloc_segment_entry(segnum[i]); + if (unlikely(!ent)) + goto failed; + list_add_tail(&ent->list, head); + } + + /* + * Collecting segments written after the latest super root. + * These are marked dirty to avoid being reallocated in the next write. + */ + mtime = get_seconds(); + list_for_each_entry_safe(ent, n, head, list) { + if (ent->segnum == segnum[0]) { + list_del(&ent->list); + nilfs_free_segment_entry(ent); + continue; + } + err = nilfs_open_segment_entry(ent, sufile); + if (unlikely(err)) + goto failed; + if (!nilfs_segment_usage_dirty(ent->raw_su)) { + /* make the segment garbage */ + ent->raw_su->su_nblocks = cpu_to_le32(0); + ent->raw_su->su_lastmod = cpu_to_le32(mtime); + nilfs_segment_usage_set_dirty(ent->raw_su); + } + list_del(&ent->list); + nilfs_close_segment_entry(ent, sufile); + nilfs_free_segment_entry(ent); + } + + /* Allocate new segments for recovery */ + err = nilfs_sufile_alloc(sufile, &segnum[0]); + if (unlikely(err)) + goto failed; + + nilfs->ns_pseg_offset = 0; + nilfs->ns_seg_seq = ri->ri_seq + 2; + nilfs->ns_nextnum = nilfs->ns_segnum = segnum[0]; + return 0; + + failed: + /* No need to recover sufile because it will be destroyed on error */ + return err; +} + +static int nilfs_recovery_copy_block(struct nilfs_sb_info *sbi, + struct nilfs_recovery_block *rb, + struct page *page) +{ + struct buffer_head *bh_org; + void *kaddr; + + bh_org = sb_bread(sbi->s_super, rb->blocknr); + if (unlikely(!bh_org)) + return -EIO; + + kaddr = kmap_atomic(page, KM_USER0); + memcpy(kaddr + bh_offset(bh_org), bh_org->b_data, bh_org->b_size); + kunmap_atomic(kaddr, KM_USER0); + brelse(bh_org); + return 0; +} + +static int recover_dsync_blocks(struct nilfs_sb_info *sbi, + struct list_head *head, + unsigned long *nr_salvaged_blocks) +{ + struct inode *inode; + struct nilfs_recovery_block *rb, *n; + unsigned blocksize = sbi->s_super->s_blocksize; + struct page *page; + loff_t pos; + int err = 0, err2 = 0; + + list_for_each_entry_safe(rb, n, head, list) { + inode = nilfs_iget(sbi->s_super, rb->ino); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + inode = NULL; + goto failed_inode; + } + + pos = rb->blkoff << inode->i_blkbits; + page = NULL; + err = block_write_begin(NULL, inode->i_mapping, pos, blocksize, + 0, &page, NULL, nilfs_get_block); + if (unlikely(err)) + goto failed_inode; + + err = nilfs_recovery_copy_block(sbi, rb, page); + if (unlikely(err)) + goto failed_page; + + err = nilfs_set_file_dirty(sbi, inode, 1); + if (unlikely(err)) + goto failed_page; + + block_write_end(NULL, inode->i_mapping, pos, blocksize, + blocksize, page, NULL); + + unlock_page(page); + page_cache_release(page); + + (*nr_salvaged_blocks)++; + goto next; + + failed_page: + unlock_page(page); + page_cache_release(page); + + failed_inode: + printk(KERN_WARNING + "NILFS warning: error recovering data block " + "(err=%d, ino=%lu, block-offset=%llu)\n", + err, rb->ino, (unsigned long long)rb->blkoff); + if (!err2) + err2 = err; + next: + iput(inode); /* iput(NULL) is just ignored */ + list_del_init(&rb->list); + kfree(rb); + } + return err2; +} + +/** + * nilfs_do_roll_forward - salvage logical segments newer than the latest + * checkpoint + * @sbi: nilfs_sb_info + * @nilfs: the_nilfs + * @ri: pointer to a nilfs_recovery_info + */ +static int nilfs_do_roll_forward(struct the_nilfs *nilfs, + struct nilfs_sb_info *sbi, + struct nilfs_recovery_info *ri) +{ + struct nilfs_segsum_info ssi; + sector_t pseg_start; + sector_t seg_start, seg_end; /* Starting/ending DBN of full segment */ + unsigned long nsalvaged_blocks = 0; + u64 seg_seq; + __u64 segnum, nextnum = 0; + int empty_seg = 0; + int err = 0, ret; + LIST_HEAD(dsync_blocks); /* list of data blocks to be recovered */ + enum { + RF_INIT_ST, + RF_DSYNC_ST, /* scanning data-sync segments */ + }; + int state = RF_INIT_ST; + + nilfs_attach_writer(nilfs, sbi); + pseg_start = ri->ri_lsegs_start; + seg_seq = ri->ri_lsegs_start_seq; + segnum = nilfs_get_segnum_of_block(nilfs, pseg_start); + nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); + + while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) { + + ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1); + if (ret) { + if (ret == NILFS_SEG_FAIL_IO) { + err = -EIO; + goto failed; + } + goto strayed; + } + if (unlikely(NILFS_SEG_HAS_SR(&ssi))) + goto confused; + + /* Found a valid partial segment; do recovery actions */ + nextnum = nilfs_get_segnum_of_block(nilfs, ssi.next); + empty_seg = 0; + nilfs->ns_ctime = ssi.ctime; + if (!(ssi.flags & NILFS_SS_GC)) + nilfs->ns_nongc_ctime = ssi.ctime; + + switch (state) { + case RF_INIT_ST: + if (!NILFS_SEG_LOGBGN(&ssi) || !NILFS_SEG_DSYNC(&ssi)) + goto try_next_pseg; + state = RF_DSYNC_ST; + /* Fall through */ + case RF_DSYNC_ST: + if (!NILFS_SEG_DSYNC(&ssi)) + goto confused; + + err = collect_blocks_from_segsum( + sbi, pseg_start, &ssi, &dsync_blocks); + if (unlikely(err)) + goto failed; + if (NILFS_SEG_LOGEND(&ssi)) { + err = recover_dsync_blocks( + sbi, &dsync_blocks, &nsalvaged_blocks); + if (unlikely(err)) + goto failed; + state = RF_INIT_ST; + } + break; /* Fall through to try_next_pseg */ + } + + try_next_pseg: + if (pseg_start == ri->ri_lsegs_end) + break; + pseg_start += ssi.nblocks; + if (pseg_start < seg_end) + continue; + goto feed_segment; + + strayed: + if (pseg_start == ri->ri_lsegs_end) + break; + + feed_segment: + /* Looking to the next full segment */ + if (empty_seg++) + break; + seg_seq++; + segnum = nextnum; + nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); + pseg_start = seg_start; + } + + if (nsalvaged_blocks) { + printk(KERN_INFO "NILFS (device %s): salvaged %lu blocks\n", + sbi->s_super->s_id, nsalvaged_blocks); + ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE; + } + out: + dispose_recovery_list(&dsync_blocks); + nilfs_detach_writer(sbi->s_nilfs, sbi); + return err; + + confused: + err = -EINVAL; + failed: + printk(KERN_ERR + "NILFS (device %s): Error roll-forwarding " + "(err=%d, pseg block=%llu). ", + sbi->s_super->s_id, err, (unsigned long long)pseg_start); + goto out; +} + +static void nilfs_finish_roll_forward(struct the_nilfs *nilfs, + struct nilfs_sb_info *sbi, + struct nilfs_recovery_info *ri) +{ + struct buffer_head *bh; + int err; + + if (nilfs_get_segnum_of_block(nilfs, ri->ri_lsegs_start) != + nilfs_get_segnum_of_block(nilfs, ri->ri_super_root)) + return; + + bh = sb_getblk(sbi->s_super, ri->ri_lsegs_start); + BUG_ON(!bh); + memset(bh->b_data, 0, bh->b_size); + set_buffer_dirty(bh); + err = sync_dirty_buffer(bh); + if (unlikely(err)) + printk(KERN_WARNING + "NILFS warning: buffer sync write failed during " + "post-cleaning of recovery.\n"); + brelse(bh); +} + +/** + * nilfs_recover_logical_segments - salvage logical segments written after + * the latest super root + * @nilfs: the_nilfs + * @sbi: nilfs_sb_info + * @ri: pointer to a nilfs_recovery_info struct to store search results. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error code is returned. + * + * %-EINVAL - Inconsistent filesystem state. + * + * %-EIO - I/O error + * + * %-ENOSPC - No space left on device (only in a panic state). + * + * %-ERESTARTSYS - Interrupted. + * + * %-ENOMEM - Insufficient memory available. + */ +int nilfs_recover_logical_segments(struct the_nilfs *nilfs, + struct nilfs_sb_info *sbi, + struct nilfs_recovery_info *ri) +{ + int err; + + if (ri->ri_lsegs_start == 0 || ri->ri_lsegs_end == 0) + return 0; + + err = nilfs_attach_checkpoint(sbi, ri->ri_cno); + if (unlikely(err)) { + printk(KERN_ERR + "NILFS: error loading the latest checkpoint.\n"); + return err; + } + + err = nilfs_do_roll_forward(nilfs, sbi, ri); + if (unlikely(err)) + goto failed; + + if (ri->ri_need_recovery == NILFS_RECOVERY_ROLLFORWARD_DONE) { + err = nilfs_prepare_segment_for_recovery(nilfs, ri); + if (unlikely(err)) { + printk(KERN_ERR "NILFS: Error preparing segments for " + "recovery.\n"); + goto failed; + } + + err = nilfs_attach_segment_constructor(sbi); + if (unlikely(err)) + goto failed; + + set_nilfs_discontinued(nilfs); + err = nilfs_construct_segment(sbi->s_super); + nilfs_detach_segment_constructor(sbi); + + if (unlikely(err)) { + printk(KERN_ERR "NILFS: Oops! recovery failed. " + "(err=%d)\n", err); + goto failed; + } + + nilfs_finish_roll_forward(nilfs, sbi, ri); + } + + nilfs_detach_checkpoint(sbi); + return 0; + + failed: + nilfs_detach_checkpoint(sbi); + nilfs_mdt_clear(nilfs->ns_cpfile); + nilfs_mdt_clear(nilfs->ns_sufile); + nilfs_mdt_clear(nilfs->ns_dat); + return err; +} + +/** + * nilfs_search_super_root - search the latest valid super root + * @nilfs: the_nilfs + * @sbi: nilfs_sb_info + * @ri: pointer to a nilfs_recovery_info struct to store search results. + * + * nilfs_search_super_root() looks for the latest super-root from a partial + * segment pointed by the superblock. It sets up struct the_nilfs through + * this search. It fills nilfs_recovery_info (ri) required for recovery. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error code is returned. + * + * %-EINVAL - No valid segment found + * + * %-EIO - I/O error + */ +int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, + struct nilfs_recovery_info *ri) +{ + struct nilfs_segsum_info ssi; + sector_t pseg_start, pseg_end, sr_pseg_start = 0; + sector_t seg_start, seg_end; /* range of full segment (block number) */ + u64 seg_seq; + __u64 segnum, nextnum = 0; + __u64 cno; + struct nilfs_segment_entry *ent; + LIST_HEAD(segments); + int empty_seg = 0, scan_newer = 0; + int ret; + + pseg_start = nilfs->ns_last_pseg; + seg_seq = nilfs->ns_last_seq; + cno = nilfs->ns_last_cno; + segnum = nilfs_get_segnum_of_block(nilfs, pseg_start); + + /* Calculate range of segment */ + nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); + + for (;;) { + /* Load segment summary */ + ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1); + if (ret) { + if (ret == NILFS_SEG_FAIL_IO) + goto failed; + goto strayed; + } + pseg_end = pseg_start + ssi.nblocks - 1; + if (unlikely(pseg_end > seg_end)) { + ret = NILFS_SEG_FAIL_CONSISTENCY; + goto strayed; + } + + /* A valid partial segment */ + ri->ri_pseg_start = pseg_start; + ri->ri_seq = seg_seq; + ri->ri_segnum = segnum; + nextnum = nilfs_get_segnum_of_block(nilfs, ssi.next); + ri->ri_nextnum = nextnum; + empty_seg = 0; + + if (!NILFS_SEG_HAS_SR(&ssi)) { + if (!scan_newer) { + /* This will never happen because a superblock + (last_segment) always points to a pseg + having a super root. */ + ret = NILFS_SEG_FAIL_CONSISTENCY; + goto failed; + } + if (!ri->ri_lsegs_start && NILFS_SEG_LOGBGN(&ssi)) { + ri->ri_lsegs_start = pseg_start; + ri->ri_lsegs_start_seq = seg_seq; + } + if (NILFS_SEG_LOGEND(&ssi)) + ri->ri_lsegs_end = pseg_start; + goto try_next_pseg; + } + + /* A valid super root was found. */ + ri->ri_cno = cno++; + ri->ri_super_root = pseg_end; + ri->ri_lsegs_start = ri->ri_lsegs_end = 0; + + nilfs_dispose_segment_list(&segments); + nilfs->ns_pseg_offset = (sr_pseg_start = pseg_start) + + ssi.nblocks - seg_start; + nilfs->ns_seg_seq = seg_seq; + nilfs->ns_segnum = segnum; + nilfs->ns_cno = cno; /* nilfs->ns_cno = ri->ri_cno + 1 */ + nilfs->ns_ctime = ssi.ctime; + nilfs->ns_nextnum = nextnum; + + if (scan_newer) + ri->ri_need_recovery = NILFS_RECOVERY_SR_UPDATED; + else { + if (nilfs->ns_mount_state & NILFS_VALID_FS) + goto super_root_found; + scan_newer = 1; + } + + /* reset region for roll-forward */ + pseg_start += ssi.nblocks; + if (pseg_start < seg_end) + continue; + goto feed_segment; + + try_next_pseg: + /* Standing on a course, or met an inconsistent state */ + pseg_start += ssi.nblocks; + if (pseg_start < seg_end) + continue; + goto feed_segment; + + strayed: + /* Off the trail */ + if (!scan_newer) + /* + * This can happen if a checkpoint was written without + * barriers, or as a result of an I/O failure. + */ + goto failed; + + feed_segment: + /* Looking to the next full segment */ + if (empty_seg++) + goto super_root_found; /* found a valid super root */ + + ent = nilfs_alloc_segment_entry(segnum); + if (unlikely(!ent)) { + ret = -ENOMEM; + goto failed; + } + list_add_tail(&ent->list, &segments); + + seg_seq++; + segnum = nextnum; + nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); + pseg_start = seg_start; + } + + super_root_found: + /* Updating pointers relating to the latest checkpoint */ + list_splice(&segments, ri->ri_used_segments.prev); + nilfs->ns_last_pseg = sr_pseg_start; + nilfs->ns_last_seq = nilfs->ns_seg_seq; + nilfs->ns_last_cno = ri->ri_cno; + return 0; + + failed: + nilfs_dispose_segment_list(&segments); + return (ret < 0) ? ret : nilfs_warn_segment_error(ret); +} diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h new file mode 100644 index 000000000000..adccd4fc654e --- /dev/null +++ b/fs/nilfs2/sb.h @@ -0,0 +1,102 @@ +/* + * sb.h - NILFS on-memory super block structure. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net> + * + */ + +#ifndef _NILFS_SB +#define _NILFS_SB + +#include <linux/types.h> +#include <linux/fs.h> + +/* + * Mount options + */ +struct nilfs_mount_options { + unsigned long mount_opt; + __u64 snapshot_cno; +}; + +struct the_nilfs; +struct nilfs_sc_info; + +/* + * NILFS super-block data in memory + */ +struct nilfs_sb_info { + /* Snapshot status */ + __u64 s_snapshot_cno; /* Checkpoint number */ + atomic_t s_inodes_count; + atomic_t s_blocks_count; /* Reserved (might be deleted) */ + + /* Mount options */ + unsigned long s_mount_opt; + uid_t s_resuid; + gid_t s_resgid; + + unsigned long s_interval; /* construction interval */ + unsigned long s_watermark; /* threshold of data amount + for the segment construction */ + + /* Fundamental members */ + struct super_block *s_super; /* reverse pointer to super_block */ + struct the_nilfs *s_nilfs; + struct list_head s_list; /* list head for nilfs->ns_supers */ + + /* Segment constructor */ + struct list_head s_dirty_files; /* dirty files list */ + struct nilfs_sc_info *s_sc_info; /* segment constructor info */ + spinlock_t s_inode_lock; /* Lock for the nilfs inode. + It covers s_dirty_files list */ + + /* Metadata files */ + struct inode *s_ifile; /* index file inode */ + + /* Inode allocator */ + spinlock_t s_next_gen_lock; + u32 s_next_generation; +}; + +static inline struct nilfs_sb_info *NILFS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct nilfs_sc_info *NILFS_SC(struct nilfs_sb_info *sbi) +{ + return sbi->s_sc_info; +} + +/* + * Bit operations for the mount option + */ +#define nilfs_clear_opt(sbi, opt) \ + do { (sbi)->s_mount_opt &= ~NILFS_MOUNT_##opt; } while (0) +#define nilfs_set_opt(sbi, opt) \ + do { (sbi)->s_mount_opt |= NILFS_MOUNT_##opt; } while (0) +#define nilfs_test_opt(sbi, opt) ((sbi)->s_mount_opt & NILFS_MOUNT_##opt) +#define nilfs_write_opt(sbi, mask, opt) \ + do { (sbi)->s_mount_opt = \ + (((sbi)->s_mount_opt & ~NILFS_MOUNT_##mask) | \ + NILFS_MOUNT_##opt); \ + } while (0) + +#endif /* _NILFS_SB */ diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c new file mode 100644 index 000000000000..1e68821b4a9b --- /dev/null +++ b/fs/nilfs2/segbuf.c @@ -0,0 +1,439 @@ +/* + * segbuf.c - NILFS segment buffer + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net> + * + */ + +#include <linux/buffer_head.h> +#include <linux/writeback.h> +#include <linux/crc32.h> +#include "page.h" +#include "segbuf.h" +#include "seglist.h" + + +static struct kmem_cache *nilfs_segbuf_cachep; + +static void nilfs_segbuf_init_once(void *obj) +{ + memset(obj, 0, sizeof(struct nilfs_segment_buffer)); +} + +int __init nilfs_init_segbuf_cache(void) +{ + nilfs_segbuf_cachep = + kmem_cache_create("nilfs2_segbuf_cache", + sizeof(struct nilfs_segment_buffer), + 0, SLAB_RECLAIM_ACCOUNT, + nilfs_segbuf_init_once); + + return (nilfs_segbuf_cachep == NULL) ? -ENOMEM : 0; +} + +void nilfs_destroy_segbuf_cache(void) +{ + kmem_cache_destroy(nilfs_segbuf_cachep); +} + +struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb) +{ + struct nilfs_segment_buffer *segbuf; + + segbuf = kmem_cache_alloc(nilfs_segbuf_cachep, GFP_NOFS); + if (unlikely(!segbuf)) + return NULL; + + segbuf->sb_super = sb; + INIT_LIST_HEAD(&segbuf->sb_list); + INIT_LIST_HEAD(&segbuf->sb_segsum_buffers); + INIT_LIST_HEAD(&segbuf->sb_payload_buffers); + return segbuf; +} + +void nilfs_segbuf_free(struct nilfs_segment_buffer *segbuf) +{ + kmem_cache_free(nilfs_segbuf_cachep, segbuf); +} + +void nilfs_segbuf_map(struct nilfs_segment_buffer *segbuf, __u64 segnum, + unsigned long offset, struct the_nilfs *nilfs) +{ + segbuf->sb_segnum = segnum; + nilfs_get_segment_range(nilfs, segnum, &segbuf->sb_fseg_start, + &segbuf->sb_fseg_end); + + segbuf->sb_pseg_start = segbuf->sb_fseg_start + offset; + segbuf->sb_rest_blocks = + segbuf->sb_fseg_end - segbuf->sb_pseg_start + 1; +} + +void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *segbuf, + __u64 nextnum, struct the_nilfs *nilfs) +{ + segbuf->sb_nextnum = nextnum; + segbuf->sb_sum.next = nilfs_get_segment_start_blocknr(nilfs, nextnum); +} + +int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *segbuf) +{ + struct buffer_head *bh; + + bh = sb_getblk(segbuf->sb_super, + segbuf->sb_pseg_start + segbuf->sb_sum.nsumblk); + if (unlikely(!bh)) + return -ENOMEM; + + nilfs_segbuf_add_segsum_buffer(segbuf, bh); + return 0; +} + +int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf, + struct buffer_head **bhp) +{ + struct buffer_head *bh; + + bh = sb_getblk(segbuf->sb_super, + segbuf->sb_pseg_start + segbuf->sb_sum.nblocks); + if (unlikely(!bh)) + return -ENOMEM; + + nilfs_segbuf_add_payload_buffer(segbuf, bh); + *bhp = bh; + return 0; +} + +int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags, + time_t ctime) +{ + int err; + + segbuf->sb_sum.nblocks = segbuf->sb_sum.nsumblk = 0; + err = nilfs_segbuf_extend_segsum(segbuf); + if (unlikely(err)) + return err; + + segbuf->sb_sum.flags = flags; + segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary); + segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0; + segbuf->sb_sum.ctime = ctime; + + segbuf->sb_io_error = 0; + return 0; +} + +/* + * Setup segument summary + */ +void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf) +{ + struct nilfs_segment_summary *raw_sum; + struct buffer_head *bh_sum; + + bh_sum = list_entry(segbuf->sb_segsum_buffers.next, + struct buffer_head, b_assoc_buffers); + raw_sum = (struct nilfs_segment_summary *)bh_sum->b_data; + + raw_sum->ss_magic = cpu_to_le32(NILFS_SEGSUM_MAGIC); + raw_sum->ss_bytes = cpu_to_le16(sizeof(*raw_sum)); + raw_sum->ss_flags = cpu_to_le16(segbuf->sb_sum.flags); + raw_sum->ss_seq = cpu_to_le64(segbuf->sb_sum.seg_seq); + raw_sum->ss_create = cpu_to_le64(segbuf->sb_sum.ctime); + raw_sum->ss_next = cpu_to_le64(segbuf->sb_sum.next); + raw_sum->ss_nblocks = cpu_to_le32(segbuf->sb_sum.nblocks); + raw_sum->ss_nfinfo = cpu_to_le32(segbuf->sb_sum.nfinfo); + raw_sum->ss_sumbytes = cpu_to_le32(segbuf->sb_sum.sumbytes); + raw_sum->ss_pad = 0; +} + +/* + * CRC calculation routines + */ +void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf, + u32 seed) +{ + struct buffer_head *bh; + struct nilfs_segment_summary *raw_sum; + unsigned long size, bytes = segbuf->sb_sum.sumbytes; + u32 crc; + + bh = list_entry(segbuf->sb_segsum_buffers.next, struct buffer_head, + b_assoc_buffers); + + raw_sum = (struct nilfs_segment_summary *)bh->b_data; + size = min_t(unsigned long, bytes, bh->b_size); + crc = crc32_le(seed, + (unsigned char *)raw_sum + + sizeof(raw_sum->ss_datasum) + sizeof(raw_sum->ss_sumsum), + size - (sizeof(raw_sum->ss_datasum) + + sizeof(raw_sum->ss_sumsum))); + + list_for_each_entry_continue(bh, &segbuf->sb_segsum_buffers, + b_assoc_buffers) { + bytes -= size; + size = min_t(unsigned long, bytes, bh->b_size); + crc = crc32_le(crc, bh->b_data, size); + } + raw_sum->ss_sumsum = cpu_to_le32(crc); +} + +void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf, + u32 seed) +{ + struct buffer_head *bh; + struct nilfs_segment_summary *raw_sum; + void *kaddr; + u32 crc; + + bh = list_entry(segbuf->sb_segsum_buffers.next, struct buffer_head, + b_assoc_buffers); + raw_sum = (struct nilfs_segment_summary *)bh->b_data; + crc = crc32_le(seed, + (unsigned char *)raw_sum + sizeof(raw_sum->ss_datasum), + bh->b_size - sizeof(raw_sum->ss_datasum)); + + list_for_each_entry_continue(bh, &segbuf->sb_segsum_buffers, + b_assoc_buffers) { + crc = crc32_le(crc, bh->b_data, bh->b_size); + } + list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { + kaddr = kmap_atomic(bh->b_page, KM_USER0); + crc = crc32_le(crc, kaddr + bh_offset(bh), bh->b_size); + kunmap_atomic(kaddr, KM_USER0); + } + raw_sum->ss_datasum = cpu_to_le32(crc); +} + +void nilfs_release_buffers(struct list_head *list) +{ + struct buffer_head *bh, *n; + + list_for_each_entry_safe(bh, n, list, b_assoc_buffers) { + list_del_init(&bh->b_assoc_buffers); + if (buffer_nilfs_allocated(bh)) { + struct page *clone_page = bh->b_page; + + /* remove clone page */ + brelse(bh); + page_cache_release(clone_page); /* for each bh */ + if (page_count(clone_page) <= 2) { + lock_page(clone_page); + nilfs_free_private_page(clone_page); + } + continue; + } + brelse(bh); + } +} + +/* + * BIO operations + */ +static void nilfs_end_bio_write(struct bio *bio, int err) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct nilfs_write_info *wi = bio->bi_private; + + if (err == -EOPNOTSUPP) { + set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); + bio_put(bio); + /* to be detected by submit_seg_bio() */ + } + + if (!uptodate) + atomic_inc(&wi->err); + + bio_put(bio); + complete(&wi->bio_event); +} + +static int nilfs_submit_seg_bio(struct nilfs_write_info *wi, int mode) +{ + struct bio *bio = wi->bio; + int err; + + if (wi->nbio > 0 && bdi_write_congested(wi->bdi)) { + wait_for_completion(&wi->bio_event); + wi->nbio--; + if (unlikely(atomic_read(&wi->err))) { + bio_put(bio); + err = -EIO; + goto failed; + } + } + + bio->bi_end_io = nilfs_end_bio_write; + bio->bi_private = wi; + bio_get(bio); + submit_bio(mode, bio); + if (bio_flagged(bio, BIO_EOPNOTSUPP)) { + bio_put(bio); + err = -EOPNOTSUPP; + goto failed; + } + wi->nbio++; + bio_put(bio); + + wi->bio = NULL; + wi->rest_blocks -= wi->end - wi->start; + wi->nr_vecs = min(wi->max_pages, wi->rest_blocks); + wi->start = wi->end; + return 0; + + failed: + wi->bio = NULL; + return err; +} + +/** + * nilfs_alloc_seg_bio - allocate a bio for writing segment. + * @sb: super block + * @start: beginning disk block number of this BIO. + * @nr_vecs: request size of page vector. + * + * alloc_seg_bio() allocates a new BIO structure and initialize it. + * + * Return Value: On success, pointer to the struct bio is returned. + * On error, NULL is returned. + */ +static struct bio *nilfs_alloc_seg_bio(struct super_block *sb, sector_t start, + int nr_vecs) +{ + struct bio *bio; + + bio = bio_alloc(GFP_NOWAIT, nr_vecs); + if (bio == NULL) { + while (!bio && (nr_vecs >>= 1)) + bio = bio_alloc(GFP_NOWAIT, nr_vecs); + } + if (likely(bio)) { + bio->bi_bdev = sb->s_bdev; + bio->bi_sector = (sector_t)start << (sb->s_blocksize_bits - 9); + } + return bio; +} + +void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf, + struct nilfs_write_info *wi) +{ + wi->bio = NULL; + wi->rest_blocks = segbuf->sb_sum.nblocks; + wi->max_pages = bio_get_nr_vecs(wi->sb->s_bdev); + wi->nr_vecs = min(wi->max_pages, wi->rest_blocks); + wi->start = wi->end = 0; + wi->nbio = 0; + wi->blocknr = segbuf->sb_pseg_start; + + atomic_set(&wi->err, 0); + init_completion(&wi->bio_event); +} + +static int nilfs_submit_bh(struct nilfs_write_info *wi, struct buffer_head *bh, + int mode) +{ + int len, err; + + BUG_ON(wi->nr_vecs <= 0); + repeat: + if (!wi->bio) { + wi->bio = nilfs_alloc_seg_bio(wi->sb, wi->blocknr + wi->end, + wi->nr_vecs); + if (unlikely(!wi->bio)) + return -ENOMEM; + } + + len = bio_add_page(wi->bio, bh->b_page, bh->b_size, bh_offset(bh)); + if (len == bh->b_size) { + wi->end++; + return 0; + } + /* bio is FULL */ + err = nilfs_submit_seg_bio(wi, mode); + /* never submit current bh */ + if (likely(!err)) + goto repeat; + return err; +} + +int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, + struct nilfs_write_info *wi) +{ + struct buffer_head *bh; + int res, rw = WRITE; + + list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) { + res = nilfs_submit_bh(wi, bh, rw); + if (unlikely(res)) + goto failed_bio; + } + + list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { + res = nilfs_submit_bh(wi, bh, rw); + if (unlikely(res)) + goto failed_bio; + } + + if (wi->bio) { + /* + * Last BIO is always sent through the following + * submission. + */ + rw |= (1 << BIO_RW_SYNCIO); + res = nilfs_submit_seg_bio(wi, rw); + if (unlikely(res)) + goto failed_bio; + } + + res = 0; + out: + return res; + + failed_bio: + atomic_inc(&wi->err); + goto out; +} + +/** + * nilfs_segbuf_wait - wait for completion of requested BIOs + * @wi: nilfs_write_info + * + * Return Value: On Success, 0 is returned. On Error, one of the following + * negative error code is returned. + * + * %-EIO - I/O error + */ +int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf, + struct nilfs_write_info *wi) +{ + int err = 0; + + if (!wi->nbio) + return 0; + + do { + wait_for_completion(&wi->bio_event); + } while (--wi->nbio > 0); + + if (unlikely(atomic_read(&wi->err) > 0)) { + printk(KERN_ERR "NILFS: IO error writing segment\n"); + err = -EIO; + segbuf->sb_io_error = 1; + } + return err; +} diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h new file mode 100644 index 000000000000..0c3076f4e592 --- /dev/null +++ b/fs/nilfs2/segbuf.h @@ -0,0 +1,201 @@ +/* + * segbuf.h - NILFS Segment buffer prototypes and definitions + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net> + * + */ +#ifndef _NILFS_SEGBUF_H +#define _NILFS_SEGBUF_H + +#include <linux/fs.h> +#include <linux/buffer_head.h> +#include <linux/bio.h> +#include <linux/completion.h> +#include <linux/backing-dev.h> + +/** + * struct nilfs_segsum_info - On-memory segment summary + * @flags: Flags + * @nfinfo: Number of file information structures + * @nblocks: Number of blocks included in the partial segment + * @nsumblk: Number of summary blocks + * @sumbytes: Byte count of segment summary + * @nfileblk: Total number of file blocks + * @seg_seq: Segment sequence number + * @ctime: Creation time + * @next: Block number of the next full segment + */ +struct nilfs_segsum_info { + unsigned int flags; + unsigned long nfinfo; + unsigned long nblocks; + unsigned long nsumblk; + unsigned long sumbytes; + unsigned long nfileblk; + u64 seg_seq; + time_t ctime; + sector_t next; +}; + +/* macro for the flags */ +#define NILFS_SEG_HAS_SR(sum) ((sum)->flags & NILFS_SS_SR) +#define NILFS_SEG_LOGBGN(sum) ((sum)->flags & NILFS_SS_LOGBGN) +#define NILFS_SEG_LOGEND(sum) ((sum)->flags & NILFS_SS_LOGEND) +#define NILFS_SEG_DSYNC(sum) ((sum)->flags & NILFS_SS_SYNDT) +#define NILFS_SEG_SIMPLEX(sum) \ + (((sum)->flags & (NILFS_SS_LOGBGN | NILFS_SS_LOGEND)) == \ + (NILFS_SS_LOGBGN | NILFS_SS_LOGEND)) + +#define NILFS_SEG_EMPTY(sum) ((sum)->nblocks == (sum)->nsumblk) + +/** + * struct nilfs_segment_buffer - Segment buffer + * @sb_super: back pointer to a superblock struct + * @sb_list: List head to chain this structure + * @sb_sum: On-memory segment summary + * @sb_segnum: Index number of the full segment + * @sb_nextnum: Index number of the next full segment + * @sb_fseg_start: Start block number of the full segment + * @sb_fseg_end: End block number of the full segment + * @sb_pseg_start: Disk block number of partial segment + * @sb_rest_blocks: Number of residual blocks in the current segment + * @sb_segsum_buffers: List of buffers for segment summaries + * @sb_payload_buffers: List of buffers for segment payload + * @sb_io_error: I/O error status + */ +struct nilfs_segment_buffer { + struct super_block *sb_super; + struct list_head sb_list; + + /* Segment information */ + struct nilfs_segsum_info sb_sum; + __u64 sb_segnum; + __u64 sb_nextnum; + sector_t sb_fseg_start, sb_fseg_end; + sector_t sb_pseg_start; + unsigned sb_rest_blocks; + + /* Buffers */ + struct list_head sb_segsum_buffers; + struct list_head sb_payload_buffers; /* including super root */ + + /* io status */ + int sb_io_error; +}; + +#define NILFS_LIST_SEGBUF(head) \ + list_entry((head), struct nilfs_segment_buffer, sb_list) +#define NILFS_NEXT_SEGBUF(segbuf) NILFS_LIST_SEGBUF((segbuf)->sb_list.next) +#define NILFS_PREV_SEGBUF(segbuf) NILFS_LIST_SEGBUF((segbuf)->sb_list.prev) +#define NILFS_LAST_SEGBUF(head) NILFS_LIST_SEGBUF((head)->prev) +#define NILFS_FIRST_SEGBUF(head) NILFS_LIST_SEGBUF((head)->next) +#define NILFS_SEGBUF_IS_LAST(segbuf, head) ((segbuf)->sb_list.next == (head)) + +#define nilfs_for_each_segbuf_before(s, t, h) \ + for ((s) = NILFS_FIRST_SEGBUF(h); (s) != (t); \ + (s) = NILFS_NEXT_SEGBUF(s)) + +#define NILFS_SEGBUF_FIRST_BH(head) \ + (list_entry((head)->next, struct buffer_head, b_assoc_buffers)) +#define NILFS_SEGBUF_NEXT_BH(bh) \ + (list_entry((bh)->b_assoc_buffers.next, struct buffer_head, \ + b_assoc_buffers)) +#define NILFS_SEGBUF_BH_IS_LAST(bh, head) ((bh)->b_assoc_buffers.next == head) + + +int __init nilfs_init_segbuf_cache(void); +void nilfs_destroy_segbuf_cache(void); +struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *); +void nilfs_segbuf_free(struct nilfs_segment_buffer *); +void nilfs_segbuf_map(struct nilfs_segment_buffer *, __u64, unsigned long, + struct the_nilfs *); +void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64, + struct the_nilfs *); +int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t); +int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *); +int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *, + struct buffer_head **); +void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *); +void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *, u32); +void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *, u32); + +static inline void +nilfs_segbuf_add_segsum_buffer(struct nilfs_segment_buffer *segbuf, + struct buffer_head *bh) +{ + list_add_tail(&bh->b_assoc_buffers, &segbuf->sb_segsum_buffers); + segbuf->sb_sum.nblocks++; + segbuf->sb_sum.nsumblk++; +} + +static inline void +nilfs_segbuf_add_payload_buffer(struct nilfs_segment_buffer *segbuf, + struct buffer_head *bh) +{ + list_add_tail(&bh->b_assoc_buffers, &segbuf->sb_payload_buffers); + segbuf->sb_sum.nblocks++; +} + +static inline void +nilfs_segbuf_add_file_buffer(struct nilfs_segment_buffer *segbuf, + struct buffer_head *bh) +{ + get_bh(bh); + nilfs_segbuf_add_payload_buffer(segbuf, bh); + segbuf->sb_sum.nfileblk++; +} + +void nilfs_release_buffers(struct list_head *); + +static inline void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf) +{ + nilfs_release_buffers(&segbuf->sb_segsum_buffers); + nilfs_release_buffers(&segbuf->sb_payload_buffers); +} + +struct nilfs_write_info { + struct bio *bio; + int start, end; /* The region to be submitted */ + int rest_blocks; + int max_pages; + int nr_vecs; + sector_t blocknr; + + int nbio; + atomic_t err; + struct completion bio_event; + /* completion event of segment write */ + + /* + * The following fields must be set explicitly + */ + struct super_block *sb; + struct backing_dev_info *bdi; /* backing dev info */ + struct buffer_head *bh_sr; +}; + + +void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *, + struct nilfs_write_info *); +int nilfs_segbuf_write(struct nilfs_segment_buffer *, + struct nilfs_write_info *); +int nilfs_segbuf_wait(struct nilfs_segment_buffer *, + struct nilfs_write_info *); + +#endif /* _NILFS_SEGBUF_H */ diff --git a/fs/nilfs2/seglist.h b/fs/nilfs2/seglist.h new file mode 100644 index 000000000000..d39df9144e99 --- /dev/null +++ b/fs/nilfs2/seglist.h @@ -0,0 +1,85 @@ +/* + * seglist.h - expediential structure and routines to handle list of segments + * (would be removed in a future release) + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net> + * + */ +#ifndef _NILFS_SEGLIST_H +#define _NILFS_SEGLIST_H + +#include <linux/fs.h> +#include <linux/buffer_head.h> +#include <linux/nilfs2_fs.h> +#include "sufile.h" + +struct nilfs_segment_entry { + __u64 segnum; + +#define NILFS_SLH_FREED 0x0001 /* The segment was freed provisonally. + It must be cancelled if + construction aborted */ + + unsigned flags; + struct list_head list; + struct buffer_head *bh_su; + struct nilfs_segment_usage *raw_su; +}; + + +void nilfs_dispose_segment_list(struct list_head *); + +static inline struct nilfs_segment_entry * +nilfs_alloc_segment_entry(__u64 segnum) +{ + struct nilfs_segment_entry *ent = kmalloc(sizeof(*ent), GFP_NOFS); + + if (likely(ent)) { + ent->segnum = segnum; + ent->flags = 0; + ent->bh_su = NULL; + ent->raw_su = NULL; + INIT_LIST_HEAD(&ent->list); + } + return ent; +} + +static inline int nilfs_open_segment_entry(struct nilfs_segment_entry *ent, + struct inode *sufile) +{ + return nilfs_sufile_get_segment_usage(sufile, ent->segnum, + &ent->raw_su, &ent->bh_su); +} + +static inline void nilfs_close_segment_entry(struct nilfs_segment_entry *ent, + struct inode *sufile) +{ + if (!ent->bh_su) + return; + nilfs_sufile_put_segment_usage(sufile, ent->segnum, ent->bh_su); + ent->bh_su = NULL; + ent->raw_su = NULL; +} + +static inline void nilfs_free_segment_entry(struct nilfs_segment_entry *ent) +{ + kfree(ent); +} + +#endif /* _NILFS_SEGLIST_H */ diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c new file mode 100644 index 000000000000..fb70ec3be20e --- /dev/null +++ b/fs/nilfs2/segment.c @@ -0,0 +1,2977 @@ +/* + * segment.c - NILFS segment constructor. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net> + * + */ + +#include <linux/pagemap.h> +#include <linux/buffer_head.h> +#include <linux/writeback.h> +#include <linux/bio.h> +#include <linux/completion.h> +#include <linux/blkdev.h> +#include <linux/backing-dev.h> +#include <linux/freezer.h> +#include <linux/kthread.h> +#include <linux/crc32.h> +#include <linux/pagevec.h> +#include "nilfs.h" +#include "btnode.h" +#include "page.h" +#include "segment.h" +#include "sufile.h" +#include "cpfile.h" +#include "ifile.h" +#include "seglist.h" +#include "segbuf.h" + + +/* + * Segment constructor + */ +#define SC_N_INODEVEC 16 /* Size of locally allocated inode vector */ + +#define SC_MAX_SEGDELTA 64 /* Upper limit of the number of segments + appended in collection retry loop */ + +/* Construction mode */ +enum { + SC_LSEG_SR = 1, /* Make a logical segment having a super root */ + SC_LSEG_DSYNC, /* Flush data blocks of a given file and make + a logical segment without a super root */ + SC_FLUSH_FILE, /* Flush data files, leads to segment writes without + creating a checkpoint */ + SC_FLUSH_DAT, /* Flush DAT file. This also creates segments without + a checkpoint */ +}; + +/* Stage numbers of dirty block collection */ +enum { + NILFS_ST_INIT = 0, + NILFS_ST_GC, /* Collecting dirty blocks for GC */ + NILFS_ST_FILE, + NILFS_ST_IFILE, + NILFS_ST_CPFILE, + NILFS_ST_SUFILE, + NILFS_ST_DAT, + NILFS_ST_SR, /* Super root */ + NILFS_ST_DSYNC, /* Data sync blocks */ + NILFS_ST_DONE, +}; + +/* State flags of collection */ +#define NILFS_CF_NODE 0x0001 /* Collecting node blocks */ +#define NILFS_CF_IFILE_STARTED 0x0002 /* IFILE stage has started */ +#define NILFS_CF_HISTORY_MASK (NILFS_CF_IFILE_STARTED) + +/* Operations depending on the construction mode and file type */ +struct nilfs_sc_operations { + int (*collect_data)(struct nilfs_sc_info *, struct buffer_head *, + struct inode *); + int (*collect_node)(struct nilfs_sc_info *, struct buffer_head *, + struct inode *); + int (*collect_bmap)(struct nilfs_sc_info *, struct buffer_head *, + struct inode *); + void (*write_data_binfo)(struct nilfs_sc_info *, + struct nilfs_segsum_pointer *, + union nilfs_binfo *); + void (*write_node_binfo)(struct nilfs_sc_info *, + struct nilfs_segsum_pointer *, + union nilfs_binfo *); +}; + +/* + * Other definitions + */ +static void nilfs_segctor_start_timer(struct nilfs_sc_info *); +static void nilfs_segctor_do_flush(struct nilfs_sc_info *, int); +static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *); +static void nilfs_dispose_list(struct nilfs_sb_info *, struct list_head *, + int); + +#define nilfs_cnt32_gt(a, b) \ + (typecheck(__u32, a) && typecheck(__u32, b) && \ + ((__s32)(b) - (__s32)(a) < 0)) +#define nilfs_cnt32_ge(a, b) \ + (typecheck(__u32, a) && typecheck(__u32, b) && \ + ((__s32)(a) - (__s32)(b) >= 0)) +#define nilfs_cnt32_lt(a, b) nilfs_cnt32_gt(b, a) +#define nilfs_cnt32_le(a, b) nilfs_cnt32_ge(b, a) + +/* + * Transaction + */ +static struct kmem_cache *nilfs_transaction_cachep; + +/** + * nilfs_init_transaction_cache - create a cache for nilfs_transaction_info + * + * nilfs_init_transaction_cache() creates a slab cache for the struct + * nilfs_transaction_info. + * + * Return Value: On success, it returns 0. On error, one of the following + * negative error code is returned. + * + * %-ENOMEM - Insufficient memory available. + */ +int nilfs_init_transaction_cache(void) +{ + nilfs_transaction_cachep = + kmem_cache_create("nilfs2_transaction_cache", + sizeof(struct nilfs_transaction_info), + 0, SLAB_RECLAIM_ACCOUNT, NULL); + return (nilfs_transaction_cachep == NULL) ? -ENOMEM : 0; +} + +/** + * nilfs_detroy_transaction_cache - destroy the cache for transaction info + * + * nilfs_destroy_transaction_cache() frees the slab cache for the struct + * nilfs_transaction_info. + */ +void nilfs_destroy_transaction_cache(void) +{ + kmem_cache_destroy(nilfs_transaction_cachep); +} + +static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti) +{ + struct nilfs_transaction_info *cur_ti = current->journal_info; + void *save = NULL; + + if (cur_ti) { + if (cur_ti->ti_magic == NILFS_TI_MAGIC) + return ++cur_ti->ti_count; + else { + /* + * If journal_info field is occupied by other FS, + * it is saved and will be restored on + * nilfs_transaction_commit(). + */ + printk(KERN_WARNING + "NILFS warning: journal info from a different " + "FS\n"); + save = current->journal_info; + } + } + if (!ti) { + ti = kmem_cache_alloc(nilfs_transaction_cachep, GFP_NOFS); + if (!ti) + return -ENOMEM; + ti->ti_flags = NILFS_TI_DYNAMIC_ALLOC; + } else { + ti->ti_flags = 0; + } + ti->ti_count = 0; + ti->ti_save = save; + ti->ti_magic = NILFS_TI_MAGIC; + current->journal_info = ti; + return 0; +} + +/** + * nilfs_transaction_begin - start indivisible file operations. + * @sb: super block + * @ti: nilfs_transaction_info + * @vacancy_check: flags for vacancy rate checks + * + * nilfs_transaction_begin() acquires a reader/writer semaphore, called + * the segment semaphore, to make a segment construction and write tasks + * exclusive. The function is used with nilfs_transaction_commit() in pairs. + * The region enclosed by these two functions can be nested. To avoid a + * deadlock, the semaphore is only acquired or released in the outermost call. + * + * This function allocates a nilfs_transaction_info struct to keep context + * information on it. It is initialized and hooked onto the current task in + * the outermost call. If a pre-allocated struct is given to @ti, it is used + * instead; othewise a new struct is assigned from a slab. + * + * When @vacancy_check flag is set, this function will check the amount of + * free space, and will wait for the GC to reclaim disk space if low capacity. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error code is returned. + * + * %-ENOMEM - Insufficient memory available. + * + * %-ENOSPC - No space left on device + */ +int nilfs_transaction_begin(struct super_block *sb, + struct nilfs_transaction_info *ti, + int vacancy_check) +{ + struct nilfs_sb_info *sbi; + struct the_nilfs *nilfs; + int ret = nilfs_prepare_segment_lock(ti); + + if (unlikely(ret < 0)) + return ret; + if (ret > 0) + return 0; + + sbi = NILFS_SB(sb); + nilfs = sbi->s_nilfs; + down_read(&nilfs->ns_segctor_sem); + if (vacancy_check && nilfs_near_disk_full(nilfs)) { + up_read(&nilfs->ns_segctor_sem); + ret = -ENOSPC; + goto failed; + } + return 0; + + failed: + ti = current->journal_info; + current->journal_info = ti->ti_save; + if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) + kmem_cache_free(nilfs_transaction_cachep, ti); + return ret; +} + +/** + * nilfs_transaction_commit - commit indivisible file operations. + * @sb: super block + * + * nilfs_transaction_commit() releases the read semaphore which is + * acquired by nilfs_transaction_begin(). This is only performed + * in outermost call of this function. If a commit flag is set, + * nilfs_transaction_commit() sets a timer to start the segment + * constructor. If a sync flag is set, it starts construction + * directly. + */ +int nilfs_transaction_commit(struct super_block *sb) +{ + struct nilfs_transaction_info *ti = current->journal_info; + struct nilfs_sb_info *sbi; + struct nilfs_sc_info *sci; + int err = 0; + + BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC); + ti->ti_flags |= NILFS_TI_COMMIT; + if (ti->ti_count > 0) { + ti->ti_count--; + return 0; + } + sbi = NILFS_SB(sb); + sci = NILFS_SC(sbi); + if (sci != NULL) { + if (ti->ti_flags & NILFS_TI_COMMIT) + nilfs_segctor_start_timer(sci); + if (atomic_read(&sbi->s_nilfs->ns_ndirtyblks) > + sci->sc_watermark) + nilfs_segctor_do_flush(sci, 0); + } + up_read(&sbi->s_nilfs->ns_segctor_sem); + current->journal_info = ti->ti_save; + + if (ti->ti_flags & NILFS_TI_SYNC) + err = nilfs_construct_segment(sb); + if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) + kmem_cache_free(nilfs_transaction_cachep, ti); + return err; +} + +void nilfs_transaction_abort(struct super_block *sb) +{ + struct nilfs_transaction_info *ti = current->journal_info; + + BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC); + if (ti->ti_count > 0) { + ti->ti_count--; + return; + } + up_read(&NILFS_SB(sb)->s_nilfs->ns_segctor_sem); + + current->journal_info = ti->ti_save; + if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) + kmem_cache_free(nilfs_transaction_cachep, ti); +} + +void nilfs_relax_pressure_in_lock(struct super_block *sb) +{ + struct nilfs_sb_info *sbi = NILFS_SB(sb); + struct nilfs_sc_info *sci = NILFS_SC(sbi); + struct the_nilfs *nilfs = sbi->s_nilfs; + + if (!sci || !sci->sc_flush_request) + return; + + set_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags); + up_read(&nilfs->ns_segctor_sem); + + down_write(&nilfs->ns_segctor_sem); + if (sci->sc_flush_request && + test_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags)) { + struct nilfs_transaction_info *ti = current->journal_info; + + ti->ti_flags |= NILFS_TI_WRITER; + nilfs_segctor_do_immediate_flush(sci); + ti->ti_flags &= ~NILFS_TI_WRITER; + } + downgrade_write(&nilfs->ns_segctor_sem); +} + +static void nilfs_transaction_lock(struct nilfs_sb_info *sbi, + struct nilfs_transaction_info *ti, + int gcflag) +{ + struct nilfs_transaction_info *cur_ti = current->journal_info; + + WARN_ON(cur_ti); + ti->ti_flags = NILFS_TI_WRITER; + ti->ti_count = 0; + ti->ti_save = cur_ti; + ti->ti_magic = NILFS_TI_MAGIC; + INIT_LIST_HEAD(&ti->ti_garbage); + current->journal_info = ti; + + for (;;) { + down_write(&sbi->s_nilfs->ns_segctor_sem); + if (!test_bit(NILFS_SC_PRIOR_FLUSH, &NILFS_SC(sbi)->sc_flags)) + break; + + nilfs_segctor_do_immediate_flush(NILFS_SC(sbi)); + + up_write(&sbi->s_nilfs->ns_segctor_sem); + yield(); + } + if (gcflag) + ti->ti_flags |= NILFS_TI_GC; +} + +static void nilfs_transaction_unlock(struct nilfs_sb_info *sbi) +{ + struct nilfs_transaction_info *ti = current->journal_info; + + BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC); + BUG_ON(ti->ti_count > 0); + + up_write(&sbi->s_nilfs->ns_segctor_sem); + current->journal_info = ti->ti_save; + if (!list_empty(&ti->ti_garbage)) + nilfs_dispose_list(sbi, &ti->ti_garbage, 0); +} + +static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci, + struct nilfs_segsum_pointer *ssp, + unsigned bytes) +{ + struct nilfs_segment_buffer *segbuf = sci->sc_curseg; + unsigned blocksize = sci->sc_super->s_blocksize; + void *p; + + if (unlikely(ssp->offset + bytes > blocksize)) { + ssp->offset = 0; + BUG_ON(NILFS_SEGBUF_BH_IS_LAST(ssp->bh, + &segbuf->sb_segsum_buffers)); + ssp->bh = NILFS_SEGBUF_NEXT_BH(ssp->bh); + } + p = ssp->bh->b_data + ssp->offset; + ssp->offset += bytes; + return p; +} + +/** + * nilfs_segctor_reset_segment_buffer - reset the current segment buffer + * @sci: nilfs_sc_info + */ +static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci) +{ + struct nilfs_segment_buffer *segbuf = sci->sc_curseg; + struct buffer_head *sumbh; + unsigned sumbytes; + unsigned flags = 0; + int err; + + if (nilfs_doing_gc()) + flags = NILFS_SS_GC; + err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime); + if (unlikely(err)) + return err; + + sumbh = NILFS_SEGBUF_FIRST_BH(&segbuf->sb_segsum_buffers); + sumbytes = segbuf->sb_sum.sumbytes; + sci->sc_finfo_ptr.bh = sumbh; sci->sc_finfo_ptr.offset = sumbytes; + sci->sc_binfo_ptr.bh = sumbh; sci->sc_binfo_ptr.offset = sumbytes; + sci->sc_blk_cnt = sci->sc_datablk_cnt = 0; + return 0; +} + +static int nilfs_segctor_feed_segment(struct nilfs_sc_info *sci) +{ + sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks; + if (NILFS_SEGBUF_IS_LAST(sci->sc_curseg, &sci->sc_segbufs)) + return -E2BIG; /* The current segment is filled up + (internal code) */ + sci->sc_curseg = NILFS_NEXT_SEGBUF(sci->sc_curseg); + return nilfs_segctor_reset_segment_buffer(sci); +} + +static int nilfs_segctor_add_super_root(struct nilfs_sc_info *sci) +{ + struct nilfs_segment_buffer *segbuf = sci->sc_curseg; + int err; + + if (segbuf->sb_sum.nblocks >= segbuf->sb_rest_blocks) { + err = nilfs_segctor_feed_segment(sci); + if (err) + return err; + segbuf = sci->sc_curseg; + } + err = nilfs_segbuf_extend_payload(segbuf, &sci->sc_super_root); + if (likely(!err)) + segbuf->sb_sum.flags |= NILFS_SS_SR; + return err; +} + +/* + * Functions for making segment summary and payloads + */ +static int nilfs_segctor_segsum_block_required( + struct nilfs_sc_info *sci, const struct nilfs_segsum_pointer *ssp, + unsigned binfo_size) +{ + unsigned blocksize = sci->sc_super->s_blocksize; + /* Size of finfo and binfo is enough small against blocksize */ + + return ssp->offset + binfo_size + + (!sci->sc_blk_cnt ? sizeof(struct nilfs_finfo) : 0) > + blocksize; +} + +static void nilfs_segctor_begin_finfo(struct nilfs_sc_info *sci, + struct inode *inode) +{ + sci->sc_curseg->sb_sum.nfinfo++; + sci->sc_binfo_ptr = sci->sc_finfo_ptr; + nilfs_segctor_map_segsum_entry( + sci, &sci->sc_binfo_ptr, sizeof(struct nilfs_finfo)); + + if (inode->i_sb && !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags)) + set_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags); + /* skip finfo */ +} + +static void nilfs_segctor_end_finfo(struct nilfs_sc_info *sci, + struct inode *inode) +{ + struct nilfs_finfo *finfo; + struct nilfs_inode_info *ii; + struct nilfs_segment_buffer *segbuf; + + if (sci->sc_blk_cnt == 0) + return; + + ii = NILFS_I(inode); + finfo = nilfs_segctor_map_segsum_entry(sci, &sci->sc_finfo_ptr, + sizeof(*finfo)); + finfo->fi_ino = cpu_to_le64(inode->i_ino); + finfo->fi_nblocks = cpu_to_le32(sci->sc_blk_cnt); + finfo->fi_ndatablk = cpu_to_le32(sci->sc_datablk_cnt); + finfo->fi_cno = cpu_to_le64(ii->i_cno); + + segbuf = sci->sc_curseg; + segbuf->sb_sum.sumbytes = sci->sc_binfo_ptr.offset + + sci->sc_super->s_blocksize * (segbuf->sb_sum.nsumblk - 1); + sci->sc_finfo_ptr = sci->sc_binfo_ptr; + sci->sc_blk_cnt = sci->sc_datablk_cnt = 0; +} + +static int nilfs_segctor_add_file_block(struct nilfs_sc_info *sci, + struct buffer_head *bh, + struct inode *inode, + unsigned binfo_size) +{ + struct nilfs_segment_buffer *segbuf; + int required, err = 0; + + retry: + segbuf = sci->sc_curseg; + required = nilfs_segctor_segsum_block_required( + sci, &sci->sc_binfo_ptr, binfo_size); + if (segbuf->sb_sum.nblocks + required + 1 > segbuf->sb_rest_blocks) { + nilfs_segctor_end_finfo(sci, inode); + err = nilfs_segctor_feed_segment(sci); + if (err) + return err; + goto retry; + } + if (unlikely(required)) { + err = nilfs_segbuf_extend_segsum(segbuf); + if (unlikely(err)) + goto failed; + } + if (sci->sc_blk_cnt == 0) + nilfs_segctor_begin_finfo(sci, inode); + + nilfs_segctor_map_segsum_entry(sci, &sci->sc_binfo_ptr, binfo_size); + /* Substitution to vblocknr is delayed until update_blocknr() */ + nilfs_segbuf_add_file_buffer(segbuf, bh); + sci->sc_blk_cnt++; + failed: + return err; +} + +static int nilfs_handle_bmap_error(int err, const char *fname, + struct inode *inode, struct super_block *sb) +{ + if (err == -EINVAL) { + nilfs_error(sb, fname, "broken bmap (inode=%lu)\n", + inode->i_ino); + err = -EIO; + } + return err; +} + +/* + * Callback functions that enumerate, mark, and collect dirty blocks + */ +static int nilfs_collect_file_data(struct nilfs_sc_info *sci, + struct buffer_head *bh, struct inode *inode) +{ + int err; + + err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh); + if (unlikely(err < 0)) + return nilfs_handle_bmap_error(err, __func__, inode, + sci->sc_super); + + err = nilfs_segctor_add_file_block(sci, bh, inode, + sizeof(struct nilfs_binfo_v)); + if (!err) + sci->sc_datablk_cnt++; + return err; +} + +static int nilfs_collect_file_node(struct nilfs_sc_info *sci, + struct buffer_head *bh, + struct inode *inode) +{ + int err; + + err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh); + if (unlikely(err < 0)) + return nilfs_handle_bmap_error(err, __func__, inode, + sci->sc_super); + return 0; +} + +static int nilfs_collect_file_bmap(struct nilfs_sc_info *sci, + struct buffer_head *bh, + struct inode *inode) +{ + WARN_ON(!buffer_dirty(bh)); + return nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64)); +} + +static void nilfs_write_file_data_binfo(struct nilfs_sc_info *sci, + struct nilfs_segsum_pointer *ssp, + union nilfs_binfo *binfo) +{ + struct nilfs_binfo_v *binfo_v = nilfs_segctor_map_segsum_entry( + sci, ssp, sizeof(*binfo_v)); + *binfo_v = binfo->bi_v; +} + +static void nilfs_write_file_node_binfo(struct nilfs_sc_info *sci, + struct nilfs_segsum_pointer *ssp, + union nilfs_binfo *binfo) +{ + __le64 *vblocknr = nilfs_segctor_map_segsum_entry( + sci, ssp, sizeof(*vblocknr)); + *vblocknr = binfo->bi_v.bi_vblocknr; +} + +struct nilfs_sc_operations nilfs_sc_file_ops = { + .collect_data = nilfs_collect_file_data, + .collect_node = nilfs_collect_file_node, + .collect_bmap = nilfs_collect_file_bmap, + .write_data_binfo = nilfs_write_file_data_binfo, + .write_node_binfo = nilfs_write_file_node_binfo, +}; + +static int nilfs_collect_dat_data(struct nilfs_sc_info *sci, + struct buffer_head *bh, struct inode *inode) +{ + int err; + + err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh); + if (unlikely(err < 0)) + return nilfs_handle_bmap_error(err, __func__, inode, + sci->sc_super); + + err = nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64)); + if (!err) + sci->sc_datablk_cnt++; + return err; +} + +static int nilfs_collect_dat_bmap(struct nilfs_sc_info *sci, + struct buffer_head *bh, struct inode *inode) +{ + WARN_ON(!buffer_dirty(bh)); + return nilfs_segctor_add_file_block(sci, bh, inode, + sizeof(struct nilfs_binfo_dat)); +} + +static void nilfs_write_dat_data_binfo(struct nilfs_sc_info *sci, + struct nilfs_segsum_pointer *ssp, + union nilfs_binfo *binfo) +{ + __le64 *blkoff = nilfs_segctor_map_segsum_entry(sci, ssp, + sizeof(*blkoff)); + *blkoff = binfo->bi_dat.bi_blkoff; +} + +static void nilfs_write_dat_node_binfo(struct nilfs_sc_info *sci, + struct nilfs_segsum_pointer *ssp, + union nilfs_binfo *binfo) +{ + struct nilfs_binfo_dat *binfo_dat = + nilfs_segctor_map_segsum_entry(sci, ssp, sizeof(*binfo_dat)); + *binfo_dat = binfo->bi_dat; +} + +struct nilfs_sc_operations nilfs_sc_dat_ops = { + .collect_data = nilfs_collect_dat_data, + .collect_node = nilfs_collect_file_node, + .collect_bmap = nilfs_collect_dat_bmap, + .write_data_binfo = nilfs_write_dat_data_binfo, + .write_node_binfo = nilfs_write_dat_node_binfo, +}; + +struct nilfs_sc_operations nilfs_sc_dsync_ops = { + .collect_data = nilfs_collect_file_data, + .collect_node = NULL, + .collect_bmap = NULL, + .write_data_binfo = nilfs_write_file_data_binfo, + .write_node_binfo = NULL, +}; + +static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, + struct list_head *listp, + size_t nlimit, + loff_t start, loff_t end) +{ + struct address_space *mapping = inode->i_mapping; + struct pagevec pvec; + pgoff_t index = 0, last = ULONG_MAX; + size_t ndirties = 0; + int i; + + if (unlikely(start != 0 || end != LLONG_MAX)) { + /* + * A valid range is given for sync-ing data pages. The + * range is rounded to per-page; extra dirty buffers + * may be included if blocksize < pagesize. + */ + index = start >> PAGE_SHIFT; + last = end >> PAGE_SHIFT; + } + pagevec_init(&pvec, 0); + repeat: + if (unlikely(index > last) || + !pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, + min_t(pgoff_t, last - index, + PAGEVEC_SIZE - 1) + 1)) + return ndirties; + + for (i = 0; i < pagevec_count(&pvec); i++) { + struct buffer_head *bh, *head; + struct page *page = pvec.pages[i]; + + if (unlikely(page->index > last)) + break; + + if (mapping->host) { + lock_page(page); + if (!page_has_buffers(page)) + create_empty_buffers(page, + 1 << inode->i_blkbits, 0); + unlock_page(page); + } + + bh = head = page_buffers(page); + do { + if (!buffer_dirty(bh)) + continue; + get_bh(bh); + list_add_tail(&bh->b_assoc_buffers, listp); + ndirties++; + if (unlikely(ndirties >= nlimit)) { + pagevec_release(&pvec); + cond_resched(); + return ndirties; + } + } while (bh = bh->b_this_page, bh != head); + } + pagevec_release(&pvec); + cond_resched(); + goto repeat; +} + +static void nilfs_lookup_dirty_node_buffers(struct inode *inode, + struct list_head *listp) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + struct address_space *mapping = &ii->i_btnode_cache; + struct pagevec pvec; + struct buffer_head *bh, *head; + unsigned int i; + pgoff_t index = 0; + + pagevec_init(&pvec, 0); + + while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, + PAGEVEC_SIZE)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + bh = head = page_buffers(pvec.pages[i]); + do { + if (buffer_dirty(bh)) { + get_bh(bh); + list_add_tail(&bh->b_assoc_buffers, + listp); + } + bh = bh->b_this_page; + } while (bh != head); + } + pagevec_release(&pvec); + cond_resched(); + } +} + +static void nilfs_dispose_list(struct nilfs_sb_info *sbi, + struct list_head *head, int force) +{ + struct nilfs_inode_info *ii, *n; + struct nilfs_inode_info *ivec[SC_N_INODEVEC], **pii; + unsigned nv = 0; + + while (!list_empty(head)) { + spin_lock(&sbi->s_inode_lock); + list_for_each_entry_safe(ii, n, head, i_dirty) { + list_del_init(&ii->i_dirty); + if (force) { + if (unlikely(ii->i_bh)) { + brelse(ii->i_bh); + ii->i_bh = NULL; + } + } else if (test_bit(NILFS_I_DIRTY, &ii->i_state)) { + set_bit(NILFS_I_QUEUED, &ii->i_state); + list_add_tail(&ii->i_dirty, + &sbi->s_dirty_files); + continue; + } + ivec[nv++] = ii; + if (nv == SC_N_INODEVEC) + break; + } + spin_unlock(&sbi->s_inode_lock); + + for (pii = ivec; nv > 0; pii++, nv--) + iput(&(*pii)->vfs_inode); + } +} + +static int nilfs_test_metadata_dirty(struct nilfs_sb_info *sbi) +{ + struct the_nilfs *nilfs = sbi->s_nilfs; + int ret = 0; + + if (nilfs_mdt_fetch_dirty(sbi->s_ifile)) + ret++; + if (nilfs_mdt_fetch_dirty(nilfs->ns_cpfile)) + ret++; + if (nilfs_mdt_fetch_dirty(nilfs->ns_sufile)) + ret++; + if (ret || nilfs_doing_gc()) + if (nilfs_mdt_fetch_dirty(nilfs_dat_inode(nilfs))) + ret++; + return ret; +} + +static int nilfs_segctor_clean(struct nilfs_sc_info *sci) +{ + return list_empty(&sci->sc_dirty_files) && + !test_bit(NILFS_SC_DIRTY, &sci->sc_flags) && + list_empty(&sci->sc_cleaning_segments) && + (!nilfs_doing_gc() || list_empty(&sci->sc_gc_inodes)); +} + +static int nilfs_segctor_confirm(struct nilfs_sc_info *sci) +{ + struct nilfs_sb_info *sbi = sci->sc_sbi; + int ret = 0; + + if (nilfs_test_metadata_dirty(sbi)) + set_bit(NILFS_SC_DIRTY, &sci->sc_flags); + + spin_lock(&sbi->s_inode_lock); + if (list_empty(&sbi->s_dirty_files) && nilfs_segctor_clean(sci)) + ret++; + + spin_unlock(&sbi->s_inode_lock); + return ret; +} + +static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci) +{ + struct nilfs_sb_info *sbi = sci->sc_sbi; + struct the_nilfs *nilfs = sbi->s_nilfs; + + nilfs_mdt_clear_dirty(sbi->s_ifile); + nilfs_mdt_clear_dirty(nilfs->ns_cpfile); + nilfs_mdt_clear_dirty(nilfs->ns_sufile); + nilfs_mdt_clear_dirty(nilfs_dat_inode(nilfs)); +} + +static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci) +{ + struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs; + struct buffer_head *bh_cp; + struct nilfs_checkpoint *raw_cp; + int err; + + /* XXX: this interface will be changed */ + err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 1, + &raw_cp, &bh_cp); + if (likely(!err)) { + /* The following code is duplicated with cpfile. But, it is + needed to collect the checkpoint even if it was not newly + created */ + nilfs_mdt_mark_buffer_dirty(bh_cp); + nilfs_mdt_mark_dirty(nilfs->ns_cpfile); + nilfs_cpfile_put_checkpoint( + nilfs->ns_cpfile, nilfs->ns_cno, bh_cp); + } else + WARN_ON(err == -EINVAL || err == -ENOENT); + + return err; +} + +static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci) +{ + struct nilfs_sb_info *sbi = sci->sc_sbi; + struct the_nilfs *nilfs = sbi->s_nilfs; + struct buffer_head *bh_cp; + struct nilfs_checkpoint *raw_cp; + int err; + + err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 0, + &raw_cp, &bh_cp); + if (unlikely(err)) { + WARN_ON(err == -EINVAL || err == -ENOENT); + goto failed_ibh; + } + raw_cp->cp_snapshot_list.ssl_next = 0; + raw_cp->cp_snapshot_list.ssl_prev = 0; + raw_cp->cp_inodes_count = + cpu_to_le64(atomic_read(&sbi->s_inodes_count)); + raw_cp->cp_blocks_count = + cpu_to_le64(atomic_read(&sbi->s_blocks_count)); + raw_cp->cp_nblk_inc = + cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc); + raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime); + raw_cp->cp_cno = cpu_to_le64(nilfs->ns_cno); + + if (test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags)) + nilfs_checkpoint_clear_minor(raw_cp); + else + nilfs_checkpoint_set_minor(raw_cp); + + nilfs_write_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode, 1); + nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, bh_cp); + return 0; + + failed_ibh: + return err; +} + +static void nilfs_fill_in_file_bmap(struct inode *ifile, + struct nilfs_inode_info *ii) + +{ + struct buffer_head *ibh; + struct nilfs_inode *raw_inode; + + if (test_bit(NILFS_I_BMAP, &ii->i_state)) { + ibh = ii->i_bh; + BUG_ON(!ibh); + raw_inode = nilfs_ifile_map_inode(ifile, ii->vfs_inode.i_ino, + ibh); + nilfs_bmap_write(ii->i_bmap, raw_inode); + nilfs_ifile_unmap_inode(ifile, ii->vfs_inode.i_ino, ibh); + } +} + +static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci, + struct inode *ifile) +{ + struct nilfs_inode_info *ii; + + list_for_each_entry(ii, &sci->sc_dirty_files, i_dirty) { + nilfs_fill_in_file_bmap(ifile, ii); + set_bit(NILFS_I_COLLECTED, &ii->i_state); + } +} + +/* + * CRC calculation routines + */ +static void nilfs_fill_in_super_root_crc(struct buffer_head *bh_sr, u32 seed) +{ + struct nilfs_super_root *raw_sr = + (struct nilfs_super_root *)bh_sr->b_data; + u32 crc; + + crc = crc32_le(seed, + (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum), + NILFS_SR_BYTES - sizeof(raw_sr->sr_sum)); + raw_sr->sr_sum = cpu_to_le32(crc); +} + +static void nilfs_segctor_fill_in_checksums(struct nilfs_sc_info *sci, + u32 seed) +{ + struct nilfs_segment_buffer *segbuf; + + if (sci->sc_super_root) + nilfs_fill_in_super_root_crc(sci->sc_super_root, seed); + + list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { + nilfs_segbuf_fill_in_segsum_crc(segbuf, seed); + nilfs_segbuf_fill_in_data_crc(segbuf, seed); + } +} + +static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci, + struct the_nilfs *nilfs) +{ + struct buffer_head *bh_sr = sci->sc_super_root; + struct nilfs_super_root *raw_sr = + (struct nilfs_super_root *)bh_sr->b_data; + unsigned isz = nilfs->ns_inode_size; + + raw_sr->sr_bytes = cpu_to_le16(NILFS_SR_BYTES); + raw_sr->sr_nongc_ctime + = cpu_to_le64(nilfs_doing_gc() ? + nilfs->ns_nongc_ctime : sci->sc_seg_ctime); + raw_sr->sr_flags = 0; + + nilfs_mdt_write_inode_direct( + nilfs_dat_inode(nilfs), bh_sr, NILFS_SR_DAT_OFFSET(isz)); + nilfs_mdt_write_inode_direct( + nilfs->ns_cpfile, bh_sr, NILFS_SR_CPFILE_OFFSET(isz)); + nilfs_mdt_write_inode_direct( + nilfs->ns_sufile, bh_sr, NILFS_SR_SUFILE_OFFSET(isz)); +} + +static void nilfs_redirty_inodes(struct list_head *head) +{ + struct nilfs_inode_info *ii; + + list_for_each_entry(ii, head, i_dirty) { + if (test_bit(NILFS_I_COLLECTED, &ii->i_state)) + clear_bit(NILFS_I_COLLECTED, &ii->i_state); + } +} + +static void nilfs_drop_collected_inodes(struct list_head *head) +{ + struct nilfs_inode_info *ii; + + list_for_each_entry(ii, head, i_dirty) { + if (!test_and_clear_bit(NILFS_I_COLLECTED, &ii->i_state)) + continue; + + clear_bit(NILFS_I_INODE_DIRTY, &ii->i_state); + set_bit(NILFS_I_UPDATED, &ii->i_state); + } +} + +static void nilfs_segctor_cancel_free_segments(struct nilfs_sc_info *sci, + struct inode *sufile) + +{ + struct list_head *head = &sci->sc_cleaning_segments; + struct nilfs_segment_entry *ent; + int err; + + list_for_each_entry(ent, head, list) { + if (!(ent->flags & NILFS_SLH_FREED)) + break; + err = nilfs_sufile_cancel_free(sufile, ent->segnum); + WARN_ON(err); /* do not happen */ + ent->flags &= ~NILFS_SLH_FREED; + } +} + +static int nilfs_segctor_prepare_free_segments(struct nilfs_sc_info *sci, + struct inode *sufile) +{ + struct list_head *head = &sci->sc_cleaning_segments; + struct nilfs_segment_entry *ent; + int err; + + list_for_each_entry(ent, head, list) { + err = nilfs_sufile_free(sufile, ent->segnum); + if (unlikely(err)) + return err; + ent->flags |= NILFS_SLH_FREED; + } + return 0; +} + +static void nilfs_segctor_commit_free_segments(struct nilfs_sc_info *sci) +{ + nilfs_dispose_segment_list(&sci->sc_cleaning_segments); +} + +static int nilfs_segctor_apply_buffers(struct nilfs_sc_info *sci, + struct inode *inode, + struct list_head *listp, + int (*collect)(struct nilfs_sc_info *, + struct buffer_head *, + struct inode *)) +{ + struct buffer_head *bh, *n; + int err = 0; + + if (collect) { + list_for_each_entry_safe(bh, n, listp, b_assoc_buffers) { + list_del_init(&bh->b_assoc_buffers); + err = collect(sci, bh, inode); + brelse(bh); + if (unlikely(err)) + goto dispose_buffers; + } + return 0; + } + + dispose_buffers: + while (!list_empty(listp)) { + bh = list_entry(listp->next, struct buffer_head, + b_assoc_buffers); + list_del_init(&bh->b_assoc_buffers); + brelse(bh); + } + return err; +} + +static size_t nilfs_segctor_buffer_rest(struct nilfs_sc_info *sci) +{ + /* Remaining number of blocks within segment buffer */ + return sci->sc_segbuf_nblocks - + (sci->sc_nblk_this_inc + sci->sc_curseg->sb_sum.nblocks); +} + +static int nilfs_segctor_scan_file(struct nilfs_sc_info *sci, + struct inode *inode, + struct nilfs_sc_operations *sc_ops) +{ + LIST_HEAD(data_buffers); + LIST_HEAD(node_buffers); + int err; + + if (!(sci->sc_stage.flags & NILFS_CF_NODE)) { + size_t n, rest = nilfs_segctor_buffer_rest(sci); + + n = nilfs_lookup_dirty_data_buffers( + inode, &data_buffers, rest + 1, 0, LLONG_MAX); + if (n > rest) { + err = nilfs_segctor_apply_buffers( + sci, inode, &data_buffers, + sc_ops->collect_data); + BUG_ON(!err); /* always receive -E2BIG or true error */ + goto break_or_fail; + } + } + nilfs_lookup_dirty_node_buffers(inode, &node_buffers); + + if (!(sci->sc_stage.flags & NILFS_CF_NODE)) { + err = nilfs_segctor_apply_buffers( + sci, inode, &data_buffers, sc_ops->collect_data); + if (unlikely(err)) { + /* dispose node list */ + nilfs_segctor_apply_buffers( + sci, inode, &node_buffers, NULL); + goto break_or_fail; + } + sci->sc_stage.flags |= NILFS_CF_NODE; + } + /* Collect node */ + err = nilfs_segctor_apply_buffers( + sci, inode, &node_buffers, sc_ops->collect_node); + if (unlikely(err)) + goto break_or_fail; + + nilfs_bmap_lookup_dirty_buffers(NILFS_I(inode)->i_bmap, &node_buffers); + err = nilfs_segctor_apply_buffers( + sci, inode, &node_buffers, sc_ops->collect_bmap); + if (unlikely(err)) + goto break_or_fail; + + nilfs_segctor_end_finfo(sci, inode); + sci->sc_stage.flags &= ~NILFS_CF_NODE; + + break_or_fail: + return err; +} + +static int nilfs_segctor_scan_file_dsync(struct nilfs_sc_info *sci, + struct inode *inode) +{ + LIST_HEAD(data_buffers); + size_t n, rest = nilfs_segctor_buffer_rest(sci); + int err; + + n = nilfs_lookup_dirty_data_buffers(inode, &data_buffers, rest + 1, + sci->sc_dsync_start, + sci->sc_dsync_end); + + err = nilfs_segctor_apply_buffers(sci, inode, &data_buffers, + nilfs_collect_file_data); + if (!err) { + nilfs_segctor_end_finfo(sci, inode); + BUG_ON(n > rest); + /* always receive -E2BIG or true error if n > rest */ + } + return err; +} + +static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) +{ + struct nilfs_sb_info *sbi = sci->sc_sbi; + struct the_nilfs *nilfs = sbi->s_nilfs; + struct list_head *head; + struct nilfs_inode_info *ii; + int err = 0; + + switch (sci->sc_stage.scnt) { + case NILFS_ST_INIT: + /* Pre-processes */ + sci->sc_stage.flags = 0; + + if (!test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags)) { + sci->sc_nblk_inc = 0; + sci->sc_curseg->sb_sum.flags = NILFS_SS_LOGBGN; + if (mode == SC_LSEG_DSYNC) { + sci->sc_stage.scnt = NILFS_ST_DSYNC; + goto dsync_mode; + } + } + + sci->sc_stage.dirty_file_ptr = NULL; + sci->sc_stage.gc_inode_ptr = NULL; + if (mode == SC_FLUSH_DAT) { + sci->sc_stage.scnt = NILFS_ST_DAT; + goto dat_stage; + } + sci->sc_stage.scnt++; /* Fall through */ + case NILFS_ST_GC: + if (nilfs_doing_gc()) { + head = &sci->sc_gc_inodes; + ii = list_prepare_entry(sci->sc_stage.gc_inode_ptr, + head, i_dirty); + list_for_each_entry_continue(ii, head, i_dirty) { + err = nilfs_segctor_scan_file( + sci, &ii->vfs_inode, + &nilfs_sc_file_ops); + if (unlikely(err)) { + sci->sc_stage.gc_inode_ptr = list_entry( + ii->i_dirty.prev, + struct nilfs_inode_info, + i_dirty); + goto break_or_fail; + } + set_bit(NILFS_I_COLLECTED, &ii->i_state); + } + sci->sc_stage.gc_inode_ptr = NULL; + } + sci->sc_stage.scnt++; /* Fall through */ + case NILFS_ST_FILE: + head = &sci->sc_dirty_files; + ii = list_prepare_entry(sci->sc_stage.dirty_file_ptr, head, + i_dirty); + list_for_each_entry_continue(ii, head, i_dirty) { + clear_bit(NILFS_I_DIRTY, &ii->i_state); + + err = nilfs_segctor_scan_file(sci, &ii->vfs_inode, + &nilfs_sc_file_ops); + if (unlikely(err)) { + sci->sc_stage.dirty_file_ptr = + list_entry(ii->i_dirty.prev, + struct nilfs_inode_info, + i_dirty); + goto break_or_fail; + } + /* sci->sc_stage.dirty_file_ptr = NILFS_I(inode); */ + /* XXX: required ? */ + } + sci->sc_stage.dirty_file_ptr = NULL; + if (mode == SC_FLUSH_FILE) { + sci->sc_stage.scnt = NILFS_ST_DONE; + return 0; + } + sci->sc_stage.scnt++; + sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED; + /* Fall through */ + case NILFS_ST_IFILE: + err = nilfs_segctor_scan_file(sci, sbi->s_ifile, + &nilfs_sc_file_ops); + if (unlikely(err)) + break; + sci->sc_stage.scnt++; + /* Creating a checkpoint */ + err = nilfs_segctor_create_checkpoint(sci); + if (unlikely(err)) + break; + /* Fall through */ + case NILFS_ST_CPFILE: + err = nilfs_segctor_scan_file(sci, nilfs->ns_cpfile, + &nilfs_sc_file_ops); + if (unlikely(err)) + break; + sci->sc_stage.scnt++; /* Fall through */ + case NILFS_ST_SUFILE: + err = nilfs_segctor_prepare_free_segments(sci, + nilfs->ns_sufile); + if (unlikely(err)) + break; + err = nilfs_segctor_scan_file(sci, nilfs->ns_sufile, + &nilfs_sc_file_ops); + if (unlikely(err)) + break; + sci->sc_stage.scnt++; /* Fall through */ + case NILFS_ST_DAT: + dat_stage: + err = nilfs_segctor_scan_file(sci, nilfs_dat_inode(nilfs), + &nilfs_sc_dat_ops); + if (unlikely(err)) + break; + if (mode == SC_FLUSH_DAT) { + sci->sc_stage.scnt = NILFS_ST_DONE; + return 0; + } + sci->sc_stage.scnt++; /* Fall through */ + case NILFS_ST_SR: + if (mode == SC_LSEG_SR) { + /* Appending a super root */ + err = nilfs_segctor_add_super_root(sci); + if (unlikely(err)) + break; + } + /* End of a logical segment */ + sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND; + sci->sc_stage.scnt = NILFS_ST_DONE; + return 0; + case NILFS_ST_DSYNC: + dsync_mode: + sci->sc_curseg->sb_sum.flags |= NILFS_SS_SYNDT; + ii = sci->sc_dsync_inode; + if (!test_bit(NILFS_I_BUSY, &ii->i_state)) + break; + + err = nilfs_segctor_scan_file_dsync(sci, &ii->vfs_inode); + if (unlikely(err)) + break; + sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND; + sci->sc_stage.scnt = NILFS_ST_DONE; + return 0; + case NILFS_ST_DONE: + return 0; + default: + BUG(); + } + + break_or_fail: + return err; +} + +static int nilfs_touch_segusage(struct inode *sufile, __u64 segnum) +{ + struct buffer_head *bh_su; + struct nilfs_segment_usage *raw_su; + int err; + + err = nilfs_sufile_get_segment_usage(sufile, segnum, &raw_su, &bh_su); + if (unlikely(err)) + return err; + nilfs_mdt_mark_buffer_dirty(bh_su); + nilfs_mdt_mark_dirty(sufile); + nilfs_sufile_put_segment_usage(sufile, segnum, bh_su); + return 0; +} + +static int nilfs_segctor_begin_construction(struct nilfs_sc_info *sci, + struct the_nilfs *nilfs) +{ + struct nilfs_segment_buffer *segbuf, *n; + __u64 nextnum; + int err; + + if (list_empty(&sci->sc_segbufs)) { + segbuf = nilfs_segbuf_new(sci->sc_super); + if (unlikely(!segbuf)) + return -ENOMEM; + list_add(&segbuf->sb_list, &sci->sc_segbufs); + } else + segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs); + + nilfs_segbuf_map(segbuf, nilfs->ns_segnum, nilfs->ns_pseg_offset, + nilfs); + + if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) { + nilfs_shift_to_next_segment(nilfs); + nilfs_segbuf_map(segbuf, nilfs->ns_segnum, 0, nilfs); + } + sci->sc_segbuf_nblocks = segbuf->sb_rest_blocks; + + err = nilfs_touch_segusage(nilfs->ns_sufile, segbuf->sb_segnum); + if (unlikely(err)) + return err; + + if (nilfs->ns_segnum == nilfs->ns_nextnum) { + /* Start from the head of a new full segment */ + err = nilfs_sufile_alloc(nilfs->ns_sufile, &nextnum); + if (unlikely(err)) + return err; + } else + nextnum = nilfs->ns_nextnum; + + segbuf->sb_sum.seg_seq = nilfs->ns_seg_seq; + nilfs_segbuf_set_next_segnum(segbuf, nextnum, nilfs); + + /* truncating segment buffers */ + list_for_each_entry_safe_continue(segbuf, n, &sci->sc_segbufs, + sb_list) { + list_del_init(&segbuf->sb_list); + nilfs_segbuf_free(segbuf); + } + return 0; +} + +static int nilfs_segctor_extend_segments(struct nilfs_sc_info *sci, + struct the_nilfs *nilfs, int nadd) +{ + struct nilfs_segment_buffer *segbuf, *prev, *n; + struct inode *sufile = nilfs->ns_sufile; + __u64 nextnextnum; + LIST_HEAD(list); + int err, ret, i; + + prev = NILFS_LAST_SEGBUF(&sci->sc_segbufs); + /* + * Since the segment specified with nextnum might be allocated during + * the previous construction, the buffer including its segusage may + * not be dirty. The following call ensures that the buffer is dirty + * and will pin the buffer on memory until the sufile is written. + */ + err = nilfs_touch_segusage(sufile, prev->sb_nextnum); + if (unlikely(err)) + return err; + + for (i = 0; i < nadd; i++) { + /* extend segment info */ + err = -ENOMEM; + segbuf = nilfs_segbuf_new(sci->sc_super); + if (unlikely(!segbuf)) + goto failed; + + /* map this buffer to region of segment on-disk */ + nilfs_segbuf_map(segbuf, prev->sb_nextnum, 0, nilfs); + sci->sc_segbuf_nblocks += segbuf->sb_rest_blocks; + + /* allocate the next next full segment */ + err = nilfs_sufile_alloc(sufile, &nextnextnum); + if (unlikely(err)) + goto failed_segbuf; + + segbuf->sb_sum.seg_seq = prev->sb_sum.seg_seq + 1; + nilfs_segbuf_set_next_segnum(segbuf, nextnextnum, nilfs); + + list_add_tail(&segbuf->sb_list, &list); + prev = segbuf; + } + list_splice(&list, sci->sc_segbufs.prev); + return 0; + + failed_segbuf: + nilfs_segbuf_free(segbuf); + failed: + list_for_each_entry_safe(segbuf, n, &list, sb_list) { + ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum); + WARN_ON(ret); /* never fails */ + list_del_init(&segbuf->sb_list); + nilfs_segbuf_free(segbuf); + } + return err; +} + +static void nilfs_segctor_free_incomplete_segments(struct nilfs_sc_info *sci, + struct the_nilfs *nilfs) +{ + struct nilfs_segment_buffer *segbuf; + int ret, done = 0; + + segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs); + if (nilfs->ns_nextnum != segbuf->sb_nextnum) { + ret = nilfs_sufile_free(nilfs->ns_sufile, segbuf->sb_nextnum); + WARN_ON(ret); /* never fails */ + } + if (segbuf->sb_io_error) { + /* Case 1: The first segment failed */ + if (segbuf->sb_pseg_start != segbuf->sb_fseg_start) + /* Case 1a: Partial segment appended into an existing + segment */ + nilfs_terminate_segment(nilfs, segbuf->sb_fseg_start, + segbuf->sb_fseg_end); + else /* Case 1b: New full segment */ + set_nilfs_discontinued(nilfs); + done++; + } + + list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) { + ret = nilfs_sufile_free(nilfs->ns_sufile, segbuf->sb_nextnum); + WARN_ON(ret); /* never fails */ + if (!done && segbuf->sb_io_error) { + if (segbuf->sb_segnum != nilfs->ns_nextnum) + /* Case 2: extended segment (!= next) failed */ + nilfs_sufile_set_error(nilfs->ns_sufile, + segbuf->sb_segnum); + done++; + } + } +} + +static void nilfs_segctor_clear_segment_buffers(struct nilfs_sc_info *sci) +{ + struct nilfs_segment_buffer *segbuf; + + list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) + nilfs_segbuf_clear(segbuf); + sci->sc_super_root = NULL; +} + +static void nilfs_segctor_destroy_segment_buffers(struct nilfs_sc_info *sci) +{ + struct nilfs_segment_buffer *segbuf; + + while (!list_empty(&sci->sc_segbufs)) { + segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs); + list_del_init(&segbuf->sb_list); + nilfs_segbuf_free(segbuf); + } + /* sci->sc_curseg = NULL; */ +} + +static void nilfs_segctor_end_construction(struct nilfs_sc_info *sci, + struct the_nilfs *nilfs, int err) +{ + if (unlikely(err)) { + nilfs_segctor_free_incomplete_segments(sci, nilfs); + nilfs_segctor_cancel_free_segments(sci, nilfs->ns_sufile); + } + nilfs_segctor_clear_segment_buffers(sci); +} + +static void nilfs_segctor_update_segusage(struct nilfs_sc_info *sci, + struct inode *sufile) +{ + struct nilfs_segment_buffer *segbuf; + struct buffer_head *bh_su; + struct nilfs_segment_usage *raw_su; + unsigned long live_blocks; + int ret; + + list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { + ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum, + &raw_su, &bh_su); + WARN_ON(ret); /* always succeed because bh_su is dirty */ + live_blocks = segbuf->sb_sum.nblocks + + (segbuf->sb_pseg_start - segbuf->sb_fseg_start); + raw_su->su_lastmod = cpu_to_le64(sci->sc_seg_ctime); + raw_su->su_nblocks = cpu_to_le32(live_blocks); + nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum, + bh_su); + } +} + +static void nilfs_segctor_cancel_segusage(struct nilfs_sc_info *sci, + struct inode *sufile) +{ + struct nilfs_segment_buffer *segbuf; + struct buffer_head *bh_su; + struct nilfs_segment_usage *raw_su; + int ret; + + segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs); + ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum, + &raw_su, &bh_su); + WARN_ON(ret); /* always succeed because bh_su is dirty */ + raw_su->su_nblocks = cpu_to_le32(segbuf->sb_pseg_start - + segbuf->sb_fseg_start); + nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum, bh_su); + + list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) { + ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum, + &raw_su, &bh_su); + WARN_ON(ret); /* always succeed */ + raw_su->su_nblocks = 0; + nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum, + bh_su); + } +} + +static void nilfs_segctor_truncate_segments(struct nilfs_sc_info *sci, + struct nilfs_segment_buffer *last, + struct inode *sufile) +{ + struct nilfs_segment_buffer *segbuf = last, *n; + int ret; + + list_for_each_entry_safe_continue(segbuf, n, &sci->sc_segbufs, + sb_list) { + list_del_init(&segbuf->sb_list); + sci->sc_segbuf_nblocks -= segbuf->sb_rest_blocks; + ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum); + WARN_ON(ret); + nilfs_segbuf_free(segbuf); + } +} + + +static int nilfs_segctor_collect(struct nilfs_sc_info *sci, + struct the_nilfs *nilfs, int mode) +{ + struct nilfs_cstage prev_stage = sci->sc_stage; + int err, nadd = 1; + + /* Collection retry loop */ + for (;;) { + sci->sc_super_root = NULL; + sci->sc_nblk_this_inc = 0; + sci->sc_curseg = NILFS_FIRST_SEGBUF(&sci->sc_segbufs); + + err = nilfs_segctor_reset_segment_buffer(sci); + if (unlikely(err)) + goto failed; + + err = nilfs_segctor_collect_blocks(sci, mode); + sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks; + if (!err) + break; + + if (unlikely(err != -E2BIG)) + goto failed; + + /* The current segment is filled up */ + if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE) + break; + + nilfs_segctor_cancel_free_segments(sci, nilfs->ns_sufile); + nilfs_segctor_clear_segment_buffers(sci); + + err = nilfs_segctor_extend_segments(sci, nilfs, nadd); + if (unlikely(err)) + return err; + + nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA); + sci->sc_stage = prev_stage; + } + nilfs_segctor_truncate_segments(sci, sci->sc_curseg, nilfs->ns_sufile); + return 0; + + failed: + return err; +} + +static void nilfs_list_replace_buffer(struct buffer_head *old_bh, + struct buffer_head *new_bh) +{ + BUG_ON(!list_empty(&new_bh->b_assoc_buffers)); + + list_replace_init(&old_bh->b_assoc_buffers, &new_bh->b_assoc_buffers); + /* The caller must release old_bh */ +} + +static int +nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci, + struct nilfs_segment_buffer *segbuf, + int mode) +{ + struct inode *inode = NULL; + sector_t blocknr; + unsigned long nfinfo = segbuf->sb_sum.nfinfo; + unsigned long nblocks = 0, ndatablk = 0; + struct nilfs_sc_operations *sc_op = NULL; + struct nilfs_segsum_pointer ssp; + struct nilfs_finfo *finfo = NULL; + union nilfs_binfo binfo; + struct buffer_head *bh, *bh_org; + ino_t ino = 0; + int err = 0; + + if (!nfinfo) + goto out; + + blocknr = segbuf->sb_pseg_start + segbuf->sb_sum.nsumblk; + ssp.bh = NILFS_SEGBUF_FIRST_BH(&segbuf->sb_segsum_buffers); + ssp.offset = sizeof(struct nilfs_segment_summary); + + list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { + if (bh == sci->sc_super_root) + break; + if (!finfo) { + finfo = nilfs_segctor_map_segsum_entry( + sci, &ssp, sizeof(*finfo)); + ino = le64_to_cpu(finfo->fi_ino); + nblocks = le32_to_cpu(finfo->fi_nblocks); + ndatablk = le32_to_cpu(finfo->fi_ndatablk); + + if (buffer_nilfs_node(bh)) + inode = NILFS_BTNC_I(bh->b_page->mapping); + else + inode = NILFS_AS_I(bh->b_page->mapping); + + if (mode == SC_LSEG_DSYNC) + sc_op = &nilfs_sc_dsync_ops; + else if (ino == NILFS_DAT_INO) + sc_op = &nilfs_sc_dat_ops; + else /* file blocks */ + sc_op = &nilfs_sc_file_ops; + } + bh_org = bh; + get_bh(bh_org); + err = nilfs_bmap_assign(NILFS_I(inode)->i_bmap, &bh, blocknr, + &binfo); + if (bh != bh_org) + nilfs_list_replace_buffer(bh_org, bh); + brelse(bh_org); + if (unlikely(err)) + goto failed_bmap; + + if (ndatablk > 0) + sc_op->write_data_binfo(sci, &ssp, &binfo); + else + sc_op->write_node_binfo(sci, &ssp, &binfo); + + blocknr++; + if (--nblocks == 0) { + finfo = NULL; + if (--nfinfo == 0) + break; + } else if (ndatablk > 0) + ndatablk--; + } + out: + return 0; + + failed_bmap: + err = nilfs_handle_bmap_error(err, __func__, inode, sci->sc_super); + return err; +} + +static int nilfs_segctor_assign(struct nilfs_sc_info *sci, int mode) +{ + struct nilfs_segment_buffer *segbuf; + int err; + + list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { + err = nilfs_segctor_update_payload_blocknr(sci, segbuf, mode); + if (unlikely(err)) + return err; + nilfs_segbuf_fill_in_segsum(segbuf); + } + return 0; +} + +static int +nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out) +{ + struct page *clone_page; + struct buffer_head *bh, *head, *bh2; + void *kaddr; + + bh = head = page_buffers(page); + + clone_page = nilfs_alloc_private_page(bh->b_bdev, bh->b_size, 0); + if (unlikely(!clone_page)) + return -ENOMEM; + + bh2 = page_buffers(clone_page); + kaddr = kmap_atomic(page, KM_USER0); + do { + if (list_empty(&bh->b_assoc_buffers)) + continue; + get_bh(bh2); + page_cache_get(clone_page); /* for each bh */ + memcpy(bh2->b_data, kaddr + bh_offset(bh), bh2->b_size); + bh2->b_blocknr = bh->b_blocknr; + list_replace(&bh->b_assoc_buffers, &bh2->b_assoc_buffers); + list_add_tail(&bh->b_assoc_buffers, out); + } while (bh = bh->b_this_page, bh2 = bh2->b_this_page, bh != head); + kunmap_atomic(kaddr, KM_USER0); + + if (!TestSetPageWriteback(clone_page)) + inc_zone_page_state(clone_page, NR_WRITEBACK); + unlock_page(clone_page); + + return 0; +} + +static int nilfs_test_page_to_be_frozen(struct page *page) +{ + struct address_space *mapping = page->mapping; + + if (!mapping || !mapping->host || S_ISDIR(mapping->host->i_mode)) + return 0; + + if (page_mapped(page)) { + ClearPageChecked(page); + return 1; + } + return PageChecked(page); +} + +static int nilfs_begin_page_io(struct page *page, struct list_head *out) +{ + if (!page || PageWriteback(page)) + /* For split b-tree node pages, this function may be called + twice. We ignore the 2nd or later calls by this check. */ + return 0; + + lock_page(page); + clear_page_dirty_for_io(page); + set_page_writeback(page); + unlock_page(page); + + if (nilfs_test_page_to_be_frozen(page)) { + int err = nilfs_copy_replace_page_buffers(page, out); + if (unlikely(err)) + return err; + } + return 0; +} + +static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci, + struct page **failed_page) +{ + struct nilfs_segment_buffer *segbuf; + struct page *bd_page = NULL, *fs_page = NULL; + struct list_head *list = &sci->sc_copied_buffers; + int err; + + *failed_page = NULL; + list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { + struct buffer_head *bh; + + list_for_each_entry(bh, &segbuf->sb_segsum_buffers, + b_assoc_buffers) { + if (bh->b_page != bd_page) { + if (bd_page) { + lock_page(bd_page); + clear_page_dirty_for_io(bd_page); + set_page_writeback(bd_page); + unlock_page(bd_page); + } + bd_page = bh->b_page; + } + } + + list_for_each_entry(bh, &segbuf->sb_payload_buffers, + b_assoc_buffers) { + if (bh == sci->sc_super_root) { + if (bh->b_page != bd_page) { + lock_page(bd_page); + clear_page_dirty_for_io(bd_page); + set_page_writeback(bd_page); + unlock_page(bd_page); + bd_page = bh->b_page; + } + break; + } + if (bh->b_page != fs_page) { + err = nilfs_begin_page_io(fs_page, list); + if (unlikely(err)) { + *failed_page = fs_page; + goto out; + } + fs_page = bh->b_page; + } + } + } + if (bd_page) { + lock_page(bd_page); + clear_page_dirty_for_io(bd_page); + set_page_writeback(bd_page); + unlock_page(bd_page); + } + err = nilfs_begin_page_io(fs_page, list); + if (unlikely(err)) + *failed_page = fs_page; + out: + return err; +} + +static int nilfs_segctor_write(struct nilfs_sc_info *sci, + struct backing_dev_info *bdi) +{ + struct nilfs_segment_buffer *segbuf; + struct nilfs_write_info wi; + int err, res; + + wi.sb = sci->sc_super; + wi.bh_sr = sci->sc_super_root; + wi.bdi = bdi; + + list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { + nilfs_segbuf_prepare_write(segbuf, &wi); + err = nilfs_segbuf_write(segbuf, &wi); + + res = nilfs_segbuf_wait(segbuf, &wi); + err = unlikely(err) ? : res; + if (unlikely(err)) + return err; + } + return 0; +} + +static int nilfs_page_has_uncleared_buffer(struct page *page) +{ + struct buffer_head *head, *bh; + + head = bh = page_buffers(page); + do { + if (buffer_dirty(bh) && !list_empty(&bh->b_assoc_buffers)) + return 1; + bh = bh->b_this_page; + } while (bh != head); + return 0; +} + +static void __nilfs_end_page_io(struct page *page, int err) +{ + if (!err) { + if (!nilfs_page_buffers_clean(page)) + __set_page_dirty_nobuffers(page); + ClearPageError(page); + } else { + __set_page_dirty_nobuffers(page); + SetPageError(page); + } + + if (buffer_nilfs_allocated(page_buffers(page))) { + if (TestClearPageWriteback(page)) + dec_zone_page_state(page, NR_WRITEBACK); + } else + end_page_writeback(page); +} + +static void nilfs_end_page_io(struct page *page, int err) +{ + if (!page) + return; + + if (buffer_nilfs_node(page_buffers(page)) && + nilfs_page_has_uncleared_buffer(page)) + /* For b-tree node pages, this function may be called twice + or more because they might be split in a segment. + This check assures that cleanup has been done for all + buffers in a split btnode page. */ + return; + + __nilfs_end_page_io(page, err); +} + +static void nilfs_clear_copied_buffers(struct list_head *list, int err) +{ + struct buffer_head *bh, *head; + struct page *page; + + while (!list_empty(list)) { + bh = list_entry(list->next, struct buffer_head, + b_assoc_buffers); + page = bh->b_page; + page_cache_get(page); + head = bh = page_buffers(page); + do { + if (!list_empty(&bh->b_assoc_buffers)) { + list_del_init(&bh->b_assoc_buffers); + if (!err) { + set_buffer_uptodate(bh); + clear_buffer_dirty(bh); + clear_buffer_nilfs_volatile(bh); + } + brelse(bh); /* for b_assoc_buffers */ + } + } while ((bh = bh->b_this_page) != head); + + __nilfs_end_page_io(page, err); + page_cache_release(page); + } +} + +static void nilfs_segctor_abort_write(struct nilfs_sc_info *sci, + struct page *failed_page, int err) +{ + struct nilfs_segment_buffer *segbuf; + struct page *bd_page = NULL, *fs_page = NULL; + + list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { + struct buffer_head *bh; + + list_for_each_entry(bh, &segbuf->sb_segsum_buffers, + b_assoc_buffers) { + if (bh->b_page != bd_page) { + if (bd_page) + end_page_writeback(bd_page); + bd_page = bh->b_page; + } + } + + list_for_each_entry(bh, &segbuf->sb_payload_buffers, + b_assoc_buffers) { + if (bh == sci->sc_super_root) { + if (bh->b_page != bd_page) { + end_page_writeback(bd_page); + bd_page = bh->b_page; + } + break; + } + if (bh->b_page != fs_page) { + nilfs_end_page_io(fs_page, err); + if (unlikely(fs_page == failed_page)) + goto done; + fs_page = bh->b_page; + } + } + } + if (bd_page) + end_page_writeback(bd_page); + + nilfs_end_page_io(fs_page, err); + done: + nilfs_clear_copied_buffers(&sci->sc_copied_buffers, err); +} + +static void nilfs_set_next_segment(struct the_nilfs *nilfs, + struct nilfs_segment_buffer *segbuf) +{ + nilfs->ns_segnum = segbuf->sb_segnum; + nilfs->ns_nextnum = segbuf->sb_nextnum; + nilfs->ns_pseg_offset = segbuf->sb_pseg_start - segbuf->sb_fseg_start + + segbuf->sb_sum.nblocks; + nilfs->ns_seg_seq = segbuf->sb_sum.seg_seq; + nilfs->ns_ctime = segbuf->sb_sum.ctime; +} + +static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) +{ + struct nilfs_segment_buffer *segbuf; + struct page *bd_page = NULL, *fs_page = NULL; + struct nilfs_sb_info *sbi = sci->sc_sbi; + struct the_nilfs *nilfs = sbi->s_nilfs; + int update_sr = (sci->sc_super_root != NULL); + + list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { + struct buffer_head *bh; + + list_for_each_entry(bh, &segbuf->sb_segsum_buffers, + b_assoc_buffers) { + set_buffer_uptodate(bh); + clear_buffer_dirty(bh); + if (bh->b_page != bd_page) { + if (bd_page) + end_page_writeback(bd_page); + bd_page = bh->b_page; + } + } + /* + * We assume that the buffers which belong to the same page + * continue over the buffer list. + * Under this assumption, the last BHs of pages is + * identifiable by the discontinuity of bh->b_page + * (page != fs_page). + * + * For B-tree node blocks, however, this assumption is not + * guaranteed. The cleanup code of B-tree node pages needs + * special care. + */ + list_for_each_entry(bh, &segbuf->sb_payload_buffers, + b_assoc_buffers) { + set_buffer_uptodate(bh); + clear_buffer_dirty(bh); + clear_buffer_nilfs_volatile(bh); + if (bh == sci->sc_super_root) { + if (bh->b_page != bd_page) { + end_page_writeback(bd_page); + bd_page = bh->b_page; + } + break; + } + if (bh->b_page != fs_page) { + nilfs_end_page_io(fs_page, 0); + fs_page = bh->b_page; + } + } + + if (!NILFS_SEG_SIMPLEX(&segbuf->sb_sum)) { + if (NILFS_SEG_LOGBGN(&segbuf->sb_sum)) { + set_bit(NILFS_SC_UNCLOSED, &sci->sc_flags); + sci->sc_lseg_stime = jiffies; + } + if (NILFS_SEG_LOGEND(&segbuf->sb_sum)) + clear_bit(NILFS_SC_UNCLOSED, &sci->sc_flags); + } + } + /* + * Since pages may continue over multiple segment buffers, + * end of the last page must be checked outside of the loop. + */ + if (bd_page) + end_page_writeback(bd_page); + + nilfs_end_page_io(fs_page, 0); + + nilfs_clear_copied_buffers(&sci->sc_copied_buffers, 0); + + nilfs_drop_collected_inodes(&sci->sc_dirty_files); + + if (nilfs_doing_gc()) { + nilfs_drop_collected_inodes(&sci->sc_gc_inodes); + if (update_sr) + nilfs_commit_gcdat_inode(nilfs); + } else + nilfs->ns_nongc_ctime = sci->sc_seg_ctime; + + sci->sc_nblk_inc += sci->sc_nblk_this_inc; + + segbuf = NILFS_LAST_SEGBUF(&sci->sc_segbufs); + nilfs_set_next_segment(nilfs, segbuf); + + if (update_sr) { + nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start, + segbuf->sb_sum.seg_seq, nilfs->ns_cno++); + sbi->s_super->s_dirt = 1; + + clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags); + clear_bit(NILFS_SC_DIRTY, &sci->sc_flags); + set_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags); + } else + clear_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags); +} + +static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci, + struct nilfs_sb_info *sbi) +{ + struct nilfs_inode_info *ii, *n; + __u64 cno = sbi->s_nilfs->ns_cno; + + spin_lock(&sbi->s_inode_lock); + retry: + list_for_each_entry_safe(ii, n, &sbi->s_dirty_files, i_dirty) { + if (!ii->i_bh) { + struct buffer_head *ibh; + int err; + + spin_unlock(&sbi->s_inode_lock); + err = nilfs_ifile_get_inode_block( + sbi->s_ifile, ii->vfs_inode.i_ino, &ibh); + if (unlikely(err)) { + nilfs_warning(sbi->s_super, __func__, + "failed to get inode block.\n"); + return err; + } + nilfs_mdt_mark_buffer_dirty(ibh); + nilfs_mdt_mark_dirty(sbi->s_ifile); + spin_lock(&sbi->s_inode_lock); + if (likely(!ii->i_bh)) + ii->i_bh = ibh; + else + brelse(ibh); + goto retry; + } + ii->i_cno = cno; + + clear_bit(NILFS_I_QUEUED, &ii->i_state); + set_bit(NILFS_I_BUSY, &ii->i_state); + list_del(&ii->i_dirty); + list_add_tail(&ii->i_dirty, &sci->sc_dirty_files); + } + spin_unlock(&sbi->s_inode_lock); + + NILFS_I(sbi->s_ifile)->i_cno = cno; + + return 0; +} + +static void nilfs_segctor_check_out_files(struct nilfs_sc_info *sci, + struct nilfs_sb_info *sbi) +{ + struct nilfs_transaction_info *ti = current->journal_info; + struct nilfs_inode_info *ii, *n; + __u64 cno = sbi->s_nilfs->ns_cno; + + spin_lock(&sbi->s_inode_lock); + list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) { + if (!test_and_clear_bit(NILFS_I_UPDATED, &ii->i_state) || + test_bit(NILFS_I_DIRTY, &ii->i_state)) { + /* The current checkpoint number (=nilfs->ns_cno) is + changed between check-in and check-out only if the + super root is written out. So, we can update i_cno + for the inodes that remain in the dirty list. */ + ii->i_cno = cno; + continue; + } + clear_bit(NILFS_I_BUSY, &ii->i_state); + brelse(ii->i_bh); + ii->i_bh = NULL; + list_del(&ii->i_dirty); + list_add_tail(&ii->i_dirty, &ti->ti_garbage); + } + spin_unlock(&sbi->s_inode_lock); +} + +/* + * Main procedure of segment constructor + */ +static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) +{ + struct nilfs_sb_info *sbi = sci->sc_sbi; + struct the_nilfs *nilfs = sbi->s_nilfs; + struct page *failed_page; + int err, has_sr = 0; + + sci->sc_stage.scnt = NILFS_ST_INIT; + + err = nilfs_segctor_check_in_files(sci, sbi); + if (unlikely(err)) + goto out; + + if (nilfs_test_metadata_dirty(sbi)) + set_bit(NILFS_SC_DIRTY, &sci->sc_flags); + + if (nilfs_segctor_clean(sci)) + goto out; + + do { + sci->sc_stage.flags &= ~NILFS_CF_HISTORY_MASK; + + err = nilfs_segctor_begin_construction(sci, nilfs); + if (unlikely(err)) + goto out; + + /* Update time stamp */ + sci->sc_seg_ctime = get_seconds(); + + err = nilfs_segctor_collect(sci, nilfs, mode); + if (unlikely(err)) + goto failed; + + has_sr = (sci->sc_super_root != NULL); + + /* Avoid empty segment */ + if (sci->sc_stage.scnt == NILFS_ST_DONE && + NILFS_SEG_EMPTY(&sci->sc_curseg->sb_sum)) { + nilfs_segctor_end_construction(sci, nilfs, 1); + goto out; + } + + err = nilfs_segctor_assign(sci, mode); + if (unlikely(err)) + goto failed; + + if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED) + nilfs_segctor_fill_in_file_bmap(sci, sbi->s_ifile); + + if (has_sr) { + err = nilfs_segctor_fill_in_checkpoint(sci); + if (unlikely(err)) + goto failed_to_make_up; + + nilfs_segctor_fill_in_super_root(sci, nilfs); + } + nilfs_segctor_update_segusage(sci, nilfs->ns_sufile); + + /* Write partial segments */ + err = nilfs_segctor_prepare_write(sci, &failed_page); + if (unlikely(err)) + goto failed_to_write; + + nilfs_segctor_fill_in_checksums(sci, nilfs->ns_crc_seed); + + err = nilfs_segctor_write(sci, nilfs->ns_bdi); + if (unlikely(err)) + goto failed_to_write; + + nilfs_segctor_complete_write(sci); + + /* Commit segments */ + if (has_sr) { + nilfs_segctor_commit_free_segments(sci); + nilfs_segctor_clear_metadata_dirty(sci); + } + + nilfs_segctor_end_construction(sci, nilfs, 0); + + } while (sci->sc_stage.scnt != NILFS_ST_DONE); + + out: + nilfs_segctor_destroy_segment_buffers(sci); + nilfs_segctor_check_out_files(sci, sbi); + return err; + + failed_to_write: + nilfs_segctor_abort_write(sci, failed_page, err); + nilfs_segctor_cancel_segusage(sci, nilfs->ns_sufile); + + failed_to_make_up: + if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED) + nilfs_redirty_inodes(&sci->sc_dirty_files); + + failed: + if (nilfs_doing_gc()) + nilfs_redirty_inodes(&sci->sc_gc_inodes); + nilfs_segctor_end_construction(sci, nilfs, err); + goto out; +} + +/** + * nilfs_secgtor_start_timer - set timer of background write + * @sci: nilfs_sc_info + * + * If the timer has already been set, it ignores the new request. + * This function MUST be called within a section locking the segment + * semaphore. + */ +static void nilfs_segctor_start_timer(struct nilfs_sc_info *sci) +{ + spin_lock(&sci->sc_state_lock); + if (sci->sc_timer && !(sci->sc_state & NILFS_SEGCTOR_COMMIT)) { + sci->sc_timer->expires = jiffies + sci->sc_interval; + add_timer(sci->sc_timer); + sci->sc_state |= NILFS_SEGCTOR_COMMIT; + } + spin_unlock(&sci->sc_state_lock); +} + +static void nilfs_segctor_do_flush(struct nilfs_sc_info *sci, int bn) +{ + spin_lock(&sci->sc_state_lock); + if (!(sci->sc_flush_request & (1 << bn))) { + unsigned long prev_req = sci->sc_flush_request; + + sci->sc_flush_request |= (1 << bn); + if (!prev_req) + wake_up(&sci->sc_wait_daemon); + } + spin_unlock(&sci->sc_state_lock); +} + +/** + * nilfs_flush_segment - trigger a segment construction for resource control + * @sb: super block + * @ino: inode number of the file to be flushed out. + */ +void nilfs_flush_segment(struct super_block *sb, ino_t ino) +{ + struct nilfs_sb_info *sbi = NILFS_SB(sb); + struct nilfs_sc_info *sci = NILFS_SC(sbi); + + if (!sci || nilfs_doing_construction()) + return; + nilfs_segctor_do_flush(sci, NILFS_MDT_INODE(sb, ino) ? ino : 0); + /* assign bit 0 to data files */ +} + +int nilfs_segctor_add_segments_to_be_freed(struct nilfs_sc_info *sci, + __u64 *segnum, size_t nsegs) +{ + struct nilfs_segment_entry *ent; + struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs; + struct inode *sufile = nilfs->ns_sufile; + LIST_HEAD(list); + __u64 *pnum; + size_t i; + int err; + + for (pnum = segnum, i = 0; i < nsegs; pnum++, i++) { + ent = nilfs_alloc_segment_entry(*pnum); + if (unlikely(!ent)) { + err = -ENOMEM; + goto failed; + } + list_add_tail(&ent->list, &list); + + err = nilfs_open_segment_entry(ent, sufile); + if (unlikely(err)) + goto failed; + + if (unlikely(!nilfs_segment_usage_dirty(ent->raw_su))) + printk(KERN_WARNING "NILFS: unused segment is " + "requested to be cleaned (segnum=%llu)\n", + (unsigned long long)ent->segnum); + nilfs_close_segment_entry(ent, sufile); + } + list_splice(&list, sci->sc_cleaning_segments.prev); + return 0; + + failed: + nilfs_dispose_segment_list(&list); + return err; +} + +void nilfs_segctor_clear_segments_to_be_freed(struct nilfs_sc_info *sci) +{ + nilfs_dispose_segment_list(&sci->sc_cleaning_segments); +} + +struct nilfs_segctor_wait_request { + wait_queue_t wq; + __u32 seq; + int err; + atomic_t done; +}; + +static int nilfs_segctor_sync(struct nilfs_sc_info *sci) +{ + struct nilfs_segctor_wait_request wait_req; + int err = 0; + + spin_lock(&sci->sc_state_lock); + init_wait(&wait_req.wq); + wait_req.err = 0; + atomic_set(&wait_req.done, 0); + wait_req.seq = ++sci->sc_seq_request; + spin_unlock(&sci->sc_state_lock); + + init_waitqueue_entry(&wait_req.wq, current); + add_wait_queue(&sci->sc_wait_request, &wait_req.wq); + set_current_state(TASK_INTERRUPTIBLE); + wake_up(&sci->sc_wait_daemon); + + for (;;) { + if (atomic_read(&wait_req.done)) { + err = wait_req.err; + break; + } + if (!signal_pending(current)) { + schedule(); + continue; + } + err = -ERESTARTSYS; + break; + } + finish_wait(&sci->sc_wait_request, &wait_req.wq); + return err; +} + +static void nilfs_segctor_wakeup(struct nilfs_sc_info *sci, int err) +{ + struct nilfs_segctor_wait_request *wrq, *n; + unsigned long flags; + + spin_lock_irqsave(&sci->sc_wait_request.lock, flags); + list_for_each_entry_safe(wrq, n, &sci->sc_wait_request.task_list, + wq.task_list) { + if (!atomic_read(&wrq->done) && + nilfs_cnt32_ge(sci->sc_seq_done, wrq->seq)) { + wrq->err = err; + atomic_set(&wrq->done, 1); + } + if (atomic_read(&wrq->done)) { + wrq->wq.func(&wrq->wq, + TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, + 0, NULL); + } + } + spin_unlock_irqrestore(&sci->sc_wait_request.lock, flags); +} + +/** + * nilfs_construct_segment - construct a logical segment + * @sb: super block + * + * Return Value: On success, 0 is retured. On errors, one of the following + * negative error code is returned. + * + * %-EROFS - Read only filesystem. + * + * %-EIO - I/O error + * + * %-ENOSPC - No space left on device (only in a panic state). + * + * %-ERESTARTSYS - Interrupted. + * + * %-ENOMEM - Insufficient memory available. + */ +int nilfs_construct_segment(struct super_block *sb) +{ + struct nilfs_sb_info *sbi = NILFS_SB(sb); + struct nilfs_sc_info *sci = NILFS_SC(sbi); + struct nilfs_transaction_info *ti; + int err; + + if (!sci) + return -EROFS; + + /* A call inside transactions causes a deadlock. */ + BUG_ON((ti = current->journal_info) && ti->ti_magic == NILFS_TI_MAGIC); + + err = nilfs_segctor_sync(sci); + return err; +} + +/** + * nilfs_construct_dsync_segment - construct a data-only logical segment + * @sb: super block + * @inode: inode whose data blocks should be written out + * @start: start byte offset + * @end: end byte offset (inclusive) + * + * Return Value: On success, 0 is retured. On errors, one of the following + * negative error code is returned. + * + * %-EROFS - Read only filesystem. + * + * %-EIO - I/O error + * + * %-ENOSPC - No space left on device (only in a panic state). + * + * %-ERESTARTSYS - Interrupted. + * + * %-ENOMEM - Insufficient memory available. + */ +int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode, + loff_t start, loff_t end) +{ + struct nilfs_sb_info *sbi = NILFS_SB(sb); + struct nilfs_sc_info *sci = NILFS_SC(sbi); + struct nilfs_inode_info *ii; + struct nilfs_transaction_info ti; + int err = 0; + + if (!sci) + return -EROFS; + + nilfs_transaction_lock(sbi, &ti, 0); + + ii = NILFS_I(inode); + if (test_bit(NILFS_I_INODE_DIRTY, &ii->i_state) || + nilfs_test_opt(sbi, STRICT_ORDER) || + test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) || + nilfs_discontinued(sbi->s_nilfs)) { + nilfs_transaction_unlock(sbi); + err = nilfs_segctor_sync(sci); + return err; + } + + spin_lock(&sbi->s_inode_lock); + if (!test_bit(NILFS_I_QUEUED, &ii->i_state) && + !test_bit(NILFS_I_BUSY, &ii->i_state)) { + spin_unlock(&sbi->s_inode_lock); + nilfs_transaction_unlock(sbi); + return 0; + } + spin_unlock(&sbi->s_inode_lock); + sci->sc_dsync_inode = ii; + sci->sc_dsync_start = start; + sci->sc_dsync_end = end; + + err = nilfs_segctor_do_construct(sci, SC_LSEG_DSYNC); + + nilfs_transaction_unlock(sbi); + return err; +} + +struct nilfs_segctor_req { + int mode; + __u32 seq_accepted; + int sc_err; /* construction failure */ + int sb_err; /* super block writeback failure */ +}; + +#define FLUSH_FILE_BIT (0x1) /* data file only */ +#define FLUSH_DAT_BIT (1 << NILFS_DAT_INO) /* DAT only */ + +static void nilfs_segctor_accept(struct nilfs_sc_info *sci, + struct nilfs_segctor_req *req) +{ + req->sc_err = req->sb_err = 0; + spin_lock(&sci->sc_state_lock); + req->seq_accepted = sci->sc_seq_request; + spin_unlock(&sci->sc_state_lock); + + if (sci->sc_timer) + del_timer_sync(sci->sc_timer); +} + +static void nilfs_segctor_notify(struct nilfs_sc_info *sci, + struct nilfs_segctor_req *req) +{ + /* Clear requests (even when the construction failed) */ + spin_lock(&sci->sc_state_lock); + + sci->sc_state &= ~NILFS_SEGCTOR_COMMIT; + + if (req->mode == SC_LSEG_SR) { + sci->sc_seq_done = req->seq_accepted; + nilfs_segctor_wakeup(sci, req->sc_err ? : req->sb_err); + sci->sc_flush_request = 0; + } else if (req->mode == SC_FLUSH_FILE) + sci->sc_flush_request &= ~FLUSH_FILE_BIT; + else if (req->mode == SC_FLUSH_DAT) + sci->sc_flush_request &= ~FLUSH_DAT_BIT; + + spin_unlock(&sci->sc_state_lock); +} + +static int nilfs_segctor_construct(struct nilfs_sc_info *sci, + struct nilfs_segctor_req *req) +{ + struct nilfs_sb_info *sbi = sci->sc_sbi; + struct the_nilfs *nilfs = sbi->s_nilfs; + int err = 0; + + if (nilfs_discontinued(nilfs)) + req->mode = SC_LSEG_SR; + if (!nilfs_segctor_confirm(sci)) { + err = nilfs_segctor_do_construct(sci, req->mode); + req->sc_err = err; + } + if (likely(!err)) { + if (req->mode != SC_FLUSH_DAT) + atomic_set(&nilfs->ns_ndirtyblks, 0); + if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) && + nilfs_discontinued(nilfs)) { + down_write(&nilfs->ns_sem); + req->sb_err = nilfs_commit_super(sbi, 0); + up_write(&nilfs->ns_sem); + } + } + return err; +} + +static void nilfs_construction_timeout(unsigned long data) +{ + struct task_struct *p = (struct task_struct *)data; + wake_up_process(p); +} + +static void +nilfs_remove_written_gcinodes(struct the_nilfs *nilfs, struct list_head *head) +{ + struct nilfs_inode_info *ii, *n; + + list_for_each_entry_safe(ii, n, head, i_dirty) { + if (!test_bit(NILFS_I_UPDATED, &ii->i_state)) + continue; + hlist_del_init(&ii->vfs_inode.i_hash); + list_del_init(&ii->i_dirty); + nilfs_clear_gcinode(&ii->vfs_inode); + } +} + +int nilfs_clean_segments(struct super_block *sb, void __user *argp) +{ + struct nilfs_sb_info *sbi = NILFS_SB(sb); + struct nilfs_sc_info *sci = NILFS_SC(sbi); + struct the_nilfs *nilfs = sbi->s_nilfs; + struct nilfs_transaction_info ti; + struct nilfs_segctor_req req = { .mode = SC_LSEG_SR }; + int err; + + if (unlikely(!sci)) + return -EROFS; + + nilfs_transaction_lock(sbi, &ti, 1); + + err = nilfs_init_gcdat_inode(nilfs); + if (unlikely(err)) + goto out_unlock; + err = nilfs_ioctl_prepare_clean_segments(nilfs, argp); + if (unlikely(err)) + goto out_unlock; + + list_splice_init(&nilfs->ns_gc_inodes, sci->sc_gc_inodes.prev); + + for (;;) { + nilfs_segctor_accept(sci, &req); + err = nilfs_segctor_construct(sci, &req); + nilfs_remove_written_gcinodes(nilfs, &sci->sc_gc_inodes); + nilfs_segctor_notify(sci, &req); + + if (likely(!err)) + break; + + nilfs_warning(sb, __func__, + "segment construction failed. (err=%d)", err); + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(sci->sc_interval); + } + + out_unlock: + nilfs_clear_gcdat_inode(nilfs); + nilfs_transaction_unlock(sbi); + return err; +} + +static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode) +{ + struct nilfs_sb_info *sbi = sci->sc_sbi; + struct nilfs_transaction_info ti; + struct nilfs_segctor_req req = { .mode = mode }; + + nilfs_transaction_lock(sbi, &ti, 0); + + nilfs_segctor_accept(sci, &req); + nilfs_segctor_construct(sci, &req); + nilfs_segctor_notify(sci, &req); + + /* + * Unclosed segment should be retried. We do this using sc_timer. + * Timeout of sc_timer will invoke complete construction which leads + * to close the current logical segment. + */ + if (test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags)) + nilfs_segctor_start_timer(sci); + + nilfs_transaction_unlock(sbi); +} + +static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci) +{ + int mode = 0; + int err; + + spin_lock(&sci->sc_state_lock); + mode = (sci->sc_flush_request & FLUSH_DAT_BIT) ? + SC_FLUSH_DAT : SC_FLUSH_FILE; + spin_unlock(&sci->sc_state_lock); + + if (mode) { + err = nilfs_segctor_do_construct(sci, mode); + + spin_lock(&sci->sc_state_lock); + sci->sc_flush_request &= (mode == SC_FLUSH_FILE) ? + ~FLUSH_FILE_BIT : ~FLUSH_DAT_BIT; + spin_unlock(&sci->sc_state_lock); + } + clear_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags); +} + +static int nilfs_segctor_flush_mode(struct nilfs_sc_info *sci) +{ + if (!test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) || + time_before(jiffies, sci->sc_lseg_stime + sci->sc_mjcp_freq)) { + if (!(sci->sc_flush_request & ~FLUSH_FILE_BIT)) + return SC_FLUSH_FILE; + else if (!(sci->sc_flush_request & ~FLUSH_DAT_BIT)) + return SC_FLUSH_DAT; + } + return SC_LSEG_SR; +} + +/** + * nilfs_segctor_thread - main loop of the segment constructor thread. + * @arg: pointer to a struct nilfs_sc_info. + * + * nilfs_segctor_thread() initializes a timer and serves as a daemon + * to execute segment constructions. + */ +static int nilfs_segctor_thread(void *arg) +{ + struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg; + struct timer_list timer; + int timeout = 0; + + init_timer(&timer); + timer.data = (unsigned long)current; + timer.function = nilfs_construction_timeout; + sci->sc_timer = &timer; + + /* start sync. */ + sci->sc_task = current; + wake_up(&sci->sc_wait_task); /* for nilfs_segctor_start_thread() */ + printk(KERN_INFO + "segctord starting. Construction interval = %lu seconds, " + "CP frequency < %lu seconds\n", + sci->sc_interval / HZ, sci->sc_mjcp_freq / HZ); + + spin_lock(&sci->sc_state_lock); + loop: + for (;;) { + int mode; + + if (sci->sc_state & NILFS_SEGCTOR_QUIT) + goto end_thread; + + if (timeout || sci->sc_seq_request != sci->sc_seq_done) + mode = SC_LSEG_SR; + else if (!sci->sc_flush_request) + break; + else + mode = nilfs_segctor_flush_mode(sci); + + spin_unlock(&sci->sc_state_lock); + nilfs_segctor_thread_construct(sci, mode); + spin_lock(&sci->sc_state_lock); + timeout = 0; + } + + + if (freezing(current)) { + spin_unlock(&sci->sc_state_lock); + refrigerator(); + spin_lock(&sci->sc_state_lock); + } else { + DEFINE_WAIT(wait); + int should_sleep = 1; + + prepare_to_wait(&sci->sc_wait_daemon, &wait, + TASK_INTERRUPTIBLE); + + if (sci->sc_seq_request != sci->sc_seq_done) + should_sleep = 0; + else if (sci->sc_flush_request) + should_sleep = 0; + else if (sci->sc_state & NILFS_SEGCTOR_COMMIT) + should_sleep = time_before(jiffies, + sci->sc_timer->expires); + + if (should_sleep) { + spin_unlock(&sci->sc_state_lock); + schedule(); + spin_lock(&sci->sc_state_lock); + } + finish_wait(&sci->sc_wait_daemon, &wait); + timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) && + time_after_eq(jiffies, sci->sc_timer->expires)); + } + goto loop; + + end_thread: + spin_unlock(&sci->sc_state_lock); + del_timer_sync(sci->sc_timer); + sci->sc_timer = NULL; + + /* end sync. */ + sci->sc_task = NULL; + wake_up(&sci->sc_wait_task); /* for nilfs_segctor_kill_thread() */ + return 0; +} + +static int nilfs_segctor_start_thread(struct nilfs_sc_info *sci) +{ + struct task_struct *t; + + t = kthread_run(nilfs_segctor_thread, sci, "segctord"); + if (IS_ERR(t)) { + int err = PTR_ERR(t); + + printk(KERN_ERR "NILFS: error %d creating segctord thread\n", + err); + return err; + } + wait_event(sci->sc_wait_task, sci->sc_task != NULL); + return 0; +} + +static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci) +{ + sci->sc_state |= NILFS_SEGCTOR_QUIT; + + while (sci->sc_task) { + wake_up(&sci->sc_wait_daemon); + spin_unlock(&sci->sc_state_lock); + wait_event(sci->sc_wait_task, sci->sc_task == NULL); + spin_lock(&sci->sc_state_lock); + } +} + +static int nilfs_segctor_init(struct nilfs_sc_info *sci) +{ + sci->sc_seq_done = sci->sc_seq_request; + + return nilfs_segctor_start_thread(sci); +} + +/* + * Setup & clean-up functions + */ +static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi) +{ + struct nilfs_sc_info *sci; + + sci = kzalloc(sizeof(*sci), GFP_KERNEL); + if (!sci) + return NULL; + + sci->sc_sbi = sbi; + sci->sc_super = sbi->s_super; + + init_waitqueue_head(&sci->sc_wait_request); + init_waitqueue_head(&sci->sc_wait_daemon); + init_waitqueue_head(&sci->sc_wait_task); + spin_lock_init(&sci->sc_state_lock); + INIT_LIST_HEAD(&sci->sc_dirty_files); + INIT_LIST_HEAD(&sci->sc_segbufs); + INIT_LIST_HEAD(&sci->sc_gc_inodes); + INIT_LIST_HEAD(&sci->sc_cleaning_segments); + INIT_LIST_HEAD(&sci->sc_copied_buffers); + + sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT; + sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ; + sci->sc_watermark = NILFS_SC_DEFAULT_WATERMARK; + + if (sbi->s_interval) + sci->sc_interval = sbi->s_interval; + if (sbi->s_watermark) + sci->sc_watermark = sbi->s_watermark; + return sci; +} + +static void nilfs_segctor_write_out(struct nilfs_sc_info *sci) +{ + int ret, retrycount = NILFS_SC_CLEANUP_RETRY; + + /* The segctord thread was stopped and its timer was removed. + But some tasks remain. */ + do { + struct nilfs_sb_info *sbi = sci->sc_sbi; + struct nilfs_transaction_info ti; + struct nilfs_segctor_req req = { .mode = SC_LSEG_SR }; + + nilfs_transaction_lock(sbi, &ti, 0); + nilfs_segctor_accept(sci, &req); + ret = nilfs_segctor_construct(sci, &req); + nilfs_segctor_notify(sci, &req); + nilfs_transaction_unlock(sbi); + + } while (ret && retrycount-- > 0); +} + +/** + * nilfs_segctor_destroy - destroy the segment constructor. + * @sci: nilfs_sc_info + * + * nilfs_segctor_destroy() kills the segctord thread and frees + * the nilfs_sc_info struct. + * Caller must hold the segment semaphore. + */ +static void nilfs_segctor_destroy(struct nilfs_sc_info *sci) +{ + struct nilfs_sb_info *sbi = sci->sc_sbi; + int flag; + + up_write(&sbi->s_nilfs->ns_segctor_sem); + + spin_lock(&sci->sc_state_lock); + nilfs_segctor_kill_thread(sci); + flag = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) || sci->sc_flush_request + || sci->sc_seq_request != sci->sc_seq_done); + spin_unlock(&sci->sc_state_lock); + + if (flag || nilfs_segctor_confirm(sci)) + nilfs_segctor_write_out(sci); + + WARN_ON(!list_empty(&sci->sc_copied_buffers)); + + if (!list_empty(&sci->sc_dirty_files)) { + nilfs_warning(sbi->s_super, __func__, + "dirty file(s) after the final construction\n"); + nilfs_dispose_list(sbi, &sci->sc_dirty_files, 1); + } + + if (!list_empty(&sci->sc_cleaning_segments)) + nilfs_dispose_segment_list(&sci->sc_cleaning_segments); + + WARN_ON(!list_empty(&sci->sc_segbufs)); + + down_write(&sbi->s_nilfs->ns_segctor_sem); + + kfree(sci); +} + +/** + * nilfs_attach_segment_constructor - attach a segment constructor + * @sbi: nilfs_sb_info + * + * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info, + * initilizes it, and starts the segment constructor. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error code is returned. + * + * %-ENOMEM - Insufficient memory available. + */ +int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi) +{ + struct the_nilfs *nilfs = sbi->s_nilfs; + int err; + + /* Each field of nilfs_segctor is cleared through the initialization + of super-block info */ + sbi->s_sc_info = nilfs_segctor_new(sbi); + if (!sbi->s_sc_info) + return -ENOMEM; + + nilfs_attach_writer(nilfs, sbi); + err = nilfs_segctor_init(NILFS_SC(sbi)); + if (err) { + nilfs_detach_writer(nilfs, sbi); + kfree(sbi->s_sc_info); + sbi->s_sc_info = NULL; + } + return err; +} + +/** + * nilfs_detach_segment_constructor - destroy the segment constructor + * @sbi: nilfs_sb_info + * + * nilfs_detach_segment_constructor() kills the segment constructor daemon, + * frees the struct nilfs_sc_info, and destroy the dirty file list. + */ +void nilfs_detach_segment_constructor(struct nilfs_sb_info *sbi) +{ + struct the_nilfs *nilfs = sbi->s_nilfs; + LIST_HEAD(garbage_list); + + down_write(&nilfs->ns_segctor_sem); + if (NILFS_SC(sbi)) { + nilfs_segctor_destroy(NILFS_SC(sbi)); + sbi->s_sc_info = NULL; + } + + /* Force to free the list of dirty files */ + spin_lock(&sbi->s_inode_lock); + if (!list_empty(&sbi->s_dirty_files)) { + list_splice_init(&sbi->s_dirty_files, &garbage_list); + nilfs_warning(sbi->s_super, __func__, + "Non empty dirty list after the last " + "segment construction\n"); + } + spin_unlock(&sbi->s_inode_lock); + up_write(&nilfs->ns_segctor_sem); + + nilfs_dispose_list(sbi, &garbage_list, 1); + nilfs_detach_writer(nilfs, sbi); +} diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h new file mode 100644 index 000000000000..a98fc1ed0bbb --- /dev/null +++ b/fs/nilfs2/segment.h @@ -0,0 +1,243 @@ +/* + * segment.h - NILFS Segment constructor prototypes and definitions + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net> + * + */ +#ifndef _NILFS_SEGMENT_H +#define _NILFS_SEGMENT_H + +#include <linux/types.h> +#include <linux/fs.h> +#include <linux/buffer_head.h> +#include <linux/nilfs2_fs.h> +#include "sb.h" + +/** + * struct nilfs_recovery_info - Recovery infomation + * @ri_need_recovery: Recovery status + * @ri_super_root: Block number of the last super root + * @ri_ri_cno: Number of the last checkpoint + * @ri_lsegs_start: Region for roll-forwarding (start block number) + * @ri_lsegs_end: Region for roll-forwarding (end block number) + * @ri_lseg_start_seq: Sequence value of the segment at ri_lsegs_start + * @ri_used_segments: List of segments to be mark active + * @ri_pseg_start: Block number of the last partial segment + * @ri_seq: Sequence number on the last partial segment + * @ri_segnum: Segment number on the last partial segment + * @ri_nextnum: Next segment number on the last partial segment + */ +struct nilfs_recovery_info { + int ri_need_recovery; + sector_t ri_super_root; + __u64 ri_cno; + + sector_t ri_lsegs_start; + sector_t ri_lsegs_end; + u64 ri_lsegs_start_seq; + struct list_head ri_used_segments; + sector_t ri_pseg_start; + u64 ri_seq; + __u64 ri_segnum; + __u64 ri_nextnum; +}; + +/* ri_need_recovery */ +#define NILFS_RECOVERY_SR_UPDATED 1 /* The super root was updated */ +#define NILFS_RECOVERY_ROLLFORWARD_DONE 2 /* Rollforward was carried out */ + +/** + * struct nilfs_cstage - Context of collection stage + * @scnt: Stage count + * @flags: State flags + * @dirty_file_ptr: Pointer on dirty_files list, or inode of a target file + * @gc_inode_ptr: Pointer on the list of gc-inodes + */ +struct nilfs_cstage { + int scnt; + unsigned flags; + struct nilfs_inode_info *dirty_file_ptr; + struct nilfs_inode_info *gc_inode_ptr; +}; + +struct nilfs_segment_buffer; + +struct nilfs_segsum_pointer { + struct buffer_head *bh; + unsigned offset; /* offset in bytes */ +}; + +/** + * struct nilfs_sc_info - Segment constructor information + * @sc_super: Back pointer to super_block struct + * @sc_sbi: Back pointer to nilfs_sb_info struct + * @sc_nblk_inc: Block count of current generation + * @sc_dirty_files: List of files to be written + * @sc_gc_inodes: List of GC inodes having blocks to be written + * @sc_cleaning_segments: List of segments to be freed through construction + * @sc_copied_buffers: List of copied buffers (buffer heads) to freeze data + * @sc_dsync_inode: inode whose data pages are written for a sync operation + * @sc_dsync_start: start byte offset of data pages + * @sc_dsync_end: end byte offset of data pages (inclusive) + * @sc_segbufs: List of segment buffers + * @sc_segbuf_nblocks: Number of available blocks in segment buffers. + * @sc_curseg: Current segment buffer + * @sc_super_root: Pointer to the super root buffer + * @sc_stage: Collection stage + * @sc_finfo_ptr: pointer to the current finfo struct in the segment summary + * @sc_binfo_ptr: pointer to the current binfo struct in the segment summary + * @sc_blk_cnt: Block count of a file + * @sc_datablk_cnt: Data block count of a file + * @sc_nblk_this_inc: Number of blocks included in the current logical segment + * @sc_seg_ctime: Creation time + * @sc_flags: Internal flags + * @sc_state_lock: spinlock for sc_state and so on + * @sc_state: Segctord state flags + * @sc_flush_request: inode bitmap of metadata files to be flushed + * @sc_wait_request: Client request queue + * @sc_wait_daemon: Daemon wait queue + * @sc_wait_task: Start/end wait queue to control segctord task + * @sc_seq_request: Request counter + * @sc_seq_done: Completion counter + * @sc_sync: Request of explicit sync operation + * @sc_interval: Timeout value of background construction + * @sc_mjcp_freq: Frequency of creating checkpoints + * @sc_lseg_stime: Start time of the latest logical segment + * @sc_watermark: Watermark for the number of dirty buffers + * @sc_timer: Timer for segctord + * @sc_task: current thread of segctord + */ +struct nilfs_sc_info { + struct super_block *sc_super; + struct nilfs_sb_info *sc_sbi; + + unsigned long sc_nblk_inc; + + struct list_head sc_dirty_files; + struct list_head sc_gc_inodes; + struct list_head sc_cleaning_segments; + struct list_head sc_copied_buffers; + + struct nilfs_inode_info *sc_dsync_inode; + loff_t sc_dsync_start; + loff_t sc_dsync_end; + + /* Segment buffers */ + struct list_head sc_segbufs; + unsigned long sc_segbuf_nblocks; + struct nilfs_segment_buffer *sc_curseg; + struct buffer_head *sc_super_root; + + struct nilfs_cstage sc_stage; + + struct nilfs_segsum_pointer sc_finfo_ptr; + struct nilfs_segsum_pointer sc_binfo_ptr; + unsigned long sc_blk_cnt; + unsigned long sc_datablk_cnt; + unsigned long sc_nblk_this_inc; + time_t sc_seg_ctime; + + unsigned long sc_flags; + + spinlock_t sc_state_lock; + unsigned long sc_state; + unsigned long sc_flush_request; + + wait_queue_head_t sc_wait_request; + wait_queue_head_t sc_wait_daemon; + wait_queue_head_t sc_wait_task; + + __u32 sc_seq_request; + __u32 sc_seq_done; + + int sc_sync; + unsigned long sc_interval; + unsigned long sc_mjcp_freq; + unsigned long sc_lseg_stime; /* in 1/HZ seconds */ + unsigned long sc_watermark; + + struct timer_list *sc_timer; + struct task_struct *sc_task; +}; + +/* sc_flags */ +enum { + NILFS_SC_DIRTY, /* One or more dirty meta-data blocks exist */ + NILFS_SC_UNCLOSED, /* Logical segment is not closed */ + NILFS_SC_SUPER_ROOT, /* The latest segment has a super root */ + NILFS_SC_PRIOR_FLUSH, /* Requesting immediate flush without making a + checkpoint */ + NILFS_SC_HAVE_DELTA, /* Next checkpoint will have update of files + other than DAT, cpfile, sufile, or files + moved by GC */ +}; + +/* sc_state */ +#define NILFS_SEGCTOR_QUIT 0x0001 /* segctord is being destroyed */ +#define NILFS_SEGCTOR_COMMIT 0x0004 /* committed transaction exists */ + +/* + * Constant parameters + */ +#define NILFS_SC_CLEANUP_RETRY 3 /* Retry count of construction when + destroying segctord */ + +/* + * Default values of timeout, in seconds. + */ +#define NILFS_SC_DEFAULT_TIMEOUT 5 /* Timeout value of dirty blocks. + It triggers construction of a + logical segment with a super root */ +#define NILFS_SC_DEFAULT_SR_FREQ 30 /* Maximum frequency of super root + creation */ + +/* + * The default threshold amount of data, in block counts. + */ +#define NILFS_SC_DEFAULT_WATERMARK 3600 + + +/* segment.c */ +extern int nilfs_init_transaction_cache(void); +extern void nilfs_destroy_transaction_cache(void); +extern void nilfs_relax_pressure_in_lock(struct super_block *); + +extern int nilfs_construct_segment(struct super_block *); +extern int nilfs_construct_dsync_segment(struct super_block *, struct inode *, + loff_t, loff_t); +extern void nilfs_flush_segment(struct super_block *, ino_t); +extern int nilfs_clean_segments(struct super_block *, void __user *); + +extern int nilfs_segctor_add_segments_to_be_freed(struct nilfs_sc_info *, + __u64 *, size_t); +extern void nilfs_segctor_clear_segments_to_be_freed(struct nilfs_sc_info *); + +extern int nilfs_attach_segment_constructor(struct nilfs_sb_info *); +extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *); + +/* recovery.c */ +extern int nilfs_read_super_root_block(struct super_block *, sector_t, + struct buffer_head **, int); +extern int nilfs_search_super_root(struct the_nilfs *, struct nilfs_sb_info *, + struct nilfs_recovery_info *); +extern int nilfs_recover_logical_segments(struct the_nilfs *, + struct nilfs_sb_info *, + struct nilfs_recovery_info *); + +#endif /* _NILFS_SEGMENT_H */ diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c new file mode 100644 index 000000000000..c774cf397e2f --- /dev/null +++ b/fs/nilfs2/sufile.c @@ -0,0 +1,640 @@ +/* + * sufile.c - NILFS segment usage file. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/string.h> +#include <linux/buffer_head.h> +#include <linux/errno.h> +#include <linux/nilfs2_fs.h> +#include "mdt.h" +#include "sufile.h" + + +static inline unsigned long +nilfs_sufile_segment_usages_per_block(const struct inode *sufile) +{ + return NILFS_MDT(sufile)->mi_entries_per_block; +} + +static unsigned long +nilfs_sufile_get_blkoff(const struct inode *sufile, __u64 segnum) +{ + __u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset; + do_div(t, nilfs_sufile_segment_usages_per_block(sufile)); + return (unsigned long)t; +} + +static unsigned long +nilfs_sufile_get_offset(const struct inode *sufile, __u64 segnum) +{ + __u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset; + return do_div(t, nilfs_sufile_segment_usages_per_block(sufile)); +} + +static unsigned long +nilfs_sufile_segment_usages_in_block(const struct inode *sufile, __u64 curr, + __u64 max) +{ + return min_t(unsigned long, + nilfs_sufile_segment_usages_per_block(sufile) - + nilfs_sufile_get_offset(sufile, curr), + max - curr + 1); +} + +static inline struct nilfs_sufile_header * +nilfs_sufile_block_get_header(const struct inode *sufile, + struct buffer_head *bh, + void *kaddr) +{ + return kaddr + bh_offset(bh); +} + +static struct nilfs_segment_usage * +nilfs_sufile_block_get_segment_usage(const struct inode *sufile, __u64 segnum, + struct buffer_head *bh, void *kaddr) +{ + return kaddr + bh_offset(bh) + + nilfs_sufile_get_offset(sufile, segnum) * + NILFS_MDT(sufile)->mi_entry_size; +} + +static inline int nilfs_sufile_get_header_block(struct inode *sufile, + struct buffer_head **bhp) +{ + return nilfs_mdt_get_block(sufile, 0, 0, NULL, bhp); +} + +static inline int +nilfs_sufile_get_segment_usage_block(struct inode *sufile, __u64 segnum, + int create, struct buffer_head **bhp) +{ + return nilfs_mdt_get_block(sufile, + nilfs_sufile_get_blkoff(sufile, segnum), + create, NULL, bhp); +} + +/** + * nilfs_sufile_alloc - allocate a segment + * @sufile: inode of segment usage file + * @segnump: pointer to segment number + * + * Description: nilfs_sufile_alloc() allocates a clean segment. + * + * Return Value: On success, 0 is returned and the segment number of the + * allocated segment is stored in the place pointed by @segnump. On error, one + * of the following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOSPC - No clean segment left. + */ +int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) +{ + struct buffer_head *header_bh, *su_bh; + struct the_nilfs *nilfs; + struct nilfs_sufile_header *header; + struct nilfs_segment_usage *su; + size_t susz = NILFS_MDT(sufile)->mi_entry_size; + __u64 segnum, maxsegnum, last_alloc; + void *kaddr; + unsigned long nsegments, ncleansegs, nsus; + int ret, i, j; + + down_write(&NILFS_MDT(sufile)->mi_sem); + + nilfs = NILFS_MDT(sufile)->mi_nilfs; + + ret = nilfs_sufile_get_header_block(sufile, &header_bh); + if (ret < 0) + goto out_sem; + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr); + ncleansegs = le64_to_cpu(header->sh_ncleansegs); + last_alloc = le64_to_cpu(header->sh_last_alloc); + kunmap_atomic(kaddr, KM_USER0); + + nsegments = nilfs_sufile_get_nsegments(sufile); + segnum = last_alloc + 1; + maxsegnum = nsegments - 1; + for (i = 0; i < nsegments; i += nsus) { + if (segnum >= nsegments) { + /* wrap around */ + segnum = 0; + maxsegnum = last_alloc; + } + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1, + &su_bh); + if (ret < 0) + goto out_header; + kaddr = kmap_atomic(su_bh->b_page, KM_USER0); + su = nilfs_sufile_block_get_segment_usage( + sufile, segnum, su_bh, kaddr); + + nsus = nilfs_sufile_segment_usages_in_block( + sufile, segnum, maxsegnum); + for (j = 0; j < nsus; j++, su = (void *)su + susz, segnum++) { + if (!nilfs_segment_usage_clean(su)) + continue; + /* found a clean segment */ + nilfs_segment_usage_set_dirty(su); + kunmap_atomic(kaddr, KM_USER0); + + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = nilfs_sufile_block_get_header( + sufile, header_bh, kaddr); + le64_add_cpu(&header->sh_ncleansegs, -1); + le64_add_cpu(&header->sh_ndirtysegs, 1); + header->sh_last_alloc = cpu_to_le64(segnum); + kunmap_atomic(kaddr, KM_USER0); + + nilfs_mdt_mark_buffer_dirty(header_bh); + nilfs_mdt_mark_buffer_dirty(su_bh); + nilfs_mdt_mark_dirty(sufile); + brelse(su_bh); + *segnump = segnum; + goto out_header; + } + + kunmap_atomic(kaddr, KM_USER0); + brelse(su_bh); + } + + /* no segments left */ + ret = -ENOSPC; + + out_header: + brelse(header_bh); + + out_sem: + up_write(&NILFS_MDT(sufile)->mi_sem); + return ret; +} + +/** + * nilfs_sufile_cancel_free - + * @sufile: inode of segment usage file + * @segnum: segment number + * + * Description: + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_sufile_cancel_free(struct inode *sufile, __u64 segnum) +{ + struct buffer_head *header_bh, *su_bh; + struct the_nilfs *nilfs; + struct nilfs_sufile_header *header; + struct nilfs_segment_usage *su; + void *kaddr; + int ret; + + down_write(&NILFS_MDT(sufile)->mi_sem); + + nilfs = NILFS_MDT(sufile)->mi_nilfs; + + ret = nilfs_sufile_get_header_block(sufile, &header_bh); + if (ret < 0) + goto out_sem; + + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &su_bh); + if (ret < 0) + goto out_header; + + kaddr = kmap_atomic(su_bh->b_page, KM_USER0); + su = nilfs_sufile_block_get_segment_usage( + sufile, segnum, su_bh, kaddr); + if (unlikely(!nilfs_segment_usage_clean(su))) { + printk(KERN_WARNING "%s: segment %llu must be clean\n", + __func__, (unsigned long long)segnum); + kunmap_atomic(kaddr, KM_USER0); + goto out_su_bh; + } + nilfs_segment_usage_set_dirty(su); + kunmap_atomic(kaddr, KM_USER0); + + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr); + le64_add_cpu(&header->sh_ncleansegs, -1); + le64_add_cpu(&header->sh_ndirtysegs, 1); + kunmap_atomic(kaddr, KM_USER0); + + nilfs_mdt_mark_buffer_dirty(header_bh); + nilfs_mdt_mark_buffer_dirty(su_bh); + nilfs_mdt_mark_dirty(sufile); + + out_su_bh: + brelse(su_bh); + out_header: + brelse(header_bh); + out_sem: + up_write(&NILFS_MDT(sufile)->mi_sem); + return ret; +} + +/** + * nilfs_sufile_freev - free segments + * @sufile: inode of segment usage file + * @segnum: array of segment numbers + * @nsegs: number of segments + * + * Description: nilfs_sufile_freev() frees segments specified by @segnum and + * @nsegs, which must have been returned by a previous call to + * nilfs_sufile_alloc(). + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +#define NILFS_SUFILE_FREEV_PREALLOC 16 +int nilfs_sufile_freev(struct inode *sufile, __u64 *segnum, size_t nsegs) +{ + struct buffer_head *header_bh, **su_bh, + *su_bh_prealloc[NILFS_SUFILE_FREEV_PREALLOC]; + struct the_nilfs *nilfs; + struct nilfs_sufile_header *header; + struct nilfs_segment_usage *su; + void *kaddr; + int ret, i; + + down_write(&NILFS_MDT(sufile)->mi_sem); + + nilfs = NILFS_MDT(sufile)->mi_nilfs; + + /* prepare resources */ + if (nsegs <= NILFS_SUFILE_FREEV_PREALLOC) + su_bh = su_bh_prealloc; + else { + su_bh = kmalloc(sizeof(*su_bh) * nsegs, GFP_NOFS); + if (su_bh == NULL) { + ret = -ENOMEM; + goto out_sem; + } + } + + ret = nilfs_sufile_get_header_block(sufile, &header_bh); + if (ret < 0) + goto out_su_bh; + for (i = 0; i < nsegs; i++) { + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum[i], + 0, &su_bh[i]); + if (ret < 0) + goto out_bh; + } + + /* free segments */ + for (i = 0; i < nsegs; i++) { + kaddr = kmap_atomic(su_bh[i]->b_page, KM_USER0); + su = nilfs_sufile_block_get_segment_usage( + sufile, segnum[i], su_bh[i], kaddr); + WARN_ON(nilfs_segment_usage_error(su)); + nilfs_segment_usage_set_clean(su); + kunmap_atomic(kaddr, KM_USER0); + nilfs_mdt_mark_buffer_dirty(su_bh[i]); + } + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr); + le64_add_cpu(&header->sh_ncleansegs, nsegs); + le64_add_cpu(&header->sh_ndirtysegs, -(u64)nsegs); + kunmap_atomic(kaddr, KM_USER0); + nilfs_mdt_mark_buffer_dirty(header_bh); + nilfs_mdt_mark_dirty(sufile); + + out_bh: + for (i--; i >= 0; i--) + brelse(su_bh[i]); + brelse(header_bh); + + out_su_bh: + if (su_bh != su_bh_prealloc) + kfree(su_bh); + + out_sem: + up_write(&NILFS_MDT(sufile)->mi_sem); + return ret; +} + +/** + * nilfs_sufile_free - + * @sufile: + * @segnum: + */ +int nilfs_sufile_free(struct inode *sufile, __u64 segnum) +{ + return nilfs_sufile_freev(sufile, &segnum, 1); +} + +/** + * nilfs_sufile_get_segment_usage - get a segment usage + * @sufile: inode of segment usage file + * @segnum: segment number + * @sup: pointer to segment usage + * @bhp: pointer to buffer head + * + * Description: nilfs_sufile_get_segment_usage() acquires the segment usage + * specified by @segnum. + * + * Return Value: On success, 0 is returned, and the segment usage and the + * buffer head of the buffer on which the segment usage is located are stored + * in the place pointed by @sup and @bhp, respectively. On error, one of the + * following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EINVAL - Invalid segment usage number. + */ +int nilfs_sufile_get_segment_usage(struct inode *sufile, __u64 segnum, + struct nilfs_segment_usage **sup, + struct buffer_head **bhp) +{ + struct buffer_head *bh; + struct nilfs_segment_usage *su; + void *kaddr; + int ret; + + /* segnum is 0 origin */ + if (segnum >= nilfs_sufile_get_nsegments(sufile)) + return -EINVAL; + down_write(&NILFS_MDT(sufile)->mi_sem); + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1, &bh); + if (ret < 0) + goto out_sem; + kaddr = kmap(bh->b_page); + su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr); + if (nilfs_segment_usage_error(su)) { + kunmap(bh->b_page); + brelse(bh); + ret = -EINVAL; + goto out_sem; + } + + if (sup != NULL) + *sup = su; + *bhp = bh; + + out_sem: + up_write(&NILFS_MDT(sufile)->mi_sem); + return ret; +} + +/** + * nilfs_sufile_put_segment_usage - put a segment usage + * @sufile: inode of segment usage file + * @segnum: segment number + * @bh: buffer head + * + * Description: nilfs_sufile_put_segment_usage() releases the segment usage + * specified by @segnum. @bh must be the buffer head which have been returned + * by a previous call to nilfs_sufile_get_segment_usage() with @segnum. + */ +void nilfs_sufile_put_segment_usage(struct inode *sufile, __u64 segnum, + struct buffer_head *bh) +{ + kunmap(bh->b_page); + brelse(bh); +} + +/** + * nilfs_sufile_get_stat - get segment usage statistics + * @sufile: inode of segment usage file + * @stat: pointer to a structure of segment usage statistics + * + * Description: nilfs_sufile_get_stat() returns information about segment + * usage. + * + * Return Value: On success, 0 is returned, and segment usage information is + * stored in the place pointed by @stat. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat) +{ + struct buffer_head *header_bh; + struct nilfs_sufile_header *header; + struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs; + void *kaddr; + int ret; + + down_read(&NILFS_MDT(sufile)->mi_sem); + + ret = nilfs_sufile_get_header_block(sufile, &header_bh); + if (ret < 0) + goto out_sem; + + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr); + sustat->ss_nsegs = nilfs_sufile_get_nsegments(sufile); + sustat->ss_ncleansegs = le64_to_cpu(header->sh_ncleansegs); + sustat->ss_ndirtysegs = le64_to_cpu(header->sh_ndirtysegs); + sustat->ss_ctime = nilfs->ns_ctime; + sustat->ss_nongc_ctime = nilfs->ns_nongc_ctime; + spin_lock(&nilfs->ns_last_segment_lock); + sustat->ss_prot_seq = nilfs->ns_prot_seq; + spin_unlock(&nilfs->ns_last_segment_lock); + kunmap_atomic(kaddr, KM_USER0); + brelse(header_bh); + + out_sem: + up_read(&NILFS_MDT(sufile)->mi_sem); + return ret; +} + +/** + * nilfs_sufile_get_ncleansegs - get the number of clean segments + * @sufile: inode of segment usage file + * @nsegsp: pointer to the number of clean segments + * + * Description: nilfs_sufile_get_ncleansegs() acquires the number of clean + * segments. + * + * Return Value: On success, 0 is returned and the number of clean segments is + * stored in the place pointed by @nsegsp. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_sufile_get_ncleansegs(struct inode *sufile, unsigned long *nsegsp) +{ + struct nilfs_sustat sustat; + int ret; + + ret = nilfs_sufile_get_stat(sufile, &sustat); + if (ret == 0) + *nsegsp = sustat.ss_ncleansegs; + return ret; +} + +/** + * nilfs_sufile_set_error - mark a segment as erroneous + * @sufile: inode of segment usage file + * @segnum: segment number + * + * Description: nilfs_sufile_set_error() marks the segment specified by + * @segnum as erroneous. The error segment will never be used again. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EINVAL - Invalid segment usage number. + */ +int nilfs_sufile_set_error(struct inode *sufile, __u64 segnum) +{ + struct buffer_head *header_bh, *su_bh; + struct nilfs_segment_usage *su; + struct nilfs_sufile_header *header; + void *kaddr; + int ret; + + if (unlikely(segnum >= nilfs_sufile_get_nsegments(sufile))) { + printk(KERN_WARNING "%s: invalid segment number: %llu\n", + __func__, (unsigned long long)segnum); + return -EINVAL; + } + down_write(&NILFS_MDT(sufile)->mi_sem); + + ret = nilfs_sufile_get_header_block(sufile, &header_bh); + if (ret < 0) + goto out_sem; + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &su_bh); + if (ret < 0) + goto out_header; + + kaddr = kmap_atomic(su_bh->b_page, KM_USER0); + su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); + if (nilfs_segment_usage_error(su)) { + kunmap_atomic(kaddr, KM_USER0); + brelse(su_bh); + goto out_header; + } + + nilfs_segment_usage_set_error(su); + kunmap_atomic(kaddr, KM_USER0); + brelse(su_bh); + + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr); + le64_add_cpu(&header->sh_ndirtysegs, -1); + kunmap_atomic(kaddr, KM_USER0); + nilfs_mdt_mark_buffer_dirty(header_bh); + nilfs_mdt_mark_buffer_dirty(su_bh); + nilfs_mdt_mark_dirty(sufile); + brelse(su_bh); + + out_header: + brelse(header_bh); + + out_sem: + up_write(&NILFS_MDT(sufile)->mi_sem); + return ret; +} + +/** + * nilfs_sufile_get_suinfo - + * @sufile: inode of segment usage file + * @segnum: segment number to start looking + * @si: array of suinfo + * @nsi: size of suinfo array + * + * Description: + * + * Return Value: On success, 0 is returned and .... On error, one of the + * following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, + struct nilfs_suinfo *si, size_t nsi) +{ + struct buffer_head *su_bh; + struct nilfs_segment_usage *su; + size_t susz = NILFS_MDT(sufile)->mi_entry_size; + struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs; + void *kaddr; + unsigned long nsegs, segusages_per_block; + ssize_t n; + int ret, i, j; + + down_read(&NILFS_MDT(sufile)->mi_sem); + + segusages_per_block = nilfs_sufile_segment_usages_per_block(sufile); + nsegs = min_t(unsigned long, + nilfs_sufile_get_nsegments(sufile) - segnum, + nsi); + for (i = 0; i < nsegs; i += n, segnum += n) { + n = min_t(unsigned long, + segusages_per_block - + nilfs_sufile_get_offset(sufile, segnum), + nsegs - i); + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, + &su_bh); + if (ret < 0) { + if (ret != -ENOENT) + goto out; + /* hole */ + memset(&si[i], 0, sizeof(struct nilfs_suinfo) * n); + continue; + } + + kaddr = kmap_atomic(su_bh->b_page, KM_USER0); + su = nilfs_sufile_block_get_segment_usage( + sufile, segnum, su_bh, kaddr); + for (j = 0; j < n; j++, su = (void *)su + susz) { + si[i + j].sui_lastmod = le64_to_cpu(su->su_lastmod); + si[i + j].sui_nblocks = le32_to_cpu(su->su_nblocks); + si[i + j].sui_flags = le32_to_cpu(su->su_flags) & + ~(1UL << NILFS_SEGMENT_USAGE_ACTIVE); + if (nilfs_segment_is_active(nilfs, segnum + i + j)) + si[i + j].sui_flags |= + (1UL << NILFS_SEGMENT_USAGE_ACTIVE); + } + kunmap_atomic(kaddr, KM_USER0); + brelse(su_bh); + } + ret = nsegs; + + out: + up_read(&NILFS_MDT(sufile)->mi_sem); + return ret; +} diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h new file mode 100644 index 000000000000..d595f33a768d --- /dev/null +++ b/fs/nilfs2/sufile.h @@ -0,0 +1,54 @@ +/* + * sufile.h - NILFS segment usage file. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#ifndef _NILFS_SUFILE_H +#define _NILFS_SUFILE_H + +#include <linux/fs.h> +#include <linux/buffer_head.h> +#include <linux/nilfs2_fs.h> +#include "mdt.h" + +#define NILFS_SUFILE_GFP NILFS_MDT_GFP + +static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile) +{ + return NILFS_MDT(sufile)->mi_nilfs->ns_nsegments; +} + +int nilfs_sufile_alloc(struct inode *, __u64 *); +int nilfs_sufile_cancel_free(struct inode *, __u64); +int nilfs_sufile_freev(struct inode *, __u64 *, size_t); +int nilfs_sufile_free(struct inode *, __u64); +int nilfs_sufile_get_segment_usage(struct inode *, __u64, + struct nilfs_segment_usage **, + struct buffer_head **); +void nilfs_sufile_put_segment_usage(struct inode *, __u64, + struct buffer_head *); +int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *); +int nilfs_sufile_get_ncleansegs(struct inode *, unsigned long *); +int nilfs_sufile_set_error(struct inode *, __u64); +ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, struct nilfs_suinfo *, + size_t); + + +#endif /* _NILFS_SUFILE_H */ diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c new file mode 100644 index 000000000000..e117e1ea9bff --- /dev/null +++ b/fs/nilfs2/super.c @@ -0,0 +1,1323 @@ +/* + * super.c - NILFS module and super block management. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net> + */ +/* + * linux/fs/ext2/super.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include <linux/module.h> +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/blkdev.h> +#include <linux/parser.h> +#include <linux/random.h> +#include <linux/crc32.h> +#include <linux/smp_lock.h> +#include <linux/vfs.h> +#include <linux/writeback.h> +#include <linux/kobject.h> +#include <linux/exportfs.h> +#include "nilfs.h" +#include "mdt.h" +#include "alloc.h" +#include "page.h" +#include "cpfile.h" +#include "ifile.h" +#include "dat.h" +#include "segment.h" +#include "segbuf.h" + +MODULE_AUTHOR("NTT Corp."); +MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem " + "(NILFS)"); +MODULE_VERSION(NILFS_VERSION); +MODULE_LICENSE("GPL"); + +static int nilfs_remount(struct super_block *sb, int *flags, char *data); +static int test_exclusive_mount(struct file_system_type *fs_type, + struct block_device *bdev, int flags); + +/** + * nilfs_error() - report failure condition on a filesystem + * + * nilfs_error() sets an ERROR_FS flag on the superblock as well as + * reporting an error message. It should be called when NILFS detects + * incoherences or defects of meta data on disk. As for sustainable + * errors such as a single-shot I/O error, nilfs_warning() or the printk() + * function should be used instead. + * + * The segment constructor must not call this function because it can + * kill itself. + */ +void nilfs_error(struct super_block *sb, const char *function, + const char *fmt, ...) +{ + struct nilfs_sb_info *sbi = NILFS_SB(sb); + va_list args; + + va_start(args, fmt); + printk(KERN_CRIT "NILFS error (device %s): %s: ", sb->s_id, function); + vprintk(fmt, args); + printk("\n"); + va_end(args); + + if (!(sb->s_flags & MS_RDONLY)) { + struct the_nilfs *nilfs = sbi->s_nilfs; + + if (!nilfs_test_opt(sbi, ERRORS_CONT)) + nilfs_detach_segment_constructor(sbi); + + down_write(&nilfs->ns_sem); + if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) { + nilfs->ns_mount_state |= NILFS_ERROR_FS; + nilfs->ns_sbp[0]->s_state |= + cpu_to_le16(NILFS_ERROR_FS); + nilfs_commit_super(sbi, 1); + } + up_write(&nilfs->ns_sem); + + if (nilfs_test_opt(sbi, ERRORS_RO)) { + printk(KERN_CRIT "Remounting filesystem read-only\n"); + sb->s_flags |= MS_RDONLY; + } + } + + if (nilfs_test_opt(sbi, ERRORS_PANIC)) + panic("NILFS (device %s): panic forced after error\n", + sb->s_id); +} + +void nilfs_warning(struct super_block *sb, const char *function, + const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + printk(KERN_WARNING "NILFS warning (device %s): %s: ", + sb->s_id, function); + vprintk(fmt, args); + printk("\n"); + va_end(args); +} + +static struct kmem_cache *nilfs_inode_cachep; + +struct inode *nilfs_alloc_inode(struct super_block *sb) +{ + struct nilfs_inode_info *ii; + + ii = kmem_cache_alloc(nilfs_inode_cachep, GFP_NOFS); + if (!ii) + return NULL; + ii->i_bh = NULL; + ii->i_state = 0; + ii->vfs_inode.i_version = 1; + nilfs_btnode_cache_init(&ii->i_btnode_cache); + return &ii->vfs_inode; +} + +void nilfs_destroy_inode(struct inode *inode) +{ + kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode)); +} + +static void init_once(void *obj) +{ + struct nilfs_inode_info *ii = obj; + + INIT_LIST_HEAD(&ii->i_dirty); +#ifdef CONFIG_NILFS_XATTR + init_rwsem(&ii->xattr_sem); +#endif + nilfs_btnode_cache_init_once(&ii->i_btnode_cache); + ii->i_bmap = (struct nilfs_bmap *)&ii->i_bmap_union; + inode_init_once(&ii->vfs_inode); +} + +static int nilfs_init_inode_cache(void) +{ + nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache", + sizeof(struct nilfs_inode_info), + 0, SLAB_RECLAIM_ACCOUNT, + init_once); + + return (nilfs_inode_cachep == NULL) ? -ENOMEM : 0; +} + +static inline void nilfs_destroy_inode_cache(void) +{ + kmem_cache_destroy(nilfs_inode_cachep); +} + +static void nilfs_clear_inode(struct inode *inode) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + +#ifdef CONFIG_NILFS_POSIX_ACL + if (ii->i_acl && ii->i_acl != NILFS_ACL_NOT_CACHED) { + posix_acl_release(ii->i_acl); + ii->i_acl = NILFS_ACL_NOT_CACHED; + } + if (ii->i_default_acl && ii->i_default_acl != NILFS_ACL_NOT_CACHED) { + posix_acl_release(ii->i_default_acl); + ii->i_default_acl = NILFS_ACL_NOT_CACHED; + } +#endif + /* + * Free resources allocated in nilfs_read_inode(), here. + */ + BUG_ON(!list_empty(&ii->i_dirty)); + brelse(ii->i_bh); + ii->i_bh = NULL; + + if (test_bit(NILFS_I_BMAP, &ii->i_state)) + nilfs_bmap_clear(ii->i_bmap); + + nilfs_btnode_cache_clear(&ii->i_btnode_cache); +} + +static int nilfs_sync_super(struct nilfs_sb_info *sbi, int dupsb) +{ + struct the_nilfs *nilfs = sbi->s_nilfs; + int err; + int barrier_done = 0; + + if (nilfs_test_opt(sbi, BARRIER)) { + set_buffer_ordered(nilfs->ns_sbh[0]); + barrier_done = 1; + } + retry: + set_buffer_dirty(nilfs->ns_sbh[0]); + err = sync_dirty_buffer(nilfs->ns_sbh[0]); + if (err == -EOPNOTSUPP && barrier_done) { + nilfs_warning(sbi->s_super, __func__, + "barrier-based sync failed. " + "disabling barriers\n"); + nilfs_clear_opt(sbi, BARRIER); + barrier_done = 0; + clear_buffer_ordered(nilfs->ns_sbh[0]); + goto retry; + } + if (unlikely(err)) { + printk(KERN_ERR + "NILFS: unable to write superblock (err=%d)\n", err); + if (err == -EIO && nilfs->ns_sbh[1]) { + nilfs_fall_back_super_block(nilfs); + goto retry; + } + } else { + struct nilfs_super_block *sbp = nilfs->ns_sbp[0]; + + /* + * The latest segment becomes trailable from the position + * written in superblock. + */ + clear_nilfs_discontinued(nilfs); + + /* update GC protection for recent segments */ + if (nilfs->ns_sbh[1]) { + sbp = NULL; + if (dupsb) { + set_buffer_dirty(nilfs->ns_sbh[1]); + if (!sync_dirty_buffer(nilfs->ns_sbh[1])) + sbp = nilfs->ns_sbp[1]; + } + } + if (sbp) { + spin_lock(&nilfs->ns_last_segment_lock); + nilfs->ns_prot_seq = le64_to_cpu(sbp->s_last_seq); + spin_unlock(&nilfs->ns_last_segment_lock); + } + } + + return err; +} + +int nilfs_commit_super(struct nilfs_sb_info *sbi, int dupsb) +{ + struct the_nilfs *nilfs = sbi->s_nilfs; + struct nilfs_super_block **sbp = nilfs->ns_sbp; + sector_t nfreeblocks; + time_t t; + int err; + + /* nilfs->sem must be locked by the caller. */ + if (sbp[0]->s_magic != NILFS_SUPER_MAGIC) { + if (sbp[1] && sbp[1]->s_magic == NILFS_SUPER_MAGIC) + nilfs_swap_super_block(nilfs); + else { + printk(KERN_CRIT "NILFS: superblock broke on dev %s\n", + sbi->s_super->s_id); + return -EIO; + } + } + err = nilfs_count_free_blocks(nilfs, &nfreeblocks); + if (unlikely(err)) { + printk(KERN_ERR "NILFS: failed to count free blocks\n"); + return err; + } + spin_lock(&nilfs->ns_last_segment_lock); + sbp[0]->s_last_seq = cpu_to_le64(nilfs->ns_last_seq); + sbp[0]->s_last_pseg = cpu_to_le64(nilfs->ns_last_pseg); + sbp[0]->s_last_cno = cpu_to_le64(nilfs->ns_last_cno); + spin_unlock(&nilfs->ns_last_segment_lock); + + t = get_seconds(); + nilfs->ns_sbwtime[0] = t; + sbp[0]->s_free_blocks_count = cpu_to_le64(nfreeblocks); + sbp[0]->s_wtime = cpu_to_le64(t); + sbp[0]->s_sum = 0; + sbp[0]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed, + (unsigned char *)sbp[0], + nilfs->ns_sbsize)); + if (dupsb && sbp[1]) { + memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); + nilfs->ns_sbwtime[1] = t; + } + sbi->s_super->s_dirt = 0; + return nilfs_sync_super(sbi, dupsb); +} + +static void nilfs_put_super(struct super_block *sb) +{ + struct nilfs_sb_info *sbi = NILFS_SB(sb); + struct the_nilfs *nilfs = sbi->s_nilfs; + + nilfs_detach_segment_constructor(sbi); + + if (!(sb->s_flags & MS_RDONLY)) { + down_write(&nilfs->ns_sem); + nilfs->ns_sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state); + nilfs_commit_super(sbi, 1); + up_write(&nilfs->ns_sem); + } + + nilfs_detach_checkpoint(sbi); + put_nilfs(sbi->s_nilfs); + sbi->s_super = NULL; + sb->s_fs_info = NULL; + kfree(sbi); +} + +/** + * nilfs_write_super - write super block(s) of NILFS + * @sb: super_block + * + * nilfs_write_super() gets a fs-dependent lock, writes super block(s), and + * clears s_dirt. This function is called in the section protected by + * lock_super(). + * + * The s_dirt flag is managed by each filesystem and we protect it by ns_sem + * of the struct the_nilfs. Lock order must be as follows: + * + * 1. lock_super() + * 2. down_write(&nilfs->ns_sem) + * + * Inside NILFS, locking ns_sem is enough to protect s_dirt and the buffer + * of the super block (nilfs->ns_sbp[]). + * + * In most cases, VFS functions call lock_super() before calling these + * methods. So we must be careful not to bring on deadlocks when using + * lock_super(); see generic_shutdown_super(), write_super(), and so on. + * + * Note that order of lock_kernel() and lock_super() depends on contexts + * of VFS. We should also note that lock_kernel() can be used in its + * protective section and only the outermost one has an effect. + */ +static void nilfs_write_super(struct super_block *sb) +{ + struct nilfs_sb_info *sbi = NILFS_SB(sb); + struct the_nilfs *nilfs = sbi->s_nilfs; + + down_write(&nilfs->ns_sem); + if (!(sb->s_flags & MS_RDONLY)) { + struct nilfs_super_block **sbp = nilfs->ns_sbp; + u64 t = get_seconds(); + int dupsb; + + if (!nilfs_discontinued(nilfs) && t >= nilfs->ns_sbwtime[0] && + t < nilfs->ns_sbwtime[0] + NILFS_SB_FREQ) { + up_write(&nilfs->ns_sem); + return; + } + dupsb = sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ; + nilfs_commit_super(sbi, dupsb); + } + sb->s_dirt = 0; + up_write(&nilfs->ns_sem); +} + +static int nilfs_sync_fs(struct super_block *sb, int wait) +{ + int err = 0; + + /* This function is called when super block should be written back */ + if (wait) + err = nilfs_construct_segment(sb); + return err; +} + +int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno) +{ + struct the_nilfs *nilfs = sbi->s_nilfs; + struct nilfs_checkpoint *raw_cp; + struct buffer_head *bh_cp; + int err; + + down_write(&nilfs->ns_sem); + list_add(&sbi->s_list, &nilfs->ns_supers); + up_write(&nilfs->ns_sem); + + sbi->s_ifile = nilfs_mdt_new( + nilfs, sbi->s_super, NILFS_IFILE_INO, NILFS_IFILE_GFP); + if (!sbi->s_ifile) + return -ENOMEM; + + err = nilfs_palloc_init_blockgroup(sbi->s_ifile, nilfs->ns_inode_size); + if (unlikely(err)) + goto failed; + + err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp, + &bh_cp); + if (unlikely(err)) { + if (err == -ENOENT || err == -EINVAL) { + printk(KERN_ERR + "NILFS: Invalid checkpoint " + "(checkpoint number=%llu)\n", + (unsigned long long)cno); + err = -EINVAL; + } + goto failed; + } + err = nilfs_read_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode); + if (unlikely(err)) + goto failed_bh; + atomic_set(&sbi->s_inodes_count, le64_to_cpu(raw_cp->cp_inodes_count)); + atomic_set(&sbi->s_blocks_count, le64_to_cpu(raw_cp->cp_blocks_count)); + + nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp); + return 0; + + failed_bh: + nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp); + failed: + nilfs_mdt_destroy(sbi->s_ifile); + sbi->s_ifile = NULL; + + down_write(&nilfs->ns_sem); + list_del_init(&sbi->s_list); + up_write(&nilfs->ns_sem); + + return err; +} + +void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi) +{ + struct the_nilfs *nilfs = sbi->s_nilfs; + + nilfs_mdt_clear(sbi->s_ifile); + nilfs_mdt_destroy(sbi->s_ifile); + sbi->s_ifile = NULL; + down_write(&nilfs->ns_sem); + list_del_init(&sbi->s_list); + up_write(&nilfs->ns_sem); +} + +static int nilfs_mark_recovery_complete(struct nilfs_sb_info *sbi) +{ + struct the_nilfs *nilfs = sbi->s_nilfs; + int err = 0; + + down_write(&nilfs->ns_sem); + if (!(nilfs->ns_mount_state & NILFS_VALID_FS)) { + nilfs->ns_mount_state |= NILFS_VALID_FS; + err = nilfs_commit_super(sbi, 1); + if (likely(!err)) + printk(KERN_INFO "NILFS: recovery complete.\n"); + } + up_write(&nilfs->ns_sem); + return err; +} + +static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct nilfs_sb_info *sbi = NILFS_SB(sb); + unsigned long long blocks; + unsigned long overhead; + unsigned long nrsvblocks; + sector_t nfreeblocks; + struct the_nilfs *nilfs = sbi->s_nilfs; + int err; + + /* + * Compute all of the segment blocks + * + * The blocks before first segment and after last segment + * are excluded. + */ + blocks = nilfs->ns_blocks_per_segment * nilfs->ns_nsegments + - nilfs->ns_first_data_block; + nrsvblocks = nilfs->ns_nrsvsegs * nilfs->ns_blocks_per_segment; + + /* + * Compute the overhead + * + * When distributing meta data blocks outside semgent structure, + * We must count them as the overhead. + */ + overhead = 0; + + err = nilfs_count_free_blocks(nilfs, &nfreeblocks); + if (unlikely(err)) + return err; + + buf->f_type = NILFS_SUPER_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = blocks - overhead; + buf->f_bfree = nfreeblocks; + buf->f_bavail = (buf->f_bfree >= nrsvblocks) ? + (buf->f_bfree - nrsvblocks) : 0; + buf->f_files = atomic_read(&sbi->s_inodes_count); + buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */ + buf->f_namelen = NILFS_NAME_LEN; + return 0; +} + +static struct super_operations nilfs_sops = { + .alloc_inode = nilfs_alloc_inode, + .destroy_inode = nilfs_destroy_inode, + .dirty_inode = nilfs_dirty_inode, + /* .write_inode = nilfs_write_inode, */ + /* .put_inode = nilfs_put_inode, */ + /* .drop_inode = nilfs_drop_inode, */ + .delete_inode = nilfs_delete_inode, + .put_super = nilfs_put_super, + .write_super = nilfs_write_super, + .sync_fs = nilfs_sync_fs, + /* .write_super_lockfs */ + /* .unlockfs */ + .statfs = nilfs_statfs, + .remount_fs = nilfs_remount, + .clear_inode = nilfs_clear_inode, + /* .umount_begin */ + /* .show_options */ +}; + +static struct inode * +nilfs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) +{ + struct inode *inode; + + if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO && + ino != NILFS_SKETCH_INO) + return ERR_PTR(-ESTALE); + + inode = nilfs_iget(sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + if (generation && inode->i_generation != generation) { + iput(inode); + return ERR_PTR(-ESTALE); + } + + return inode; +} + +static struct dentry * +nilfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, + int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + nilfs_nfs_get_inode); +} + +static struct dentry * +nilfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, + int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + nilfs_nfs_get_inode); +} + +static struct export_operations nilfs_export_ops = { + .fh_to_dentry = nilfs_fh_to_dentry, + .fh_to_parent = nilfs_fh_to_parent, + .get_parent = nilfs_get_parent, +}; + +enum { + Opt_err_cont, Opt_err_panic, Opt_err_ro, + Opt_barrier, Opt_snapshot, Opt_order, + Opt_err, +}; + +static match_table_t tokens = { + {Opt_err_cont, "errors=continue"}, + {Opt_err_panic, "errors=panic"}, + {Opt_err_ro, "errors=remount-ro"}, + {Opt_barrier, "barrier=%s"}, + {Opt_snapshot, "cp=%u"}, + {Opt_order, "order=%s"}, + {Opt_err, NULL} +}; + +static int match_bool(substring_t *s, int *result) +{ + int len = s->to - s->from; + + if (strncmp(s->from, "on", len) == 0) + *result = 1; + else if (strncmp(s->from, "off", len) == 0) + *result = 0; + else + return 1; + return 0; +} + +static int parse_options(char *options, struct super_block *sb) +{ + struct nilfs_sb_info *sbi = NILFS_SB(sb); + char *p; + substring_t args[MAX_OPT_ARGS]; + int option; + + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_barrier: + if (match_bool(&args[0], &option)) + return 0; + if (option) + nilfs_set_opt(sbi, BARRIER); + else + nilfs_clear_opt(sbi, BARRIER); + break; + case Opt_order: + if (strcmp(args[0].from, "relaxed") == 0) + /* Ordered data semantics */ + nilfs_clear_opt(sbi, STRICT_ORDER); + else if (strcmp(args[0].from, "strict") == 0) + /* Strict in-order semantics */ + nilfs_set_opt(sbi, STRICT_ORDER); + else + return 0; + break; + case Opt_err_panic: + nilfs_write_opt(sbi, ERROR_MODE, ERRORS_PANIC); + break; + case Opt_err_ro: + nilfs_write_opt(sbi, ERROR_MODE, ERRORS_RO); + break; + case Opt_err_cont: + nilfs_write_opt(sbi, ERROR_MODE, ERRORS_CONT); + break; + case Opt_snapshot: + if (match_int(&args[0], &option) || option <= 0) + return 0; + if (!(sb->s_flags & MS_RDONLY)) + return 0; + sbi->s_snapshot_cno = option; + nilfs_set_opt(sbi, SNAPSHOT); + break; + default: + printk(KERN_ERR + "NILFS: Unrecognized mount option \"%s\"\n", p); + return 0; + } + } + return 1; +} + +static inline void +nilfs_set_default_options(struct nilfs_sb_info *sbi, + struct nilfs_super_block *sbp) +{ + sbi->s_mount_opt = + NILFS_MOUNT_ERRORS_CONT | NILFS_MOUNT_BARRIER; +} + +static int nilfs_setup_super(struct nilfs_sb_info *sbi) +{ + struct the_nilfs *nilfs = sbi->s_nilfs; + struct nilfs_super_block *sbp = nilfs->ns_sbp[0]; + int max_mnt_count = le16_to_cpu(sbp->s_max_mnt_count); + int mnt_count = le16_to_cpu(sbp->s_mnt_count); + + /* nilfs->sem must be locked by the caller. */ + if (!(nilfs->ns_mount_state & NILFS_VALID_FS)) { + printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n"); + } else if (nilfs->ns_mount_state & NILFS_ERROR_FS) { + printk(KERN_WARNING + "NILFS warning: mounting fs with errors\n"); +#if 0 + } else if (max_mnt_count >= 0 && mnt_count >= max_mnt_count) { + printk(KERN_WARNING + "NILFS warning: maximal mount count reached\n"); +#endif + } + if (!max_mnt_count) + sbp->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT); + + sbp->s_mnt_count = cpu_to_le16(mnt_count + 1); + sbp->s_state = cpu_to_le16(le16_to_cpu(sbp->s_state) & ~NILFS_VALID_FS); + sbp->s_mtime = cpu_to_le64(get_seconds()); + return nilfs_commit_super(sbi, 1); +} + +struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb, + u64 pos, int blocksize, + struct buffer_head **pbh) +{ + unsigned long long sb_index = pos; + unsigned long offset; + + offset = do_div(sb_index, blocksize); + *pbh = sb_bread(sb, sb_index); + if (!*pbh) + return NULL; + return (struct nilfs_super_block *)((char *)(*pbh)->b_data + offset); +} + +int nilfs_store_magic_and_option(struct super_block *sb, + struct nilfs_super_block *sbp, + char *data) +{ + struct nilfs_sb_info *sbi = NILFS_SB(sb); + + sb->s_magic = le16_to_cpu(sbp->s_magic); + + /* FS independent flags */ +#ifdef NILFS_ATIME_DISABLE + sb->s_flags |= MS_NOATIME; +#endif + + nilfs_set_default_options(sbi, sbp); + + sbi->s_resuid = le16_to_cpu(sbp->s_def_resuid); + sbi->s_resgid = le16_to_cpu(sbp->s_def_resgid); + sbi->s_interval = le32_to_cpu(sbp->s_c_interval); + sbi->s_watermark = le32_to_cpu(sbp->s_c_block_max); + + return !parse_options(data, sb) ? -EINVAL : 0 ; +} + +/** + * nilfs_fill_super() - initialize a super block instance + * @sb: super_block + * @data: mount options + * @silent: silent mode flag + * @nilfs: the_nilfs struct + * + * This function is called exclusively by bd_mount_mutex. + * So, the recovery process is protected from other simultaneous mounts. + */ +static int +nilfs_fill_super(struct super_block *sb, void *data, int silent, + struct the_nilfs *nilfs) +{ + struct nilfs_sb_info *sbi; + struct inode *root; + __u64 cno; + int err; + + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + sb->s_fs_info = sbi; + + get_nilfs(nilfs); + sbi->s_nilfs = nilfs; + sbi->s_super = sb; + + err = init_nilfs(nilfs, sbi, (char *)data); + if (err) + goto failed_sbi; + + spin_lock_init(&sbi->s_inode_lock); + INIT_LIST_HEAD(&sbi->s_dirty_files); + INIT_LIST_HEAD(&sbi->s_list); + + /* + * Following initialization is overlapped because + * nilfs_sb_info structure has been cleared at the beginning. + * But we reserve them to keep our interest and make ready + * for the future change. + */ + get_random_bytes(&sbi->s_next_generation, + sizeof(sbi->s_next_generation)); + spin_lock_init(&sbi->s_next_gen_lock); + + sb->s_op = &nilfs_sops; + sb->s_export_op = &nilfs_export_ops; + sb->s_root = NULL; + sb->s_time_gran = 1; + + if (!nilfs_loaded(nilfs)) { + err = load_nilfs(nilfs, sbi); + if (err) + goto failed_sbi; + } + cno = nilfs_last_cno(nilfs); + + if (sb->s_flags & MS_RDONLY) { + if (nilfs_test_opt(sbi, SNAPSHOT)) { + err = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile, + sbi->s_snapshot_cno); + if (err < 0) + goto failed_sbi; + if (!err) { + printk(KERN_ERR + "NILFS: The specified checkpoint is " + "not a snapshot " + "(checkpoint number=%llu).\n", + (unsigned long long)sbi->s_snapshot_cno); + err = -EINVAL; + goto failed_sbi; + } + cno = sbi->s_snapshot_cno; + } else + /* Read-only mount */ + sbi->s_snapshot_cno = cno; + } + + err = nilfs_attach_checkpoint(sbi, cno); + if (err) { + printk(KERN_ERR "NILFS: error loading a checkpoint" + " (checkpoint number=%llu).\n", (unsigned long long)cno); + goto failed_sbi; + } + + if (!(sb->s_flags & MS_RDONLY)) { + err = nilfs_attach_segment_constructor(sbi); + if (err) + goto failed_checkpoint; + } + + root = nilfs_iget(sb, NILFS_ROOT_INO); + if (IS_ERR(root)) { + printk(KERN_ERR "NILFS: get root inode failed\n"); + err = PTR_ERR(root); + goto failed_segctor; + } + if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { + iput(root); + printk(KERN_ERR "NILFS: corrupt root inode.\n"); + err = -EINVAL; + goto failed_segctor; + } + sb->s_root = d_alloc_root(root); + if (!sb->s_root) { + iput(root); + printk(KERN_ERR "NILFS: get root dentry failed\n"); + err = -ENOMEM; + goto failed_segctor; + } + + if (!(sb->s_flags & MS_RDONLY)) { + down_write(&nilfs->ns_sem); + nilfs_setup_super(sbi); + up_write(&nilfs->ns_sem); + } + + err = nilfs_mark_recovery_complete(sbi); + if (unlikely(err)) { + printk(KERN_ERR "NILFS: recovery failed.\n"); + goto failed_root; + } + + return 0; + + failed_root: + dput(sb->s_root); + sb->s_root = NULL; + + failed_segctor: + nilfs_detach_segment_constructor(sbi); + + failed_checkpoint: + nilfs_detach_checkpoint(sbi); + + failed_sbi: + put_nilfs(nilfs); + sb->s_fs_info = NULL; + kfree(sbi); + return err; +} + +static int nilfs_remount(struct super_block *sb, int *flags, char *data) +{ + struct nilfs_sb_info *sbi = NILFS_SB(sb); + struct nilfs_super_block *sbp; + struct the_nilfs *nilfs = sbi->s_nilfs; + unsigned long old_sb_flags; + struct nilfs_mount_options old_opts; + int err; + + old_sb_flags = sb->s_flags; + old_opts.mount_opt = sbi->s_mount_opt; + old_opts.snapshot_cno = sbi->s_snapshot_cno; + + if (!parse_options(data, sb)) { + err = -EINVAL; + goto restore_opts; + } + sb->s_flags = (sb->s_flags & ~MS_POSIXACL); + + if ((*flags & MS_RDONLY) && + sbi->s_snapshot_cno != old_opts.snapshot_cno) { + printk(KERN_WARNING "NILFS (device %s): couldn't " + "remount to a different snapshot. \n", + sb->s_id); + err = -EINVAL; + goto restore_opts; + } + + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) + goto out; + if (*flags & MS_RDONLY) { + /* Shutting down the segment constructor */ + nilfs_detach_segment_constructor(sbi); + sb->s_flags |= MS_RDONLY; + + sbi->s_snapshot_cno = nilfs_last_cno(nilfs); + /* nilfs_set_opt(sbi, SNAPSHOT); */ + + /* + * Remounting a valid RW partition RDONLY, so set + * the RDONLY flag and then mark the partition as valid again. + */ + down_write(&nilfs->ns_sem); + sbp = nilfs->ns_sbp[0]; + if (!(sbp->s_state & le16_to_cpu(NILFS_VALID_FS)) && + (nilfs->ns_mount_state & NILFS_VALID_FS)) + sbp->s_state = cpu_to_le16(nilfs->ns_mount_state); + sbp->s_mtime = cpu_to_le64(get_seconds()); + nilfs_commit_super(sbi, 1); + up_write(&nilfs->ns_sem); + } else { + /* + * Mounting a RDONLY partition read-write, so reread and + * store the current valid flag. (It may have been changed + * by fsck since we originally mounted the partition.) + */ + down(&sb->s_bdev->bd_mount_sem); + /* Check existing RW-mount */ + if (test_exclusive_mount(sb->s_type, sb->s_bdev, 0)) { + printk(KERN_WARNING "NILFS (device %s): couldn't " + "remount because a RW-mount exists.\n", + sb->s_id); + err = -EBUSY; + goto rw_remount_failed; + } + if (sbi->s_snapshot_cno != nilfs_last_cno(nilfs)) { + printk(KERN_WARNING "NILFS (device %s): couldn't " + "remount because the current RO-mount is not " + "the latest one.\n", + sb->s_id); + err = -EINVAL; + goto rw_remount_failed; + } + sb->s_flags &= ~MS_RDONLY; + nilfs_clear_opt(sbi, SNAPSHOT); + sbi->s_snapshot_cno = 0; + + err = nilfs_attach_segment_constructor(sbi); + if (err) + goto rw_remount_failed; + + down_write(&nilfs->ns_sem); + nilfs_setup_super(sbi); + up_write(&nilfs->ns_sem); + + up(&sb->s_bdev->bd_mount_sem); + } + out: + return 0; + + rw_remount_failed: + up(&sb->s_bdev->bd_mount_sem); + restore_opts: + sb->s_flags = old_sb_flags; + sbi->s_mount_opt = old_opts.mount_opt; + sbi->s_snapshot_cno = old_opts.snapshot_cno; + return err; +} + +struct nilfs_super_data { + struct block_device *bdev; + __u64 cno; + int flags; +}; + +/** + * nilfs_identify - pre-read mount options needed to identify mount instance + * @data: mount options + * @sd: nilfs_super_data + */ +static int nilfs_identify(char *data, struct nilfs_super_data *sd) +{ + char *p, *options = data; + substring_t args[MAX_OPT_ARGS]; + int option, token; + int ret = 0; + + do { + p = strsep(&options, ","); + if (p != NULL && *p) { + token = match_token(p, tokens, args); + if (token == Opt_snapshot) { + if (!(sd->flags & MS_RDONLY)) + ret++; + else { + ret = match_int(&args[0], &option); + if (!ret) { + if (option > 0) + sd->cno = option; + else + ret++; + } + } + } + if (ret) + printk(KERN_ERR + "NILFS: invalid mount option: %s\n", p); + } + if (!options) + break; + BUG_ON(options == data); + *(options - 1) = ','; + } while (!ret); + return ret; +} + +static int nilfs_set_bdev_super(struct super_block *s, void *data) +{ + struct nilfs_super_data *sd = data; + + s->s_bdev = sd->bdev; + s->s_dev = s->s_bdev->bd_dev; + return 0; +} + +static int nilfs_test_bdev_super(struct super_block *s, void *data) +{ + struct nilfs_super_data *sd = data; + + return s->s_bdev == sd->bdev; +} + +static int nilfs_test_bdev_super2(struct super_block *s, void *data) +{ + struct nilfs_super_data *sd = data; + int ret; + + if (s->s_bdev != sd->bdev) + return 0; + + if (!((s->s_flags | sd->flags) & MS_RDONLY)) + return 1; /* Reuse an old R/W-mode super_block */ + + if (s->s_flags & sd->flags & MS_RDONLY) { + if (down_read_trylock(&s->s_umount)) { + ret = s->s_root && + (sd->cno == NILFS_SB(s)->s_snapshot_cno); + up_read(&s->s_umount); + /* + * This path is locked with sb_lock by sget(). + * So, drop_super() causes deadlock. + */ + return ret; + } + } + return 0; +} + +static int +nilfs_get_sb(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data, struct vfsmount *mnt) +{ + struct nilfs_super_data sd; + struct super_block *s, *s2; + struct the_nilfs *nilfs = NULL; + int err, need_to_close = 1; + + sd.bdev = open_bdev_exclusive(dev_name, flags, fs_type); + if (IS_ERR(sd.bdev)) + return PTR_ERR(sd.bdev); + + /* + * To get mount instance using sget() vfs-routine, NILFS needs + * much more information than normal filesystems to identify mount + * instance. For snapshot mounts, not only a mount type (ro-mount + * or rw-mount) but also a checkpoint number is required. + * The results are passed in sget() using nilfs_super_data. + */ + sd.cno = 0; + sd.flags = flags; + if (nilfs_identify((char *)data, &sd)) { + err = -EINVAL; + goto failed; + } + + /* + * once the super is inserted into the list by sget, s_umount + * will protect the lockfs code from trying to start a snapshot + * while we are mounting + */ + down(&sd.bdev->bd_mount_sem); + if (!sd.cno && + (err = test_exclusive_mount(fs_type, sd.bdev, flags ^ MS_RDONLY))) { + err = (err < 0) ? : -EBUSY; + goto failed_unlock; + } + + /* + * Phase-1: search any existent instance and get the_nilfs + */ + s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd); + if (IS_ERR(s)) + goto error_s; + + if (!s->s_root) { + err = -ENOMEM; + nilfs = alloc_nilfs(sd.bdev); + if (!nilfs) + goto cancel_new; + } else { + struct nilfs_sb_info *sbi = NILFS_SB(s); + + /* + * s_umount protects super_block from unmount process; + * It covers pointers of nilfs_sb_info and the_nilfs. + */ + nilfs = sbi->s_nilfs; + get_nilfs(nilfs); + up_write(&s->s_umount); + + /* + * Phase-2: search specified snapshot or R/W mode super_block + */ + if (!sd.cno) + /* trying to get the latest checkpoint. */ + sd.cno = nilfs_last_cno(nilfs); + + s2 = sget(fs_type, nilfs_test_bdev_super2, + nilfs_set_bdev_super, &sd); + deactivate_super(s); + /* + * Although deactivate_super() invokes close_bdev_exclusive() at + * kill_block_super(). Here, s is an existent mount; we need + * one more close_bdev_exclusive() call. + */ + s = s2; + if (IS_ERR(s)) + goto error_s; + } + + if (!s->s_root) { + char b[BDEVNAME_SIZE]; + + s->s_flags = flags; + strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id)); + sb_set_blocksize(s, block_size(sd.bdev)); + + err = nilfs_fill_super(s, data, flags & MS_VERBOSE, nilfs); + if (err) + goto cancel_new; + + s->s_flags |= MS_ACTIVE; + need_to_close = 0; + } else if (!(s->s_flags & MS_RDONLY)) { + err = -EBUSY; + } + + up(&sd.bdev->bd_mount_sem); + put_nilfs(nilfs); + if (need_to_close) + close_bdev_exclusive(sd.bdev, flags); + simple_set_mnt(mnt, s); + return 0; + + error_s: + up(&sd.bdev->bd_mount_sem); + if (nilfs) + put_nilfs(nilfs); + close_bdev_exclusive(sd.bdev, flags); + return PTR_ERR(s); + + failed_unlock: + up(&sd.bdev->bd_mount_sem); + failed: + close_bdev_exclusive(sd.bdev, flags); + + return err; + + cancel_new: + /* Abandoning the newly allocated superblock */ + up(&sd.bdev->bd_mount_sem); + if (nilfs) + put_nilfs(nilfs); + up_write(&s->s_umount); + deactivate_super(s); + /* + * deactivate_super() invokes close_bdev_exclusive(). + * We must finish all post-cleaning before this call; + * put_nilfs() and unlocking bd_mount_sem need the block device. + */ + return err; +} + +static int nilfs_test_bdev_super3(struct super_block *s, void *data) +{ + struct nilfs_super_data *sd = data; + int ret; + + if (s->s_bdev != sd->bdev) + return 0; + if (down_read_trylock(&s->s_umount)) { + ret = (s->s_flags & MS_RDONLY) && s->s_root && + nilfs_test_opt(NILFS_SB(s), SNAPSHOT); + up_read(&s->s_umount); + if (ret) + return 0; /* ignore snapshot mounts */ + } + return !((sd->flags ^ s->s_flags) & MS_RDONLY); +} + +static int __false_bdev_super(struct super_block *s, void *data) +{ +#if 0 /* XXX: workaround for lock debug. This is not good idea */ + up_write(&s->s_umount); +#endif + return -EFAULT; +} + +/** + * test_exclusive_mount - check whether an exclusive RW/RO mount exists or not. + * fs_type: filesystem type + * bdev: block device + * flag: 0 (check rw-mount) or MS_RDONLY (check ro-mount) + * res: pointer to an integer to store result + * + * This function must be called within a section protected by bd_mount_mutex. + */ +static int test_exclusive_mount(struct file_system_type *fs_type, + struct block_device *bdev, int flags) +{ + struct super_block *s; + struct nilfs_super_data sd = { .flags = flags, .bdev = bdev }; + + s = sget(fs_type, nilfs_test_bdev_super3, __false_bdev_super, &sd); + if (IS_ERR(s)) { + if (PTR_ERR(s) != -EFAULT) + return PTR_ERR(s); + return 0; /* Not found */ + } + up_write(&s->s_umount); + deactivate_super(s); + return 1; /* Found */ +} + +struct file_system_type nilfs_fs_type = { + .owner = THIS_MODULE, + .name = "nilfs2", + .get_sb = nilfs_get_sb, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static int __init init_nilfs_fs(void) +{ + int err; + + err = nilfs_init_inode_cache(); + if (err) + goto failed; + + err = nilfs_init_transaction_cache(); + if (err) + goto failed_inode_cache; + + err = nilfs_init_segbuf_cache(); + if (err) + goto failed_transaction_cache; + + err = nilfs_btree_path_cache_init(); + if (err) + goto failed_segbuf_cache; + + err = register_filesystem(&nilfs_fs_type); + if (err) + goto failed_btree_path_cache; + + return 0; + + failed_btree_path_cache: + nilfs_btree_path_cache_destroy(); + + failed_segbuf_cache: + nilfs_destroy_segbuf_cache(); + + failed_transaction_cache: + nilfs_destroy_transaction_cache(); + + failed_inode_cache: + nilfs_destroy_inode_cache(); + + failed: + return err; +} + +static void __exit exit_nilfs_fs(void) +{ + nilfs_destroy_segbuf_cache(); + nilfs_destroy_transaction_cache(); + nilfs_destroy_inode_cache(); + nilfs_btree_path_cache_destroy(); + unregister_filesystem(&nilfs_fs_type); +} + +module_init(init_nilfs_fs) +module_exit(exit_nilfs_fs) diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c new file mode 100644 index 000000000000..33400cf0bbe2 --- /dev/null +++ b/fs/nilfs2/the_nilfs.c @@ -0,0 +1,637 @@ +/* + * the_nilfs.c - the_nilfs shared structure. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net> + * + */ + +#include <linux/buffer_head.h> +#include <linux/slab.h> +#include <linux/blkdev.h> +#include <linux/backing-dev.h> +#include <linux/crc32.h> +#include "nilfs.h" +#include "segment.h" +#include "alloc.h" +#include "cpfile.h" +#include "sufile.h" +#include "dat.h" +#include "seglist.h" +#include "segbuf.h" + +void nilfs_set_last_segment(struct the_nilfs *nilfs, + sector_t start_blocknr, u64 seq, __u64 cno) +{ + spin_lock(&nilfs->ns_last_segment_lock); + nilfs->ns_last_pseg = start_blocknr; + nilfs->ns_last_seq = seq; + nilfs->ns_last_cno = cno; + spin_unlock(&nilfs->ns_last_segment_lock); +} + +/** + * alloc_nilfs - allocate the_nilfs structure + * @bdev: block device to which the_nilfs is related + * + * alloc_nilfs() allocates memory for the_nilfs and + * initializes its reference count and locks. + * + * Return Value: On success, pointer to the_nilfs is returned. + * On error, NULL is returned. + */ +struct the_nilfs *alloc_nilfs(struct block_device *bdev) +{ + struct the_nilfs *nilfs; + + nilfs = kzalloc(sizeof(*nilfs), GFP_KERNEL); + if (!nilfs) + return NULL; + + nilfs->ns_bdev = bdev; + atomic_set(&nilfs->ns_count, 1); + atomic_set(&nilfs->ns_writer_refcount, -1); + atomic_set(&nilfs->ns_ndirtyblks, 0); + init_rwsem(&nilfs->ns_sem); + mutex_init(&nilfs->ns_writer_mutex); + INIT_LIST_HEAD(&nilfs->ns_supers); + spin_lock_init(&nilfs->ns_last_segment_lock); + nilfs->ns_gc_inodes_h = NULL; + init_rwsem(&nilfs->ns_segctor_sem); + + return nilfs; +} + +/** + * put_nilfs - release a reference to the_nilfs + * @nilfs: the_nilfs structure to be released + * + * put_nilfs() decrements a reference counter of the_nilfs. + * If the reference count reaches zero, the_nilfs is freed. + */ +void put_nilfs(struct the_nilfs *nilfs) +{ + if (!atomic_dec_and_test(&nilfs->ns_count)) + return; + /* + * Increment of ns_count never occur below because the caller + * of get_nilfs() holds at least one reference to the_nilfs. + * Thus its exclusion control is not required here. + */ + might_sleep(); + if (nilfs_loaded(nilfs)) { + nilfs_mdt_clear(nilfs->ns_sufile); + nilfs_mdt_destroy(nilfs->ns_sufile); + nilfs_mdt_clear(nilfs->ns_cpfile); + nilfs_mdt_destroy(nilfs->ns_cpfile); + nilfs_mdt_clear(nilfs->ns_dat); + nilfs_mdt_destroy(nilfs->ns_dat); + /* XXX: how and when to clear nilfs->ns_gc_dat? */ + nilfs_mdt_destroy(nilfs->ns_gc_dat); + } + if (nilfs_init(nilfs)) { + nilfs_destroy_gccache(nilfs); + brelse(nilfs->ns_sbh[0]); + brelse(nilfs->ns_sbh[1]); + } + kfree(nilfs); +} + +static int nilfs_load_super_root(struct the_nilfs *nilfs, + struct nilfs_sb_info *sbi, sector_t sr_block) +{ + struct buffer_head *bh_sr; + struct nilfs_super_root *raw_sr; + struct nilfs_super_block **sbp = nilfs->ns_sbp; + unsigned dat_entry_size, segment_usage_size, checkpoint_size; + unsigned inode_size; + int err; + + err = nilfs_read_super_root_block(sbi->s_super, sr_block, &bh_sr, 1); + if (unlikely(err)) + return err; + + down_read(&nilfs->ns_sem); + dat_entry_size = le16_to_cpu(sbp[0]->s_dat_entry_size); + checkpoint_size = le16_to_cpu(sbp[0]->s_checkpoint_size); + segment_usage_size = le16_to_cpu(sbp[0]->s_segment_usage_size); + up_read(&nilfs->ns_sem); + + inode_size = nilfs->ns_inode_size; + + err = -ENOMEM; + nilfs->ns_dat = nilfs_mdt_new( + nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP); + if (unlikely(!nilfs->ns_dat)) + goto failed; + + nilfs->ns_gc_dat = nilfs_mdt_new( + nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP); + if (unlikely(!nilfs->ns_gc_dat)) + goto failed_dat; + + nilfs->ns_cpfile = nilfs_mdt_new( + nilfs, NULL, NILFS_CPFILE_INO, NILFS_CPFILE_GFP); + if (unlikely(!nilfs->ns_cpfile)) + goto failed_gc_dat; + + nilfs->ns_sufile = nilfs_mdt_new( + nilfs, NULL, NILFS_SUFILE_INO, NILFS_SUFILE_GFP); + if (unlikely(!nilfs->ns_sufile)) + goto failed_cpfile; + + err = nilfs_palloc_init_blockgroup(nilfs->ns_dat, dat_entry_size); + if (unlikely(err)) + goto failed_sufile; + + err = nilfs_palloc_init_blockgroup(nilfs->ns_gc_dat, dat_entry_size); + if (unlikely(err)) + goto failed_sufile; + + nilfs_mdt_set_shadow(nilfs->ns_dat, nilfs->ns_gc_dat); + nilfs_mdt_set_entry_size(nilfs->ns_cpfile, checkpoint_size, + sizeof(struct nilfs_cpfile_header)); + nilfs_mdt_set_entry_size(nilfs->ns_sufile, segment_usage_size, + sizeof(struct nilfs_sufile_header)); + + err = nilfs_mdt_read_inode_direct( + nilfs->ns_dat, bh_sr, NILFS_SR_DAT_OFFSET(inode_size)); + if (unlikely(err)) + goto failed_sufile; + + err = nilfs_mdt_read_inode_direct( + nilfs->ns_cpfile, bh_sr, NILFS_SR_CPFILE_OFFSET(inode_size)); + if (unlikely(err)) + goto failed_sufile; + + err = nilfs_mdt_read_inode_direct( + nilfs->ns_sufile, bh_sr, NILFS_SR_SUFILE_OFFSET(inode_size)); + if (unlikely(err)) + goto failed_sufile; + + raw_sr = (struct nilfs_super_root *)bh_sr->b_data; + nilfs->ns_nongc_ctime = le64_to_cpu(raw_sr->sr_nongc_ctime); + + failed: + brelse(bh_sr); + return err; + + failed_sufile: + nilfs_mdt_destroy(nilfs->ns_sufile); + + failed_cpfile: + nilfs_mdt_destroy(nilfs->ns_cpfile); + + failed_gc_dat: + nilfs_mdt_destroy(nilfs->ns_gc_dat); + + failed_dat: + nilfs_mdt_destroy(nilfs->ns_dat); + goto failed; +} + +static void nilfs_init_recovery_info(struct nilfs_recovery_info *ri) +{ + memset(ri, 0, sizeof(*ri)); + INIT_LIST_HEAD(&ri->ri_used_segments); +} + +static void nilfs_clear_recovery_info(struct nilfs_recovery_info *ri) +{ + nilfs_dispose_segment_list(&ri->ri_used_segments); +} + +/** + * load_nilfs - load and recover the nilfs + * @nilfs: the_nilfs structure to be released + * @sbi: nilfs_sb_info used to recover past segment + * + * load_nilfs() searches and load the latest super root, + * attaches the last segment, and does recovery if needed. + * The caller must call this exclusively for simultaneous mounts. + */ +int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) +{ + struct nilfs_recovery_info ri; + unsigned int s_flags = sbi->s_super->s_flags; + int really_read_only = bdev_read_only(nilfs->ns_bdev); + unsigned valid_fs; + int err = 0; + + nilfs_init_recovery_info(&ri); + + down_write(&nilfs->ns_sem); + valid_fs = (nilfs->ns_mount_state & NILFS_VALID_FS); + up_write(&nilfs->ns_sem); + + if (!valid_fs && (s_flags & MS_RDONLY)) { + printk(KERN_INFO "NILFS: INFO: recovery " + "required for readonly filesystem.\n"); + if (really_read_only) { + printk(KERN_ERR "NILFS: write access " + "unavailable, cannot proceed.\n"); + err = -EROFS; + goto failed; + } + printk(KERN_INFO "NILFS: write access will " + "be enabled during recovery.\n"); + sbi->s_super->s_flags &= ~MS_RDONLY; + } + + err = nilfs_search_super_root(nilfs, sbi, &ri); + if (unlikely(err)) { + printk(KERN_ERR "NILFS: error searching super root.\n"); + goto failed; + } + + err = nilfs_load_super_root(nilfs, sbi, ri.ri_super_root); + if (unlikely(err)) { + printk(KERN_ERR "NILFS: error loading super root.\n"); + goto failed; + } + + if (!valid_fs) { + err = nilfs_recover_logical_segments(nilfs, sbi, &ri); + if (unlikely(err)) { + nilfs_mdt_destroy(nilfs->ns_cpfile); + nilfs_mdt_destroy(nilfs->ns_sufile); + nilfs_mdt_destroy(nilfs->ns_dat); + goto failed; + } + if (ri.ri_need_recovery == NILFS_RECOVERY_SR_UPDATED) + sbi->s_super->s_dirt = 1; + } + + set_nilfs_loaded(nilfs); + + failed: + nilfs_clear_recovery_info(&ri); + sbi->s_super->s_flags = s_flags; + return err; +} + +static unsigned long long nilfs_max_size(unsigned int blkbits) +{ + unsigned int max_bits; + unsigned long long res = MAX_LFS_FILESIZE; /* page cache limit */ + + max_bits = blkbits + NILFS_BMAP_KEY_BIT; /* bmap size limit */ + if (max_bits < 64) + res = min_t(unsigned long long, res, (1ULL << max_bits) - 1); + return res; +} + +static int nilfs_store_disk_layout(struct the_nilfs *nilfs, + struct nilfs_super_block *sbp) +{ + if (le32_to_cpu(sbp->s_rev_level) != NILFS_CURRENT_REV) { + printk(KERN_ERR "NILFS: revision mismatch " + "(superblock rev.=%d.%d, current rev.=%d.%d). " + "Please check the version of mkfs.nilfs.\n", + le32_to_cpu(sbp->s_rev_level), + le16_to_cpu(sbp->s_minor_rev_level), + NILFS_CURRENT_REV, NILFS_MINOR_REV); + return -EINVAL; + } + nilfs->ns_sbsize = le16_to_cpu(sbp->s_bytes); + if (nilfs->ns_sbsize > BLOCK_SIZE) + return -EINVAL; + + nilfs->ns_inode_size = le16_to_cpu(sbp->s_inode_size); + nilfs->ns_first_ino = le32_to_cpu(sbp->s_first_ino); + + nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment); + if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) { + printk(KERN_ERR "NILFS: too short segment. \n"); + return -EINVAL; + } + + nilfs->ns_first_data_block = le64_to_cpu(sbp->s_first_data_block); + nilfs->ns_nsegments = le64_to_cpu(sbp->s_nsegments); + nilfs->ns_r_segments_percentage = + le32_to_cpu(sbp->s_r_segments_percentage); + nilfs->ns_nrsvsegs = + max_t(unsigned long, NILFS_MIN_NRSVSEGS, + DIV_ROUND_UP(nilfs->ns_nsegments * + nilfs->ns_r_segments_percentage, 100)); + nilfs->ns_crc_seed = le32_to_cpu(sbp->s_crc_seed); + return 0; +} + +static int nilfs_valid_sb(struct nilfs_super_block *sbp) +{ + static unsigned char sum[4]; + const int sumoff = offsetof(struct nilfs_super_block, s_sum); + size_t bytes; + u32 crc; + + if (!sbp || le16_to_cpu(sbp->s_magic) != NILFS_SUPER_MAGIC) + return 0; + bytes = le16_to_cpu(sbp->s_bytes); + if (bytes > BLOCK_SIZE) + return 0; + crc = crc32_le(le32_to_cpu(sbp->s_crc_seed), (unsigned char *)sbp, + sumoff); + crc = crc32_le(crc, sum, 4); + crc = crc32_le(crc, (unsigned char *)sbp + sumoff + 4, + bytes - sumoff - 4); + return crc == le32_to_cpu(sbp->s_sum); +} + +static int nilfs_sb2_bad_offset(struct nilfs_super_block *sbp, u64 offset) +{ + return offset < ((le64_to_cpu(sbp->s_nsegments) * + le32_to_cpu(sbp->s_blocks_per_segment)) << + (le32_to_cpu(sbp->s_log_block_size) + 10)); +} + +static void nilfs_release_super_block(struct the_nilfs *nilfs) +{ + int i; + + for (i = 0; i < 2; i++) { + if (nilfs->ns_sbp[i]) { + brelse(nilfs->ns_sbh[i]); + nilfs->ns_sbh[i] = NULL; + nilfs->ns_sbp[i] = NULL; + } + } +} + +void nilfs_fall_back_super_block(struct the_nilfs *nilfs) +{ + brelse(nilfs->ns_sbh[0]); + nilfs->ns_sbh[0] = nilfs->ns_sbh[1]; + nilfs->ns_sbp[0] = nilfs->ns_sbp[1]; + nilfs->ns_sbh[1] = NULL; + nilfs->ns_sbp[1] = NULL; +} + +void nilfs_swap_super_block(struct the_nilfs *nilfs) +{ + struct buffer_head *tsbh = nilfs->ns_sbh[0]; + struct nilfs_super_block *tsbp = nilfs->ns_sbp[0]; + + nilfs->ns_sbh[0] = nilfs->ns_sbh[1]; + nilfs->ns_sbp[0] = nilfs->ns_sbp[1]; + nilfs->ns_sbh[1] = tsbh; + nilfs->ns_sbp[1] = tsbp; +} + +static int nilfs_load_super_block(struct the_nilfs *nilfs, + struct super_block *sb, int blocksize, + struct nilfs_super_block **sbpp) +{ + struct nilfs_super_block **sbp = nilfs->ns_sbp; + struct buffer_head **sbh = nilfs->ns_sbh; + u64 sb2off = NILFS_SB2_OFFSET_BYTES(nilfs->ns_bdev->bd_inode->i_size); + int valid[2], swp = 0; + + sbp[0] = nilfs_read_super_block(sb, NILFS_SB_OFFSET_BYTES, blocksize, + &sbh[0]); + sbp[1] = nilfs_read_super_block(sb, sb2off, blocksize, &sbh[1]); + + if (!sbp[0]) { + if (!sbp[1]) { + printk(KERN_ERR "NILFS: unable to read superblock\n"); + return -EIO; + } + printk(KERN_WARNING + "NILFS warning: unable to read primary superblock\n"); + } else if (!sbp[1]) + printk(KERN_WARNING + "NILFS warning: unable to read secondary superblock\n"); + + valid[0] = nilfs_valid_sb(sbp[0]); + valid[1] = nilfs_valid_sb(sbp[1]); + swp = valid[1] && + (!valid[0] || + le64_to_cpu(sbp[1]->s_wtime) > le64_to_cpu(sbp[0]->s_wtime)); + + if (valid[swp] && nilfs_sb2_bad_offset(sbp[swp], sb2off)) { + brelse(sbh[1]); + sbh[1] = NULL; + sbp[1] = NULL; + swp = 0; + } + if (!valid[swp]) { + nilfs_release_super_block(nilfs); + printk(KERN_ERR "NILFS: Can't find nilfs on dev %s.\n", + sb->s_id); + return -EINVAL; + } + + if (swp) { + printk(KERN_WARNING "NILFS warning: broken superblock. " + "using spare superblock.\n"); + nilfs_swap_super_block(nilfs); + } + + nilfs->ns_sbwtime[0] = le64_to_cpu(sbp[0]->s_wtime); + nilfs->ns_sbwtime[1] = valid[!swp] ? le64_to_cpu(sbp[1]->s_wtime) : 0; + nilfs->ns_prot_seq = le64_to_cpu(sbp[valid[1] & !swp]->s_last_seq); + *sbpp = sbp[0]; + return 0; +} + +/** + * init_nilfs - initialize a NILFS instance. + * @nilfs: the_nilfs structure + * @sbi: nilfs_sb_info + * @sb: super block + * @data: mount options + * + * init_nilfs() performs common initialization per block device (e.g. + * reading the super block, getting disk layout information, initializing + * shared fields in the_nilfs). It takes on some portion of the jobs + * typically done by a fill_super() routine. This division arises from + * the nature that multiple NILFS instances may be simultaneously + * mounted on a device. + * For multiple mounts on the same device, only the first mount + * invokes these tasks. + * + * Return Value: On success, 0 is returned. On error, a negative error + * code is returned. + */ +int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data) +{ + struct super_block *sb = sbi->s_super; + struct nilfs_super_block *sbp; + struct backing_dev_info *bdi; + int blocksize; + int err; + + down_write(&nilfs->ns_sem); + if (nilfs_init(nilfs)) { + /* Load values from existing the_nilfs */ + sbp = nilfs->ns_sbp[0]; + err = nilfs_store_magic_and_option(sb, sbp, data); + if (err) + goto out; + + blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size); + if (sb->s_blocksize != blocksize && + !sb_set_blocksize(sb, blocksize)) { + printk(KERN_ERR "NILFS: blocksize %d unfit to device\n", + blocksize); + err = -EINVAL; + } + sb->s_maxbytes = nilfs_max_size(sb->s_blocksize_bits); + goto out; + } + + blocksize = sb_min_blocksize(sb, BLOCK_SIZE); + if (!blocksize) { + printk(KERN_ERR "NILFS: unable to set blocksize\n"); + err = -EINVAL; + goto out; + } + err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp); + if (err) + goto out; + + err = nilfs_store_magic_and_option(sb, sbp, data); + if (err) + goto failed_sbh; + + blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size); + if (sb->s_blocksize != blocksize) { + int hw_blocksize = bdev_hardsect_size(sb->s_bdev); + + if (blocksize < hw_blocksize) { + printk(KERN_ERR + "NILFS: blocksize %d too small for device " + "(sector-size = %d).\n", + blocksize, hw_blocksize); + err = -EINVAL; + goto failed_sbh; + } + nilfs_release_super_block(nilfs); + sb_set_blocksize(sb, blocksize); + + err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp); + if (err) + goto out; + /* not failed_sbh; sbh is released automatically + when reloading fails. */ + } + nilfs->ns_blocksize_bits = sb->s_blocksize_bits; + + err = nilfs_store_disk_layout(nilfs, sbp); + if (err) + goto failed_sbh; + + sb->s_maxbytes = nilfs_max_size(sb->s_blocksize_bits); + + nilfs->ns_mount_state = le16_to_cpu(sbp->s_state); + + bdi = nilfs->ns_bdev->bd_inode_backing_dev_info; + if (!bdi) + bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info; + nilfs->ns_bdi = bdi ? : &default_backing_dev_info; + + /* Finding last segment */ + nilfs->ns_last_pseg = le64_to_cpu(sbp->s_last_pseg); + nilfs->ns_last_cno = le64_to_cpu(sbp->s_last_cno); + nilfs->ns_last_seq = le64_to_cpu(sbp->s_last_seq); + + nilfs->ns_seg_seq = nilfs->ns_last_seq; + nilfs->ns_segnum = + nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg); + nilfs->ns_cno = nilfs->ns_last_cno + 1; + if (nilfs->ns_segnum >= nilfs->ns_nsegments) { + printk(KERN_ERR "NILFS invalid last segment number.\n"); + err = -EINVAL; + goto failed_sbh; + } + /* Dummy values */ + nilfs->ns_free_segments_count = + nilfs->ns_nsegments - (nilfs->ns_segnum + 1); + + /* Initialize gcinode cache */ + err = nilfs_init_gccache(nilfs); + if (err) + goto failed_sbh; + + set_nilfs_init(nilfs); + err = 0; + out: + up_write(&nilfs->ns_sem); + return err; + + failed_sbh: + nilfs_release_super_block(nilfs); + goto out; +} + +int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks) +{ + struct inode *dat = nilfs_dat_inode(nilfs); + unsigned long ncleansegs; + int err; + + down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ + err = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile, &ncleansegs); + up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ + if (likely(!err)) + *nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment; + return err; +} + +int nilfs_near_disk_full(struct the_nilfs *nilfs) +{ + struct inode *sufile = nilfs->ns_sufile; + unsigned long ncleansegs, nincsegs; + int ret; + + ret = nilfs_sufile_get_ncleansegs(sufile, &ncleansegs); + if (likely(!ret)) { + nincsegs = atomic_read(&nilfs->ns_ndirtyblks) / + nilfs->ns_blocks_per_segment + 1; + if (ncleansegs <= nilfs->ns_nrsvsegs + nincsegs) + ret++; + } + return ret; +} + +int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno, + int snapshot_mount) +{ + struct nilfs_sb_info *sbi; + int ret = 0; + + down_read(&nilfs->ns_sem); + if (cno == 0 || cno > nilfs->ns_cno) + goto out_unlock; + + list_for_each_entry(sbi, &nilfs->ns_supers, s_list) { + if (sbi->s_snapshot_cno == cno && + (!snapshot_mount || nilfs_test_opt(sbi, SNAPSHOT))) { + /* exclude read-only mounts */ + ret++; + break; + } + } + /* for protecting recent checkpoints */ + if (cno >= nilfs_last_cno(nilfs)) + ret++; + + out_unlock: + up_read(&nilfs->ns_sem); + return ret; +} diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h new file mode 100644 index 000000000000..30fe58778d05 --- /dev/null +++ b/fs/nilfs2/the_nilfs.h @@ -0,0 +1,298 @@ +/* + * the_nilfs.h - the_nilfs shared structure. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net> + * + */ + +#ifndef _THE_NILFS_H +#define _THE_NILFS_H + +#include <linux/types.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> +#include <linux/blkdev.h> +#include <linux/backing-dev.h> +#include "sb.h" + +/* the_nilfs struct */ +enum { + THE_NILFS_INIT = 0, /* Information from super_block is set */ + THE_NILFS_LOADED, /* Roll-back/roll-forward has done and + the latest checkpoint was loaded */ + THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */ +}; + +/** + * struct the_nilfs - struct to supervise multiple nilfs mount points + * @ns_flags: flags + * @ns_count: reference count + * @ns_bdev: block device + * @ns_bdi: backing dev info + * @ns_writer: back pointer to writable nilfs_sb_info + * @ns_sem: semaphore for shared states + * @ns_writer_mutex: mutex protecting ns_writer attach/detach + * @ns_writer_refcount: number of referrers on ns_writer + * @ns_sbh: buffer heads of on-disk super blocks + * @ns_sbp: pointers to super block data + * @ns_sbwtime: previous write time of super blocks + * @ns_sbsize: size of valid data in super block + * @ns_supers: list of nilfs super block structs + * @ns_seg_seq: segment sequence counter + * @ns_segnum: index number of the latest full segment. + * @ns_nextnum: index number of the full segment index to be used next + * @ns_pseg_offset: offset of next partial segment in the current full segment + * @ns_cno: next checkpoint number + * @ns_ctime: write time of the last segment + * @ns_nongc_ctime: write time of the last segment not for cleaner operation + * @ns_ndirtyblks: Number of dirty data blocks + * @ns_last_segment_lock: lock protecting fields for the latest segment + * @ns_last_pseg: start block number of the latest segment + * @ns_last_seq: sequence value of the latest segment + * @ns_last_cno: checkpoint number of the latest segment + * @ns_prot_seq: least sequence number of segments which must not be reclaimed + * @ns_free_segments_count: counter of free segments + * @ns_segctor_sem: segment constructor semaphore + * @ns_dat: DAT file inode + * @ns_cpfile: checkpoint file inode + * @ns_sufile: segusage file inode + * @ns_gc_dat: shadow inode of the DAT file inode for GC + * @ns_gc_inodes: dummy inodes to keep live blocks + * @ns_gc_inodes_h: hash list to keep dummy inode holding live blocks + * @ns_blocksize_bits: bit length of block size + * @ns_nsegments: number of segments in filesystem + * @ns_blocks_per_segment: number of blocks per segment + * @ns_r_segments_percentage: reserved segments percentage + * @ns_nrsvsegs: number of reserved segments + * @ns_first_data_block: block number of first data block + * @ns_inode_size: size of on-disk inode + * @ns_first_ino: first not-special inode number + * @ns_crc_seed: seed value of CRC32 calculation + */ +struct the_nilfs { + unsigned long ns_flags; + atomic_t ns_count; + + struct block_device *ns_bdev; + struct backing_dev_info *ns_bdi; + struct nilfs_sb_info *ns_writer; + struct rw_semaphore ns_sem; + struct mutex ns_writer_mutex; + atomic_t ns_writer_refcount; + + /* + * used for + * - loading the latest checkpoint exclusively. + * - allocating a new full segment. + * - protecting s_dirt in the super_block struct + * (see nilfs_write_super) and the following fields. + */ + struct buffer_head *ns_sbh[2]; + struct nilfs_super_block *ns_sbp[2]; + time_t ns_sbwtime[2]; + unsigned ns_sbsize; + unsigned ns_mount_state; + struct list_head ns_supers; + + /* + * Following fields are dedicated to a writable FS-instance. + * Except for the period seeking checkpoint, code outside the segment + * constructor must lock a segment semaphore while accessing these + * fields. + * The writable FS-instance is sole during a lifetime of the_nilfs. + */ + u64 ns_seg_seq; + __u64 ns_segnum; + __u64 ns_nextnum; + unsigned long ns_pseg_offset; + __u64 ns_cno; + time_t ns_ctime; + time_t ns_nongc_ctime; + atomic_t ns_ndirtyblks; + + /* + * The following fields hold information on the latest partial segment + * written to disk with a super root. These fields are protected by + * ns_last_segment_lock. + */ + spinlock_t ns_last_segment_lock; + sector_t ns_last_pseg; + u64 ns_last_seq; + __u64 ns_last_cno; + u64 ns_prot_seq; + unsigned long ns_free_segments_count; + + struct rw_semaphore ns_segctor_sem; + + /* + * Following fields are lock free except for the period before + * the_nilfs is initialized. + */ + struct inode *ns_dat; + struct inode *ns_cpfile; + struct inode *ns_sufile; + struct inode *ns_gc_dat; + + /* GC inode list and hash table head */ + struct list_head ns_gc_inodes; + struct hlist_head *ns_gc_inodes_h; + + /* Disk layout information (static) */ + unsigned int ns_blocksize_bits; + unsigned long ns_nsegments; + unsigned long ns_blocks_per_segment; + unsigned long ns_r_segments_percentage; + unsigned long ns_nrsvsegs; + unsigned long ns_first_data_block; + int ns_inode_size; + int ns_first_ino; + u32 ns_crc_seed; +}; + +#define NILFS_GCINODE_HASH_BITS 8 +#define NILFS_GCINODE_HASH_SIZE (1<<NILFS_GCINODE_HASH_BITS) + +#define THE_NILFS_FNS(bit, name) \ +static inline void set_nilfs_##name(struct the_nilfs *nilfs) \ +{ \ + set_bit(THE_NILFS_##bit, &(nilfs)->ns_flags); \ +} \ +static inline void clear_nilfs_##name(struct the_nilfs *nilfs) \ +{ \ + clear_bit(THE_NILFS_##bit, &(nilfs)->ns_flags); \ +} \ +static inline int nilfs_##name(struct the_nilfs *nilfs) \ +{ \ + return test_bit(THE_NILFS_##bit, &(nilfs)->ns_flags); \ +} + +THE_NILFS_FNS(INIT, init) +THE_NILFS_FNS(LOADED, loaded) +THE_NILFS_FNS(DISCONTINUED, discontinued) + +/* Minimum interval of periodical update of superblocks (in seconds) */ +#define NILFS_SB_FREQ 10 +#define NILFS_ALTSB_FREQ 60 /* spare superblock */ + +void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64); +struct the_nilfs *alloc_nilfs(struct block_device *); +void put_nilfs(struct the_nilfs *); +int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *); +int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *); +int nilfs_count_free_blocks(struct the_nilfs *, sector_t *); +int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int); +int nilfs_near_disk_full(struct the_nilfs *); +void nilfs_fall_back_super_block(struct the_nilfs *); +void nilfs_swap_super_block(struct the_nilfs *); + + +static inline void get_nilfs(struct the_nilfs *nilfs) +{ + /* Caller must have at least one reference of the_nilfs. */ + atomic_inc(&nilfs->ns_count); +} + +static inline struct nilfs_sb_info *nilfs_get_writer(struct the_nilfs *nilfs) +{ + if (atomic_inc_and_test(&nilfs->ns_writer_refcount)) + mutex_lock(&nilfs->ns_writer_mutex); + return nilfs->ns_writer; +} + +static inline void nilfs_put_writer(struct the_nilfs *nilfs) +{ + if (atomic_add_negative(-1, &nilfs->ns_writer_refcount)) + mutex_unlock(&nilfs->ns_writer_mutex); +} + +static inline void +nilfs_attach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) +{ + mutex_lock(&nilfs->ns_writer_mutex); + nilfs->ns_writer = sbi; + mutex_unlock(&nilfs->ns_writer_mutex); +} + +static inline void +nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) +{ + mutex_lock(&nilfs->ns_writer_mutex); + if (sbi == nilfs->ns_writer) + nilfs->ns_writer = NULL; + mutex_unlock(&nilfs->ns_writer_mutex); +} + +static inline void +nilfs_get_segment_range(struct the_nilfs *nilfs, __u64 segnum, + sector_t *seg_start, sector_t *seg_end) +{ + *seg_start = (sector_t)nilfs->ns_blocks_per_segment * segnum; + *seg_end = *seg_start + nilfs->ns_blocks_per_segment - 1; + if (segnum == 0) + *seg_start = nilfs->ns_first_data_block; +} + +static inline sector_t +nilfs_get_segment_start_blocknr(struct the_nilfs *nilfs, __u64 segnum) +{ + return (segnum == 0) ? nilfs->ns_first_data_block : + (sector_t)nilfs->ns_blocks_per_segment * segnum; +} + +static inline __u64 +nilfs_get_segnum_of_block(struct the_nilfs *nilfs, sector_t blocknr) +{ + sector_t segnum = blocknr; + + sector_div(segnum, nilfs->ns_blocks_per_segment); + return segnum; +} + +static inline void +nilfs_terminate_segment(struct the_nilfs *nilfs, sector_t seg_start, + sector_t seg_end) +{ + /* terminate the current full segment (used in case of I/O-error) */ + nilfs->ns_pseg_offset = seg_end - seg_start + 1; +} + +static inline void nilfs_shift_to_next_segment(struct the_nilfs *nilfs) +{ + /* move forward with a full segment */ + nilfs->ns_segnum = nilfs->ns_nextnum; + nilfs->ns_pseg_offset = 0; + nilfs->ns_seg_seq++; +} + +static inline __u64 nilfs_last_cno(struct the_nilfs *nilfs) +{ + __u64 cno; + + spin_lock(&nilfs->ns_last_segment_lock); + cno = nilfs->ns_last_cno; + spin_unlock(&nilfs->ns_last_segment_lock); + return cno; +} + +static inline int nilfs_segment_is_active(struct the_nilfs *nilfs, __u64 n) +{ + return n == nilfs->ns_segnum || n == nilfs->ns_nextnum; +} + +#endif /* _THE_NILFS_H */ diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 12dfb44c22e5..fbeaec762103 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -296,7 +296,7 @@ int ocfs2_init_acl(handle_t *handle, return PTR_ERR(acl); } if (!acl) - inode->i_mode &= ~current->fs->umask; + inode->i_mode &= ~current_umask(); } if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) { struct posix_acl *clone; diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 19e3a96aa02c..678a067d9251 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -294,6 +294,55 @@ static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = { .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters, }; +static void ocfs2_dx_root_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 blkno) +{ + struct ocfs2_dx_root_block *dx_root = et->et_object; + + dx_root->dr_last_eb_blk = cpu_to_le64(blkno); +} + +static u64 ocfs2_dx_root_get_last_eb_blk(struct ocfs2_extent_tree *et) +{ + struct ocfs2_dx_root_block *dx_root = et->et_object; + + return le64_to_cpu(dx_root->dr_last_eb_blk); +} + +static void ocfs2_dx_root_update_clusters(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 clusters) +{ + struct ocfs2_dx_root_block *dx_root = et->et_object; + + le32_add_cpu(&dx_root->dr_clusters, clusters); +} + +static int ocfs2_dx_root_sanity_check(struct inode *inode, + struct ocfs2_extent_tree *et) +{ + struct ocfs2_dx_root_block *dx_root = et->et_object; + + BUG_ON(!OCFS2_IS_VALID_DX_ROOT(dx_root)); + + return 0; +} + +static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et) +{ + struct ocfs2_dx_root_block *dx_root = et->et_object; + + et->et_root_el = &dx_root->dr_list; +} + +static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = { + .eo_set_last_eb_blk = ocfs2_dx_root_set_last_eb_blk, + .eo_get_last_eb_blk = ocfs2_dx_root_get_last_eb_blk, + .eo_update_clusters = ocfs2_dx_root_update_clusters, + .eo_sanity_check = ocfs2_dx_root_sanity_check, + .eo_fill_root_el = ocfs2_dx_root_fill_root_el, +}; + static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, struct inode *inode, struct buffer_head *bh, @@ -339,6 +388,14 @@ void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, &ocfs2_xattr_value_et_ops); } +void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et, + struct inode *inode, + struct buffer_head *bh) +{ + __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_dr, + NULL, &ocfs2_dx_root_et_ops); +} + static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et, u64 new_last_eb_blk) { diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index cceff5c37f47..353254ba29e1 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -75,6 +75,9 @@ struct ocfs2_xattr_value_buf; void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, struct inode *inode, struct ocfs2_xattr_value_buf *vb); +void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et, + struct inode *inode, + struct buffer_head *bh); /* * Read an extent block into *bh. If *bh is NULL, a bh will be diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 8e1709a679b7..b2c52b3a1484 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1956,15 +1956,16 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping, } const struct address_space_operations ocfs2_aops = { - .readpage = ocfs2_readpage, - .readpages = ocfs2_readpages, - .writepage = ocfs2_writepage, - .write_begin = ocfs2_write_begin, - .write_end = ocfs2_write_end, - .bmap = ocfs2_bmap, - .sync_page = block_sync_page, - .direct_IO = ocfs2_direct_IO, - .invalidatepage = ocfs2_invalidatepage, - .releasepage = ocfs2_releasepage, - .migratepage = buffer_migrate_page, + .readpage = ocfs2_readpage, + .readpages = ocfs2_readpages, + .writepage = ocfs2_writepage, + .write_begin = ocfs2_write_begin, + .write_end = ocfs2_write_end, + .bmap = ocfs2_bmap, + .sync_page = block_sync_page, + .direct_IO = ocfs2_direct_IO, + .invalidatepage = ocfs2_invalidatepage, + .releasepage = ocfs2_releasepage, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, }; diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 04697ba7f73e..4f85eceab376 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -33,6 +33,7 @@ #include <linux/random.h> #include <linux/crc32.h> #include <linux/time.h> +#include <linux/debugfs.h> #include "heartbeat.h" #include "tcp.h" @@ -60,6 +61,11 @@ static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; static LIST_HEAD(o2hb_node_events); static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue); +#define O2HB_DEBUG_DIR "o2hb" +#define O2HB_DEBUG_LIVENODES "livenodes" +static struct dentry *o2hb_debug_dir; +static struct dentry *o2hb_debug_livenodes; + static LIST_HEAD(o2hb_all_regions); static struct o2hb_callback { @@ -905,7 +911,77 @@ static int o2hb_thread(void *data) return 0; } -void o2hb_init(void) +#ifdef CONFIG_DEBUG_FS +static int o2hb_debug_open(struct inode *inode, struct file *file) +{ + unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + char *buf = NULL; + int i = -1; + int out = 0; + + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!buf) + goto bail; + + o2hb_fill_node_map(map, sizeof(map)); + + while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) + out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i); + out += snprintf(buf + out, PAGE_SIZE - out, "\n"); + + i_size_write(inode, out); + + file->private_data = buf; + + return 0; +bail: + return -ENOMEM; +} + +static int o2hb_debug_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} + +static ssize_t o2hb_debug_read(struct file *file, char __user *buf, + size_t nbytes, loff_t *ppos) +{ + return simple_read_from_buffer(buf, nbytes, ppos, file->private_data, + i_size_read(file->f_mapping->host)); +} +#else +static int o2hb_debug_open(struct inode *inode, struct file *file) +{ + return 0; +} +static int o2hb_debug_release(struct inode *inode, struct file *file) +{ + return 0; +} +static ssize_t o2hb_debug_read(struct file *file, char __user *buf, + size_t nbytes, loff_t *ppos) +{ + return 0; +} +#endif /* CONFIG_DEBUG_FS */ + +static struct file_operations o2hb_debug_fops = { + .open = o2hb_debug_open, + .release = o2hb_debug_release, + .read = o2hb_debug_read, + .llseek = generic_file_llseek, +}; + +void o2hb_exit(void) +{ + if (o2hb_debug_livenodes) + debugfs_remove(o2hb_debug_livenodes); + if (o2hb_debug_dir) + debugfs_remove(o2hb_debug_dir); +} + +int o2hb_init(void) { int i; @@ -918,6 +994,24 @@ void o2hb_init(void) INIT_LIST_HEAD(&o2hb_node_events); memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap)); + + o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL); + if (!o2hb_debug_dir) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + + o2hb_debug_livenodes = debugfs_create_file(O2HB_DEBUG_LIVENODES, + S_IFREG|S_IRUSR, + o2hb_debug_dir, NULL, + &o2hb_debug_fops); + if (!o2hb_debug_livenodes) { + mlog_errno(-ENOMEM); + debugfs_remove(o2hb_debug_dir); + return -ENOMEM; + } + + return 0; } /* if we're already in a callback then we're already serialized by the sem */ diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h index e511339886b3..2f1649253b49 100644 --- a/fs/ocfs2/cluster/heartbeat.h +++ b/fs/ocfs2/cluster/heartbeat.h @@ -75,7 +75,8 @@ void o2hb_unregister_callback(const char *region_uuid, struct o2hb_callback_func *hc); void o2hb_fill_node_map(unsigned long *map, unsigned bytes); -void o2hb_init(void); +void o2hb_exit(void); +int o2hb_init(void); int o2hb_check_node_heartbeating(u8 node_num); int o2hb_check_node_heartbeating_from_callback(u8 node_num); int o2hb_check_local_node_heartbeating(void); diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index 70e8fa9e2539..7ee6188bc79a 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -881,6 +881,7 @@ static void __exit exit_o2nm(void) o2cb_sys_shutdown(); o2net_exit(); + o2hb_exit(); } static int __init init_o2nm(void) @@ -889,11 +890,13 @@ static int __init init_o2nm(void) cluster_print_version(); - o2hb_init(); + ret = o2hb_init(); + if (ret) + goto out; ret = o2net_init(); if (ret) - goto out; + goto out_o2hb; ret = o2net_register_hb_callbacks(); if (ret) @@ -916,6 +919,8 @@ out_callbacks: o2net_unregister_hb_callbacks(); out_o2net: o2net_exit(); +out_o2hb: + o2hb_exit(); out: return ret; } diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index f2c4098cf337..e71160cda110 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -41,6 +41,7 @@ #include <linux/slab.h> #include <linux/highmem.h> #include <linux/quotaops.h> +#include <linux/sort.h> #define MLOG_MASK_PREFIX ML_NAMEI #include <cluster/masklog.h> @@ -58,6 +59,7 @@ #include "namei.h" #include "suballoc.h" #include "super.h" +#include "sysfile.h" #include "uptodate.h" #include "buffer_head_io.h" @@ -71,11 +73,6 @@ static unsigned char ocfs2_filetype_table[] = { DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK }; -static int ocfs2_extend_dir(struct ocfs2_super *osb, - struct inode *dir, - struct buffer_head *parent_fe_bh, - unsigned int blocks_wanted, - struct buffer_head **new_de_bh); static int ocfs2_do_extend_dir(struct super_block *sb, handle_t *handle, struct inode *dir, @@ -83,22 +80,36 @@ static int ocfs2_do_extend_dir(struct super_block *sb, struct ocfs2_alloc_context *data_ac, struct ocfs2_alloc_context *meta_ac, struct buffer_head **new_bh); +static int ocfs2_dir_indexed(struct inode *inode); /* * These are distinct checks because future versions of the file system will * want to have a trailing dirent structure independent of indexing. */ -static int ocfs2_dir_has_trailer(struct inode *dir) +static int ocfs2_supports_dir_trailer(struct inode *dir) { + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) return 0; - return ocfs2_meta_ecc(OCFS2_SB(dir->i_sb)); + return ocfs2_meta_ecc(osb) || ocfs2_dir_indexed(dir); } -static int ocfs2_supports_dir_trailer(struct ocfs2_super *osb) +/* + * "new' here refers to the point at which we're creating a new + * directory via "mkdir()", but also when we're expanding an inline + * directory. In either case, we don't yet have the indexing bit set + * on the directory, so the standard checks will fail in when metaecc + * is turned off. Only directory-initialization type functions should + * use this then. Everything else wants ocfs2_supports_dir_trailer() + */ +static int ocfs2_new_dir_wants_trailer(struct inode *dir) { - return ocfs2_meta_ecc(osb); + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + + return ocfs2_meta_ecc(osb) || + ocfs2_supports_indexed_dirs(osb); } static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb) @@ -130,7 +141,7 @@ static int ocfs2_skip_dir_trailer(struct inode *dir, { unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer); - if (!ocfs2_dir_has_trailer(dir)) + if (!ocfs2_supports_dir_trailer(dir)) return 0; if (offset != toff) @@ -140,7 +151,7 @@ static int ocfs2_skip_dir_trailer(struct inode *dir, } static void ocfs2_init_dir_trailer(struct inode *inode, - struct buffer_head *bh) + struct buffer_head *bh, u16 rec_len) { struct ocfs2_dir_block_trailer *trailer; @@ -150,6 +161,153 @@ static void ocfs2_init_dir_trailer(struct inode *inode, cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer)); trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno); trailer->db_blkno = cpu_to_le64(bh->b_blocknr); + trailer->db_free_rec_len = cpu_to_le16(rec_len); +} +/* + * Link an unindexed block with a dir trailer structure into the index free + * list. This function will modify dirdata_bh, but assumes you've already + * passed it to the journal. + */ +static int ocfs2_dx_dir_link_trailer(struct inode *dir, handle_t *handle, + struct buffer_head *dx_root_bh, + struct buffer_head *dirdata_bh) +{ + int ret; + struct ocfs2_dx_root_block *dx_root; + struct ocfs2_dir_block_trailer *trailer; + + ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + trailer = ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb); + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + + trailer->db_free_next = dx_root->dr_free_blk; + dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr); + + ocfs2_journal_dirty(handle, dx_root_bh); + +out: + return ret; +} + +static int ocfs2_free_list_at_root(struct ocfs2_dir_lookup_result *res) +{ + return res->dl_prev_leaf_bh == NULL; +} + +void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res) +{ + brelse(res->dl_dx_root_bh); + brelse(res->dl_leaf_bh); + brelse(res->dl_dx_leaf_bh); + brelse(res->dl_prev_leaf_bh); +} + +static int ocfs2_dir_indexed(struct inode *inode) +{ + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INDEXED_DIR_FL) + return 1; + return 0; +} + +static inline int ocfs2_dx_root_inline(struct ocfs2_dx_root_block *dx_root) +{ + return dx_root->dr_flags & OCFS2_DX_FLAG_INLINE; +} + +/* + * Hashing code adapted from ext3 + */ +#define DELTA 0x9E3779B9 + +static void TEA_transform(__u32 buf[4], __u32 const in[]) +{ + __u32 sum = 0; + __u32 b0 = buf[0], b1 = buf[1]; + __u32 a = in[0], b = in[1], c = in[2], d = in[3]; + int n = 16; + + do { + sum += DELTA; + b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); + b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); + } while (--n); + + buf[0] += b0; + buf[1] += b1; +} + +static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) +{ + __u32 pad, val; + int i; + + pad = (__u32)len | ((__u32)len << 8); + pad |= pad << 16; + + val = pad; + if (len > num*4) + len = num * 4; + for (i = 0; i < len; i++) { + if ((i % 4) == 0) + val = pad; + val = msg[i] + (val << 8); + if ((i % 4) == 3) { + *buf++ = val; + val = pad; + num--; + } + } + if (--num >= 0) + *buf++ = val; + while (--num >= 0) + *buf++ = pad; +} + +static void ocfs2_dx_dir_name_hash(struct inode *dir, const char *name, int len, + struct ocfs2_dx_hinfo *hinfo) +{ + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + const char *p; + __u32 in[8], buf[4]; + + /* + * XXX: Is this really necessary, if the index is never looked + * at by readdir? Is a hash value of '0' a bad idea? + */ + if ((len == 1 && !strncmp(".", name, 1)) || + (len == 2 && !strncmp("..", name, 2))) { + buf[0] = buf[1] = 0; + goto out; + } + +#ifdef OCFS2_DEBUG_DX_DIRS + /* + * This makes it very easy to debug indexing problems. We + * should never allow this to be selected without hand editing + * this file though. + */ + buf[0] = buf[1] = len; + goto out; +#endif + + memcpy(buf, osb->osb_dx_seed, sizeof(buf)); + + p = name; + while (len > 0) { + str2hashbuf(p, len, in, 4); + TEA_transform(buf, in); + len -= 16; + p += 16; + } + +out: + hinfo->major_hash = buf[0]; + hinfo->minor_hash = buf[1]; } /* @@ -312,6 +470,52 @@ static int ocfs2_validate_dir_block(struct super_block *sb, } /* + * Validate a directory trailer. + * + * We check the trailer here rather than in ocfs2_validate_dir_block() + * because that function doesn't have the inode to test. + */ +static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh) +{ + int rc = 0; + struct ocfs2_dir_block_trailer *trailer; + + trailer = ocfs2_trailer_from_bh(bh, dir->i_sb); + if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) { + rc = -EINVAL; + ocfs2_error(dir->i_sb, + "Invalid dirblock #%llu: " + "signature = %.*s\n", + (unsigned long long)bh->b_blocknr, 7, + trailer->db_signature); + goto out; + } + if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) { + rc = -EINVAL; + ocfs2_error(dir->i_sb, + "Directory block #%llu has an invalid " + "db_blkno of %llu", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(trailer->db_blkno)); + goto out; + } + if (le64_to_cpu(trailer->db_parent_dinode) != + OCFS2_I(dir)->ip_blkno) { + rc = -EINVAL; + ocfs2_error(dir->i_sb, + "Directory block #%llu on dinode " + "#%llu has an invalid parent_dinode " + "of %llu", + (unsigned long long)bh->b_blocknr, + (unsigned long long)OCFS2_I(dir)->ip_blkno, + (unsigned long long)le64_to_cpu(trailer->db_blkno)); + goto out; + } +out: + return rc; +} + +/* * This function forces all errors to -EIO for consistency with its * predecessor, ocfs2_bread(). We haven't audited what returning the * real error codes would do to callers. We log the real codes with @@ -322,7 +526,6 @@ static int ocfs2_read_dir_block(struct inode *inode, u64 v_block, { int rc = 0; struct buffer_head *tmp = *bh; - struct ocfs2_dir_block_trailer *trailer; rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags, ocfs2_validate_dir_block); @@ -331,42 +534,13 @@ static int ocfs2_read_dir_block(struct inode *inode, u64 v_block, goto out; } - /* - * We check the trailer here rather than in - * ocfs2_validate_dir_block() because that function doesn't have - * the inode to test. - */ if (!(flags & OCFS2_BH_READAHEAD) && - ocfs2_dir_has_trailer(inode)) { - trailer = ocfs2_trailer_from_bh(tmp, inode->i_sb); - if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) { - rc = -EINVAL; - ocfs2_error(inode->i_sb, - "Invalid dirblock #%llu: " - "signature = %.*s\n", - (unsigned long long)tmp->b_blocknr, 7, - trailer->db_signature); - goto out; - } - if (le64_to_cpu(trailer->db_blkno) != tmp->b_blocknr) { - rc = -EINVAL; - ocfs2_error(inode->i_sb, - "Directory block #%llu has an invalid " - "db_blkno of %llu", - (unsigned long long)tmp->b_blocknr, - (unsigned long long)le64_to_cpu(trailer->db_blkno)); - goto out; - } - if (le64_to_cpu(trailer->db_parent_dinode) != - OCFS2_I(inode)->ip_blkno) { - rc = -EINVAL; - ocfs2_error(inode->i_sb, - "Directory block #%llu on dinode " - "#%llu has an invalid parent_dinode " - "of %llu", - (unsigned long long)tmp->b_blocknr, - (unsigned long long)OCFS2_I(inode)->ip_blkno, - (unsigned long long)le64_to_cpu(trailer->db_blkno)); + ocfs2_supports_dir_trailer(inode)) { + rc = ocfs2_check_dir_trailer(inode, tmp); + if (rc) { + if (!*bh) + brelse(tmp); + mlog_errno(rc); goto out; } } @@ -379,6 +553,141 @@ out: return rc ? -EIO : 0; } +/* + * Read the block at 'phys' which belongs to this directory + * inode. This function does no virtual->physical block translation - + * what's passed in is assumed to be a valid directory block. + */ +static int ocfs2_read_dir_block_direct(struct inode *dir, u64 phys, + struct buffer_head **bh) +{ + int ret; + struct buffer_head *tmp = *bh; + + ret = ocfs2_read_block(dir, phys, &tmp, ocfs2_validate_dir_block); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (ocfs2_supports_dir_trailer(dir)) { + ret = ocfs2_check_dir_trailer(dir, tmp); + if (ret) { + if (!*bh) + brelse(tmp); + mlog_errno(ret); + goto out; + } + } + + if (!ret && !*bh) + *bh = tmp; +out: + return ret; +} + +static int ocfs2_validate_dx_root(struct super_block *sb, + struct buffer_head *bh) +{ + int ret; + struct ocfs2_dx_root_block *dx_root; + + BUG_ON(!buffer_uptodate(bh)); + + dx_root = (struct ocfs2_dx_root_block *) bh->b_data; + + ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_root->dr_check); + if (ret) { + mlog(ML_ERROR, + "Checksum failed for dir index root block %llu\n", + (unsigned long long)bh->b_blocknr); + return ret; + } + + if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) { + ocfs2_error(sb, + "Dir Index Root # %llu has bad signature %.*s", + (unsigned long long)le64_to_cpu(dx_root->dr_blkno), + 7, dx_root->dr_signature); + return -EINVAL; + } + + return 0; +} + +static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di, + struct buffer_head **dx_root_bh) +{ + int ret; + u64 blkno = le64_to_cpu(di->i_dx_root); + struct buffer_head *tmp = *dx_root_bh; + + ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_root); + + /* If ocfs2_read_block() got us a new bh, pass it up. */ + if (!ret && !*dx_root_bh) + *dx_root_bh = tmp; + + return ret; +} + +static int ocfs2_validate_dx_leaf(struct super_block *sb, + struct buffer_head *bh) +{ + int ret; + struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)bh->b_data; + + BUG_ON(!buffer_uptodate(bh)); + + ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_leaf->dl_check); + if (ret) { + mlog(ML_ERROR, + "Checksum failed for dir index leaf block %llu\n", + (unsigned long long)bh->b_blocknr); + return ret; + } + + if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) { + ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s", + 7, dx_leaf->dl_signature); + return -EROFS; + } + + return 0; +} + +static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno, + struct buffer_head **dx_leaf_bh) +{ + int ret; + struct buffer_head *tmp = *dx_leaf_bh; + + ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_leaf); + + /* If ocfs2_read_block() got us a new bh, pass it up. */ + if (!ret && !*dx_leaf_bh) + *dx_leaf_bh = tmp; + + return ret; +} + +/* + * Read a series of dx_leaf blocks. This expects all buffer_head + * pointers to be NULL on function entry. + */ +static int ocfs2_read_dx_leaves(struct inode *dir, u64 start, int num, + struct buffer_head **dx_leaf_bhs) +{ + int ret; + + ret = ocfs2_read_blocks(dir, start, num, dx_leaf_bhs, 0, + ocfs2_validate_dx_leaf); + if (ret) + mlog_errno(ret); + + return ret; +} + static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen, struct inode *dir, struct ocfs2_dir_entry **res_dir) @@ -480,39 +789,340 @@ cleanup_and_exit: return ret; } +static int ocfs2_dx_dir_lookup_rec(struct inode *inode, + struct ocfs2_extent_list *el, + u32 major_hash, + u32 *ret_cpos, + u64 *ret_phys_blkno, + unsigned int *ret_clen) +{ + int ret = 0, i, found; + struct buffer_head *eb_bh = NULL; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_rec *rec = NULL; + + if (el->l_tree_depth) { + ret = ocfs2_find_leaf(inode, el, major_hash, &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + + if (el->l_tree_depth) { + ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in " + "btree tree block %llu\n", inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); + ret = -EROFS; + goto out; + } + } + + found = 0; + for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) { + rec = &el->l_recs[i]; + + if (le32_to_cpu(rec->e_cpos) <= major_hash) { + found = 1; + break; + } + } + + if (!found) { + ocfs2_error(inode->i_sb, "Inode %lu has bad extent " + "record (%u, %u, 0) in btree", inode->i_ino, + le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); + ret = -EROFS; + goto out; + } + + if (ret_phys_blkno) + *ret_phys_blkno = le64_to_cpu(rec->e_blkno); + if (ret_cpos) + *ret_cpos = le32_to_cpu(rec->e_cpos); + if (ret_clen) + *ret_clen = le16_to_cpu(rec->e_leaf_clusters); + +out: + brelse(eb_bh); + return ret; +} + +/* + * Returns the block index, from the start of the cluster which this + * hash belongs too. + */ +static inline unsigned int __ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb, + u32 minor_hash) +{ + return minor_hash & osb->osb_dx_mask; +} + +static inline unsigned int ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb, + struct ocfs2_dx_hinfo *hinfo) +{ + return __ocfs2_dx_dir_hash_idx(osb, hinfo->minor_hash); +} + +static int ocfs2_dx_dir_lookup(struct inode *inode, + struct ocfs2_extent_list *el, + struct ocfs2_dx_hinfo *hinfo, + u32 *ret_cpos, + u64 *ret_phys_blkno) +{ + int ret = 0; + unsigned int cend, uninitialized_var(clen); + u32 uninitialized_var(cpos); + u64 uninitialized_var(blkno); + u32 name_hash = hinfo->major_hash; + + ret = ocfs2_dx_dir_lookup_rec(inode, el, name_hash, &cpos, &blkno, + &clen); + if (ret) { + mlog_errno(ret); + goto out; + } + + cend = cpos + clen; + if (name_hash >= cend) { + /* We want the last cluster */ + blkno += ocfs2_clusters_to_blocks(inode->i_sb, clen - 1); + cpos += clen - 1; + } else { + blkno += ocfs2_clusters_to_blocks(inode->i_sb, + name_hash - cpos); + cpos = name_hash; + } + + /* + * We now have the cluster which should hold our entry. To + * find the exact block from the start of the cluster to + * search, we take the lower bits of the hash. + */ + blkno += ocfs2_dx_dir_hash_idx(OCFS2_SB(inode->i_sb), hinfo); + + if (ret_phys_blkno) + *ret_phys_blkno = blkno; + if (ret_cpos) + *ret_cpos = cpos; + +out: + + return ret; +} + +static int ocfs2_dx_dir_search(const char *name, int namelen, + struct inode *dir, + struct ocfs2_dx_root_block *dx_root, + struct ocfs2_dir_lookup_result *res) +{ + int ret, i, found; + u64 uninitialized_var(phys); + struct buffer_head *dx_leaf_bh = NULL; + struct ocfs2_dx_leaf *dx_leaf; + struct ocfs2_dx_entry *dx_entry = NULL; + struct buffer_head *dir_ent_bh = NULL; + struct ocfs2_dir_entry *dir_ent = NULL; + struct ocfs2_dx_hinfo *hinfo = &res->dl_hinfo; + struct ocfs2_extent_list *dr_el; + struct ocfs2_dx_entry_list *entry_list; + + ocfs2_dx_dir_name_hash(dir, name, namelen, &res->dl_hinfo); + + if (ocfs2_dx_root_inline(dx_root)) { + entry_list = &dx_root->dr_entries; + goto search; + } + + dr_el = &dx_root->dr_list; + + ret = ocfs2_dx_dir_lookup(dir, dr_el, hinfo, NULL, &phys); + if (ret) { + mlog_errno(ret); + goto out; + } + + mlog(0, "Dir %llu: name: \"%.*s\", lookup of hash: %u.0x%x " + "returns: %llu\n", + (unsigned long long)OCFS2_I(dir)->ip_blkno, + namelen, name, hinfo->major_hash, hinfo->minor_hash, + (unsigned long long)phys); + + ret = ocfs2_read_dx_leaf(dir, phys, &dx_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + dx_leaf = (struct ocfs2_dx_leaf *) dx_leaf_bh->b_data; + + mlog(0, "leaf info: num_used: %d, count: %d\n", + le16_to_cpu(dx_leaf->dl_list.de_num_used), + le16_to_cpu(dx_leaf->dl_list.de_count)); + + entry_list = &dx_leaf->dl_list; + +search: + /* + * Empty leaf is legal, so no need to check for that. + */ + found = 0; + for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) { + dx_entry = &entry_list->de_entries[i]; + + if (hinfo->major_hash != le32_to_cpu(dx_entry->dx_major_hash) + || hinfo->minor_hash != le32_to_cpu(dx_entry->dx_minor_hash)) + continue; + + /* + * Search unindexed leaf block now. We're not + * guaranteed to find anything. + */ + ret = ocfs2_read_dir_block_direct(dir, + le64_to_cpu(dx_entry->dx_dirent_blk), + &dir_ent_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * XXX: We should check the unindexed block here, + * before using it. + */ + + found = ocfs2_search_dirblock(dir_ent_bh, dir, name, namelen, + 0, dir_ent_bh->b_data, + dir->i_sb->s_blocksize, &dir_ent); + if (found == 1) + break; + + if (found == -1) { + /* This means we found a bad directory entry. */ + ret = -EIO; + mlog_errno(ret); + goto out; + } + + brelse(dir_ent_bh); + dir_ent_bh = NULL; + } + + if (found <= 0) { + ret = -ENOENT; + goto out; + } + + res->dl_leaf_bh = dir_ent_bh; + res->dl_entry = dir_ent; + res->dl_dx_leaf_bh = dx_leaf_bh; + res->dl_dx_entry = dx_entry; + + ret = 0; +out: + if (ret) { + brelse(dx_leaf_bh); + brelse(dir_ent_bh); + } + return ret; +} + +static int ocfs2_find_entry_dx(const char *name, int namelen, + struct inode *dir, + struct ocfs2_dir_lookup_result *lookup) +{ + int ret; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di; + struct buffer_head *dx_root_bh = NULL; + struct ocfs2_dx_root_block *dx_root; + + ret = ocfs2_read_inode_block(dir, &di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + di = (struct ocfs2_dinode *)di_bh->b_data; + + ret = ocfs2_read_dx_root(dir, di, &dx_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data; + + ret = ocfs2_dx_dir_search(name, namelen, dir, dx_root, lookup); + if (ret) { + if (ret != -ENOENT) + mlog_errno(ret); + goto out; + } + + lookup->dl_dx_root_bh = dx_root_bh; + dx_root_bh = NULL; +out: + brelse(di_bh); + brelse(dx_root_bh); + return ret; +} + /* * Try to find an entry of the provided name within 'dir'. * - * If nothing was found, NULL is returned. Otherwise, a buffer_head - * and pointer to the dir entry are passed back. + * If nothing was found, -ENOENT is returned. Otherwise, zero is + * returned and the struct 'res' will contain information useful to + * other directory manipulation functions. * * Caller can NOT assume anything about the contents of the - * buffer_head - it is passed back only so that it can be passed into - * any one of the manipulation functions (add entry, delete entry, - * etc). As an example, bh in the extent directory case is a data - * block, in the inline-data case it actually points to an inode. + * buffer_heads - they are passed back only so that it can be passed + * into any one of the manipulation functions (add entry, delete + * entry, etc). As an example, bh in the extent directory case is a + * data block, in the inline-data case it actually points to an inode, + * in the indexed directory case, multiple buffers are involved. */ -struct buffer_head *ocfs2_find_entry(const char *name, int namelen, - struct inode *dir, - struct ocfs2_dir_entry **res_dir) +int ocfs2_find_entry(const char *name, int namelen, + struct inode *dir, struct ocfs2_dir_lookup_result *lookup) { - *res_dir = NULL; + struct buffer_head *bh; + struct ocfs2_dir_entry *res_dir = NULL; + if (ocfs2_dir_indexed(dir)) + return ocfs2_find_entry_dx(name, namelen, dir, lookup); + + /* + * The unindexed dir code only uses part of the lookup + * structure, so there's no reason to push it down further + * than this. + */ if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) - return ocfs2_find_entry_id(name, namelen, dir, res_dir); + bh = ocfs2_find_entry_id(name, namelen, dir, &res_dir); + else + bh = ocfs2_find_entry_el(name, namelen, dir, &res_dir); + + if (bh == NULL) + return -ENOENT; - return ocfs2_find_entry_el(name, namelen, dir, res_dir); + lookup->dl_leaf_bh = bh; + lookup->dl_entry = res_dir; + return 0; } /* * Update inode number and type of a previously found directory entry. */ int ocfs2_update_entry(struct inode *dir, handle_t *handle, - struct buffer_head *de_bh, struct ocfs2_dir_entry *de, + struct ocfs2_dir_lookup_result *res, struct inode *new_entry_inode) { int ret; ocfs2_journal_access_func access = ocfs2_journal_access_db; + struct ocfs2_dir_entry *de = res->dl_entry; + struct buffer_head *de_bh = res->dl_leaf_bh; /* * The same code works fine for both inline-data and extent @@ -538,6 +1148,10 @@ out: return ret; } +/* + * __ocfs2_delete_entry deletes a directory entry by merging it with the + * previous entry + */ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir, struct ocfs2_dir_entry *de_del, struct buffer_head *bh, char *first_de, @@ -587,6 +1201,181 @@ bail: return status; } +static unsigned int ocfs2_figure_dirent_hole(struct ocfs2_dir_entry *de) +{ + unsigned int hole; + + if (le64_to_cpu(de->inode) == 0) + hole = le16_to_cpu(de->rec_len); + else + hole = le16_to_cpu(de->rec_len) - + OCFS2_DIR_REC_LEN(de->name_len); + + return hole; +} + +static int ocfs2_find_max_rec_len(struct super_block *sb, + struct buffer_head *dirblock_bh) +{ + int size, this_hole, largest_hole = 0; + char *trailer, *de_buf, *limit, *start = dirblock_bh->b_data; + struct ocfs2_dir_entry *de; + + trailer = (char *)ocfs2_trailer_from_bh(dirblock_bh, sb); + size = ocfs2_dir_trailer_blk_off(sb); + limit = start + size; + de_buf = start; + de = (struct ocfs2_dir_entry *)de_buf; + do { + if (de_buf != trailer) { + this_hole = ocfs2_figure_dirent_hole(de); + if (this_hole > largest_hole) + largest_hole = this_hole; + } + + de_buf += le16_to_cpu(de->rec_len); + de = (struct ocfs2_dir_entry *)de_buf; + } while (de_buf < limit); + + if (largest_hole >= OCFS2_DIR_MIN_REC_LEN) + return largest_hole; + return 0; +} + +static void ocfs2_dx_list_remove_entry(struct ocfs2_dx_entry_list *entry_list, + int index) +{ + int num_used = le16_to_cpu(entry_list->de_num_used); + + if (num_used == 1 || index == (num_used - 1)) + goto clear; + + memmove(&entry_list->de_entries[index], + &entry_list->de_entries[index + 1], + (num_used - index - 1)*sizeof(struct ocfs2_dx_entry)); +clear: + num_used--; + memset(&entry_list->de_entries[num_used], 0, + sizeof(struct ocfs2_dx_entry)); + entry_list->de_num_used = cpu_to_le16(num_used); +} + +static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir, + struct ocfs2_dir_lookup_result *lookup) +{ + int ret, index, max_rec_len, add_to_free_list = 0; + struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh; + struct buffer_head *leaf_bh = lookup->dl_leaf_bh; + struct ocfs2_dx_leaf *dx_leaf; + struct ocfs2_dx_entry *dx_entry = lookup->dl_dx_entry; + struct ocfs2_dir_block_trailer *trailer; + struct ocfs2_dx_root_block *dx_root; + struct ocfs2_dx_entry_list *entry_list; + + /* + * This function gets a bit messy because we might have to + * modify the root block, regardless of whether the indexed + * entries are stored inline. + */ + + /* + * *Only* set 'entry_list' here, based on where we're looking + * for the indexed entries. Later, we might still want to + * journal both blocks, based on free list state. + */ + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + if (ocfs2_dx_root_inline(dx_root)) { + entry_list = &dx_root->dr_entries; + } else { + dx_leaf = (struct ocfs2_dx_leaf *) lookup->dl_dx_leaf_bh->b_data; + entry_list = &dx_leaf->dl_list; + } + + /* Neither of these are a disk corruption - that should have + * been caught by lookup, before we got here. */ + BUG_ON(le16_to_cpu(entry_list->de_count) <= 0); + BUG_ON(le16_to_cpu(entry_list->de_num_used) <= 0); + + index = (char *)dx_entry - (char *)entry_list->de_entries; + index /= sizeof(*dx_entry); + + if (index >= le16_to_cpu(entry_list->de_num_used)) { + mlog(ML_ERROR, "Dir %llu: Bad dx_entry ptr idx %d, (%p, %p)\n", + (unsigned long long)OCFS2_I(dir)->ip_blkno, index, + entry_list, dx_entry); + return -EIO; + } + + /* + * We know that removal of this dirent will leave enough room + * for a new one, so add this block to the free list if it + * isn't already there. + */ + trailer = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb); + if (trailer->db_free_rec_len == 0) + add_to_free_list = 1; + + /* + * Add the block holding our index into the journal before + * removing the unindexed entry. If we get an error return + * from __ocfs2_delete_entry(), then it hasn't removed the + * entry yet. Likewise, successful return means we *must* + * remove the indexed entry. + * + * We're also careful to journal the root tree block here as + * the entry count needs to be updated. Also, we might be + * adding to the start of the free list. + */ + ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (!ocfs2_dx_root_inline(dx_root)) { + ret = ocfs2_journal_access_dl(handle, dir, + lookup->dl_dx_leaf_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + mlog(0, "Dir %llu: delete entry at index: %d\n", + (unsigned long long)OCFS2_I(dir)->ip_blkno, index); + + ret = __ocfs2_delete_entry(handle, dir, lookup->dl_entry, + leaf_bh, leaf_bh->b_data, leaf_bh->b_size); + if (ret) { + mlog_errno(ret); + goto out; + } + + max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, leaf_bh); + trailer->db_free_rec_len = cpu_to_le16(max_rec_len); + if (add_to_free_list) { + trailer->db_free_next = dx_root->dr_free_blk; + dx_root->dr_free_blk = cpu_to_le64(leaf_bh->b_blocknr); + ocfs2_journal_dirty(handle, dx_root_bh); + } + + /* leaf_bh was journal_accessed for us in __ocfs2_delete_entry */ + ocfs2_journal_dirty(handle, leaf_bh); + + le32_add_cpu(&dx_root->dr_num_entries, -1); + ocfs2_journal_dirty(handle, dx_root_bh); + + ocfs2_dx_list_remove_entry(entry_list, index); + + if (!ocfs2_dx_root_inline(dx_root)) + ocfs2_journal_dirty(handle, lookup->dl_dx_leaf_bh); + +out: + return ret; +} + static inline int ocfs2_delete_entry_id(handle_t *handle, struct inode *dir, struct ocfs2_dir_entry *de_del, @@ -624,18 +1413,22 @@ static inline int ocfs2_delete_entry_el(handle_t *handle, } /* - * ocfs2_delete_entry deletes a directory entry by merging it with the - * previous entry + * Delete a directory entry. Hide the details of directory + * implementation from the caller. */ int ocfs2_delete_entry(handle_t *handle, struct inode *dir, - struct ocfs2_dir_entry *de_del, - struct buffer_head *bh) + struct ocfs2_dir_lookup_result *res) { + if (ocfs2_dir_indexed(dir)) + return ocfs2_delete_entry_dx(handle, dir, res); + if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) - return ocfs2_delete_entry_id(handle, dir, de_del, bh); + return ocfs2_delete_entry_id(handle, dir, res->dl_entry, + res->dl_leaf_bh); - return ocfs2_delete_entry_el(handle, dir, de_del, bh); + return ocfs2_delete_entry_el(handle, dir, res->dl_entry, + res->dl_leaf_bh); } /* @@ -663,18 +1456,166 @@ static inline int ocfs2_dirent_would_fit(struct ocfs2_dir_entry *de, return 0; } +static void ocfs2_dx_dir_leaf_insert_tail(struct ocfs2_dx_leaf *dx_leaf, + struct ocfs2_dx_entry *dx_new_entry) +{ + int i; + + i = le16_to_cpu(dx_leaf->dl_list.de_num_used); + dx_leaf->dl_list.de_entries[i] = *dx_new_entry; + + le16_add_cpu(&dx_leaf->dl_list.de_num_used, 1); +} + +static void ocfs2_dx_entry_list_insert(struct ocfs2_dx_entry_list *entry_list, + struct ocfs2_dx_hinfo *hinfo, + u64 dirent_blk) +{ + int i; + struct ocfs2_dx_entry *dx_entry; + + i = le16_to_cpu(entry_list->de_num_used); + dx_entry = &entry_list->de_entries[i]; + + memset(dx_entry, 0, sizeof(*dx_entry)); + dx_entry->dx_major_hash = cpu_to_le32(hinfo->major_hash); + dx_entry->dx_minor_hash = cpu_to_le32(hinfo->minor_hash); + dx_entry->dx_dirent_blk = cpu_to_le64(dirent_blk); + + le16_add_cpu(&entry_list->de_num_used, 1); +} + +static int __ocfs2_dx_dir_leaf_insert(struct inode *dir, handle_t *handle, + struct ocfs2_dx_hinfo *hinfo, + u64 dirent_blk, + struct buffer_head *dx_leaf_bh) +{ + int ret; + struct ocfs2_dx_leaf *dx_leaf; + + ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data; + ocfs2_dx_entry_list_insert(&dx_leaf->dl_list, hinfo, dirent_blk); + ocfs2_journal_dirty(handle, dx_leaf_bh); + +out: + return ret; +} + +static void ocfs2_dx_inline_root_insert(struct inode *dir, handle_t *handle, + struct ocfs2_dx_hinfo *hinfo, + u64 dirent_blk, + struct ocfs2_dx_root_block *dx_root) +{ + ocfs2_dx_entry_list_insert(&dx_root->dr_entries, hinfo, dirent_blk); +} + +static int ocfs2_dx_dir_insert(struct inode *dir, handle_t *handle, + struct ocfs2_dir_lookup_result *lookup) +{ + int ret = 0; + struct ocfs2_dx_root_block *dx_root; + struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh; + + ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + dx_root = (struct ocfs2_dx_root_block *)lookup->dl_dx_root_bh->b_data; + if (ocfs2_dx_root_inline(dx_root)) { + ocfs2_dx_inline_root_insert(dir, handle, + &lookup->dl_hinfo, + lookup->dl_leaf_bh->b_blocknr, + dx_root); + } else { + ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &lookup->dl_hinfo, + lookup->dl_leaf_bh->b_blocknr, + lookup->dl_dx_leaf_bh); + if (ret) + goto out; + } + + le32_add_cpu(&dx_root->dr_num_entries, 1); + ocfs2_journal_dirty(handle, dx_root_bh); + +out: + return ret; +} + +static void ocfs2_remove_block_from_free_list(struct inode *dir, + handle_t *handle, + struct ocfs2_dir_lookup_result *lookup) +{ + struct ocfs2_dir_block_trailer *trailer, *prev; + struct ocfs2_dx_root_block *dx_root; + struct buffer_head *bh; + + trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb); + + if (ocfs2_free_list_at_root(lookup)) { + bh = lookup->dl_dx_root_bh; + dx_root = (struct ocfs2_dx_root_block *)bh->b_data; + dx_root->dr_free_blk = trailer->db_free_next; + } else { + bh = lookup->dl_prev_leaf_bh; + prev = ocfs2_trailer_from_bh(bh, dir->i_sb); + prev->db_free_next = trailer->db_free_next; + } + + trailer->db_free_rec_len = cpu_to_le16(0); + trailer->db_free_next = cpu_to_le64(0); + + ocfs2_journal_dirty(handle, bh); + ocfs2_journal_dirty(handle, lookup->dl_leaf_bh); +} + +/* + * This expects that a journal write has been reserved on + * lookup->dl_prev_leaf_bh or lookup->dl_dx_root_bh + */ +static void ocfs2_recalc_free_list(struct inode *dir, handle_t *handle, + struct ocfs2_dir_lookup_result *lookup) +{ + int max_rec_len; + struct ocfs2_dir_block_trailer *trailer; + + /* Walk dl_leaf_bh to figure out what the new free rec_len is. */ + max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, lookup->dl_leaf_bh); + if (max_rec_len) { + /* + * There's still room in this block, so no need to remove it + * from the free list. In this case, we just want to update + * the rec len accounting. + */ + trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb); + trailer->db_free_rec_len = cpu_to_le16(max_rec_len); + ocfs2_journal_dirty(handle, lookup->dl_leaf_bh); + } else { + ocfs2_remove_block_from_free_list(dir, handle, lookup); + } +} + /* we don't always have a dentry for what we want to add, so people * like orphan dir can call this instead. * - * If you pass me insert_bh, I'll skip the search of the other dir - * blocks and put the record in there. + * The lookup context must have been filled from + * ocfs2_prepare_dir_for_insert. */ int __ocfs2_add_entry(handle_t *handle, struct inode *dir, const char *name, int namelen, struct inode *inode, u64 blkno, struct buffer_head *parent_fe_bh, - struct buffer_head *insert_bh) + struct ocfs2_dir_lookup_result *lookup) { unsigned long offset; unsigned short rec_len; @@ -683,6 +1624,7 @@ int __ocfs2_add_entry(handle_t *handle, struct super_block *sb = dir->i_sb; int retval, status; unsigned int size = sb->s_blocksize; + struct buffer_head *insert_bh = lookup->dl_leaf_bh; char *data_start = insert_bh->b_data; mlog_entry_void(); @@ -690,7 +1632,31 @@ int __ocfs2_add_entry(handle_t *handle, if (!namelen) return -EINVAL; - if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + if (ocfs2_dir_indexed(dir)) { + struct buffer_head *bh; + + /* + * An indexed dir may require that we update the free space + * list. Reserve a write to the previous node in the list so + * that we don't fail later. + * + * XXX: This can be either a dx_root_block, or an unindexed + * directory tree leaf block. + */ + if (ocfs2_free_list_at_root(lookup)) { + bh = lookup->dl_dx_root_bh; + retval = ocfs2_journal_access_dr(handle, dir, bh, + OCFS2_JOURNAL_ACCESS_WRITE); + } else { + bh = lookup->dl_prev_leaf_bh; + retval = ocfs2_journal_access_db(handle, dir, bh, + OCFS2_JOURNAL_ACCESS_WRITE); + } + if (retval) { + mlog_errno(retval); + return retval; + } + } else if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { data_start = di->id2.i_data.id_data; size = i_size_read(dir); @@ -737,10 +1703,22 @@ int __ocfs2_add_entry(handle_t *handle, status = ocfs2_journal_access_di(handle, dir, insert_bh, OCFS2_JOURNAL_ACCESS_WRITE); - else + else { status = ocfs2_journal_access_db(handle, dir, insert_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + OCFS2_JOURNAL_ACCESS_WRITE); + + if (ocfs2_dir_indexed(dir)) { + status = ocfs2_dx_dir_insert(dir, + handle, + lookup); + if (status) { + mlog_errno(status); + goto bail; + } + } + } + /* By now the buffer is marked for journaling */ offset += le16_to_cpu(de->rec_len); if (le64_to_cpu(de->inode)) { @@ -761,6 +1739,9 @@ int __ocfs2_add_entry(handle_t *handle, de->name_len = namelen; memcpy(de->name, name, namelen); + if (ocfs2_dir_indexed(dir)) + ocfs2_recalc_free_list(dir, handle, lookup); + dir->i_version++; status = ocfs2_journal_dirty(handle, insert_bh); retval = 0; @@ -870,6 +1851,10 @@ out: return 0; } +/* + * NOTE: This function can be called against unindexed directories, + * and indexed ones. + */ static int ocfs2_dir_foreach_blk_el(struct inode *inode, u64 *f_version, loff_t *f_pos, void *priv, @@ -1071,31 +2056,22 @@ int ocfs2_find_files_on_disk(const char *name, int namelen, u64 *blkno, struct inode *inode, - struct buffer_head **dirent_bh, - struct ocfs2_dir_entry **dirent) + struct ocfs2_dir_lookup_result *lookup) { int status = -ENOENT; - mlog_entry("(name=%.*s, blkno=%p, inode=%p, dirent_bh=%p, dirent=%p)\n", - namelen, name, blkno, inode, dirent_bh, dirent); + mlog(0, "name=%.*s, blkno=%p, inode=%llu\n", namelen, name, blkno, + (unsigned long long)OCFS2_I(inode)->ip_blkno); - *dirent_bh = ocfs2_find_entry(name, namelen, inode, dirent); - if (!*dirent_bh || !*dirent) { - status = -ENOENT; + status = ocfs2_find_entry(name, namelen, inode, lookup); + if (status) goto leave; - } - *blkno = le64_to_cpu((*dirent)->inode); + *blkno = le64_to_cpu(lookup->dl_entry->inode); status = 0; leave: - if (status < 0) { - *dirent = NULL; - brelse(*dirent_bh); - *dirent_bh = NULL; - } - mlog_exit(status); return status; } @@ -1107,11 +2083,10 @@ int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name, int namelen, u64 *blkno) { int ret; - struct buffer_head *bh = NULL; - struct ocfs2_dir_entry *dirent = NULL; + struct ocfs2_dir_lookup_result lookup = { NULL, }; - ret = ocfs2_find_files_on_disk(name, namelen, blkno, dir, &bh, &dirent); - brelse(bh); + ret = ocfs2_find_files_on_disk(name, namelen, blkno, dir, &lookup); + ocfs2_free_dir_lookup_result(&lookup); return ret; } @@ -1128,20 +2103,18 @@ int ocfs2_check_dir_for_entry(struct inode *dir, int namelen) { int ret; - struct buffer_head *dirent_bh = NULL; - struct ocfs2_dir_entry *dirent = NULL; + struct ocfs2_dir_lookup_result lookup = { NULL, }; mlog_entry("dir %llu, name '%.*s'\n", (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name); ret = -EEXIST; - dirent_bh = ocfs2_find_entry(name, namelen, dir, &dirent); - if (dirent_bh) + if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0) goto bail; ret = 0; bail: - brelse(dirent_bh); + ocfs2_free_dir_lookup_result(&lookup); mlog_exit(ret); return ret; @@ -1151,6 +2124,7 @@ struct ocfs2_empty_dir_priv { unsigned seen_dot; unsigned seen_dot_dot; unsigned seen_other; + unsigned dx_dir; }; static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len, loff_t pos, u64 ino, unsigned type) @@ -1160,6 +2134,13 @@ static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len, /* * Check the positions of "." and ".." records to be sure * they're in the correct place. + * + * Indexed directories don't need to proceed past the first + * two entries, so we end the scan after seeing '..'. Despite + * that, we allow the scan to proceed In the event that we + * have a corrupted indexed directory (no dot or dot dot + * entries). This allows us to double check for existing + * entries which might not have been found in the index. */ if (name_len == 1 && !strncmp(".", name, 1) && pos == 0) { p->seen_dot = 1; @@ -1169,16 +2150,57 @@ static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len, if (name_len == 2 && !strncmp("..", name, 2) && pos == OCFS2_DIR_REC_LEN(1)) { p->seen_dot_dot = 1; + + if (p->dx_dir && p->seen_dot) + return 1; + return 0; } p->seen_other = 1; return 1; } + +static int ocfs2_empty_dir_dx(struct inode *inode, + struct ocfs2_empty_dir_priv *priv) +{ + int ret; + struct buffer_head *di_bh = NULL; + struct buffer_head *dx_root_bh = NULL; + struct ocfs2_dinode *di; + struct ocfs2_dx_root_block *dx_root; + + priv->dx_dir = 1; + + ret = ocfs2_read_inode_block(inode, &di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + di = (struct ocfs2_dinode *)di_bh->b_data; + + ret = ocfs2_read_dx_root(inode, di, &dx_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + + if (le32_to_cpu(dx_root->dr_num_entries) != 2) + priv->seen_other = 1; + +out: + brelse(di_bh); + brelse(dx_root_bh); + return ret; +} + /* * routine to check that the specified directory is empty (for rmdir) * * Returns 1 if dir is empty, zero otherwise. + * + * XXX: This is a performance problem for unindexed directories. */ int ocfs2_empty_dir(struct inode *inode) { @@ -1188,6 +2210,16 @@ int ocfs2_empty_dir(struct inode *inode) memset(&priv, 0, sizeof(priv)); + if (ocfs2_dir_indexed(inode)) { + ret = ocfs2_empty_dir_dx(inode, &priv); + if (ret) + mlog_errno(ret); + /* + * We still run ocfs2_dir_foreach to get the checks + * for "." and "..". + */ + } + ret = ocfs2_dir_foreach(inode, &start, &priv, ocfs2_empty_dir_filldir); if (ret) mlog_errno(ret); @@ -1280,7 +2312,8 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb, struct inode *parent, struct inode *inode, struct buffer_head *fe_bh, - struct ocfs2_alloc_context *data_ac) + struct ocfs2_alloc_context *data_ac, + struct buffer_head **ret_new_bh) { int status; unsigned int size = osb->sb->s_blocksize; @@ -1289,7 +2322,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb, mlog_entry_void(); - if (ocfs2_supports_dir_trailer(osb)) + if (ocfs2_new_dir_wants_trailer(inode)) size = ocfs2_dir_trailer_blk_off(parent->i_sb); status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh, @@ -1310,8 +2343,19 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb, memset(new_bh->b_data, 0, osb->sb->s_blocksize); de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size); - if (ocfs2_supports_dir_trailer(osb)) - ocfs2_init_dir_trailer(inode, new_bh); + if (ocfs2_new_dir_wants_trailer(inode)) { + int size = le16_to_cpu(de->rec_len); + + /* + * Figure out the size of the hole left over after + * insertion of '.' and '..'. The trailer wants this + * information. + */ + size -= OCFS2_DIR_REC_LEN(2); + size -= sizeof(struct ocfs2_dir_block_trailer); + + ocfs2_init_dir_trailer(inode, new_bh, size); + } status = ocfs2_journal_dirty(handle, new_bh); if (status < 0) { @@ -1329,6 +2373,10 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb, } status = 0; + if (ret_new_bh) { + *ret_new_bh = new_bh; + new_bh = NULL; + } bail: brelse(new_bh); @@ -1336,20 +2384,427 @@ bail: return status; } +static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb, + handle_t *handle, struct inode *dir, + struct buffer_head *di_bh, + struct buffer_head *dirdata_bh, + struct ocfs2_alloc_context *meta_ac, + int dx_inline, u32 num_entries, + struct buffer_head **ret_dx_root_bh) +{ + int ret; + struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; + u16 dr_suballoc_bit; + u64 dr_blkno; + unsigned int num_bits; + struct buffer_head *dx_root_bh = NULL; + struct ocfs2_dx_root_block *dx_root; + struct ocfs2_dir_block_trailer *trailer = + ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb); + + ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, &dr_suballoc_bit, + &num_bits, &dr_blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + mlog(0, "Dir %llu, attach new index block: %llu\n", + (unsigned long long)OCFS2_I(dir)->ip_blkno, + (unsigned long long)dr_blkno); + + dx_root_bh = sb_getblk(osb->sb, dr_blkno); + if (dx_root_bh == NULL) { + ret = -EIO; + goto out; + } + ocfs2_set_new_buffer_uptodate(dir, dx_root_bh); + + ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + memset(dx_root, 0, osb->sb->s_blocksize); + strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE); + dx_root->dr_suballoc_slot = cpu_to_le16(osb->slot_num); + dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit); + dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation); + dx_root->dr_blkno = cpu_to_le64(dr_blkno); + dx_root->dr_dir_blkno = cpu_to_le64(OCFS2_I(dir)->ip_blkno); + dx_root->dr_num_entries = cpu_to_le32(num_entries); + if (le16_to_cpu(trailer->db_free_rec_len)) + dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr); + else + dx_root->dr_free_blk = cpu_to_le64(0); + + if (dx_inline) { + dx_root->dr_flags |= OCFS2_DX_FLAG_INLINE; + dx_root->dr_entries.de_count = + cpu_to_le16(ocfs2_dx_entries_per_root(osb->sb)); + } else { + dx_root->dr_list.l_count = + cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb)); + } + + ret = ocfs2_journal_dirty(handle, dx_root_bh); + if (ret) + mlog_errno(ret); + + ret = ocfs2_journal_access_di(handle, dir, di_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + goto out; + } + + di->i_dx_root = cpu_to_le64(dr_blkno); + + OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL; + di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features); + + ret = ocfs2_journal_dirty(handle, di_bh); + if (ret) + mlog_errno(ret); + + *ret_dx_root_bh = dx_root_bh; + dx_root_bh = NULL; + +out: + brelse(dx_root_bh); + return ret; +} + +static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb, + handle_t *handle, struct inode *dir, + struct buffer_head **dx_leaves, + int num_dx_leaves, u64 start_blk) +{ + int ret, i; + struct ocfs2_dx_leaf *dx_leaf; + struct buffer_head *bh; + + for (i = 0; i < num_dx_leaves; i++) { + bh = sb_getblk(osb->sb, start_blk + i); + if (bh == NULL) { + ret = -EIO; + goto out; + } + dx_leaves[i] = bh; + + ocfs2_set_new_buffer_uptodate(dir, bh); + + ret = ocfs2_journal_access_dl(handle, dir, bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + dx_leaf = (struct ocfs2_dx_leaf *) bh->b_data; + + memset(dx_leaf, 0, osb->sb->s_blocksize); + strcpy(dx_leaf->dl_signature, OCFS2_DX_LEAF_SIGNATURE); + dx_leaf->dl_fs_generation = cpu_to_le32(osb->fs_generation); + dx_leaf->dl_blkno = cpu_to_le64(bh->b_blocknr); + dx_leaf->dl_list.de_count = + cpu_to_le16(ocfs2_dx_entries_per_leaf(osb->sb)); + + mlog(0, + "Dir %llu, format dx_leaf: %llu, entry count: %u\n", + (unsigned long long)OCFS2_I(dir)->ip_blkno, + (unsigned long long)bh->b_blocknr, + le16_to_cpu(dx_leaf->dl_list.de_count)); + + ocfs2_journal_dirty(handle, bh); + } + + ret = 0; +out: + return ret; +} + +/* + * Allocates and formats a new cluster for use in an indexed dir + * leaf. This version will not do the extent insert, so that it can be + * used by operations which need careful ordering. + */ +static int __ocfs2_dx_dir_new_cluster(struct inode *dir, + u32 cpos, handle_t *handle, + struct ocfs2_alloc_context *data_ac, + struct buffer_head **dx_leaves, + int num_dx_leaves, u64 *ret_phys_blkno) +{ + int ret; + u32 phys, num; + u64 phys_blkno; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + + /* + * XXX: For create, this should claim cluster for the index + * *before* the unindexed insert so that we have a better + * chance of contiguousness as the directory grows in number + * of entries. + */ + ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1, 1, &phys, &num); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * Format the new cluster first. That way, we're inserting + * valid data. + */ + phys_blkno = ocfs2_clusters_to_blocks(osb->sb, phys); + ret = ocfs2_dx_dir_format_cluster(osb, handle, dir, dx_leaves, + num_dx_leaves, phys_blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + *ret_phys_blkno = phys_blkno; +out: + return ret; +} + +static int ocfs2_dx_dir_new_cluster(struct inode *dir, + struct ocfs2_extent_tree *et, + u32 cpos, handle_t *handle, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + struct buffer_head **dx_leaves, + int num_dx_leaves) +{ + int ret; + u64 phys_blkno; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + + ret = __ocfs2_dx_dir_new_cluster(dir, cpos, handle, data_ac, dx_leaves, + num_dx_leaves, &phys_blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_insert_extent(osb, handle, dir, et, cpos, phys_blkno, 1, 0, + meta_ac); + if (ret) + mlog_errno(ret); +out: + return ret; +} + +static struct buffer_head **ocfs2_dx_dir_kmalloc_leaves(struct super_block *sb, + int *ret_num_leaves) +{ + int num_dx_leaves = ocfs2_clusters_to_blocks(sb, 1); + struct buffer_head **dx_leaves; + + dx_leaves = kcalloc(num_dx_leaves, sizeof(struct buffer_head *), + GFP_NOFS); + if (dx_leaves && ret_num_leaves) + *ret_num_leaves = num_dx_leaves; + + return dx_leaves; +} + +static int ocfs2_fill_new_dir_dx(struct ocfs2_super *osb, + handle_t *handle, + struct inode *parent, + struct inode *inode, + struct buffer_head *di_bh, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac) +{ + int ret; + struct buffer_head *leaf_bh = NULL; + struct buffer_head *dx_root_bh = NULL; + struct ocfs2_dx_hinfo hinfo; + struct ocfs2_dx_root_block *dx_root; + struct ocfs2_dx_entry_list *entry_list; + + /* + * Our strategy is to create the directory as though it were + * unindexed, then add the index block. This works with very + * little complication since the state of a new directory is a + * very well known quantity. + * + * Essentially, we have two dirents ("." and ".."), in the 1st + * block which need indexing. These are easily inserted into + * the index block. + */ + + ret = ocfs2_fill_new_dir_el(osb, handle, parent, inode, di_bh, + data_ac, &leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_dx_dir_attach_index(osb, handle, inode, di_bh, leaf_bh, + meta_ac, 1, 2, &dx_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + entry_list = &dx_root->dr_entries; + + /* Buffer has been journaled for us by ocfs2_dx_dir_attach_index */ + ocfs2_dx_dir_name_hash(inode, ".", 1, &hinfo); + ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr); + + ocfs2_dx_dir_name_hash(inode, "..", 2, &hinfo); + ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr); + +out: + brelse(dx_root_bh); + brelse(leaf_bh); + return ret; +} + int ocfs2_fill_new_dir(struct ocfs2_super *osb, handle_t *handle, struct inode *parent, struct inode *inode, struct buffer_head *fe_bh, - struct ocfs2_alloc_context *data_ac) + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac) + { BUG_ON(!ocfs2_supports_inline_data(osb) && data_ac == NULL); if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) return ocfs2_fill_new_dir_id(osb, handle, parent, inode, fe_bh); + if (ocfs2_supports_indexed_dirs(osb)) + return ocfs2_fill_new_dir_dx(osb, handle, parent, inode, fe_bh, + data_ac, meta_ac); + return ocfs2_fill_new_dir_el(osb, handle, parent, inode, fe_bh, - data_ac); + data_ac, NULL); +} + +static int ocfs2_dx_dir_index_block(struct inode *dir, + handle_t *handle, + struct buffer_head **dx_leaves, + int num_dx_leaves, + u32 *num_dx_entries, + struct buffer_head *dirent_bh) +{ + int ret, namelen, i; + char *de_buf, *limit; + struct ocfs2_dir_entry *de; + struct buffer_head *dx_leaf_bh; + struct ocfs2_dx_hinfo hinfo; + u64 dirent_blk = dirent_bh->b_blocknr; + + de_buf = dirent_bh->b_data; + limit = de_buf + dir->i_sb->s_blocksize; + + while (de_buf < limit) { + de = (struct ocfs2_dir_entry *)de_buf; + + namelen = de->name_len; + if (!namelen || !de->inode) + goto inc; + + ocfs2_dx_dir_name_hash(dir, de->name, namelen, &hinfo); + + i = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb), &hinfo); + dx_leaf_bh = dx_leaves[i]; + + ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &hinfo, + dirent_blk, dx_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + *num_dx_entries = *num_dx_entries + 1; + +inc: + de_buf += le16_to_cpu(de->rec_len); + } + +out: + return ret; +} + +/* + * XXX: This expects dx_root_bh to already be part of the transaction. + */ +static void ocfs2_dx_dir_index_root_block(struct inode *dir, + struct buffer_head *dx_root_bh, + struct buffer_head *dirent_bh) +{ + char *de_buf, *limit; + struct ocfs2_dx_root_block *dx_root; + struct ocfs2_dir_entry *de; + struct ocfs2_dx_hinfo hinfo; + u64 dirent_blk = dirent_bh->b_blocknr; + + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + + de_buf = dirent_bh->b_data; + limit = de_buf + dir->i_sb->s_blocksize; + + while (de_buf < limit) { + de = (struct ocfs2_dir_entry *)de_buf; + + if (!de->name_len || !de->inode) + goto inc; + + ocfs2_dx_dir_name_hash(dir, de->name, de->name_len, &hinfo); + + mlog(0, + "dir: %llu, major: 0x%x minor: 0x%x, index: %u, name: %.*s\n", + (unsigned long long)dir->i_ino, hinfo.major_hash, + hinfo.minor_hash, + le16_to_cpu(dx_root->dr_entries.de_num_used), + de->name_len, de->name); + + ocfs2_dx_entry_list_insert(&dx_root->dr_entries, &hinfo, + dirent_blk); + + le32_add_cpu(&dx_root->dr_num_entries, 1); +inc: + de_buf += le16_to_cpu(de->rec_len); + } +} + +/* + * Count the number of inline directory entries in di_bh and compare + * them against the number of entries we can hold in an inline dx root + * block. + */ +static int ocfs2_new_dx_should_be_inline(struct inode *dir, + struct buffer_head *di_bh) +{ + int dirent_count = 0; + char *de_buf, *limit; + struct ocfs2_dir_entry *de; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + + de_buf = di->id2.i_data.id_data; + limit = de_buf + i_size_read(dir); + + while (de_buf < limit) { + de = (struct ocfs2_dir_entry *)de_buf; + + if (de->name_len && de->inode) + dirent_count++; + + de_buf += le16_to_cpu(de->rec_len); + } + + /* We are careful to leave room for one extra record. */ + return dirent_count < ocfs2_dx_entries_per_root(dir->i_sb); } /* @@ -1358,18 +2813,26 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb, * expansion from an inline directory to one with extents. The first dir block * in that case is taken from the inline data portion of the inode block. * + * This will also return the largest amount of contiguous space for a dirent + * in the block. That value is *not* necessarily the last dirent, even after + * expansion. The directory indexing code wants this value for free space + * accounting. We do this here since we're already walking the entire dir + * block. + * * We add the dir trailer if this filesystem wants it. */ -static void ocfs2_expand_last_dirent(char *start, unsigned int old_size, - struct super_block *sb) +static unsigned int ocfs2_expand_last_dirent(char *start, unsigned int old_size, + struct inode *dir) { + struct super_block *sb = dir->i_sb; struct ocfs2_dir_entry *de; struct ocfs2_dir_entry *prev_de; char *de_buf, *limit; unsigned int new_size = sb->s_blocksize; - unsigned int bytes; + unsigned int bytes, this_hole; + unsigned int largest_hole = 0; - if (ocfs2_supports_dir_trailer(OCFS2_SB(sb))) + if (ocfs2_new_dir_wants_trailer(dir)) new_size = ocfs2_dir_trailer_blk_off(sb); bytes = new_size - old_size; @@ -1378,12 +2841,26 @@ static void ocfs2_expand_last_dirent(char *start, unsigned int old_size, de_buf = start; de = (struct ocfs2_dir_entry *)de_buf; do { + this_hole = ocfs2_figure_dirent_hole(de); + if (this_hole > largest_hole) + largest_hole = this_hole; + prev_de = de; de_buf += le16_to_cpu(de->rec_len); de = (struct ocfs2_dir_entry *)de_buf; } while (de_buf < limit); le16_add_cpu(&prev_de->rec_len, bytes); + + /* We need to double check this after modification of the final + * dirent. */ + this_hole = ocfs2_figure_dirent_hole(prev_de); + if (this_hole > largest_hole) + largest_hole = this_hole; + + if (largest_hole >= OCFS2_DIR_MIN_REC_LEN) + return largest_hole; + return 0; } /* @@ -1396,29 +2873,61 @@ static void ocfs2_expand_last_dirent(char *start, unsigned int old_size, */ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, unsigned int blocks_wanted, + struct ocfs2_dir_lookup_result *lookup, struct buffer_head **first_block_bh) { - u32 alloc, bit_off, len; + u32 alloc, dx_alloc, bit_off, len, num_dx_entries = 0; struct super_block *sb = dir->i_sb; - int ret, credits = ocfs2_inline_to_extents_credits(sb); - u64 blkno, bytes = blocks_wanted << sb->s_blocksize_bits; + int ret, i, num_dx_leaves = 0, dx_inline = 0, + credits = ocfs2_inline_to_extents_credits(sb); + u64 dx_insert_blkno, blkno, + bytes = blocks_wanted << sb->s_blocksize_bits; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); struct ocfs2_inode_info *oi = OCFS2_I(dir); struct ocfs2_alloc_context *data_ac; + struct ocfs2_alloc_context *meta_ac = NULL; struct buffer_head *dirdata_bh = NULL; + struct buffer_head *dx_root_bh = NULL; + struct buffer_head **dx_leaves = NULL; struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; handle_t *handle; struct ocfs2_extent_tree et; - int did_quota = 0; + struct ocfs2_extent_tree dx_et; + int did_quota = 0, bytes_allocated = 0; ocfs2_init_dinode_extent_tree(&et, dir, di_bh); alloc = ocfs2_clusters_for_bytes(sb, bytes); + dx_alloc = 0; + + if (ocfs2_supports_indexed_dirs(osb)) { + credits += ocfs2_add_dir_index_credits(sb); + + dx_inline = ocfs2_new_dx_should_be_inline(dir, di_bh); + if (!dx_inline) { + /* Add one more cluster for an index leaf */ + dx_alloc++; + dx_leaves = ocfs2_dx_dir_kmalloc_leaves(sb, + &num_dx_leaves); + if (!dx_leaves) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + } + + /* This gets us the dx_root */ + ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + } /* - * We should never need more than 2 clusters for this - - * maximum dirent size is far less than one block. In fact, - * the only time we'd need more than one cluster is if + * We should never need more than 2 clusters for the unindexed + * tree - maximum dirent size is far less than one block. In + * fact, the only time we'd need more than one cluster is if * blocksize == clustersize and the dirent won't fit in the * extra space that the expansion to a single block gives. As * of today, that only happens on 4k/4k file systems. @@ -1435,7 +2944,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, /* * Prepare for worst case allocation scenario of two separate - * extents. + * extents in the unindexed tree. */ if (alloc == 2) credits += OCFS2_SUBALLOC_ALLOC; @@ -1448,11 +2957,29 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, } if (vfs_dq_alloc_space_nodirty(dir, - ocfs2_clusters_to_bytes(osb->sb, alloc))) { + ocfs2_clusters_to_bytes(osb->sb, + alloc + dx_alloc))) { ret = -EDQUOT; goto out_commit; } did_quota = 1; + + if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) { + /* + * Allocate our index cluster first, to maximize the + * possibility that unindexed leaves grow + * contiguously. + */ + ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac, + dx_leaves, num_dx_leaves, + &dx_insert_blkno); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1); + } + /* * Try to claim as many clusters as the bitmap can give though * if we only get one now, that's enough to continue. The rest @@ -1463,6 +2990,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, mlog_errno(ret); goto out_commit; } + bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1); /* * Operations are carefully ordered so that we set up the new @@ -1489,9 +3017,16 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir)); memset(dirdata_bh->b_data + i_size_read(dir), 0, sb->s_blocksize - i_size_read(dir)); - ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), sb); - if (ocfs2_supports_dir_trailer(osb)) - ocfs2_init_dir_trailer(dir, dirdata_bh); + i = ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), dir); + if (ocfs2_new_dir_wants_trailer(dir)) { + /* + * Prepare the dir trailer up front. It will otherwise look + * like a valid dirent. Even if inserting the index fails + * (unlikely), then all we'll have done is given first dir + * block a small amount of fragmentation. + */ + ocfs2_init_dir_trailer(dir, dirdata_bh, i); + } ret = ocfs2_journal_dirty(handle, dirdata_bh); if (ret) { @@ -1499,6 +3034,24 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, goto out_commit; } + if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) { + /* + * Dx dirs with an external cluster need to do this up + * front. Inline dx root's get handled later, after + * we've allocated our root block. We get passed back + * a total number of items so that dr_num_entries can + * be correctly set once the dx_root has been + * allocated. + */ + ret = ocfs2_dx_dir_index_block(dir, handle, dx_leaves, + num_dx_leaves, &num_dx_entries, + dirdata_bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + } + /* * Set extent, i_size, etc on the directory. After this, the * inode should contain the same exact dirents as before and @@ -1551,6 +3104,27 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, goto out_commit; } + if (ocfs2_supports_indexed_dirs(osb)) { + ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh, + dirdata_bh, meta_ac, dx_inline, + num_dx_entries, &dx_root_bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + if (dx_inline) { + ocfs2_dx_dir_index_root_block(dir, dx_root_bh, + dirdata_bh); + } else { + ocfs2_init_dx_root_extent_tree(&dx_et, dir, dx_root_bh); + ret = ocfs2_insert_extent(osb, handle, dir, &dx_et, 0, + dx_insert_blkno, 1, 0, NULL); + if (ret) + mlog_errno(ret); + } + } + /* * We asked for two clusters, but only got one in the 1st * pass. Claim the 2nd cluster as a separate extent. @@ -1570,15 +3144,32 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, mlog_errno(ret); goto out_commit; } + bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1); } *first_block_bh = dirdata_bh; dirdata_bh = NULL; + if (ocfs2_supports_indexed_dirs(osb)) { + unsigned int off; + + if (!dx_inline) { + /* + * We need to return the correct block within the + * cluster which should hold our entry. + */ + off = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb), + &lookup->dl_hinfo); + get_bh(dx_leaves[off]); + lookup->dl_dx_leaf_bh = dx_leaves[off]; + } + lookup->dl_dx_root_bh = dx_root_bh; + dx_root_bh = NULL; + } out_commit: if (ret < 0 && did_quota) - vfs_dq_free_space_nodirty(dir, - ocfs2_clusters_to_bytes(osb->sb, 2)); + vfs_dq_free_space_nodirty(dir, bytes_allocated); + ocfs2_commit_trans(osb, handle); out_sem: @@ -1587,8 +3178,17 @@ out_sem: out: if (data_ac) ocfs2_free_alloc_context(data_ac); + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + if (dx_leaves) { + for (i = 0; i < num_dx_leaves; i++) + brelse(dx_leaves[i]); + kfree(dx_leaves); + } brelse(dirdata_bh); + brelse(dx_root_bh); return ret; } @@ -1658,11 +3258,14 @@ bail: * is to be turned into an extent based one. The size of the dirent to * insert might be larger than the space gained by growing to just one * block, so we may have to grow the inode by two blocks in that case. + * + * If the directory is already indexed, dx_root_bh must be provided. */ static int ocfs2_extend_dir(struct ocfs2_super *osb, struct inode *dir, struct buffer_head *parent_fe_bh, unsigned int blocks_wanted, + struct ocfs2_dir_lookup_result *lookup, struct buffer_head **new_de_bh) { int status = 0; @@ -1677,17 +3280,29 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, struct ocfs2_dir_entry * de; struct super_block *sb = osb->sb; struct ocfs2_extent_tree et; + struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh; mlog_entry_void(); if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + /* + * This would be a code error as an inline directory should + * never have an index root. + */ + BUG_ON(dx_root_bh); + status = ocfs2_expand_inline_dir(dir, parent_fe_bh, - blocks_wanted, &new_bh); + blocks_wanted, lookup, + &new_bh); if (status) { mlog_errno(status); goto bail; } + /* Expansion from inline to an indexed directory will + * have given us this. */ + dx_root_bh = lookup->dl_dx_root_bh; + if (blocks_wanted == 1) { /* * If the new dirent will fit inside the space @@ -1751,6 +3366,10 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, } do_extend: + if (ocfs2_dir_indexed(dir)) + credits++; /* For attaching the new dirent block to the + * dx_root */ + down_write(&OCFS2_I(dir)->ip_alloc_sem); drop_alloc_sem = 1; @@ -1781,9 +3400,19 @@ do_extend: de = (struct ocfs2_dir_entry *) new_bh->b_data; de->inode = 0; - if (ocfs2_dir_has_trailer(dir)) { + if (ocfs2_supports_dir_trailer(dir)) { de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb)); - ocfs2_init_dir_trailer(dir, new_bh); + + ocfs2_init_dir_trailer(dir, new_bh, le16_to_cpu(de->rec_len)); + + if (ocfs2_dir_indexed(dir)) { + status = ocfs2_dx_dir_link_trailer(dir, handle, + dx_root_bh, new_bh); + if (status) { + mlog_errno(status); + goto bail; + } + } } else { de->rec_len = cpu_to_le16(sb->s_blocksize); } @@ -1839,7 +3468,7 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh, * This calculates how many free bytes we'd have in block zero, should * this function force expansion to an extent tree. */ - if (ocfs2_supports_dir_trailer(OCFS2_SB(sb))) + if (ocfs2_new_dir_wants_trailer(dir)) free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir); else free_space = dir->i_sb->s_blocksize - i_size_read(dir); @@ -1970,12 +3599,766 @@ bail: return status; } +static int dx_leaf_sort_cmp(const void *a, const void *b) +{ + const struct ocfs2_dx_entry *entry1 = a; + const struct ocfs2_dx_entry *entry2 = b; + u32 major_hash1 = le32_to_cpu(entry1->dx_major_hash); + u32 major_hash2 = le32_to_cpu(entry2->dx_major_hash); + u32 minor_hash1 = le32_to_cpu(entry1->dx_minor_hash); + u32 minor_hash2 = le32_to_cpu(entry2->dx_minor_hash); + + if (major_hash1 > major_hash2) + return 1; + if (major_hash1 < major_hash2) + return -1; + + /* + * It is not strictly necessary to sort by minor + */ + if (minor_hash1 > minor_hash2) + return 1; + if (minor_hash1 < minor_hash2) + return -1; + return 0; +} + +static void dx_leaf_sort_swap(void *a, void *b, int size) +{ + struct ocfs2_dx_entry *entry1 = a; + struct ocfs2_dx_entry *entry2 = b; + struct ocfs2_dx_entry tmp; + + BUG_ON(size != sizeof(*entry1)); + + tmp = *entry1; + *entry1 = *entry2; + *entry2 = tmp; +} + +static int ocfs2_dx_leaf_same_major(struct ocfs2_dx_leaf *dx_leaf) +{ + struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list; + int i, num = le16_to_cpu(dl_list->de_num_used); + + for (i = 0; i < (num - 1); i++) { + if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) != + le32_to_cpu(dl_list->de_entries[i + 1].dx_major_hash)) + return 0; + } + + return 1; +} + +/* + * Find the optimal value to split this leaf on. This expects the leaf + * entries to be in sorted order. + * + * leaf_cpos is the cpos of the leaf we're splitting. insert_hash is + * the hash we want to insert. + * + * This function is only concerned with the major hash - that which + * determines which cluster an item belongs to. + */ +static int ocfs2_dx_dir_find_leaf_split(struct ocfs2_dx_leaf *dx_leaf, + u32 leaf_cpos, u32 insert_hash, + u32 *split_hash) +{ + struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list; + int i, num_used = le16_to_cpu(dl_list->de_num_used); + int allsame; + + /* + * There's a couple rare, but nasty corner cases we have to + * check for here. All of them involve a leaf where all value + * have the same hash, which is what we look for first. + * + * Most of the time, all of the above is false, and we simply + * pick the median value for a split. + */ + allsame = ocfs2_dx_leaf_same_major(dx_leaf); + if (allsame) { + u32 val = le32_to_cpu(dl_list->de_entries[0].dx_major_hash); + + if (val == insert_hash) { + /* + * No matter where we would choose to split, + * the new entry would want to occupy the same + * block as these. Since there's no space left + * in their existing block, we know there + * won't be space after the split. + */ + return -ENOSPC; + } + + if (val == leaf_cpos) { + /* + * Because val is the same as leaf_cpos (which + * is the smallest value this leaf can have), + * yet is not equal to insert_hash, then we + * know that insert_hash *must* be larger than + * val (and leaf_cpos). At least cpos+1 in value. + * + * We also know then, that there cannot be an + * adjacent extent (otherwise we'd be looking + * at it). Choosing this value gives us a + * chance to get some contiguousness. + */ + *split_hash = leaf_cpos + 1; + return 0; + } + + if (val > insert_hash) { + /* + * val can not be the same as insert hash, and + * also must be larger than leaf_cpos. Also, + * we know that there can't be a leaf between + * cpos and val, otherwise the entries with + * hash 'val' would be there. + */ + *split_hash = val; + return 0; + } + + *split_hash = insert_hash; + return 0; + } + + /* + * Since the records are sorted and the checks above + * guaranteed that not all records in this block are the same, + * we simple travel forward, from the median, and pick the 1st + * record whose value is larger than leaf_cpos. + */ + for (i = (num_used / 2); i < num_used; i++) + if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) > + leaf_cpos) + break; + + BUG_ON(i == num_used); /* Should be impossible */ + *split_hash = le32_to_cpu(dl_list->de_entries[i].dx_major_hash); + return 0; +} + +/* + * Transfer all entries in orig_dx_leaves whose major hash is equal to or + * larger than split_hash into new_dx_leaves. We use a temporary + * buffer (tmp_dx_leaf) to make the changes to the original leaf blocks. + * + * Since the block offset inside a leaf (cluster) is a constant mask + * of minor_hash, we can optimize - an item at block offset X within + * the original cluster, will be at offset X within the new cluster. + */ +static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash, + handle_t *handle, + struct ocfs2_dx_leaf *tmp_dx_leaf, + struct buffer_head **orig_dx_leaves, + struct buffer_head **new_dx_leaves, + int num_dx_leaves) +{ + int i, j, num_used; + u32 major_hash; + struct ocfs2_dx_leaf *orig_dx_leaf, *new_dx_leaf; + struct ocfs2_dx_entry_list *orig_list, *new_list, *tmp_list; + struct ocfs2_dx_entry *dx_entry; + + tmp_list = &tmp_dx_leaf->dl_list; + + for (i = 0; i < num_dx_leaves; i++) { + orig_dx_leaf = (struct ocfs2_dx_leaf *) orig_dx_leaves[i]->b_data; + orig_list = &orig_dx_leaf->dl_list; + new_dx_leaf = (struct ocfs2_dx_leaf *) new_dx_leaves[i]->b_data; + new_list = &new_dx_leaf->dl_list; + + num_used = le16_to_cpu(orig_list->de_num_used); + + memcpy(tmp_dx_leaf, orig_dx_leaf, dir->i_sb->s_blocksize); + tmp_list->de_num_used = cpu_to_le16(0); + memset(&tmp_list->de_entries, 0, sizeof(*dx_entry)*num_used); + + for (j = 0; j < num_used; j++) { + dx_entry = &orig_list->de_entries[j]; + major_hash = le32_to_cpu(dx_entry->dx_major_hash); + if (major_hash >= split_hash) + ocfs2_dx_dir_leaf_insert_tail(new_dx_leaf, + dx_entry); + else + ocfs2_dx_dir_leaf_insert_tail(tmp_dx_leaf, + dx_entry); + } + memcpy(orig_dx_leaf, tmp_dx_leaf, dir->i_sb->s_blocksize); + + ocfs2_journal_dirty(handle, orig_dx_leaves[i]); + ocfs2_journal_dirty(handle, new_dx_leaves[i]); + } +} + +static int ocfs2_dx_dir_rebalance_credits(struct ocfs2_super *osb, + struct ocfs2_dx_root_block *dx_root) +{ + int credits = ocfs2_clusters_to_blocks(osb->sb, 2); + + credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list, 1); + credits += ocfs2_quota_trans_credits(osb->sb); + return credits; +} + +/* + * Find the median value in dx_leaf_bh and allocate a new leaf to move + * half our entries into. + */ +static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir, + struct buffer_head *dx_root_bh, + struct buffer_head *dx_leaf_bh, + struct ocfs2_dx_hinfo *hinfo, u32 leaf_cpos, + u64 leaf_blkno) +{ + struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data; + int credits, ret, i, num_used, did_quota = 0; + u32 cpos, split_hash, insert_hash = hinfo->major_hash; + u64 orig_leaves_start; + int num_dx_leaves; + struct buffer_head **orig_dx_leaves = NULL; + struct buffer_head **new_dx_leaves = NULL; + struct ocfs2_alloc_context *data_ac = NULL, *meta_ac = NULL; + struct ocfs2_extent_tree et; + handle_t *handle = NULL; + struct ocfs2_dx_root_block *dx_root; + struct ocfs2_dx_leaf *tmp_dx_leaf = NULL; + + mlog(0, "DX Dir: %llu, rebalance leaf leaf_blkno: %llu insert: %u\n", + (unsigned long long)OCFS2_I(dir)->ip_blkno, + (unsigned long long)leaf_blkno, insert_hash); + + ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh); + + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + /* + * XXX: This is a rather large limit. We should use a more + * realistic value. + */ + if (le32_to_cpu(dx_root->dr_clusters) == UINT_MAX) + return -ENOSPC; + + num_used = le16_to_cpu(dx_leaf->dl_list.de_num_used); + if (num_used < le16_to_cpu(dx_leaf->dl_list.de_count)) { + mlog(ML_ERROR, "DX Dir: %llu, Asked to rebalance empty leaf: " + "%llu, %d\n", (unsigned long long)OCFS2_I(dir)->ip_blkno, + (unsigned long long)leaf_blkno, num_used); + ret = -EIO; + goto out; + } + + orig_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves); + if (!orig_dx_leaves) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + new_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, NULL); + if (!new_dx_leaves) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_lock_allocators(dir, &et, 1, 0, &data_ac, &meta_ac); + if (ret) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + + credits = ocfs2_dx_dir_rebalance_credits(osb, dx_root); + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + mlog_errno(ret); + goto out; + } + + if (vfs_dq_alloc_space_nodirty(dir, + ocfs2_clusters_to_bytes(dir->i_sb, 1))) { + ret = -EDQUOT; + goto out_commit; + } + did_quota = 1; + + ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + /* + * This block is changing anyway, so we can sort it in place. + */ + sort(dx_leaf->dl_list.de_entries, num_used, + sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp, + dx_leaf_sort_swap); + + ret = ocfs2_journal_dirty(handle, dx_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash, + &split_hash); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + mlog(0, "Split leaf (%u) at %u, insert major hash is %u\n", + leaf_cpos, split_hash, insert_hash); + + /* + * We have to carefully order operations here. There are items + * which want to be in the new cluster before insert, but in + * order to put those items in the new cluster, we alter the + * old cluster. A failure to insert gets nasty. + * + * So, start by reserving writes to the old + * cluster. ocfs2_dx_dir_new_cluster will reserve writes on + * the new cluster for us, before inserting it. The insert + * won't happen if there's an error before that. Once the + * insert is done then, we can transfer from one leaf into the + * other without fear of hitting any error. + */ + + /* + * The leaf transfer wants some scratch space so that we don't + * wind up doing a bunch of expensive memmove(). + */ + tmp_dx_leaf = kmalloc(osb->sb->s_blocksize, GFP_NOFS); + if (!tmp_dx_leaf) { + ret = -ENOMEM; + mlog_errno(ret); + goto out_commit; + } + + orig_leaves_start = ocfs2_block_to_cluster_start(dir->i_sb, leaf_blkno); + ret = ocfs2_read_dx_leaves(dir, orig_leaves_start, num_dx_leaves, + orig_dx_leaves); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + for (i = 0; i < num_dx_leaves; i++) { + ret = ocfs2_journal_access_dl(handle, dir, orig_dx_leaves[i], + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + } + + cpos = split_hash; + ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle, + data_ac, meta_ac, new_dx_leaves, + num_dx_leaves); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf, + orig_dx_leaves, new_dx_leaves, num_dx_leaves); + +out_commit: + if (ret < 0 && did_quota) + vfs_dq_free_space_nodirty(dir, + ocfs2_clusters_to_bytes(dir->i_sb, 1)); + + ocfs2_commit_trans(osb, handle); + +out: + if (orig_dx_leaves || new_dx_leaves) { + for (i = 0; i < num_dx_leaves; i++) { + if (orig_dx_leaves) + brelse(orig_dx_leaves[i]); + if (new_dx_leaves) + brelse(new_dx_leaves[i]); + } + kfree(orig_dx_leaves); + kfree(new_dx_leaves); + } + + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + if (data_ac) + ocfs2_free_alloc_context(data_ac); + + kfree(tmp_dx_leaf); + return ret; +} + +static int ocfs2_find_dir_space_dx(struct ocfs2_super *osb, struct inode *dir, + struct buffer_head *di_bh, + struct buffer_head *dx_root_bh, + const char *name, int namelen, + struct ocfs2_dir_lookup_result *lookup) +{ + int ret, rebalanced = 0; + struct ocfs2_dx_root_block *dx_root; + struct buffer_head *dx_leaf_bh = NULL; + struct ocfs2_dx_leaf *dx_leaf; + u64 blkno; + u32 leaf_cpos; + + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + +restart_search: + ret = ocfs2_dx_dir_lookup(dir, &dx_root->dr_list, &lookup->dl_hinfo, + &leaf_cpos, &blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_read_dx_leaf(dir, blkno, &dx_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data; + + if (le16_to_cpu(dx_leaf->dl_list.de_num_used) >= + le16_to_cpu(dx_leaf->dl_list.de_count)) { + if (rebalanced) { + /* + * Rebalancing should have provided us with + * space in an appropriate leaf. + * + * XXX: Is this an abnormal condition then? + * Should we print a message here? + */ + ret = -ENOSPC; + goto out; + } + + ret = ocfs2_dx_dir_rebalance(osb, dir, dx_root_bh, dx_leaf_bh, + &lookup->dl_hinfo, leaf_cpos, + blkno); + if (ret) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + + /* + * Restart the lookup. The rebalance might have + * changed which block our item fits into. Mark our + * progress, so we only execute this once. + */ + brelse(dx_leaf_bh); + dx_leaf_bh = NULL; + rebalanced = 1; + goto restart_search; + } + + lookup->dl_dx_leaf_bh = dx_leaf_bh; + dx_leaf_bh = NULL; + +out: + brelse(dx_leaf_bh); + return ret; +} + +static int ocfs2_search_dx_free_list(struct inode *dir, + struct buffer_head *dx_root_bh, + int namelen, + struct ocfs2_dir_lookup_result *lookup) +{ + int ret = -ENOSPC; + struct buffer_head *leaf_bh = NULL, *prev_leaf_bh = NULL; + struct ocfs2_dir_block_trailer *db; + u64 next_block; + int rec_len = OCFS2_DIR_REC_LEN(namelen); + struct ocfs2_dx_root_block *dx_root; + + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + next_block = le64_to_cpu(dx_root->dr_free_blk); + + while (next_block) { + brelse(prev_leaf_bh); + prev_leaf_bh = leaf_bh; + leaf_bh = NULL; + + ret = ocfs2_read_dir_block_direct(dir, next_block, &leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + db = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb); + if (rec_len <= le16_to_cpu(db->db_free_rec_len)) { + lookup->dl_leaf_bh = leaf_bh; + lookup->dl_prev_leaf_bh = prev_leaf_bh; + leaf_bh = NULL; + prev_leaf_bh = NULL; + break; + } + + next_block = le64_to_cpu(db->db_free_next); + } + + if (!next_block) + ret = -ENOSPC; + +out: + + brelse(leaf_bh); + brelse(prev_leaf_bh); + return ret; +} + +static int ocfs2_expand_inline_dx_root(struct inode *dir, + struct buffer_head *dx_root_bh) +{ + int ret, num_dx_leaves, i, j, did_quota = 0; + struct buffer_head **dx_leaves = NULL; + struct ocfs2_extent_tree et; + u64 insert_blkno; + struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + handle_t *handle = NULL; + struct ocfs2_dx_root_block *dx_root; + struct ocfs2_dx_entry_list *entry_list; + struct ocfs2_dx_entry *dx_entry; + struct ocfs2_dx_leaf *target_leaf; + + ret = ocfs2_reserve_clusters(osb, 1, &data_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves); + if (!dx_leaves) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + handle = ocfs2_start_trans(osb, ocfs2_calc_dxi_expand_credits(osb->sb)); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + if (vfs_dq_alloc_space_nodirty(dir, + ocfs2_clusters_to_bytes(osb->sb, 1))) { + ret = -EDQUOT; + goto out_commit; + } + did_quota = 1; + + /* + * We do this up front, before the allocation, so that a + * failure to add the dx_root_bh to the journal won't result + * us losing clusters. + */ + ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac, dx_leaves, + num_dx_leaves, &insert_blkno); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + /* + * Transfer the entries from our dx_root into the appropriate + * block + */ + dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data; + entry_list = &dx_root->dr_entries; + + for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) { + dx_entry = &entry_list->de_entries[i]; + + j = __ocfs2_dx_dir_hash_idx(osb, + le32_to_cpu(dx_entry->dx_minor_hash)); + target_leaf = (struct ocfs2_dx_leaf *)dx_leaves[j]->b_data; + + ocfs2_dx_dir_leaf_insert_tail(target_leaf, dx_entry); + + /* Each leaf has been passed to the journal already + * via __ocfs2_dx_dir_new_cluster() */ + } + + dx_root->dr_flags &= ~OCFS2_DX_FLAG_INLINE; + memset(&dx_root->dr_list, 0, osb->sb->s_blocksize - + offsetof(struct ocfs2_dx_root_block, dr_list)); + dx_root->dr_list.l_count = + cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb)); + + /* This should never fail considering we start with an empty + * dx_root. */ + ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh); + ret = ocfs2_insert_extent(osb, handle, dir, &et, 0, + insert_blkno, 1, 0, NULL); + if (ret) + mlog_errno(ret); + did_quota = 0; + + ocfs2_journal_dirty(handle, dx_root_bh); + +out_commit: + if (ret < 0 && did_quota) + vfs_dq_free_space_nodirty(dir, + ocfs2_clusters_to_bytes(dir->i_sb, 1)); + + ocfs2_commit_trans(osb, handle); + +out: + if (data_ac) + ocfs2_free_alloc_context(data_ac); + + if (dx_leaves) { + for (i = 0; i < num_dx_leaves; i++) + brelse(dx_leaves[i]); + kfree(dx_leaves); + } + return ret; +} + +static int ocfs2_inline_dx_has_space(struct buffer_head *dx_root_bh) +{ + struct ocfs2_dx_root_block *dx_root; + struct ocfs2_dx_entry_list *entry_list; + + dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data; + entry_list = &dx_root->dr_entries; + + if (le16_to_cpu(entry_list->de_num_used) >= + le16_to_cpu(entry_list->de_count)) + return -ENOSPC; + + return 0; +} + +static int ocfs2_prepare_dx_dir_for_insert(struct inode *dir, + struct buffer_head *di_bh, + const char *name, + int namelen, + struct ocfs2_dir_lookup_result *lookup) +{ + int ret, free_dx_root = 1; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + struct buffer_head *dx_root_bh = NULL; + struct buffer_head *leaf_bh = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_dx_root_block *dx_root; + + ret = ocfs2_read_dx_root(dir, di, &dx_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + if (le32_to_cpu(dx_root->dr_num_entries) == OCFS2_DX_ENTRIES_MAX) { + ret = -ENOSPC; + mlog_errno(ret); + goto out; + } + + if (ocfs2_dx_root_inline(dx_root)) { + ret = ocfs2_inline_dx_has_space(dx_root_bh); + + if (ret == 0) + goto search_el; + + /* + * We ran out of room in the root block. Expand it to + * an extent, then allow ocfs2_find_dir_space_dx to do + * the rest. + */ + ret = ocfs2_expand_inline_dx_root(dir, dx_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + /* + * Insert preparation for an indexed directory is split into two + * steps. The call to find_dir_space_dx reserves room in the index for + * an additional item. If we run out of space there, it's a real error + * we can't continue on. + */ + ret = ocfs2_find_dir_space_dx(osb, dir, di_bh, dx_root_bh, name, + namelen, lookup); + if (ret) { + mlog_errno(ret); + goto out; + } + +search_el: + /* + * Next, we need to find space in the unindexed tree. This call + * searches using the free space linked list. If the unindexed tree + * lacks sufficient space, we'll expand it below. The expansion code + * is smart enough to add any new blocks to the free space list. + */ + ret = ocfs2_search_dx_free_list(dir, dx_root_bh, namelen, lookup); + if (ret && ret != -ENOSPC) { + mlog_errno(ret); + goto out; + } + + /* Do this up here - ocfs2_extend_dir might need the dx_root */ + lookup->dl_dx_root_bh = dx_root_bh; + free_dx_root = 0; + + if (ret == -ENOSPC) { + ret = ocfs2_extend_dir(osb, dir, di_bh, 1, lookup, &leaf_bh); + + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * We make the assumption here that new leaf blocks are added + * to the front of our free list. + */ + lookup->dl_prev_leaf_bh = NULL; + lookup->dl_leaf_bh = leaf_bh; + } + +out: + if (free_dx_root) + brelse(dx_root_bh); + return ret; +} + +/* + * Get a directory ready for insert. Any directory allocation required + * happens here. Success returns zero, and enough context in the dir + * lookup result that ocfs2_add_entry() will be able complete the task + * with minimal performance impact. + */ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, struct inode *dir, struct buffer_head *parent_fe_bh, const char *name, int namelen, - struct buffer_head **ret_de_bh) + struct ocfs2_dir_lookup_result *lookup) { int ret; unsigned int blocks_wanted = 1; @@ -1984,14 +4367,34 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, mlog(0, "getting ready to insert namelen %d into dir %llu\n", namelen, (unsigned long long)OCFS2_I(dir)->ip_blkno); - *ret_de_bh = NULL; - if (!namelen) { ret = -EINVAL; mlog_errno(ret); goto out; } + /* + * Do this up front to reduce confusion. + * + * The directory might start inline, then be turned into an + * indexed one, in which case we'd need to hash deep inside + * ocfs2_find_dir_space_id(). Since + * ocfs2_prepare_dx_dir_for_insert() also needs this hash + * done, there seems no point in spreading out the calls. We + * can optimize away the case where the file system doesn't + * support indexing. + */ + if (ocfs2_supports_indexed_dirs(osb)) + ocfs2_dx_dir_name_hash(dir, name, namelen, &lookup->dl_hinfo); + + if (ocfs2_dir_indexed(dir)) { + ret = ocfs2_prepare_dx_dir_for_insert(dir, parent_fe_bh, + name, namelen, lookup); + if (ret) + mlog_errno(ret); + goto out; + } + if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { ret = ocfs2_find_dir_space_id(dir, parent_fe_bh, name, namelen, &bh, &blocks_wanted); @@ -2010,7 +4413,7 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, BUG_ON(bh); ret = ocfs2_extend_dir(osb, dir, parent_fe_bh, blocks_wanted, - &bh); + lookup, &bh); if (ret) { if (ret != -ENOSPC) mlog_errno(ret); @@ -2020,9 +4423,154 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, BUG_ON(!bh); } - *ret_de_bh = bh; + lookup->dl_leaf_bh = bh; bh = NULL; out: brelse(bh); return ret; } + +static int ocfs2_dx_dir_remove_index(struct inode *dir, + struct buffer_head *di_bh, + struct buffer_head *dx_root_bh) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_dx_root_block *dx_root; + struct inode *dx_alloc_inode = NULL; + struct buffer_head *dx_alloc_bh = NULL; + handle_t *handle; + u64 blk; + u16 bit; + u64 bg_blkno; + + dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data; + + dx_alloc_inode = ocfs2_get_system_file_inode(osb, + EXTENT_ALLOC_SYSTEM_INODE, + le16_to_cpu(dx_root->dr_suballoc_slot)); + if (!dx_alloc_inode) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + mutex_lock(&dx_alloc_inode->i_mutex); + + ret = ocfs2_inode_lock(dx_alloc_inode, &dx_alloc_bh, 1); + if (ret) { + mlog_errno(ret); + goto out_mutex; + } + + handle = ocfs2_start_trans(osb, OCFS2_DX_ROOT_REMOVE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_unlock; + } + + ret = ocfs2_journal_access_di(handle, dir, di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + OCFS2_I(dir)->ip_dyn_features &= ~OCFS2_INDEXED_DIR_FL; + di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features); + di->i_dx_root = cpu_to_le64(0ULL); + + ocfs2_journal_dirty(handle, di_bh); + + blk = le64_to_cpu(dx_root->dr_blkno); + bit = le16_to_cpu(dx_root->dr_suballoc_bit); + bg_blkno = ocfs2_which_suballoc_group(blk, bit); + ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh, + bit, bg_blkno, 1); + if (ret) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(osb, handle); + +out_unlock: + ocfs2_inode_unlock(dx_alloc_inode, 1); + +out_mutex: + mutex_unlock(&dx_alloc_inode->i_mutex); + brelse(dx_alloc_bh); +out: + iput(dx_alloc_inode); + return ret; +} + +int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh) +{ + int ret; + unsigned int uninitialized_var(clen); + u32 major_hash = UINT_MAX, p_cpos, uninitialized_var(cpos); + u64 uninitialized_var(blkno); + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + struct buffer_head *dx_root_bh = NULL; + struct ocfs2_dx_root_block *dx_root; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_cached_dealloc_ctxt dealloc; + struct ocfs2_extent_tree et; + + ocfs2_init_dealloc_ctxt(&dealloc); + + if (!ocfs2_dir_indexed(dir)) + return 0; + + ret = ocfs2_read_dx_root(dir, di, &dx_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + + if (ocfs2_dx_root_inline(dx_root)) + goto remove_index; + + ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh); + + /* XXX: What if dr_clusters is too large? */ + while (le32_to_cpu(dx_root->dr_clusters)) { + ret = ocfs2_dx_dir_lookup_rec(dir, &dx_root->dr_list, + major_hash, &cpos, &blkno, &clen); + if (ret) { + mlog_errno(ret); + goto out; + } + + p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno); + + ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen, + &dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (cpos == 0) + break; + + major_hash = cpos - 1; + } + +remove_index: + ret = ocfs2_dx_dir_remove_index(dir, di_bh, dx_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_remove_from_cache(dir, dx_root_bh); +out: + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &dealloc); + + brelse(dx_root_bh); + return ret; +} diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h index c511e2e18e9f..e683f3deb645 100644 --- a/fs/ocfs2/dir.h +++ b/fs/ocfs2/dir.h @@ -26,44 +26,70 @@ #ifndef OCFS2_DIR_H #define OCFS2_DIR_H -struct buffer_head *ocfs2_find_entry(const char *name, - int namelen, - struct inode *dir, - struct ocfs2_dir_entry **res_dir); +struct ocfs2_dx_hinfo { + u32 major_hash; + u32 minor_hash; +}; + +struct ocfs2_dir_lookup_result { + struct buffer_head *dl_leaf_bh; /* Unindexed leaf + * block */ + struct ocfs2_dir_entry *dl_entry; /* Target dirent in + * unindexed leaf */ + + struct buffer_head *dl_dx_root_bh; /* Root of indexed + * tree */ + + struct buffer_head *dl_dx_leaf_bh; /* Indexed leaf block */ + struct ocfs2_dx_entry *dl_dx_entry; /* Target dx_entry in + * indexed leaf */ + struct ocfs2_dx_hinfo dl_hinfo; /* Name hash results */ + + struct buffer_head *dl_prev_leaf_bh;/* Previous entry in + * dir free space + * list. NULL if + * previous entry is + * dx root block. */ +}; + +void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res); + +int ocfs2_find_entry(const char *name, int namelen, + struct inode *dir, + struct ocfs2_dir_lookup_result *lookup); int ocfs2_delete_entry(handle_t *handle, struct inode *dir, - struct ocfs2_dir_entry *de_del, - struct buffer_head *bh); + struct ocfs2_dir_lookup_result *res); int __ocfs2_add_entry(handle_t *handle, struct inode *dir, const char *name, int namelen, struct inode *inode, u64 blkno, struct buffer_head *parent_fe_bh, - struct buffer_head *insert_bh); + struct ocfs2_dir_lookup_result *lookup); static inline int ocfs2_add_entry(handle_t *handle, struct dentry *dentry, struct inode *inode, u64 blkno, struct buffer_head *parent_fe_bh, - struct buffer_head *insert_bh) + struct ocfs2_dir_lookup_result *lookup) { return __ocfs2_add_entry(handle, dentry->d_parent->d_inode, dentry->d_name.name, dentry->d_name.len, - inode, blkno, parent_fe_bh, insert_bh); + inode, blkno, parent_fe_bh, lookup); } int ocfs2_update_entry(struct inode *dir, handle_t *handle, - struct buffer_head *de_bh, struct ocfs2_dir_entry *de, + struct ocfs2_dir_lookup_result *res, struct inode *new_entry_inode); int ocfs2_check_dir_for_entry(struct inode *dir, const char *name, int namelen); int ocfs2_empty_dir(struct inode *inode); + int ocfs2_find_files_on_disk(const char *name, int namelen, u64 *blkno, struct inode *inode, - struct buffer_head **dirent_bh, - struct ocfs2_dir_entry **dirent); + struct ocfs2_dir_lookup_result *res); int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name, int namelen, u64 *blkno); int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir); @@ -74,14 +100,17 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, struct buffer_head *parent_fe_bh, const char *name, int namelen, - struct buffer_head **ret_de_bh); + struct ocfs2_dir_lookup_result *lookup); struct ocfs2_alloc_context; int ocfs2_fill_new_dir(struct ocfs2_super *osb, handle_t *handle, struct inode *parent, struct inode *inode, struct buffer_head *fe_bh, - struct ocfs2_alloc_context *data_ac); + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac); + +int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh); struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize, void *data); diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index bb53714813ab..0102be35980c 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -52,16 +52,12 @@ enum dlm_mle_type { DLM_MLE_BLOCK, DLM_MLE_MASTER, - DLM_MLE_MIGRATION -}; - -struct dlm_lock_name { - u8 len; - u8 name[DLM_LOCKID_NAME_MAX]; + DLM_MLE_MIGRATION, + DLM_MLE_NUM_TYPES }; struct dlm_master_list_entry { - struct list_head list; + struct hlist_node master_hash_node; struct list_head hb_events; struct dlm_ctxt *dlm; spinlock_t spinlock; @@ -78,10 +74,10 @@ struct dlm_master_list_entry { enum dlm_mle_type type; struct o2hb_callback_func mle_hb_up; struct o2hb_callback_func mle_hb_down; - union { - struct dlm_lock_resource *res; - struct dlm_lock_name name; - } u; + struct dlm_lock_resource *mleres; + unsigned char mname[DLM_LOCKID_NAME_MAX]; + unsigned int mnamelen; + unsigned int mnamehash; }; enum dlm_ast_type { @@ -151,13 +147,14 @@ struct dlm_ctxt unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; struct dlm_recovery_ctxt reco; spinlock_t master_lock; - struct list_head master_list; + struct hlist_head **master_hash; struct list_head mle_hb_events; /* these give a really vague idea of the system load */ - atomic_t local_resources; - atomic_t remote_resources; - atomic_t unknown_resources; + atomic_t mle_tot_count[DLM_MLE_NUM_TYPES]; + atomic_t mle_cur_count[DLM_MLE_NUM_TYPES]; + atomic_t res_tot_count; + atomic_t res_cur_count; struct dlm_debug_ctxt *dlm_debug_ctxt; struct dentry *dlm_debugfs_subroot; @@ -195,6 +192,13 @@ static inline struct hlist_head *dlm_lockres_hash(struct dlm_ctxt *dlm, unsigned return dlm->lockres_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] + (i % DLM_BUCKETS_PER_PAGE); } +static inline struct hlist_head *dlm_master_hash(struct dlm_ctxt *dlm, + unsigned i) +{ + return dlm->master_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] + + (i % DLM_BUCKETS_PER_PAGE); +} + /* these keventd work queue items are for less-frequently * called functions that cannot be directly called from the * net message handlers for some reason, usually because @@ -848,9 +852,7 @@ struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm, unsigned int len); int dlm_is_host_down(int errno); -void dlm_change_lockres_owner(struct dlm_ctxt *dlm, - struct dlm_lock_resource *res, - u8 owner); + struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, const char *lockid, int namelen, @@ -1008,6 +1010,9 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res) DLM_LOCK_RES_MIGRATING)); } +void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle); +void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle); + /* create/destroy slab caches */ int dlm_init_master_caches(void); void dlm_destroy_master_caches(void); @@ -1110,6 +1115,23 @@ static inline int dlm_node_iter_next(struct dlm_node_iter *iter) return bit; } +static inline void dlm_set_lockres_owner(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + u8 owner) +{ + assert_spin_locked(&res->spinlock); + + res->owner = owner; +} +static inline void dlm_change_lockres_owner(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + u8 owner) +{ + assert_spin_locked(&res->spinlock); + + if (owner != res->owner) + dlm_set_lockres_owner(dlm, res, owner); +} #endif /* DLMCOMMON_H */ diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index b32f60a5acfb..df52f706f669 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -287,18 +287,8 @@ static int stringify_nodemap(unsigned long *nodemap, int maxnodes, static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len) { int out = 0; - unsigned int namelen; - const char *name; char *mle_type; - if (mle->type != DLM_MLE_MASTER) { - namelen = mle->u.name.len; - name = mle->u.name.name; - } else { - namelen = mle->u.res->lockname.len; - name = mle->u.res->lockname.name; - } - if (mle->type == DLM_MLE_BLOCK) mle_type = "BLK"; else if (mle->type == DLM_MLE_MASTER) @@ -306,7 +296,7 @@ static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len) else mle_type = "MIG"; - out += stringify_lockname(name, namelen, buf + out, len - out); + out += stringify_lockname(mle->mname, mle->mnamelen, buf + out, len - out); out += snprintf(buf + out, len - out, "\t%3s\tmas=%3u\tnew=%3u\tevt=%1d\tuse=%1d\tref=%3d\n", mle_type, mle->master, mle->new_master, @@ -501,23 +491,33 @@ static struct file_operations debug_purgelist_fops = { static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db) { struct dlm_master_list_entry *mle; - int out = 0; - unsigned long total = 0; + struct hlist_head *bucket; + struct hlist_node *list; + int i, out = 0; + unsigned long total = 0, longest = 0, bktcnt; out += snprintf(db->buf + out, db->len - out, "Dumping MLEs for Domain: %s\n", dlm->name); spin_lock(&dlm->master_lock); - list_for_each_entry(mle, &dlm->master_list, list) { - ++total; - if (db->len - out < 200) - continue; - out += dump_mle(mle, db->buf + out, db->len - out); + for (i = 0; i < DLM_HASH_BUCKETS; i++) { + bucket = dlm_master_hash(dlm, i); + hlist_for_each(list, bucket) { + mle = hlist_entry(list, struct dlm_master_list_entry, + master_hash_node); + ++total; + ++bktcnt; + if (db->len - out < 200) + continue; + out += dump_mle(mle, db->buf + out, db->len - out); + } + longest = max(longest, bktcnt); + bktcnt = 0; } spin_unlock(&dlm->master_lock); out += snprintf(db->buf + out, db->len - out, - "Total on list: %ld\n", total); + "Total: %ld, Longest: %ld\n", total, longest); return out; } @@ -756,12 +756,8 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db) int out = 0; struct dlm_reco_node_data *node; char *state; - int lres, rres, ures, tres; - - lres = atomic_read(&dlm->local_resources); - rres = atomic_read(&dlm->remote_resources); - ures = atomic_read(&dlm->unknown_resources); - tres = lres + rres + ures; + int cur_mles = 0, tot_mles = 0; + int i; spin_lock(&dlm->spinlock); @@ -804,21 +800,48 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db) db->buf + out, db->len - out); out += snprintf(db->buf + out, db->len - out, "\n"); - /* Mastered Resources Total: xxx Locally: xxx Remotely: ... */ + /* Lock Resources: xxx (xxx) */ + out += snprintf(db->buf + out, db->len - out, + "Lock Resources: %d (%d)\n", + atomic_read(&dlm->res_cur_count), + atomic_read(&dlm->res_tot_count)); + + for (i = 0; i < DLM_MLE_NUM_TYPES; ++i) + tot_mles += atomic_read(&dlm->mle_tot_count[i]); + + for (i = 0; i < DLM_MLE_NUM_TYPES; ++i) + cur_mles += atomic_read(&dlm->mle_cur_count[i]); + + /* MLEs: xxx (xxx) */ + out += snprintf(db->buf + out, db->len - out, + "MLEs: %d (%d)\n", cur_mles, tot_mles); + + /* Blocking: xxx (xxx) */ + out += snprintf(db->buf + out, db->len - out, + " Blocking: %d (%d)\n", + atomic_read(&dlm->mle_cur_count[DLM_MLE_BLOCK]), + atomic_read(&dlm->mle_tot_count[DLM_MLE_BLOCK])); + + /* Mastery: xxx (xxx) */ + out += snprintf(db->buf + out, db->len - out, + " Mastery: %d (%d)\n", + atomic_read(&dlm->mle_cur_count[DLM_MLE_MASTER]), + atomic_read(&dlm->mle_tot_count[DLM_MLE_MASTER])); + + /* Migration: xxx (xxx) */ out += snprintf(db->buf + out, db->len - out, - "Mastered Resources Total: %d Locally: %d " - "Remotely: %d Unknown: %d\n", - tres, lres, rres, ures); + " Migration: %d (%d)\n", + atomic_read(&dlm->mle_cur_count[DLM_MLE_MIGRATION]), + atomic_read(&dlm->mle_tot_count[DLM_MLE_MIGRATION])); /* Lists: Dirty=Empty Purge=InUse PendingASTs=Empty ... */ out += snprintf(db->buf + out, db->len - out, "Lists: Dirty=%s Purge=%s PendingASTs=%s " - "PendingBASTs=%s Master=%s\n", + "PendingBASTs=%s\n", (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"), (list_empty(&dlm->purge_list) ? "Empty" : "InUse"), (list_empty(&dlm->pending_asts) ? "Empty" : "InUse"), - (list_empty(&dlm->pending_basts) ? "Empty" : "InUse"), - (list_empty(&dlm->master_list) ? "Empty" : "InUse")); + (list_empty(&dlm->pending_basts) ? "Empty" : "InUse")); /* Purge Count: xxx Refs: xxx */ out += snprintf(db->buf + out, db->len - out, diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index d8d578f45613..4d9e6b288dd8 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -304,6 +304,9 @@ static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm) if (dlm->lockres_hash) dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); + if (dlm->master_hash) + dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES); + if (dlm->name) kfree(dlm->name); @@ -1534,12 +1537,27 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, for (i = 0; i < DLM_HASH_BUCKETS; i++) INIT_HLIST_HEAD(dlm_lockres_hash(dlm, i)); + dlm->master_hash = (struct hlist_head **) + dlm_alloc_pagevec(DLM_HASH_PAGES); + if (!dlm->master_hash) { + mlog_errno(-ENOMEM); + dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); + kfree(dlm->name); + kfree(dlm); + dlm = NULL; + goto leave; + } + + for (i = 0; i < DLM_HASH_BUCKETS; i++) + INIT_HLIST_HEAD(dlm_master_hash(dlm, i)); + strcpy(dlm->name, domain); dlm->key = key; dlm->node_num = o2nm_this_node(); ret = dlm_create_debugfs_subroot(dlm); if (ret < 0) { + dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES); dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); kfree(dlm->name); kfree(dlm); @@ -1579,7 +1597,6 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, init_waitqueue_head(&dlm->reco.event); init_waitqueue_head(&dlm->ast_wq); init_waitqueue_head(&dlm->migration_wq); - INIT_LIST_HEAD(&dlm->master_list); INIT_LIST_HEAD(&dlm->mle_hb_events); dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN; @@ -1587,9 +1604,13 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, dlm->reco.new_master = O2NM_INVALID_NODE_NUM; dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; - atomic_set(&dlm->local_resources, 0); - atomic_set(&dlm->remote_resources, 0); - atomic_set(&dlm->unknown_resources, 0); + + atomic_set(&dlm->res_tot_count, 0); + atomic_set(&dlm->res_cur_count, 0); + for (i = 0; i < DLM_MLE_NUM_TYPES; ++i) { + atomic_set(&dlm->mle_tot_count[i], 0); + atomic_set(&dlm->mle_cur_count[i], 0); + } spin_lock_init(&dlm->work_lock); INIT_LIST_HEAD(&dlm->work_list); diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 0a2813947853..f8b653fcd4dd 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -73,22 +73,13 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm, const char *name, unsigned int namelen) { - struct dlm_lock_resource *res; - if (dlm != mle->dlm) return 0; - if (mle->type == DLM_MLE_BLOCK || - mle->type == DLM_MLE_MIGRATION) { - if (namelen != mle->u.name.len || - memcmp(name, mle->u.name.name, namelen)!=0) - return 0; - } else { - res = mle->u.res; - if (namelen != res->lockname.len || - memcmp(res->lockname.name, name, namelen) != 0) - return 0; - } + if (namelen != mle->mnamelen || + memcmp(name, mle->mname, namelen) != 0) + return 0; + return 1; } @@ -283,7 +274,7 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle, mle->dlm = dlm; mle->type = type; - INIT_LIST_HEAD(&mle->list); + INIT_HLIST_NODE(&mle->master_hash_node); INIT_LIST_HEAD(&mle->hb_events); memset(mle->maybe_map, 0, sizeof(mle->maybe_map)); spin_lock_init(&mle->spinlock); @@ -295,19 +286,27 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle, mle->new_master = O2NM_MAX_NODES; mle->inuse = 0; + BUG_ON(mle->type != DLM_MLE_BLOCK && + mle->type != DLM_MLE_MASTER && + mle->type != DLM_MLE_MIGRATION); + if (mle->type == DLM_MLE_MASTER) { BUG_ON(!res); - mle->u.res = res; - } else if (mle->type == DLM_MLE_BLOCK) { - BUG_ON(!name); - memcpy(mle->u.name.name, name, namelen); - mle->u.name.len = namelen; - } else /* DLM_MLE_MIGRATION */ { + mle->mleres = res; + memcpy(mle->mname, res->lockname.name, res->lockname.len); + mle->mnamelen = res->lockname.len; + mle->mnamehash = res->lockname.hash; + } else { BUG_ON(!name); - memcpy(mle->u.name.name, name, namelen); - mle->u.name.len = namelen; + mle->mleres = NULL; + memcpy(mle->mname, name, namelen); + mle->mnamelen = namelen; + mle->mnamehash = dlm_lockid_hash(name, namelen); } + atomic_inc(&dlm->mle_tot_count[mle->type]); + atomic_inc(&dlm->mle_cur_count[mle->type]); + /* copy off the node_map and register hb callbacks on our copy */ memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map)); memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map)); @@ -318,6 +317,24 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle, __dlm_mle_attach_hb_events(dlm, mle); } +void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle) +{ + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&dlm->master_lock); + + if (!hlist_unhashed(&mle->master_hash_node)) + hlist_del_init(&mle->master_hash_node); +} + +void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle) +{ + struct hlist_head *bucket; + + assert_spin_locked(&dlm->master_lock); + + bucket = dlm_master_hash(dlm, mle->mnamehash); + hlist_add_head(&mle->master_hash_node, bucket); +} /* returns 1 if found, 0 if not */ static int dlm_find_mle(struct dlm_ctxt *dlm, @@ -325,10 +342,17 @@ static int dlm_find_mle(struct dlm_ctxt *dlm, char *name, unsigned int namelen) { struct dlm_master_list_entry *tmpmle; + struct hlist_head *bucket; + struct hlist_node *list; + unsigned int hash; assert_spin_locked(&dlm->master_lock); - list_for_each_entry(tmpmle, &dlm->master_list, list) { + hash = dlm_lockid_hash(name, namelen); + bucket = dlm_master_hash(dlm, hash); + hlist_for_each(list, bucket) { + tmpmle = hlist_entry(list, struct dlm_master_list_entry, + master_hash_node); if (!dlm_mle_equal(dlm, tmpmle, name, namelen)) continue; dlm_get_mle(tmpmle); @@ -408,24 +432,20 @@ static void dlm_mle_release(struct kref *kref) mle = container_of(kref, struct dlm_master_list_entry, mle_refs); dlm = mle->dlm; - if (mle->type != DLM_MLE_MASTER) { - mlog(0, "calling mle_release for %.*s, type %d\n", - mle->u.name.len, mle->u.name.name, mle->type); - } else { - mlog(0, "calling mle_release for %.*s, type %d\n", - mle->u.res->lockname.len, - mle->u.res->lockname.name, mle->type); - } assert_spin_locked(&dlm->spinlock); assert_spin_locked(&dlm->master_lock); + mlog(0, "Releasing mle for %.*s, type %d\n", mle->mnamelen, mle->mname, + mle->type); + /* remove from list if not already */ - if (!list_empty(&mle->list)) - list_del_init(&mle->list); + __dlm_unlink_mle(dlm, mle); /* detach the mle from the domain node up/down events */ __dlm_mle_detach_hb_events(dlm, mle); + atomic_dec(&dlm->mle_cur_count[mle->type]); + /* NOTE: kfree under spinlock here. * if this is bad, we can move this to a freelist. */ kmem_cache_free(dlm_mle_cache, mle); @@ -465,43 +485,6 @@ void dlm_destroy_master_caches(void) kmem_cache_destroy(dlm_lockres_cache); } -static void dlm_set_lockres_owner(struct dlm_ctxt *dlm, - struct dlm_lock_resource *res, - u8 owner) -{ - assert_spin_locked(&res->spinlock); - - mlog_entry("%.*s, %u\n", res->lockname.len, res->lockname.name, owner); - - if (owner == dlm->node_num) - atomic_inc(&dlm->local_resources); - else if (owner == DLM_LOCK_RES_OWNER_UNKNOWN) - atomic_inc(&dlm->unknown_resources); - else - atomic_inc(&dlm->remote_resources); - - res->owner = owner; -} - -void dlm_change_lockres_owner(struct dlm_ctxt *dlm, - struct dlm_lock_resource *res, u8 owner) -{ - assert_spin_locked(&res->spinlock); - - if (owner == res->owner) - return; - - if (res->owner == dlm->node_num) - atomic_dec(&dlm->local_resources); - else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) - atomic_dec(&dlm->unknown_resources); - else - atomic_dec(&dlm->remote_resources); - - dlm_set_lockres_owner(dlm, res, owner); -} - - static void dlm_lockres_release(struct kref *kref) { struct dlm_lock_resource *res; @@ -527,6 +510,8 @@ static void dlm_lockres_release(struct kref *kref) } spin_unlock(&dlm->track_lock); + atomic_dec(&dlm->res_cur_count); + dlm_put(dlm); if (!hlist_unhashed(&res->hash_node) || @@ -607,6 +592,9 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, kref_init(&res->refs); + atomic_inc(&dlm->res_tot_count); + atomic_inc(&dlm->res_cur_count); + /* just for consistency */ spin_lock(&res->spinlock); dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN); @@ -843,7 +831,7 @@ lookup: alloc_mle = NULL; dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0); set_bit(dlm->node_num, mle->maybe_map); - list_add(&mle->list, &dlm->master_list); + __dlm_insert_mle(dlm, mle); /* still holding the dlm spinlock, check the recovery map * to see if there are any nodes that still need to be @@ -1270,7 +1258,7 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, res->lockname.len, res->lockname.name); mle->type = DLM_MLE_MASTER; - mle->u.res = res; + mle->mleres = res; } } } @@ -1315,14 +1303,8 @@ static int dlm_do_master_request(struct dlm_lock_resource *res, BUG_ON(mle->type == DLM_MLE_MIGRATION); - if (mle->type != DLM_MLE_MASTER) { - request.namelen = mle->u.name.len; - memcpy(request.name, mle->u.name.name, request.namelen); - } else { - request.namelen = mle->u.res->lockname.len; - memcpy(request.name, mle->u.res->lockname.name, - request.namelen); - } + request.namelen = (u8)mle->mnamelen; + memcpy(request.name, mle->mname, request.namelen); again: ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request, @@ -1575,7 +1557,7 @@ way_up_top: // "add the block.\n"); dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen); set_bit(request->node_idx, mle->maybe_map); - list_add(&mle->list, &dlm->master_list); + __dlm_insert_mle(dlm, mle); response = DLM_MASTER_RESP_NO; } else { // mlog(0, "mle was found\n"); @@ -1967,7 +1949,7 @@ ok: assert->node_idx, rr, extra_ref, mle->inuse); dlm_print_one_mle(mle); } - list_del_init(&mle->list); + __dlm_unlink_mle(dlm, mle); __dlm_mle_detach_hb_events(dlm, mle); __dlm_put_mle(mle); if (extra_ref) { @@ -3159,10 +3141,8 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm, tmp->master = master; atomic_set(&tmp->woken, 1); wake_up(&tmp->wq); - /* remove it from the list so that only one - * mle will be found */ - list_del_init(&tmp->list); - /* this was obviously WRONG. mle is uninited here. should be tmp. */ + /* remove it so that only one mle will be found */ + __dlm_unlink_mle(dlm, tmp); __dlm_mle_detach_hb_events(dlm, tmp); ret = DLM_MIGRATE_RESPONSE_MASTERY_REF; mlog(0, "%s:%.*s: master=%u, newmaster=%u, " @@ -3181,137 +3161,164 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm, mle->master = master; /* do this for consistency with other mle types */ set_bit(new_master, mle->maybe_map); - list_add(&mle->list, &dlm->master_list); + __dlm_insert_mle(dlm, mle); return ret; } - -void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node) +/* + * Sets the owner of the lockres, associated to the mle, to UNKNOWN + */ +static struct dlm_lock_resource *dlm_reset_mleres_owner(struct dlm_ctxt *dlm, + struct dlm_master_list_entry *mle) { - struct dlm_master_list_entry *mle, *next; struct dlm_lock_resource *res; - unsigned int hash; - mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node); -top: - assert_spin_locked(&dlm->spinlock); + /* Find the lockres associated to the mle and set its owner to UNK */ + res = __dlm_lookup_lockres(dlm, mle->mname, mle->mnamelen, + mle->mnamehash); + if (res) { + spin_unlock(&dlm->master_lock); - /* clean the master list */ - spin_lock(&dlm->master_lock); - list_for_each_entry_safe(mle, next, &dlm->master_list, list) { - BUG_ON(mle->type != DLM_MLE_BLOCK && - mle->type != DLM_MLE_MASTER && - mle->type != DLM_MLE_MIGRATION); - - /* MASTER mles are initiated locally. the waiting - * process will notice the node map change - * shortly. let that happen as normal. */ - if (mle->type == DLM_MLE_MASTER) - continue; + /* move lockres onto recovery list */ + spin_lock(&res->spinlock); + dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN); + dlm_move_lockres_to_recovery_list(dlm, res); + spin_unlock(&res->spinlock); + dlm_lockres_put(res); + /* about to get rid of mle, detach from heartbeat */ + __dlm_mle_detach_hb_events(dlm, mle); - /* BLOCK mles are initiated by other nodes. - * need to clean up if the dead node would have - * been the master. */ - if (mle->type == DLM_MLE_BLOCK) { - int bit; + /* dump the mle */ + spin_lock(&dlm->master_lock); + __dlm_put_mle(mle); + spin_unlock(&dlm->master_lock); + } - spin_lock(&mle->spinlock); - bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); - if (bit != dead_node) { - mlog(0, "mle found, but dead node %u would " - "not have been master\n", dead_node); - spin_unlock(&mle->spinlock); - } else { - /* must drop the refcount by one since the - * assert_master will never arrive. this - * may result in the mle being unlinked and - * freed, but there may still be a process - * waiting in the dlmlock path which is fine. */ - mlog(0, "node %u was expected master\n", - dead_node); - atomic_set(&mle->woken, 1); - spin_unlock(&mle->spinlock); - wake_up(&mle->wq); - /* do not need events any longer, so detach - * from heartbeat */ - __dlm_mle_detach_hb_events(dlm, mle); - __dlm_put_mle(mle); - } - continue; - } + return res; +} - /* everything else is a MIGRATION mle */ - - /* the rule for MIGRATION mles is that the master - * becomes UNKNOWN if *either* the original or - * the new master dies. all UNKNOWN lockreses - * are sent to whichever node becomes the recovery - * master. the new master is responsible for - * determining if there is still a master for - * this lockres, or if he needs to take over - * mastery. either way, this node should expect - * another message to resolve this. */ - if (mle->master != dead_node && - mle->new_master != dead_node) - continue; +static void dlm_clean_migration_mle(struct dlm_ctxt *dlm, + struct dlm_master_list_entry *mle) +{ + __dlm_mle_detach_hb_events(dlm, mle); - /* if we have reached this point, this mle needs to - * be removed from the list and freed. */ + spin_lock(&mle->spinlock); + __dlm_unlink_mle(dlm, mle); + atomic_set(&mle->woken, 1); + spin_unlock(&mle->spinlock); - /* remove from the list early. NOTE: unlinking - * list_head while in list_for_each_safe */ - __dlm_mle_detach_hb_events(dlm, mle); - spin_lock(&mle->spinlock); - list_del_init(&mle->list); + wake_up(&mle->wq); +} + +static void dlm_clean_block_mle(struct dlm_ctxt *dlm, + struct dlm_master_list_entry *mle, u8 dead_node) +{ + int bit; + + BUG_ON(mle->type != DLM_MLE_BLOCK); + + spin_lock(&mle->spinlock); + bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); + if (bit != dead_node) { + mlog(0, "mle found, but dead node %u would not have been " + "master\n", dead_node); + spin_unlock(&mle->spinlock); + } else { + /* Must drop the refcount by one since the assert_master will + * never arrive. This may result in the mle being unlinked and + * freed, but there may still be a process waiting in the + * dlmlock path which is fine. */ + mlog(0, "node %u was expected master\n", dead_node); atomic_set(&mle->woken, 1); spin_unlock(&mle->spinlock); wake_up(&mle->wq); - mlog(0, "%s: node %u died during migration from " - "%u to %u!\n", dlm->name, dead_node, - mle->master, mle->new_master); - /* if there is a lockres associated with this - * mle, find it and set its owner to UNKNOWN */ - hash = dlm_lockid_hash(mle->u.name.name, mle->u.name.len); - res = __dlm_lookup_lockres(dlm, mle->u.name.name, - mle->u.name.len, hash); - if (res) { - /* unfortunately if we hit this rare case, our - * lock ordering is messed. we need to drop - * the master lock so that we can take the - * lockres lock, meaning that we will have to - * restart from the head of list. */ - spin_unlock(&dlm->master_lock); + /* Do not need events any longer, so detach from heartbeat */ + __dlm_mle_detach_hb_events(dlm, mle); + __dlm_put_mle(mle); + } +} - /* move lockres onto recovery list */ - spin_lock(&res->spinlock); - dlm_set_lockres_owner(dlm, res, - DLM_LOCK_RES_OWNER_UNKNOWN); - dlm_move_lockres_to_recovery_list(dlm, res); - spin_unlock(&res->spinlock); - dlm_lockres_put(res); +void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node) +{ + struct dlm_master_list_entry *mle; + struct dlm_lock_resource *res; + struct hlist_head *bucket; + struct hlist_node *list; + unsigned int i; - /* about to get rid of mle, detach from heartbeat */ - __dlm_mle_detach_hb_events(dlm, mle); + mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node); +top: + assert_spin_locked(&dlm->spinlock); - /* dump the mle */ - spin_lock(&dlm->master_lock); - __dlm_put_mle(mle); - spin_unlock(&dlm->master_lock); + /* clean the master list */ + spin_lock(&dlm->master_lock); + for (i = 0; i < DLM_HASH_BUCKETS; i++) { + bucket = dlm_master_hash(dlm, i); + hlist_for_each(list, bucket) { + mle = hlist_entry(list, struct dlm_master_list_entry, + master_hash_node); + + BUG_ON(mle->type != DLM_MLE_BLOCK && + mle->type != DLM_MLE_MASTER && + mle->type != DLM_MLE_MIGRATION); + + /* MASTER mles are initiated locally. The waiting + * process will notice the node map change shortly. + * Let that happen as normal. */ + if (mle->type == DLM_MLE_MASTER) + continue; + + /* BLOCK mles are initiated by other nodes. Need to + * clean up if the dead node would have been the + * master. */ + if (mle->type == DLM_MLE_BLOCK) { + dlm_clean_block_mle(dlm, mle, dead_node); + continue; + } - /* restart */ - goto top; - } + /* Everything else is a MIGRATION mle */ + + /* The rule for MIGRATION mles is that the master + * becomes UNKNOWN if *either* the original or the new + * master dies. All UNKNOWN lockres' are sent to + * whichever node becomes the recovery master. The new + * master is responsible for determining if there is + * still a master for this lockres, or if he needs to + * take over mastery. Either way, this node should + * expect another message to resolve this. */ + + if (mle->master != dead_node && + mle->new_master != dead_node) + continue; + + /* If we have reached this point, this mle needs to be + * removed from the list and freed. */ + dlm_clean_migration_mle(dlm, mle); + + mlog(0, "%s: node %u died during migration from " + "%u to %u!\n", dlm->name, dead_node, mle->master, + mle->new_master); + + /* If we find a lockres associated with the mle, we've + * hit this rare case that messes up our lock ordering. + * If so, we need to drop the master lock so that we can + * take the lockres lock, meaning that we will have to + * restart from the head of list. */ + res = dlm_reset_mleres_owner(dlm, mle); + if (res) + /* restart */ + goto top; - /* this may be the last reference */ - __dlm_put_mle(mle); + /* This may be the last reference */ + __dlm_put_mle(mle); + } } spin_unlock(&dlm->master_lock); } - int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, u8 old_master) { diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 4060bb328bc8..d490b66ad9d7 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -162,12 +162,28 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm, spin_lock(&res->spinlock); if (!__dlm_lockres_unused(res)) { - spin_unlock(&res->spinlock); mlog(0, "%s:%.*s: tried to purge but not unused\n", dlm->name, res->lockname.len, res->lockname.name); - return -ENOTEMPTY; + __dlm_print_one_lock_resource(res); + spin_unlock(&res->spinlock); + BUG(); } + + if (res->state & DLM_LOCK_RES_MIGRATING) { + mlog(0, "%s:%.*s: Delay dropref as this lockres is " + "being remastered\n", dlm->name, res->lockname.len, + res->lockname.name); + /* Re-add the lockres to the end of the purge list */ + if (!list_empty(&res->purge)) { + list_del_init(&res->purge); + list_add_tail(&res->purge, &dlm->purge_list); + } + spin_unlock(&res->spinlock); + return 0; + } + master = (res->owner == dlm->node_num); + if (!master) res->state |= DLM_LOCK_RES_DROPPING_REF; spin_unlock(&res->spinlock); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 7219a86d34cc..e15fc7d50827 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -244,6 +244,10 @@ static struct ocfs2_lock_res_ops ocfs2_rename_lops = { .flags = 0, }; +static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = { + .flags = 0, +}; + static struct ocfs2_lock_res_ops ocfs2_dentry_lops = { .get_osb = ocfs2_get_dentry_osb, .post_unlock = ocfs2_dentry_post_unlock, @@ -622,6 +626,17 @@ static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res, &ocfs2_rename_lops, osb); } +static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res, + struct ocfs2_super *osb) +{ + /* nfs_sync lockres doesn't come from a slab so we call init + * once on it manually. */ + ocfs2_lock_res_init_once(res); + ocfs2_build_lock_name(OCFS2_LOCK_TYPE_NFS_SYNC, 0, 0, res->l_name); + ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_NFS_SYNC, + &ocfs2_nfs_sync_lops, osb); +} + void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres, struct ocfs2_file_private *fp) { @@ -2417,6 +2432,34 @@ void ocfs2_rename_unlock(struct ocfs2_super *osb) ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX); } +int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex) +{ + int status; + struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres; + + if (ocfs2_is_hard_readonly(osb)) + return -EROFS; + + if (ocfs2_mount_local(osb)) + return 0; + + status = ocfs2_cluster_lock(osb, lockres, ex ? LKM_EXMODE : LKM_PRMODE, + 0, 0); + if (status < 0) + mlog(ML_ERROR, "lock on nfs sync lock failed %d\n", status); + + return status; +} + +void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex) +{ + struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres; + + if (!ocfs2_mount_local(osb)) + ocfs2_cluster_unlock(osb, lockres, + ex ? LKM_EXMODE : LKM_PRMODE); +} + int ocfs2_dentry_lock(struct dentry *dentry, int ex) { int ret; @@ -2798,6 +2841,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb) local: ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb); ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb); + ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb); osb->cconn = conn; @@ -2833,6 +2877,7 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb, ocfs2_lock_res_free(&osb->osb_super_lockres); ocfs2_lock_res_free(&osb->osb_rename_lockres); + ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres); ocfs2_cluster_disconnect(osb->cconn, hangup_pending); osb->cconn = NULL; @@ -3015,6 +3060,7 @@ static void ocfs2_drop_osb_locks(struct ocfs2_super *osb) { ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres); ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres); + ocfs2_simple_drop_lockres(osb, &osb->osb_nfs_sync_lockres); } int ocfs2_drop_inode_locks(struct inode *inode) diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index 3f8d9986b8e0..e1fd5721cd7f 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -115,6 +115,8 @@ void ocfs2_super_unlock(struct ocfs2_super *osb, int ex); int ocfs2_rename_lock(struct ocfs2_super *osb); void ocfs2_rename_unlock(struct ocfs2_super *osb); +int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex); +void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex); int ocfs2_dentry_lock(struct dentry *dentry, int ex); void ocfs2_dentry_unlock(struct dentry *dentry, int ex); int ocfs2_file_lock(struct file *file, int ex, int trylock); diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index 2f27b332d8b3..de3da8eb558c 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -31,6 +31,7 @@ #include "ocfs2.h" +#include "alloc.h" #include "dir.h" #include "dlmglue.h" #include "dcache.h" @@ -38,6 +39,7 @@ #include "inode.h" #include "buffer_head_io.h" +#include "suballoc.h" struct ocfs2_inode_handle { @@ -49,29 +51,97 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb, struct ocfs2_inode_handle *handle) { struct inode *inode; + struct ocfs2_super *osb = OCFS2_SB(sb); + u64 blkno = handle->ih_blkno; + int status, set; struct dentry *result; mlog_entry("(0x%p, 0x%p)\n", sb, handle); - if (handle->ih_blkno == 0) { - mlog_errno(-ESTALE); - return ERR_PTR(-ESTALE); + if (blkno == 0) { + mlog(0, "nfs wants inode with blkno: 0\n"); + result = ERR_PTR(-ESTALE); + goto bail; + } + + inode = ocfs2_ilookup(sb, blkno); + /* + * If the inode exists in memory, we only need to check it's + * generation number + */ + if (inode) + goto check_gen; + + /* + * This will synchronize us against ocfs2_delete_inode() on + * all nodes + */ + status = ocfs2_nfs_sync_lock(osb, 1); + if (status < 0) { + mlog(ML_ERROR, "getting nfs sync lock(EX) failed %d\n", status); + goto check_err; + } + + status = ocfs2_test_inode_bit(osb, blkno, &set); + if (status < 0) { + if (status == -EINVAL) { + /* + * The blkno NFS gave us doesn't even show up + * as an inode, we return -ESTALE to be + * nice + */ + mlog(0, "test inode bit failed %d\n", status); + status = -ESTALE; + } else { + mlog(ML_ERROR, "test inode bit failed %d\n", status); + } + goto unlock_nfs_sync; + } + + /* If the inode allocator bit is clear, this inode must be stale */ + if (!set) { + mlog(0, "inode %llu suballoc bit is clear\n", blkno); + status = -ESTALE; + goto unlock_nfs_sync; } - inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0, 0); + inode = ocfs2_iget(osb, blkno, 0, 0); - if (IS_ERR(inode)) - return (void *)inode; +unlock_nfs_sync: + ocfs2_nfs_sync_unlock(osb, 1); +check_err: + if (status < 0) { + if (status == -ESTALE) { + mlog(0, "stale inode ino: %llu generation: %u\n", + blkno, handle->ih_generation); + } + result = ERR_PTR(status); + goto bail; + } + + if (IS_ERR(inode)) { + mlog_errno(PTR_ERR(inode)); + result = (void *)inode; + goto bail; + } + +check_gen: if (handle->ih_generation != inode->i_generation) { iput(inode); - return ERR_PTR(-ESTALE); + mlog(0, "stale inode ino: %llu generation: %u\n", blkno, + handle->ih_generation); + result = ERR_PTR(-ESTALE); + goto bail; } result = d_obtain_alias(inode); if (!IS_ERR(result)) result->d_op = &ocfs2_dentry_ops; + else + mlog_errno(PTR_ERR(result)); +bail: mlog_exit_ptr(result); return result; } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index a5887df2cd8a..8672b9536039 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1926,7 +1926,7 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, out->f_path.dentry->d_name.len, out->f_path.dentry->d_name.name); - inode_double_lock(inode, pipe->inode); + mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); ret = ocfs2_rw_lock(inode, 1); if (ret < 0) { @@ -1941,12 +1941,16 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, goto out_unlock; } + if (pipe->inode) + mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD); ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags); + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); out_unlock: ocfs2_rw_unlock(inode, 1); out: - inode_double_unlock(inode, pipe->inode); + mutex_unlock(&inode->i_mutex); mlog_exit(ret); return ret; diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 229e707bc050..10e1fa87396a 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -38,6 +38,7 @@ #include "ocfs2.h" #include "alloc.h" +#include "dir.h" #include "blockcheck.h" #include "dlmglue.h" #include "extent_map.h" @@ -112,6 +113,17 @@ void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi) oi->ip_attr |= OCFS2_DIRSYNC_FL; } +struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno) +{ + struct ocfs2_find_inode_args args; + + args.fi_blkno = blkno; + args.fi_flags = 0; + args.fi_ino = ino_from_blkno(sb, blkno); + args.fi_sysfile_type = 0; + + return ilookup5(sb, blkno, ocfs2_find_actor, &args); +} struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, int sysfile_type) { @@ -275,7 +287,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, (unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)le64_to_cpu(fe->i_blkno)); - inode->i_nlink = le16_to_cpu(fe->i_links_count); + inode->i_nlink = ocfs2_read_links_count(fe); if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) { OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE; @@ -351,6 +363,8 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, ocfs2_set_inode_flags(inode); + OCFS2_I(inode)->ip_last_used_slot = 0; + OCFS2_I(inode)->ip_last_used_group = 0; mlog_exit_void(); } @@ -606,7 +620,7 @@ static int ocfs2_remove_inode(struct inode *inode, } handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS + - ocfs2_quota_trans_credits(inode->i_sb)); + ocfs2_quota_trans_credits(inode->i_sb)); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); @@ -740,6 +754,15 @@ static int ocfs2_wipe_inode(struct inode *inode, goto bail_unlock_dir; } + /* Remove any dir index tree */ + if (S_ISDIR(inode->i_mode)) { + status = ocfs2_dx_dir_truncate(inode, di_bh); + if (status) { + mlog_errno(status); + goto bail_unlock_dir; + } + } + /*Free extended attribute resources associated with this inode.*/ status = ocfs2_xattr_remove(inode, di_bh); if (status < 0) { @@ -949,6 +972,17 @@ void ocfs2_delete_inode(struct inode *inode) goto bail; } + /* + * Synchronize us against ocfs2_get_dentry. We take this in + * shared mode so that all nodes can still concurrently + * process deletes. + */ + status = ocfs2_nfs_sync_lock(OCFS2_SB(inode->i_sb), 0); + if (status < 0) { + mlog(ML_ERROR, "getting nfs sync lock(PR) failed %d\n", status); + ocfs2_cleanup_delete_inode(inode, 0); + goto bail_unblock; + } /* Lock down the inode. This gives us an up to date view of * it's metadata (for verification), and allows us to * serialize delete_inode on multiple nodes. @@ -962,7 +996,7 @@ void ocfs2_delete_inode(struct inode *inode) if (status != -ENOENT) mlog_errno(status); ocfs2_cleanup_delete_inode(inode, 0); - goto bail_unblock; + goto bail_unlock_nfs_sync; } /* Query the cluster. This will be the final decision made @@ -1005,6 +1039,10 @@ void ocfs2_delete_inode(struct inode *inode) bail_unlock_inode: ocfs2_inode_unlock(inode, 1); brelse(di_bh); + +bail_unlock_nfs_sync: + ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0); + bail_unblock: status = sigprocmask(SIG_SETMASK, &oldset, NULL); if (status < 0) @@ -1205,7 +1243,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle, spin_unlock(&OCFS2_I(inode)->ip_lock); fe->i_size = cpu_to_le64(i_size_read(inode)); - fe->i_links_count = cpu_to_le16(inode->i_nlink); + ocfs2_set_links_count(fe, inode->i_nlink); fe->i_uid = cpu_to_le32(inode->i_uid); fe->i_gid = cpu_to_le32(inode->i_gid); fe->i_mode = cpu_to_le16(inode->i_mode); @@ -1242,7 +1280,7 @@ void ocfs2_refresh_inode(struct inode *inode, OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features); ocfs2_set_inode_flags(inode); i_size_write(inode, le64_to_cpu(fe->i_size)); - inode->i_nlink = le16_to_cpu(fe->i_links_count); + inode->i_nlink = ocfs2_read_links_count(fe); inode->i_uid = le32_to_cpu(fe->i_uid); inode->i_gid = le32_to_cpu(fe->i_gid); inode->i_mode = le16_to_cpu(fe->i_mode); diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index eb3c302b38d3..ea71525aad41 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -72,6 +72,10 @@ struct ocfs2_inode_info struct inode vfs_inode; struct jbd2_inode ip_jinode; + + /* Only valid if the inode is the dir. */ + u32 ip_last_used_slot; + u64 ip_last_used_group; }; /* @@ -124,6 +128,7 @@ void ocfs2_drop_inode(struct inode *inode); /* Flags for ocfs2_iget() */ #define OCFS2_FI_FLAG_SYSFILE 0x1 #define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2 +struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff); struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags, int sysfile_type); int ocfs2_inode_init_private(struct inode *inode); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 57d7d25a2b9a..a20a0f1e37fd 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -65,6 +65,11 @@ static int ocfs2_trylock_journal(struct ocfs2_super *osb, static int ocfs2_recover_orphans(struct ocfs2_super *osb, int slot); static int ocfs2_commit_thread(void *arg); +static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, + int slot_num, + struct ocfs2_dinode *la_dinode, + struct ocfs2_dinode *tl_dinode, + struct ocfs2_quota_recovery *qrec); static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb) { @@ -76,18 +81,97 @@ static inline int ocfs2_wait_on_quotas(struct ocfs2_super *osb) return __ocfs2_wait_on_mount(osb, 1); } - - /* - * The recovery_list is a simple linked list of node numbers to recover. - * It is protected by the recovery_lock. + * This replay_map is to track online/offline slots, so we could recover + * offline slots during recovery and mount */ -struct ocfs2_recovery_map { - unsigned int rm_used; - unsigned int *rm_entries; +enum ocfs2_replay_state { + REPLAY_UNNEEDED = 0, /* Replay is not needed, so ignore this map */ + REPLAY_NEEDED, /* Replay slots marked in rm_replay_slots */ + REPLAY_DONE /* Replay was already queued */ }; +struct ocfs2_replay_map { + unsigned int rm_slots; + enum ocfs2_replay_state rm_state; + unsigned char rm_replay_slots[0]; +}; + +void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state) +{ + if (!osb->replay_map) + return; + + /* If we've already queued the replay, we don't have any more to do */ + if (osb->replay_map->rm_state == REPLAY_DONE) + return; + + osb->replay_map->rm_state = state; +} + +int ocfs2_compute_replay_slots(struct ocfs2_super *osb) +{ + struct ocfs2_replay_map *replay_map; + int i, node_num; + + /* If replay map is already set, we don't do it again */ + if (osb->replay_map) + return 0; + + replay_map = kzalloc(sizeof(struct ocfs2_replay_map) + + (osb->max_slots * sizeof(char)), GFP_KERNEL); + + if (!replay_map) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + + spin_lock(&osb->osb_lock); + + replay_map->rm_slots = osb->max_slots; + replay_map->rm_state = REPLAY_UNNEEDED; + + /* set rm_replay_slots for offline slot(s) */ + for (i = 0; i < replay_map->rm_slots; i++) { + if (ocfs2_slot_to_node_num_locked(osb, i, &node_num) == -ENOENT) + replay_map->rm_replay_slots[i] = 1; + } + + osb->replay_map = replay_map; + spin_unlock(&osb->osb_lock); + return 0; +} + +void ocfs2_queue_replay_slots(struct ocfs2_super *osb) +{ + struct ocfs2_replay_map *replay_map = osb->replay_map; + int i; + + if (!replay_map) + return; + + if (replay_map->rm_state != REPLAY_NEEDED) + return; + + for (i = 0; i < replay_map->rm_slots; i++) + if (replay_map->rm_replay_slots[i]) + ocfs2_queue_recovery_completion(osb->journal, i, NULL, + NULL, NULL); + replay_map->rm_state = REPLAY_DONE; +} + +void ocfs2_free_replay_slots(struct ocfs2_super *osb) +{ + struct ocfs2_replay_map *replay_map = osb->replay_map; + + if (!osb->replay_map) + return; + + kfree(replay_map); + osb->replay_map = NULL; +} + int ocfs2_recovery_init(struct ocfs2_super *osb) { struct ocfs2_recovery_map *rm; @@ -496,6 +580,22 @@ static struct ocfs2_triggers dq_triggers = { }, }; +static struct ocfs2_triggers dr_triggers = { + .ot_triggers = { + .t_commit = ocfs2_commit_trigger, + .t_abort = ocfs2_abort_trigger, + }, + .ot_offset = offsetof(struct ocfs2_dx_root_block, dr_check), +}; + +static struct ocfs2_triggers dl_triggers = { + .ot_triggers = { + .t_commit = ocfs2_commit_trigger, + .t_abort = ocfs2_abort_trigger, + }, + .ot_offset = offsetof(struct ocfs2_dx_leaf, dl_check), +}; + static int __ocfs2_journal_access(handle_t *handle, struct inode *inode, struct buffer_head *bh, @@ -600,6 +700,20 @@ int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode, type); } +int ocfs2_journal_access_dr(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, inode, bh, &dr_triggers, + type); +} + +int ocfs2_journal_access_dl(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, inode, bh, &dl_triggers, + type); +} + int ocfs2_journal_access(handle_t *handle, struct inode *inode, struct buffer_head *bh, int type) { @@ -1176,24 +1290,24 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, } /* Called by the mount code to queue recovery the last part of - * recovery for it's own slot. */ + * recovery for it's own and offline slot(s). */ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb) { struct ocfs2_journal *journal = osb->journal; - if (osb->dirty) { - /* No need to queue up our truncate_log as regular - * cleanup will catch that. */ - ocfs2_queue_recovery_completion(journal, - osb->slot_num, - osb->local_alloc_copy, - NULL, - NULL); - ocfs2_schedule_truncate_log_flush(osb, 0); + /* No need to queue up our truncate_log as regular cleanup will catch + * that */ + ocfs2_queue_recovery_completion(journal, osb->slot_num, + osb->local_alloc_copy, NULL, NULL); + ocfs2_schedule_truncate_log_flush(osb, 0); - osb->local_alloc_copy = NULL; - osb->dirty = 0; - } + osb->local_alloc_copy = NULL; + osb->dirty = 0; + + /* queue to recover orphan slots for all offline slots */ + ocfs2_replay_map_set_state(osb, REPLAY_NEEDED); + ocfs2_queue_replay_slots(osb); + ocfs2_free_replay_slots(osb); } void ocfs2_complete_quota_recovery(struct ocfs2_super *osb) @@ -1236,6 +1350,14 @@ restart: goto bail; } + status = ocfs2_compute_replay_slots(osb); + if (status < 0) + mlog_errno(status); + + /* queue recovery for our own slot */ + ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL, + NULL, NULL); + spin_lock(&osb->osb_lock); while (rm->rm_used) { /* It's always safe to remove entry zero, as we won't @@ -1301,11 +1423,8 @@ skip_recovery: ocfs2_super_unlock(osb, 1); - /* We always run recovery on our own orphan dir - the dead - * node(s) may have disallowd a previos inode delete. Re-processing - * is therefore required. */ - ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL, - NULL, NULL); + /* queue recovery for offline slots */ + ocfs2_queue_replay_slots(osb); bail: mutex_lock(&osb->recovery_lock); @@ -1314,6 +1433,7 @@ bail: goto restart; } + ocfs2_free_replay_slots(osb); osb->recovery_thread_task = NULL; mb(); /* sync with ocfs2_recovery_thread_running */ wake_up(&osb->recovery_event); @@ -1465,6 +1585,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, goto done; } + /* we need to run complete recovery for offline orphan slots */ + ocfs2_replay_map_set_state(osb, REPLAY_NEEDED); + mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n", node_num, slot_num, MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 172850a9a12a..619dd7f6c053 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -38,6 +38,17 @@ enum ocfs2_journal_state { struct ocfs2_super; struct ocfs2_dinode; +/* + * The recovery_list is a simple linked list of node numbers to recover. + * It is protected by the recovery_lock. + */ + +struct ocfs2_recovery_map { + unsigned int rm_used; + unsigned int *rm_entries; +}; + + struct ocfs2_journal { enum ocfs2_journal_state j_state; /* Journals current state */ @@ -139,6 +150,7 @@ void ocfs2_wait_for_recovery(struct ocfs2_super *osb); int ocfs2_recovery_init(struct ocfs2_super *osb); void ocfs2_recovery_exit(struct ocfs2_super *osb); +int ocfs2_compute_replay_slots(struct ocfs2_super *osb); /* * Journal Control: * Initialize, Load, Shutdown, Wipe a journal. @@ -266,6 +278,12 @@ int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode, /* dirblock */ int ocfs2_journal_access_db(handle_t *handle, struct inode *inode, struct buffer_head *bh, int type); +/* ocfs2_dx_root_block */ +int ocfs2_journal_access_dr(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int type); +/* ocfs2_dx_leaf */ +int ocfs2_journal_access_dl(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int type); /* Anything that has no ecc */ int ocfs2_journal_access(handle_t *handle, struct inode *inode, struct buffer_head *bh, int type); @@ -368,14 +386,29 @@ static inline int ocfs2_remove_extent_credits(struct super_block *sb) } /* data block for new dir/symlink, 2 for bitmap updates (bitmap fe + - * bitmap block for the new bit) */ -#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2) + * bitmap block for the new bit) dx_root update for free list */ +#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2 + 1) + +static inline int ocfs2_add_dir_index_credits(struct super_block *sb) +{ + /* 1 block for index, 2 allocs (data, metadata), 1 clusters + * worth of blocks for initial extent. */ + return 1 + 2 * OCFS2_SUBALLOC_ALLOC + + ocfs2_clusters_to_blocks(sb, 1); +} -/* parent fe, parent block, new file entry, inode alloc fe, inode alloc - * group descriptor + mkdir/symlink blocks + quota update */ -static inline int ocfs2_mknod_credits(struct super_block *sb) +/* parent fe, parent block, new file entry, index leaf, inode alloc fe, inode + * alloc group descriptor + mkdir/symlink blocks + dir blocks + xattr + * blocks + quota update */ +static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir, + int xattr_credits) { - return 3 + OCFS2_SUBALLOC_ALLOC + OCFS2_DIR_LINK_ADDITIONAL_CREDITS + + int dir_credits = OCFS2_DIR_LINK_ADDITIONAL_CREDITS; + + if (is_dir) + dir_credits += ocfs2_add_dir_index_credits(sb); + + return 4 + OCFS2_SUBALLOC_ALLOC + dir_credits + xattr_credits + ocfs2_quota_trans_credits(sb); } @@ -388,31 +421,31 @@ static inline int ocfs2_mknod_credits(struct super_block *sb) #define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2) /* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota - * update on dir */ + * update on dir + index leaf + dx root update for free list */ static inline int ocfs2_link_credits(struct super_block *sb) { - return 2*OCFS2_INODE_UPDATE_CREDITS + 1 + + return 2*OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_quota_trans_credits(sb); } /* inode + dir inode (if we unlink a dir), + dir entry block + orphan - * dir inode link */ + * dir inode link + dir inode index leaf + dir index root */ static inline int ocfs2_unlink_credits(struct super_block *sb) { /* The quota update from ocfs2_link_credits is unused here... */ - return 2 * OCFS2_INODE_UPDATE_CREDITS + 1 + ocfs2_link_credits(sb); + return 2 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_link_credits(sb); } /* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry + - * inode alloc group descriptor */ -#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 1 + 1) + * inode alloc group descriptor + orphan dir index leaf */ +#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3) /* dinode update, old dir dinode update, new dir dinode update, old * dir dir entry, new dir dir entry, dir entry update for renaming - * directory + target unlink */ + * directory + target unlink + 3 x dir index leaves */ static inline int ocfs2_rename_credits(struct super_block *sb) { - return 3 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_unlink_credits(sb); + return 3 * OCFS2_INODE_UPDATE_CREDITS + 6 + ocfs2_unlink_credits(sb); } /* global bitmap dinode, group desc., relinked group, @@ -422,6 +455,20 @@ static inline int ocfs2_rename_credits(struct super_block *sb) + OCFS2_INODE_UPDATE_CREDITS \ + OCFS2_XATTR_BLOCK_UPDATE_CREDITS) +/* inode update, removal of dx root block from allocator */ +#define OCFS2_DX_ROOT_REMOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + \ + OCFS2_SUBALLOC_FREE) + +static inline int ocfs2_calc_dxi_expand_credits(struct super_block *sb) +{ + int credits = 1 + OCFS2_SUBALLOC_ALLOC; + + credits += ocfs2_clusters_to_blocks(sb, 1); + credits += ocfs2_quota_trans_credits(sb); + + return credits; +} + /* * Please note that the caller must make sure that root_el is the root * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise @@ -457,7 +504,7 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb, static inline int ocfs2_calc_symlink_credits(struct super_block *sb) { - int blocks = ocfs2_mknod_credits(sb); + int blocks = ocfs2_mknod_credits(sb, 0, 0); /* links can be longer than one block so we may update many * within our single allocated extent. */ diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index ec70cdbe77fc..bac7e6abaf47 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -28,7 +28,6 @@ #include <linux/slab.h> #include <linux/highmem.h> #include <linux/bitops.h> -#include <linux/debugfs.h> #define MLOG_MASK_PREFIX ML_DISK_ALLOC #include <cluster/masklog.h> @@ -75,84 +74,6 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, struct inode *local_alloc_inode); -#ifdef CONFIG_OCFS2_FS_STATS - -static int ocfs2_la_debug_open(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} - -#define LA_DEBUG_BUF_SZ PAGE_CACHE_SIZE -#define LA_DEBUG_VER 1 -static ssize_t ocfs2_la_debug_read(struct file *file, char __user *userbuf, - size_t count, loff_t *ppos) -{ - static DEFINE_MUTEX(la_debug_mutex); - struct ocfs2_super *osb = file->private_data; - int written, ret; - char *buf = osb->local_alloc_debug_buf; - - mutex_lock(&la_debug_mutex); - memset(buf, 0, LA_DEBUG_BUF_SZ); - - written = snprintf(buf, LA_DEBUG_BUF_SZ, - "0x%x\t0x%llx\t%u\t%u\t0x%x\n", - LA_DEBUG_VER, - (unsigned long long)osb->la_last_gd, - osb->local_alloc_default_bits, - osb->local_alloc_bits, osb->local_alloc_state); - - ret = simple_read_from_buffer(userbuf, count, ppos, buf, written); - - mutex_unlock(&la_debug_mutex); - return ret; -} - -static const struct file_operations ocfs2_la_debug_fops = { - .open = ocfs2_la_debug_open, - .read = ocfs2_la_debug_read, -}; - -static void ocfs2_init_la_debug(struct ocfs2_super *osb) -{ - osb->local_alloc_debug_buf = kmalloc(LA_DEBUG_BUF_SZ, GFP_NOFS); - if (!osb->local_alloc_debug_buf) - return; - - osb->local_alloc_debug = debugfs_create_file("local_alloc_stats", - S_IFREG|S_IRUSR, - osb->osb_debug_root, - osb, - &ocfs2_la_debug_fops); - if (!osb->local_alloc_debug) { - kfree(osb->local_alloc_debug_buf); - osb->local_alloc_debug_buf = NULL; - } -} - -static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb) -{ - if (osb->local_alloc_debug) - debugfs_remove(osb->local_alloc_debug); - - if (osb->local_alloc_debug_buf) - kfree(osb->local_alloc_debug_buf); - - osb->local_alloc_debug_buf = NULL; - osb->local_alloc_debug = NULL; -} -#else /* CONFIG_OCFS2_FS_STATS */ -static void ocfs2_init_la_debug(struct ocfs2_super *osb) -{ - return; -} -static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb) -{ - return; -} -#endif - static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb) { return (osb->local_alloc_state == OCFS2_LA_THROTTLED || @@ -226,8 +147,6 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb) mlog_entry_void(); - ocfs2_init_la_debug(osb); - if (osb->local_alloc_bits == 0) goto bail; @@ -299,9 +218,6 @@ bail: if (inode) iput(inode); - if (status < 0) - ocfs2_shutdown_la_debug(osb); - mlog(0, "Local alloc window bits = %d\n", osb->local_alloc_bits); mlog_exit(status); @@ -331,8 +247,6 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) cancel_delayed_work(&osb->la_enable_wq); flush_workqueue(ocfs2_wq); - ocfs2_shutdown_la_debug(osb); - if (osb->local_alloc_state == OCFS2_LA_UNUSED) goto out; diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 4b11762f249e..2220f93f668b 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -80,14 +80,14 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, struct inode **ret_orphan_dir, struct inode *inode, char *name, - struct buffer_head **de_bh); + struct ocfs2_dir_lookup_result *lookup); static int ocfs2_orphan_add(struct ocfs2_super *osb, handle_t *handle, struct inode *inode, struct ocfs2_dinode *fe, char *name, - struct buffer_head *de_bh, + struct ocfs2_dir_lookup_result *lookup, struct inode *orphan_dir_inode); static int ocfs2_create_symlink_data(struct ocfs2_super *osb, @@ -228,17 +228,18 @@ static int ocfs2_mknod(struct inode *dir, struct ocfs2_super *osb; struct ocfs2_dinode *dirfe; struct buffer_head *new_fe_bh = NULL; - struct buffer_head *de_bh = NULL; struct inode *inode = NULL; struct ocfs2_alloc_context *inode_ac = NULL; struct ocfs2_alloc_context *data_ac = NULL; - struct ocfs2_alloc_context *xattr_ac = NULL; + struct ocfs2_alloc_context *meta_ac = NULL; int want_clusters = 0; + int want_meta = 0; int xattr_credits = 0; struct ocfs2_security_xattr_info si = { .enable = 1, }; int did_quota_inode = 0; + struct ocfs2_dir_lookup_result lookup = { NULL, }; mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode, (unsigned long)dev, dentry->d_name.len, @@ -254,13 +255,13 @@ static int ocfs2_mknod(struct inode *dir, return status; } - if (S_ISDIR(mode) && (dir->i_nlink >= OCFS2_LINK_MAX)) { + if (S_ISDIR(mode) && (dir->i_nlink >= ocfs2_link_max(osb))) { status = -EMLINK; goto leave; } dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data; - if (!dirfe->i_links_count) { + if (!ocfs2_read_links_count(dirfe)) { /* can't make a file in a deleted directory. */ status = -ENOENT; goto leave; @@ -274,7 +275,7 @@ static int ocfs2_mknod(struct inode *dir, /* get a spot inside the dir. */ status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, dentry->d_name.name, - dentry->d_name.len, &de_bh); + dentry->d_name.len, &lookup); if (status < 0) { mlog_errno(status); goto leave; @@ -308,17 +309,29 @@ static int ocfs2_mknod(struct inode *dir, /* calculate meta data/clusters for setting security and acl xattr */ status = ocfs2_calc_xattr_init(dir, parent_fe_bh, mode, - &si, &want_clusters, - &xattr_credits, &xattr_ac); + &si, &want_clusters, + &xattr_credits, &want_meta); if (status < 0) { mlog_errno(status); goto leave; } /* Reserve a cluster if creating an extent based directory. */ - if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) + if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) { want_clusters += 1; + /* Dir indexing requires extra space as well */ + if (ocfs2_supports_indexed_dirs(osb)) + want_meta++; + } + + status = ocfs2_reserve_new_metadata_blocks(osb, want_meta, &meta_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto leave; + } + status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac); if (status < 0) { if (status != -ENOSPC) @@ -326,8 +339,9 @@ static int ocfs2_mknod(struct inode *dir, goto leave; } - handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb) + - xattr_credits); + handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb, + S_ISDIR(mode), + xattr_credits)); if (IS_ERR(handle)) { status = PTR_ERR(handle); handle = NULL; @@ -355,7 +369,7 @@ static int ocfs2_mknod(struct inode *dir, if (S_ISDIR(mode)) { status = ocfs2_fill_new_dir(osb, handle, dir, inode, - new_fe_bh, data_ac); + new_fe_bh, data_ac, meta_ac); if (status < 0) { mlog_errno(status); goto leave; @@ -367,7 +381,7 @@ static int ocfs2_mknod(struct inode *dir, mlog_errno(status); goto leave; } - le16_add_cpu(&dirfe->i_links_count, 1); + ocfs2_add_links_count(dirfe, 1); status = ocfs2_journal_dirty(handle, parent_fe_bh); if (status < 0) { mlog_errno(status); @@ -377,7 +391,7 @@ static int ocfs2_mknod(struct inode *dir, } status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh, - xattr_ac, data_ac); + meta_ac, data_ac); if (status < 0) { mlog_errno(status); goto leave; @@ -385,7 +399,7 @@ static int ocfs2_mknod(struct inode *dir, if (si.enable) { status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si, - xattr_ac, data_ac); + meta_ac, data_ac); if (status < 0) { mlog_errno(status); goto leave; @@ -394,7 +408,7 @@ static int ocfs2_mknod(struct inode *dir, status = ocfs2_add_entry(handle, dentry, inode, OCFS2_I(inode)->ip_blkno, parent_fe_bh, - de_bh); + &lookup); if (status < 0) { mlog_errno(status); goto leave; @@ -423,11 +437,12 @@ leave: mlog(0, "Disk is full\n"); brelse(new_fe_bh); - brelse(de_bh); brelse(parent_fe_bh); kfree(si.name); kfree(si.value); + ocfs2_free_dir_lookup_result(&lookup); + if ((status < 0) && inode) { clear_nlink(inode); iput(inode); @@ -439,8 +454,8 @@ leave: if (data_ac) ocfs2_free_alloc_context(data_ac); - if (xattr_ac) - ocfs2_free_alloc_context(xattr_ac); + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); mlog_exit(status); @@ -462,6 +477,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, struct ocfs2_extent_list *fel; u64 fe_blkno = 0; u16 suballoc_bit; + u16 feat; mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, inode->i_mode, (unsigned long)dev, dentry->d_name.len, @@ -469,8 +485,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, *new_fe_bh = NULL; - status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit, - &fe_blkno); + status = ocfs2_claim_new_inode(osb, handle, dir, parent_fe_bh, + inode_ac, &suballoc_bit, &fe_blkno); if (status < 0) { mlog_errno(status); goto leave; @@ -513,7 +529,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, fe->i_mode = cpu_to_le16(inode->i_mode); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev)); - fe->i_links_count = cpu_to_le16(inode->i_nlink); + + ocfs2_set_links_count(fe, inode->i_nlink); fe->i_last_eb_blk = 0; strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE); @@ -525,11 +542,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, fe->i_dtime = 0; /* - * If supported, directories start with inline data. + * If supported, directories start with inline data. If inline + * isn't supported, but indexing is, we start them as indexed. */ + feat = le16_to_cpu(fe->i_dyn_features); if (S_ISDIR(inode->i_mode) && ocfs2_supports_inline_data(osb)) { - u16 feat = le16_to_cpu(fe->i_dyn_features); - fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL); fe->id2.i_data.id_count = cpu_to_le16( @@ -608,9 +625,9 @@ static int ocfs2_link(struct dentry *old_dentry, int err; struct buffer_head *fe_bh = NULL; struct buffer_head *parent_fe_bh = NULL; - struct buffer_head *de_bh = NULL; struct ocfs2_dinode *fe = NULL; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + struct ocfs2_dir_lookup_result lookup = { NULL, }; mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino, old_dentry->d_name.len, old_dentry->d_name.name, @@ -638,7 +655,7 @@ static int ocfs2_link(struct dentry *old_dentry, err = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, dentry->d_name.name, - dentry->d_name.len, &de_bh); + dentry->d_name.len, &lookup); if (err < 0) { mlog_errno(err); goto out; @@ -652,7 +669,7 @@ static int ocfs2_link(struct dentry *old_dentry, } fe = (struct ocfs2_dinode *) fe_bh->b_data; - if (le16_to_cpu(fe->i_links_count) >= OCFS2_LINK_MAX) { + if (ocfs2_read_links_count(fe) >= ocfs2_link_max(osb)) { err = -EMLINK; goto out_unlock_inode; } @@ -674,13 +691,13 @@ static int ocfs2_link(struct dentry *old_dentry, inc_nlink(inode); inode->i_ctime = CURRENT_TIME; - fe->i_links_count = cpu_to_le16(inode->i_nlink); + ocfs2_set_links_count(fe, inode->i_nlink); fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); err = ocfs2_journal_dirty(handle, fe_bh); if (err < 0) { - le16_add_cpu(&fe->i_links_count, -1); + ocfs2_add_links_count(fe, -1); drop_nlink(inode); mlog_errno(err); goto out_commit; @@ -688,9 +705,9 @@ static int ocfs2_link(struct dentry *old_dentry, err = ocfs2_add_entry(handle, dentry, inode, OCFS2_I(inode)->ip_blkno, - parent_fe_bh, de_bh); + parent_fe_bh, &lookup); if (err) { - le16_add_cpu(&fe->i_links_count, -1); + ocfs2_add_links_count(fe, -1); drop_nlink(inode); mlog_errno(err); goto out_commit; @@ -714,10 +731,11 @@ out_unlock_inode: out: ocfs2_inode_unlock(dir, 1); - brelse(de_bh); brelse(fe_bh); brelse(parent_fe_bh); + ocfs2_free_dir_lookup_result(&lookup); + mlog_exit(err); return err; @@ -766,10 +784,9 @@ static int ocfs2_unlink(struct inode *dir, struct buffer_head *fe_bh = NULL; struct buffer_head *parent_node_bh = NULL; handle_t *handle = NULL; - struct ocfs2_dir_entry *dirent = NULL; - struct buffer_head *dirent_bh = NULL; char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; - struct buffer_head *orphan_entry_bh = NULL; + struct ocfs2_dir_lookup_result lookup = { NULL, }; + struct ocfs2_dir_lookup_result orphan_insert = { NULL, }; mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry, dentry->d_name.len, dentry->d_name.name); @@ -791,8 +808,8 @@ static int ocfs2_unlink(struct inode *dir, } status = ocfs2_find_files_on_disk(dentry->d_name.name, - dentry->d_name.len, &blkno, - dir, &dirent_bh, &dirent); + dentry->d_name.len, &blkno, dir, + &lookup); if (status < 0) { if (status != -ENOENT) mlog_errno(status); @@ -817,10 +834,7 @@ static int ocfs2_unlink(struct inode *dir, child_locked = 1; if (S_ISDIR(inode->i_mode)) { - if (!ocfs2_empty_dir(inode)) { - status = -ENOTEMPTY; - goto leave; - } else if (inode->i_nlink != 2) { + if (inode->i_nlink != 2 || !ocfs2_empty_dir(inode)) { status = -ENOTEMPTY; goto leave; } @@ -836,8 +850,7 @@ static int ocfs2_unlink(struct inode *dir, if (inode_is_unlinkable(inode)) { status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, inode, - orphan_name, - &orphan_entry_bh); + orphan_name, &orphan_insert); if (status < 0) { mlog_errno(status); goto leave; @@ -863,7 +876,7 @@ static int ocfs2_unlink(struct inode *dir, if (inode_is_unlinkable(inode)) { status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name, - orphan_entry_bh, orphan_dir); + &orphan_insert, orphan_dir); if (status < 0) { mlog_errno(status); goto leave; @@ -871,7 +884,7 @@ static int ocfs2_unlink(struct inode *dir, } /* delete the name from the parent dir */ - status = ocfs2_delete_entry(handle, dir, dirent, dirent_bh); + status = ocfs2_delete_entry(handle, dir, &lookup); if (status < 0) { mlog_errno(status); goto leave; @@ -880,7 +893,7 @@ static int ocfs2_unlink(struct inode *dir, if (S_ISDIR(inode->i_mode)) drop_nlink(inode); drop_nlink(inode); - fe->i_links_count = cpu_to_le16(inode->i_nlink); + ocfs2_set_links_count(fe, inode->i_nlink); status = ocfs2_journal_dirty(handle, fe_bh); if (status < 0) { @@ -916,9 +929,10 @@ leave: } brelse(fe_bh); - brelse(dirent_bh); brelse(parent_node_bh); - brelse(orphan_entry_bh); + + ocfs2_free_dir_lookup_result(&orphan_insert); + ocfs2_free_dir_lookup_result(&lookup); mlog_exit(status); @@ -1004,8 +1018,8 @@ static int ocfs2_rename(struct inode *old_dir, struct inode *new_dir, struct dentry *new_dentry) { - int status = 0, rename_lock = 0, parents_locked = 0; - int old_child_locked = 0, new_child_locked = 0; + int status = 0, rename_lock = 0, parents_locked = 0, target_exists = 0; + int old_child_locked = 0, new_child_locked = 0, update_dot_dot = 0; struct inode *old_inode = old_dentry->d_inode; struct inode *new_inode = new_dentry->d_inode; struct inode *orphan_dir = NULL; @@ -1020,13 +1034,13 @@ static int ocfs2_rename(struct inode *old_dir, handle_t *handle = NULL; struct buffer_head *old_dir_bh = NULL; struct buffer_head *new_dir_bh = NULL; - struct ocfs2_dir_entry *old_inode_dot_dot_de = NULL, *old_de = NULL, - *new_de = NULL; - struct buffer_head *new_de_bh = NULL, *old_de_bh = NULL; // bhs for above - struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir, - // this is the 1st dirent bh nlink_t old_dir_nlink = old_dir->i_nlink; struct ocfs2_dinode *old_di; + struct ocfs2_dir_lookup_result old_inode_dot_dot_res = { NULL, }; + struct ocfs2_dir_lookup_result target_lookup_res = { NULL, }; + struct ocfs2_dir_lookup_result old_entry_lookup = { NULL, }; + struct ocfs2_dir_lookup_result orphan_insert = { NULL, }; + struct ocfs2_dir_lookup_result target_insert = { NULL, }; /* At some point it might be nice to break this function up a * bit. */ @@ -1108,9 +1122,10 @@ static int ocfs2_rename(struct inode *old_dir, if (S_ISDIR(old_inode->i_mode)) { u64 old_inode_parent; + update_dot_dot = 1; status = ocfs2_find_files_on_disk("..", 2, &old_inode_parent, - old_inode, &old_inode_de_bh, - &old_inode_dot_dot_de); + old_inode, + &old_inode_dot_dot_res); if (status) { status = -EIO; goto bail; @@ -1122,7 +1137,7 @@ static int ocfs2_rename(struct inode *old_dir, } if (!new_inode && new_dir != old_dir && - new_dir->i_nlink >= OCFS2_LINK_MAX) { + new_dir->i_nlink >= ocfs2_link_max(osb)) { status = -EMLINK; goto bail; } @@ -1151,8 +1166,8 @@ static int ocfs2_rename(struct inode *old_dir, * to delete it */ status = ocfs2_find_files_on_disk(new_dentry->d_name.name, new_dentry->d_name.len, - &newfe_blkno, new_dir, &new_de_bh, - &new_de); + &newfe_blkno, new_dir, + &target_lookup_res); /* The only error we allow here is -ENOENT because the new * file not existing is perfectly valid. */ if ((status < 0) && (status != -ENOENT)) { @@ -1161,8 +1176,10 @@ static int ocfs2_rename(struct inode *old_dir, mlog_errno(status); goto bail; } + if (status == 0) + target_exists = 1; - if (!new_de && new_inode) { + if (!target_exists && new_inode) { /* * Target was unlinked by another node while we were * waiting to get to ocfs2_rename(). There isn't @@ -1175,7 +1192,7 @@ static int ocfs2_rename(struct inode *old_dir, /* In case we need to overwrite an existing file, we blow it * away first */ - if (new_de) { + if (target_exists) { /* VFS didn't think there existed an inode here, but * someone else in the cluster must have raced our * rename to create one. Today we error cleanly, in @@ -1216,8 +1233,8 @@ static int ocfs2_rename(struct inode *old_dir, newfe = (struct ocfs2_dinode *) newfe_bh->b_data; - mlog(0, "aha rename over existing... new_de=%p new_blkno=%llu " - "newfebh=%p bhblocknr=%llu\n", new_de, + mlog(0, "aha rename over existing... new_blkno=%llu " + "newfebh=%p bhblocknr=%llu\n", (unsigned long long)newfe_blkno, newfe_bh, newfe_bh ? (unsigned long long)newfe_bh->b_blocknr : 0ULL); @@ -1225,7 +1242,7 @@ static int ocfs2_rename(struct inode *old_dir, status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, new_inode, orphan_name, - &orphan_entry_bh); + &orphan_insert); if (status < 0) { mlog_errno(status); goto bail; @@ -1243,7 +1260,7 @@ static int ocfs2_rename(struct inode *old_dir, status = ocfs2_prepare_dir_for_insert(osb, new_dir, new_dir_bh, new_dentry->d_name.name, new_dentry->d_name.len, - &insert_entry_bh); + &target_insert); if (status < 0) { mlog_errno(status); goto bail; @@ -1258,10 +1275,10 @@ static int ocfs2_rename(struct inode *old_dir, goto bail; } - if (new_de) { + if (target_exists) { if (S_ISDIR(new_inode->i_mode)) { - if (!ocfs2_empty_dir(new_inode) || - new_inode->i_nlink != 2) { + if (new_inode->i_nlink != 2 || + !ocfs2_empty_dir(new_inode)) { status = -ENOTEMPTY; goto bail; } @@ -1274,10 +1291,10 @@ static int ocfs2_rename(struct inode *old_dir, } if (S_ISDIR(new_inode->i_mode) || - (newfe->i_links_count == cpu_to_le16(1))){ + (ocfs2_read_links_count(newfe) == 1)) { status = ocfs2_orphan_add(osb, handle, new_inode, newfe, orphan_name, - orphan_entry_bh, orphan_dir); + &orphan_insert, orphan_dir); if (status < 0) { mlog_errno(status); goto bail; @@ -1285,8 +1302,8 @@ static int ocfs2_rename(struct inode *old_dir, } /* change the dirent to point to the correct inode */ - status = ocfs2_update_entry(new_dir, handle, new_de_bh, - new_de, old_inode); + status = ocfs2_update_entry(new_dir, handle, &target_lookup_res, + old_inode); if (status < 0) { mlog_errno(status); goto bail; @@ -1294,9 +1311,9 @@ static int ocfs2_rename(struct inode *old_dir, new_dir->i_version++; if (S_ISDIR(new_inode->i_mode)) - newfe->i_links_count = 0; + ocfs2_set_links_count(newfe, 0); else - le16_add_cpu(&newfe->i_links_count, -1); + ocfs2_add_links_count(newfe, -1); status = ocfs2_journal_dirty(handle, newfe_bh); if (status < 0) { @@ -1307,7 +1324,7 @@ static int ocfs2_rename(struct inode *old_dir, /* if the name was not found in new_dir, add it now */ status = ocfs2_add_entry(handle, new_dentry, old_inode, OCFS2_I(old_inode)->ip_blkno, - new_dir_bh, insert_entry_bh); + new_dir_bh, &target_insert); } old_inode->i_ctime = CURRENT_TIME; @@ -1334,15 +1351,13 @@ static int ocfs2_rename(struct inode *old_dir, * because the insert might have changed the type of directory * we're dealing with. */ - old_de_bh = ocfs2_find_entry(old_dentry->d_name.name, - old_dentry->d_name.len, - old_dir, &old_de); - if (!old_de_bh) { - status = -EIO; + status = ocfs2_find_entry(old_dentry->d_name.name, + old_dentry->d_name.len, old_dir, + &old_entry_lookup); + if (status) goto bail; - } - status = ocfs2_delete_entry(handle, old_dir, old_de, old_de_bh); + status = ocfs2_delete_entry(handle, old_dir, &old_entry_lookup); if (status < 0) { mlog_errno(status); goto bail; @@ -1353,9 +1368,10 @@ static int ocfs2_rename(struct inode *old_dir, new_inode->i_ctime = CURRENT_TIME; } old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; - if (old_inode_de_bh) { - status = ocfs2_update_entry(old_inode, handle, old_inode_de_bh, - old_inode_dot_dot_de, new_dir); + + if (update_dot_dot) { + status = ocfs2_update_entry(old_inode, handle, + &old_inode_dot_dot_res, new_dir); old_dir->i_nlink--; if (new_inode) { new_inode->i_nlink--; @@ -1391,14 +1407,13 @@ static int ocfs2_rename(struct inode *old_dir, } else { struct ocfs2_dinode *fe; status = ocfs2_journal_access_di(handle, old_dir, - old_dir_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + old_dir_bh, + OCFS2_JOURNAL_ACCESS_WRITE); fe = (struct ocfs2_dinode *) old_dir_bh->b_data; - fe->i_links_count = cpu_to_le16(old_dir->i_nlink); + ocfs2_set_links_count(fe, old_dir->i_nlink); status = ocfs2_journal_dirty(handle, old_dir_bh); } } - ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir); status = 0; bail: @@ -1429,13 +1444,17 @@ bail: if (new_inode) iput(new_inode); + + ocfs2_free_dir_lookup_result(&target_lookup_res); + ocfs2_free_dir_lookup_result(&old_entry_lookup); + ocfs2_free_dir_lookup_result(&old_inode_dot_dot_res); + ocfs2_free_dir_lookup_result(&orphan_insert); + ocfs2_free_dir_lookup_result(&target_insert); + brelse(newfe_bh); brelse(old_inode_bh); brelse(old_dir_bh); brelse(new_dir_bh); - brelse(new_de_bh); - brelse(old_de_bh); - brelse(old_inode_de_bh); brelse(orphan_entry_bh); brelse(insert_entry_bh); @@ -1558,7 +1577,6 @@ static int ocfs2_symlink(struct inode *dir, struct inode *inode = NULL; struct super_block *sb; struct buffer_head *new_fe_bh = NULL; - struct buffer_head *de_bh = NULL; struct buffer_head *parent_fe_bh = NULL; struct ocfs2_dinode *fe = NULL; struct ocfs2_dinode *dirfe; @@ -1572,6 +1590,7 @@ static int ocfs2_symlink(struct inode *dir, .enable = 1, }; int did_quota = 0, did_quota_inode = 0; + struct ocfs2_dir_lookup_result lookup = { NULL, }; mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir, dentry, symname, dentry->d_name.len, dentry->d_name.name); @@ -1592,7 +1611,7 @@ static int ocfs2_symlink(struct inode *dir, } dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data; - if (!dirfe->i_links_count) { + if (!ocfs2_read_links_count(dirfe)) { /* can't make a file in a deleted directory. */ status = -ENOENT; goto bail; @@ -1605,7 +1624,7 @@ static int ocfs2_symlink(struct inode *dir, status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, dentry->d_name.name, - dentry->d_name.len, &de_bh); + dentry->d_name.len, &lookup); if (status < 0) { mlog_errno(status); goto bail; @@ -1744,7 +1763,7 @@ static int ocfs2_symlink(struct inode *dir, status = ocfs2_add_entry(handle, dentry, inode, le64_to_cpu(fe->i_blkno), parent_fe_bh, - de_bh); + &lookup); if (status < 0) { mlog_errno(status); goto bail; @@ -1772,9 +1791,9 @@ bail: brelse(new_fe_bh); brelse(parent_fe_bh); - brelse(de_bh); kfree(si.name); kfree(si.value); + ocfs2_free_dir_lookup_result(&lookup); if (inode_ac) ocfs2_free_alloc_context(inode_ac); if (data_ac) @@ -1826,7 +1845,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, struct inode **ret_orphan_dir, struct inode *inode, char *name, - struct buffer_head **de_bh) + struct ocfs2_dir_lookup_result *lookup) { struct inode *orphan_dir_inode; struct buffer_head *orphan_dir_bh = NULL; @@ -1857,7 +1876,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode, orphan_dir_bh, name, - OCFS2_ORPHAN_NAMELEN, de_bh); + OCFS2_ORPHAN_NAMELEN, lookup); if (status < 0) { ocfs2_inode_unlock(orphan_dir_inode, 1); @@ -1884,7 +1903,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, struct inode *inode, struct ocfs2_dinode *fe, char *name, - struct buffer_head *de_bh, + struct ocfs2_dir_lookup_result *lookup, struct inode *orphan_dir_inode) { struct buffer_head *orphan_dir_bh = NULL; @@ -1910,8 +1929,8 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, * underneath us... */ orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; if (S_ISDIR(inode->i_mode)) - le16_add_cpu(&orphan_fe->i_links_count, 1); - orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count); + ocfs2_add_links_count(orphan_fe, 1); + orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe); status = ocfs2_journal_dirty(handle, orphan_dir_bh); if (status < 0) { @@ -1922,7 +1941,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, status = __ocfs2_add_entry(handle, orphan_dir_inode, name, OCFS2_ORPHAN_NAMELEN, inode, OCFS2_I(inode)->ip_blkno, - orphan_dir_bh, de_bh); + orphan_dir_bh, lookup); if (status < 0) { mlog_errno(status); goto leave; @@ -1955,8 +1974,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, char name[OCFS2_ORPHAN_NAMELEN + 1]; struct ocfs2_dinode *orphan_fe; int status = 0; - struct buffer_head *target_de_bh = NULL; - struct ocfs2_dir_entry *target_de = NULL; + struct ocfs2_dir_lookup_result lookup = { NULL, }; mlog_entry_void(); @@ -1971,17 +1989,15 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, OCFS2_ORPHAN_NAMELEN); /* find it's spot in the orphan directory */ - target_de_bh = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, - orphan_dir_inode, &target_de); - if (!target_de_bh) { - status = -ENOENT; + status = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, orphan_dir_inode, + &lookup); + if (status) { mlog_errno(status); goto leave; } /* remove it from the orphan directory */ - status = ocfs2_delete_entry(handle, orphan_dir_inode, target_de, - target_de_bh); + status = ocfs2_delete_entry(handle, orphan_dir_inode, &lookup); if (status < 0) { mlog_errno(status); goto leave; @@ -1997,8 +2013,8 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, /* do the i_nlink dance! :) */ orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; if (S_ISDIR(inode->i_mode)) - le16_add_cpu(&orphan_fe->i_links_count, -1); - orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count); + ocfs2_add_links_count(orphan_fe, -1); + orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe); status = ocfs2_journal_dirty(handle, orphan_dir_bh); if (status < 0) { @@ -2007,7 +2023,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, } leave: - brelse(target_de_bh); + ocfs2_free_dir_lookup_result(&lookup); mlog_exit(status); return status; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 946d3c34b90b..1386281950db 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -209,6 +209,7 @@ enum ocfs2_mount_options struct ocfs2_journal; struct ocfs2_slot_info; struct ocfs2_recovery_map; +struct ocfs2_replay_map; struct ocfs2_quota_recovery; struct ocfs2_dentry_lock; struct ocfs2_super @@ -264,6 +265,7 @@ struct ocfs2_super atomic_t vol_state; struct mutex recovery_lock; struct ocfs2_recovery_map *recovery_map; + struct ocfs2_replay_map *replay_map; struct task_struct *recovery_thread_task; int disable_recovery; wait_queue_head_t checkpoint_event; @@ -287,11 +289,6 @@ struct ocfs2_super u64 la_last_gd; -#ifdef CONFIG_OCFS2_FS_STATS - struct dentry *local_alloc_debug; - char *local_alloc_debug_buf; -#endif - /* Next three fields are for local node slot recovery during * mount. */ int dirty; @@ -305,9 +302,11 @@ struct ocfs2_super struct ocfs2_cluster_connection *cconn; struct ocfs2_lock_res osb_super_lockres; struct ocfs2_lock_res osb_rename_lockres; + struct ocfs2_lock_res osb_nfs_sync_lockres; struct ocfs2_dlm_debug *osb_dlm_debug; struct dentry *osb_debug_root; + struct dentry *osb_ctxt; wait_queue_head_t recovery_event; @@ -344,6 +343,12 @@ struct ocfs2_super /* used to protect metaecc calculation check of xattr. */ spinlock_t osb_xattr_lock; + + unsigned int osb_dx_mask; + u32 osb_dx_seed[4]; + + /* the group we used to allocate inodes. */ + u64 osb_inode_alloc_group; }; #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) @@ -402,6 +407,51 @@ static inline int ocfs2_meta_ecc(struct ocfs2_super *osb) return 0; } +static inline int ocfs2_supports_indexed_dirs(struct ocfs2_super *osb) +{ + if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS) + return 1; + return 0; +} + +static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb) +{ + if (ocfs2_supports_indexed_dirs(osb)) + return OCFS2_DX_LINK_MAX; + return OCFS2_LINK_MAX; +} + +static inline unsigned int ocfs2_read_links_count(struct ocfs2_dinode *di) +{ + u32 nlink = le16_to_cpu(di->i_links_count); + u32 hi = le16_to_cpu(di->i_links_count_hi); + + if (di->i_dyn_features & cpu_to_le16(OCFS2_INDEXED_DIR_FL)) + nlink |= (hi << OCFS2_LINKS_HI_SHIFT); + + return nlink; +} + +static inline void ocfs2_set_links_count(struct ocfs2_dinode *di, u32 nlink) +{ + u16 lo, hi; + + lo = nlink; + hi = nlink >> OCFS2_LINKS_HI_SHIFT; + + di->i_links_count = cpu_to_le16(lo); + di->i_links_count_hi = cpu_to_le16(hi); +} + +static inline void ocfs2_add_links_count(struct ocfs2_dinode *di, int n) +{ + u32 links = ocfs2_read_links_count(di); + + links += n; + + ocfs2_set_links_count(di, links); +} + /* set / clear functions because cluster events can make these happen * in parallel so we want the transitions to be atomic. this also * means that any future flags osb_flags must be protected by spinlock @@ -482,6 +532,12 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb) #define OCFS2_IS_VALID_DIR_TRAILER(ptr) \ (!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE)) +#define OCFS2_IS_VALID_DX_ROOT(ptr) \ + (!strcmp((ptr)->dr_signature, OCFS2_DX_ROOT_SIGNATURE)) + +#define OCFS2_IS_VALID_DX_LEAF(ptr) \ + (!strcmp((ptr)->dl_signature, OCFS2_DX_LEAF_SIGNATURE)) + static inline unsigned long ino_from_blkno(struct super_block *sb, u64 blkno) { @@ -532,6 +588,16 @@ static inline u64 ocfs2_clusters_to_bytes(struct super_block *sb, return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits; } +static inline u64 ocfs2_block_to_cluster_start(struct super_block *sb, + u64 blocks) +{ + int bits = OCFS2_SB(sb)->s_clustersize_bits - sb->s_blocksize_bits; + unsigned int clusters; + + clusters = ocfs2_blocks_to_clusters(sb, blocks); + return (u64)clusters << bits; +} + static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb, u64 bytes) { diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 2332ef740f4f..7ab6e9e5e77c 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -66,6 +66,8 @@ #define OCFS2_GROUP_DESC_SIGNATURE "GROUP01" #define OCFS2_XATTR_BLOCK_SIGNATURE "XATTR01" #define OCFS2_DIR_TRAILER_SIGNATURE "DIRTRL1" +#define OCFS2_DX_ROOT_SIGNATURE "DXDIR01" +#define OCFS2_DX_LEAF_SIGNATURE "DXLEAF1" /* Compatibility flags */ #define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \ @@ -95,7 +97,8 @@ | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \ | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \ | OCFS2_FEATURE_INCOMPAT_XATTR \ - | OCFS2_FEATURE_INCOMPAT_META_ECC) + | OCFS2_FEATURE_INCOMPAT_META_ECC \ + | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS) #define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) @@ -151,6 +154,9 @@ /* Support for extended attributes */ #define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200 +/* Support for indexed directores */ +#define OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS 0x0400 + /* Metadata checksum and error correction */ #define OCFS2_FEATURE_INCOMPAT_META_ECC 0x0800 @@ -411,8 +417,12 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = { #define OCFS2_DIR_REC_LEN(name_len) (((name_len) + OCFS2_DIR_MEMBER_LEN + \ OCFS2_DIR_ROUND) & \ ~OCFS2_DIR_ROUND) +#define OCFS2_DIR_MIN_REC_LEN OCFS2_DIR_REC_LEN(1) #define OCFS2_LINK_MAX 32000 +#define OCFS2_DX_LINK_MAX ((1U << 31) - 1U) +#define OCFS2_LINKS_HI_SHIFT 16 +#define OCFS2_DX_ENTRIES_MAX (0xffffffffU) #define S_SHIFT 12 static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = { @@ -628,8 +638,9 @@ struct ocfs2_super_block { /*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size for this fs*/ __le16 s_reserved0; - __le32 s_reserved1; -/*C0*/ __le64 s_reserved2[16]; /* Fill out superblock */ + __le32 s_dx_seed[3]; /* seed[0-2] for dx dir hash. + * s_uuid_hash serves as seed[3]. */ +/*C0*/ __le64 s_reserved2[15]; /* Fill out superblock */ /*140*/ /* @@ -679,7 +690,7 @@ struct ocfs2_dinode { belongs to */ __le16 i_suballoc_bit; /* Bit offset in suballocator block group */ -/*10*/ __le16 i_reserved0; +/*10*/ __le16 i_links_count_hi; /* High 16 bits of links count */ __le16 i_xattr_inline_size; __le32 i_clusters; /* Cluster count */ __le32 i_uid; /* Owner UID */ @@ -705,7 +716,8 @@ struct ocfs2_dinode { __le16 i_dyn_features; __le64 i_xattr_loc; /*80*/ struct ocfs2_block_check i_check; /* Error checking */ -/*88*/ __le64 i_reserved2[6]; +/*88*/ __le64 i_dx_root; /* Pointer to dir index root block */ + __le64 i_reserved2[5]; /*B8*/ union { __le64 i_pad1; /* Generic way to refer to this 64bit union */ @@ -781,6 +793,90 @@ struct ocfs2_dir_block_trailer { /*40*/ }; + /* + * A directory entry in the indexed tree. We don't store the full name here, + * but instead provide a pointer to the full dirent in the unindexed tree. + * + * We also store name_len here so as to reduce the number of leaf blocks we + * need to search in case of collisions. + */ +struct ocfs2_dx_entry { + __le32 dx_major_hash; /* Used to find logical + * cluster in index */ + __le32 dx_minor_hash; /* Lower bits used to find + * block in cluster */ + __le64 dx_dirent_blk; /* Physical block in unindexed + * tree holding this dirent. */ +}; + +struct ocfs2_dx_entry_list { + __le32 de_reserved; + __le16 de_count; /* Maximum number of entries + * possible in de_entries */ + __le16 de_num_used; /* Current number of + * de_entries entries */ + struct ocfs2_dx_entry de_entries[0]; /* Indexed dir entries + * in a packed array of + * length de_num_used */ +}; + +#define OCFS2_DX_FLAG_INLINE 0x01 + +/* + * A directory indexing block. Each indexed directory has one of these, + * pointed to by ocfs2_dinode. + * + * This block stores an indexed btree root, and a set of free space + * start-of-list pointers. + */ +struct ocfs2_dx_root_block { + __u8 dr_signature[8]; /* Signature for verification */ + struct ocfs2_block_check dr_check; /* Error checking */ + __le16 dr_suballoc_slot; /* Slot suballocator this + * block belongs to. */ + __le16 dr_suballoc_bit; /* Bit offset in suballocator + * block group */ + __le32 dr_fs_generation; /* Must match super block */ + __le64 dr_blkno; /* Offset on disk, in blocks */ + __le64 dr_last_eb_blk; /* Pointer to last + * extent block */ + __le32 dr_clusters; /* Clusters allocated + * to the indexed tree. */ + __u8 dr_flags; /* OCFS2_DX_FLAG_* flags */ + __u8 dr_reserved0; + __le16 dr_reserved1; + __le64 dr_dir_blkno; /* Pointer to parent inode */ + __le32 dr_num_entries; /* Total number of + * names stored in + * this directory.*/ + __le32 dr_reserved2; + __le64 dr_free_blk; /* Pointer to head of free + * unindexed block list. */ + __le64 dr_reserved3[15]; + union { + struct ocfs2_extent_list dr_list; /* Keep this aligned to 128 + * bits for maximum space + * efficiency. */ + struct ocfs2_dx_entry_list dr_entries; /* In-root-block list of + * entries. We grow out + * to extents if this + * gets too big. */ + }; +}; + +/* + * The header of a leaf block in the indexed tree. + */ +struct ocfs2_dx_leaf { + __u8 dl_signature[8];/* Signature for verification */ + struct ocfs2_block_check dl_check; /* Error checking */ + __le64 dl_blkno; /* Offset on disk, in blocks */ + __le32 dl_fs_generation;/* Must match super block */ + __le32 dl_reserved0; + __le64 dl_reserved1; + struct ocfs2_dx_entry_list dl_list; +}; + /* * On disk allocator group structure for OCFS2 */ @@ -1112,6 +1208,16 @@ static inline int ocfs2_extent_recs_per_inode_with_xattr( return size / sizeof(struct ocfs2_extent_rec); } +static inline int ocfs2_extent_recs_per_dx_root(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_dx_root_block, dr_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} + static inline int ocfs2_chain_recs_per_inode(struct super_block *sb) { int size; @@ -1132,6 +1238,26 @@ static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb) return size / sizeof(struct ocfs2_extent_rec); } +static inline int ocfs2_dx_entries_per_leaf(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_dx_leaf, dl_list.de_entries); + + return size / sizeof(struct ocfs2_dx_entry); +} + +static inline int ocfs2_dx_entries_per_root(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_dx_root_block, dr_entries.de_entries); + + return size / sizeof(struct ocfs2_dx_entry); +} + static inline u16 ocfs2_local_alloc_size(struct super_block *sb) { u16 size; diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h index eb6f50c9ceca..a53ce87481bf 100644 --- a/fs/ocfs2/ocfs2_lockid.h +++ b/fs/ocfs2/ocfs2_lockid.h @@ -47,6 +47,7 @@ enum ocfs2_lock_type { OCFS2_LOCK_TYPE_OPEN, OCFS2_LOCK_TYPE_FLOCK, OCFS2_LOCK_TYPE_QINFO, + OCFS2_LOCK_TYPE_NFS_SYNC, OCFS2_NUM_LOCK_TYPES }; @@ -81,6 +82,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type) case OCFS2_LOCK_TYPE_QINFO: c = 'Q'; break; + case OCFS2_LOCK_TYPE_NFS_SYNC: + c = 'Y'; + break; default: c = '\0'; } diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index a69628603e18..b4ca5911caaf 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -48,7 +48,8 @@ #include "buffer_head_io.h" #define NOT_ALLOC_NEW_GROUP 0 -#define ALLOC_NEW_GROUP 1 +#define ALLOC_NEW_GROUP 0x1 +#define ALLOC_GROUPS_FROM_GLOBAL 0x2 #define OCFS2_MAX_INODES_TO_STEAL 1024 @@ -64,7 +65,9 @@ static int ocfs2_block_group_fill(handle_t *handle, static int ocfs2_block_group_alloc(struct ocfs2_super *osb, struct inode *alloc_inode, struct buffer_head *bh, - u64 max_block); + u64 max_block, + u64 *last_alloc_group, + int flags); static int ocfs2_cluster_group_search(struct inode *inode, struct buffer_head *group_bh, @@ -116,6 +119,7 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode, u16 *bg_bit_off); static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, u32 bits_wanted, u64 max_block, + int flags, struct ocfs2_alloc_context **ac); void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) @@ -403,7 +407,9 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl) static int ocfs2_block_group_alloc(struct ocfs2_super *osb, struct inode *alloc_inode, struct buffer_head *bh, - u64 max_block) + u64 max_block, + u64 *last_alloc_group, + int flags) { int status, credits; struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; @@ -423,7 +429,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, cl = &fe->id2.i_chain; status = ocfs2_reserve_clusters_with_limit(osb, le16_to_cpu(cl->cl_cpg), - max_block, &ac); + max_block, flags, &ac); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -440,6 +446,11 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, goto bail; } + if (last_alloc_group && *last_alloc_group != 0) { + mlog(0, "use old allocation group %llu for block group alloc\n", + (unsigned long long)*last_alloc_group); + ac->ac_last_group = *last_alloc_group; + } status = ocfs2_claim_clusters(osb, handle, ac, @@ -514,6 +525,11 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode); status = 0; + + /* save the new last alloc group so that the caller can cache it. */ + if (last_alloc_group) + *last_alloc_group = ac->ac_last_group; + bail: if (handle) ocfs2_commit_trans(osb, handle); @@ -531,7 +547,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, struct ocfs2_alloc_context *ac, int type, u32 slot, - int alloc_new_group) + u64 *last_alloc_group, + int flags) { int status; u32 bits_wanted = ac->ac_bits_wanted; @@ -587,7 +604,7 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, goto bail; } - if (alloc_new_group != ALLOC_NEW_GROUP) { + if (!(flags & ALLOC_NEW_GROUP)) { mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, " "and we don't alloc a new group for it.\n", slot, bits_wanted, free_bits); @@ -596,7 +613,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, } status = ocfs2_block_group_alloc(osb, alloc_inode, bh, - ac->ac_max_block); + ac->ac_max_block, + last_alloc_group, flags); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -640,7 +658,7 @@ int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb, status = ocfs2_reserve_suballoc_bits(osb, (*ac), EXTENT_ALLOC_SYSTEM_INODE, - slot, ALLOC_NEW_GROUP); + slot, NULL, ALLOC_NEW_GROUP); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -686,7 +704,8 @@ static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb, status = ocfs2_reserve_suballoc_bits(osb, ac, INODE_ALLOC_SYSTEM_INODE, - slot, NOT_ALLOC_NEW_GROUP); + slot, NULL, + NOT_ALLOC_NEW_GROUP); if (status >= 0) { ocfs2_set_inode_steal_slot(osb, slot); break; @@ -703,6 +722,7 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb, { int status; s16 slot = ocfs2_get_inode_steal_slot(osb); + u64 alloc_group; *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); if (!(*ac)) { @@ -738,12 +758,22 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb, goto inode_steal; atomic_set(&osb->s_num_inodes_stolen, 0); + alloc_group = osb->osb_inode_alloc_group; status = ocfs2_reserve_suballoc_bits(osb, *ac, INODE_ALLOC_SYSTEM_INODE, - osb->slot_num, ALLOC_NEW_GROUP); + osb->slot_num, + &alloc_group, + ALLOC_NEW_GROUP | + ALLOC_GROUPS_FROM_GLOBAL); if (status >= 0) { status = 0; + spin_lock(&osb->osb_lock); + osb->osb_inode_alloc_group = alloc_group; + spin_unlock(&osb->osb_lock); + mlog(0, "after reservation, new allocation group is " + "%llu\n", (unsigned long long)alloc_group); + /* * Some inodes must be freed by us, so try to allocate * from our own next time. @@ -790,7 +820,7 @@ int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, status = ocfs2_reserve_suballoc_bits(osb, ac, GLOBAL_BITMAP_SYSTEM_INODE, - OCFS2_INVALID_SLOT, + OCFS2_INVALID_SLOT, NULL, ALLOC_NEW_GROUP); if (status < 0 && status != -ENOSPC) { mlog_errno(status); @@ -806,6 +836,7 @@ bail: * things a bit. */ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, u32 bits_wanted, u64 max_block, + int flags, struct ocfs2_alloc_context **ac) { int status; @@ -823,7 +854,8 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, (*ac)->ac_max_block = max_block; status = -ENOSPC; - if (ocfs2_alloc_should_use_local(osb, bits_wanted)) { + if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) && + ocfs2_alloc_should_use_local(osb, bits_wanted)) { status = ocfs2_reserve_local_alloc_bits(osb, bits_wanted, *ac); @@ -861,7 +893,8 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb, u32 bits_wanted, struct ocfs2_alloc_context **ac) { - return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, ac); + return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, + ALLOC_NEW_GROUP, ac); } /* @@ -1618,8 +1651,41 @@ bail: return status; } +static void ocfs2_init_inode_ac_group(struct inode *dir, + struct buffer_head *parent_fe_bh, + struct ocfs2_alloc_context *ac) +{ + struct ocfs2_dinode *fe = (struct ocfs2_dinode *)parent_fe_bh->b_data; + /* + * Try to allocate inodes from some specific group. + * + * If the parent dir has recorded the last group used in allocation, + * cool, use it. Otherwise if we try to allocate new inode from the + * same slot the parent dir belongs to, use the same chunk. + * + * We are very careful here to avoid the mistake of setting + * ac_last_group to a group descriptor from a different (unlocked) slot. + */ + if (OCFS2_I(dir)->ip_last_used_group && + OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot) + ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group; + else if (le16_to_cpu(fe->i_suballoc_slot) == ac->ac_alloc_slot) + ac->ac_last_group = ocfs2_which_suballoc_group( + le64_to_cpu(fe->i_blkno), + le16_to_cpu(fe->i_suballoc_bit)); +} + +static inline void ocfs2_save_inode_ac_group(struct inode *dir, + struct ocfs2_alloc_context *ac) +{ + OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group; + OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot; +} + int ocfs2_claim_new_inode(struct ocfs2_super *osb, handle_t *handle, + struct inode *dir, + struct buffer_head *parent_fe_bh, struct ocfs2_alloc_context *ac, u16 *suballoc_bit, u64 *fe_blkno) @@ -1635,6 +1701,8 @@ int ocfs2_claim_new_inode(struct ocfs2_super *osb, BUG_ON(ac->ac_bits_wanted != 1); BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE); + ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac); + status = ocfs2_claim_suballoc_bits(osb, ac, handle, @@ -1653,6 +1721,7 @@ int ocfs2_claim_new_inode(struct ocfs2_super *osb, *fe_blkno = bg_blkno + (u64) (*suballoc_bit); ac->ac_bits_given++; + ocfs2_save_inode_ac_group(dir, ac); status = 0; bail: mlog_exit(status); @@ -2116,3 +2185,162 @@ out: return ret; } + +/* + * Read the inode specified by blkno to get suballoc_slot and + * suballoc_bit. + */ +static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno, + u16 *suballoc_slot, u16 *suballoc_bit) +{ + int status; + struct buffer_head *inode_bh = NULL; + struct ocfs2_dinode *inode_fe; + + mlog_entry("blkno: %llu\n", blkno); + + /* dirty read disk */ + status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh); + if (status < 0) { + mlog(ML_ERROR, "read block %llu failed %d\n", blkno, status); + goto bail; + } + + inode_fe = (struct ocfs2_dinode *) inode_bh->b_data; + if (!OCFS2_IS_VALID_DINODE(inode_fe)) { + mlog(ML_ERROR, "invalid inode %llu requested\n", blkno); + status = -EINVAL; + goto bail; + } + + if (le16_to_cpu(inode_fe->i_suballoc_slot) != OCFS2_INVALID_SLOT && + (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) { + mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n", + blkno, (u32)le16_to_cpu(inode_fe->i_suballoc_slot)); + status = -EINVAL; + goto bail; + } + + if (suballoc_slot) + *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot); + if (suballoc_bit) + *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit); + +bail: + brelse(inode_bh); + + mlog_exit(status); + return status; +} + +/* + * test whether bit is SET in allocator bitmap or not. on success, 0 + * is returned and *res is 1 for SET; 0 otherwise. when fails, errno + * is returned and *res is meaningless. Call this after you have + * cluster locked against suballoc, or you may get a result based on + * non-up2date contents + */ +static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, + struct inode *suballoc, + struct buffer_head *alloc_bh, u64 blkno, + u16 bit, int *res) +{ + struct ocfs2_dinode *alloc_fe; + struct ocfs2_group_desc *group; + struct buffer_head *group_bh = NULL; + u64 bg_blkno; + int status; + + mlog_entry("blkno: %llu bit: %u\n", blkno, (unsigned int)bit); + + alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data; + if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) { + mlog(ML_ERROR, "suballoc bit %u out of range of %u\n", + (unsigned int)bit, + ocfs2_bits_per_group(&alloc_fe->id2.i_chain)); + status = -EINVAL; + goto bail; + } + + bg_blkno = ocfs2_which_suballoc_group(blkno, bit); + status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno, + &group_bh); + if (status < 0) { + mlog(ML_ERROR, "read group %llu failed %d\n", bg_blkno, status); + goto bail; + } + + group = (struct ocfs2_group_desc *) group_bh->b_data; + *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap); + +bail: + brelse(group_bh); + + mlog_exit(status); + return status; +} + +/* + * Test if the bit representing this inode (blkno) is set in the + * suballocator. + * + * On success, 0 is returned and *res is 1 for SET; 0 otherwise. + * + * In the event of failure, a negative value is returned and *res is + * meaningless. + * + * Callers must make sure to hold nfs_sync_lock to prevent + * ocfs2_delete_inode() on another node from accessing the same + * suballocator concurrently. + */ +int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) +{ + int status; + u16 suballoc_bit = 0, suballoc_slot = 0; + struct inode *inode_alloc_inode; + struct buffer_head *alloc_bh = NULL; + + mlog_entry("blkno: %llu", blkno); + + status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot, + &suballoc_bit); + if (status < 0) { + mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status); + goto bail; + } + + inode_alloc_inode = + ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE, + suballoc_slot); + if (!inode_alloc_inode) { + /* the error code could be inaccurate, but we are not able to + * get the correct one. */ + status = -EINVAL; + mlog(ML_ERROR, "unable to get alloc inode in slot %u\n", + (u32)suballoc_slot); + goto bail; + } + + mutex_lock(&inode_alloc_inode->i_mutex); + status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0); + if (status < 0) { + mutex_unlock(&inode_alloc_inode->i_mutex); + mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n", + (u32)suballoc_slot, status); + goto bail; + } + + status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh, + blkno, suballoc_bit, res); + if (status < 0) + mlog(ML_ERROR, "test suballoc bit failed %d\n", status); + + ocfs2_inode_unlock(inode_alloc_inode, 0); + mutex_unlock(&inode_alloc_inode->i_mutex); + + iput(inode_alloc_inode); + brelse(alloc_bh); +bail: + mlog_exit(status); + return status; +} diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index e3c13c77f9e8..8c9a78a43164 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -88,6 +88,8 @@ int ocfs2_claim_metadata(struct ocfs2_super *osb, u64 *blkno_start); int ocfs2_claim_new_inode(struct ocfs2_super *osb, handle_t *handle, + struct inode *dir, + struct buffer_head *parent_fe_bh, struct ocfs2_alloc_context *ac, u16 *suballoc_bit, u64 *fe_blkno); @@ -186,4 +188,6 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et, u32 clusters_to_add, u32 extents_to_split, struct ocfs2_alloc_context **data_ac, struct ocfs2_alloc_context **meta_ac); + +int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res); #endif /* _CHAINALLOC_H_ */ diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 7ac83a81ee55..79ff8d9d37e0 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -201,6 +201,170 @@ static const match_table_t tokens = { {Opt_err, NULL} }; +#ifdef CONFIG_DEBUG_FS +static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) +{ + int out = 0; + int i; + struct ocfs2_cluster_connection *cconn = osb->cconn; + struct ocfs2_recovery_map *rm = osb->recovery_map; + + out += snprintf(buf + out, len - out, + "%10s => Id: %-s Uuid: %-s Gen: 0x%X Label: %-s\n", + "Device", osb->dev_str, osb->uuid_str, + osb->fs_generation, osb->vol_label); + + out += snprintf(buf + out, len - out, + "%10s => State: %d Flags: 0x%lX\n", "Volume", + atomic_read(&osb->vol_state), osb->osb_flags); + + out += snprintf(buf + out, len - out, + "%10s => Block: %lu Cluster: %d\n", "Sizes", + osb->sb->s_blocksize, osb->s_clustersize); + + out += snprintf(buf + out, len - out, + "%10s => Compat: 0x%X Incompat: 0x%X " + "ROcompat: 0x%X\n", + "Features", osb->s_feature_compat, + osb->s_feature_incompat, osb->s_feature_ro_compat); + + out += snprintf(buf + out, len - out, + "%10s => Opts: 0x%lX AtimeQuanta: %u\n", "Mount", + osb->s_mount_opt, osb->s_atime_quantum); + + out += snprintf(buf + out, len - out, + "%10s => Stack: %s Name: %*s Version: %d.%d\n", + "Cluster", + (*osb->osb_cluster_stack == '\0' ? + "o2cb" : osb->osb_cluster_stack), + cconn->cc_namelen, cconn->cc_name, + cconn->cc_version.pv_major, cconn->cc_version.pv_minor); + + spin_lock(&osb->dc_task_lock); + out += snprintf(buf + out, len - out, + "%10s => Pid: %d Count: %lu WakeSeq: %lu " + "WorkSeq: %lu\n", "DownCnvt", + task_pid_nr(osb->dc_task), osb->blocked_lock_count, + osb->dc_wake_sequence, osb->dc_work_sequence); + spin_unlock(&osb->dc_task_lock); + + spin_lock(&osb->osb_lock); + out += snprintf(buf + out, len - out, "%10s => Pid: %d Nodes:", + "Recovery", + (osb->recovery_thread_task ? + task_pid_nr(osb->recovery_thread_task) : -1)); + if (rm->rm_used == 0) + out += snprintf(buf + out, len - out, " None\n"); + else { + for (i = 0; i < rm->rm_used; i++) + out += snprintf(buf + out, len - out, " %d", + rm->rm_entries[i]); + out += snprintf(buf + out, len - out, "\n"); + } + spin_unlock(&osb->osb_lock); + + out += snprintf(buf + out, len - out, + "%10s => Pid: %d Interval: %lu Needs: %d\n", "Commit", + task_pid_nr(osb->commit_task), osb->osb_commit_interval, + atomic_read(&osb->needs_checkpoint)); + + out += snprintf(buf + out, len - out, + "%10s => State: %d NumTxns: %d TxnId: %lu\n", + "Journal", osb->journal->j_state, + atomic_read(&osb->journal->j_num_trans), + osb->journal->j_trans_id); + + out += snprintf(buf + out, len - out, + "%10s => GlobalAllocs: %d LocalAllocs: %d " + "SubAllocs: %d LAWinMoves: %d SAExtends: %d\n", + "Stats", + atomic_read(&osb->alloc_stats.bitmap_data), + atomic_read(&osb->alloc_stats.local_data), + atomic_read(&osb->alloc_stats.bg_allocs), + atomic_read(&osb->alloc_stats.moves), + atomic_read(&osb->alloc_stats.bg_extends)); + + out += snprintf(buf + out, len - out, + "%10s => State: %u Descriptor: %llu Size: %u bits " + "Default: %u bits\n", + "LocalAlloc", osb->local_alloc_state, + (unsigned long long)osb->la_last_gd, + osb->local_alloc_bits, osb->local_alloc_default_bits); + + spin_lock(&osb->osb_lock); + out += snprintf(buf + out, len - out, + "%10s => Slot: %d NumStolen: %d\n", "Steal", + osb->s_inode_steal_slot, + atomic_read(&osb->s_num_inodes_stolen)); + spin_unlock(&osb->osb_lock); + + out += snprintf(buf + out, len - out, "%10s => %3s %10s\n", + "Slots", "Num", "RecoGen"); + + for (i = 0; i < osb->max_slots; ++i) { + out += snprintf(buf + out, len - out, + "%10s %c %3d %10d\n", + " ", + (i == osb->slot_num ? '*' : ' '), + i, osb->slot_recovery_generations[i]); + } + + return out; +} + +static int ocfs2_osb_debug_open(struct inode *inode, struct file *file) +{ + struct ocfs2_super *osb = inode->i_private; + char *buf = NULL; + + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!buf) + goto bail; + + i_size_write(inode, ocfs2_osb_dump(osb, buf, PAGE_SIZE)); + + file->private_data = buf; + + return 0; +bail: + return -ENOMEM; +} + +static int ocfs2_debug_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} + +static ssize_t ocfs2_debug_read(struct file *file, char __user *buf, + size_t nbytes, loff_t *ppos) +{ + return simple_read_from_buffer(buf, nbytes, ppos, file->private_data, + i_size_read(file->f_mapping->host)); +} +#else +static int ocfs2_osb_debug_open(struct inode *inode, struct file *file) +{ + return 0; +} +static int ocfs2_debug_release(struct inode *inode, struct file *file) +{ + return 0; +} +static ssize_t ocfs2_debug_read(struct file *file, char __user *buf, + size_t nbytes, loff_t *ppos) +{ + return 0; +} +#endif /* CONFIG_DEBUG_FS */ + +static struct file_operations ocfs2_osb_debug_fops = { + .open = ocfs2_osb_debug_open, + .release = ocfs2_debug_release, + .read = ocfs2_debug_read, + .llseek = generic_file_llseek, +}; + /* * write_super and sync_fs ripped right out of ext3. */ @@ -926,6 +1090,16 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) goto read_super_error; } + osb->osb_ctxt = debugfs_create_file("fs_state", S_IFREG|S_IRUSR, + osb->osb_debug_root, + osb, + &ocfs2_osb_debug_fops); + if (!osb->osb_ctxt) { + status = -EINVAL; + mlog_errno(status); + goto read_super_error; + } + status = ocfs2_mount_volume(sb); if (osb->root_inode) inode = igrab(osb->root_inode); @@ -1620,6 +1794,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) osb = OCFS2_SB(sb); BUG_ON(!osb); + debugfs_remove(osb->osb_ctxt); + ocfs2_disable_quotas(osb); ocfs2_shutdown_local_alloc(osb); @@ -1742,6 +1918,12 @@ static int ocfs2_initialize_super(struct super_block *sb, bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits); sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits); + osb->osb_dx_mask = (1 << (cbits - bbits)) - 1; + + for (i = 0; i < 3; i++) + osb->osb_dx_seed[i] = le32_to_cpu(di->id2.i_super.s_dx_seed[i]); + osb->osb_dx_seed[3] = le32_to_cpu(di->id2.i_super.s_uuid_hash); + osb->sb = sb; /* Save off for ocfs2_rw_direct */ osb->s_sectsize_bits = blksize_bits(sector_size); @@ -2130,6 +2312,12 @@ static int ocfs2_check_volume(struct ocfs2_super *osb) * lock, and it's marked as dirty, set the bit in the recover * map and launch a recovery thread for it. */ status = ocfs2_mark_dead_nodes(osb); + if (status < 0) { + mlog_errno(status); + goto finally; + } + + status = ocfs2_compute_replay_slots(osb); if (status < 0) mlog_errno(status); diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 2563df89fc2a..15631019dc63 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -512,7 +512,7 @@ int ocfs2_calc_xattr_init(struct inode *dir, struct ocfs2_security_xattr_info *si, int *want_clusters, int *xattr_credits, - struct ocfs2_alloc_context **xattr_ac) + int *want_meta) { int ret = 0; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); @@ -554,11 +554,7 @@ int ocfs2_calc_xattr_init(struct inode *dir, if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE || (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) || (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) { - ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac); - if (ret) { - mlog_errno(ret); - return ret; - } + *want_meta = *want_meta + 1; *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS; } diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index 5a1ebc789f7e..1ca7e9a1b7bc 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -68,7 +68,7 @@ int ocfs2_calc_security_init(struct inode *, int *, int *, struct ocfs2_alloc_context **); int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *, int, struct ocfs2_security_xattr_info *, - int *, int *, struct ocfs2_alloc_context **); + int *, int *, int *); /* * xattrs can live inside an inode, as part of an external xattr block, diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index 633e9dc972bb..379ae5fb4411 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -262,14 +262,19 @@ static int omfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *s = dentry->d_sb; struct omfs_sb_info *sbi = OMFS_SB(s); + u64 id = huge_encode_dev(s->s_bdev->bd_dev); + buf->f_type = OMFS_MAGIC; buf->f_bsize = sbi->s_blocksize; buf->f_blocks = sbi->s_num_blocks; buf->f_files = sbi->s_num_blocks; buf->f_namelen = OMFS_NAMELEN; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); buf->f_bfree = buf->f_bavail = buf->f_ffree = omfs_count_free(s); + return 0; } @@ -421,7 +426,7 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent) sbi->s_uid = current_uid(); sbi->s_gid = current_gid(); - sbi->s_dmask = sbi->s_fmask = current->fs->umask; + sbi->s_dmask = sbi->s_fmask = current_umask(); if (!parse_options((char *) data, sbi)) goto end; diff --git a/fs/open.c b/fs/open.c index 75b61677daaf..377eb25b6abf 100644 --- a/fs/open.c +++ b/fs/open.c @@ -29,6 +29,7 @@ #include <linux/rcupdate.h> #include <linux/audit.h> #include <linux/falloc.h> +#include <linux/fs_struct.h> int vfs_statfs(struct dentry *dentry, struct kstatfs *buf) { diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 38e337d51ced..99e33ef40be4 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -19,6 +19,7 @@ #include <linux/kmod.h> #include <linux/ctype.h> #include <linux/genhd.h> +#include <linux/blktrace_api.h> #include "check.h" @@ -294,6 +295,9 @@ static struct attribute_group part_attr_group = { static struct attribute_group *part_attr_groups[] = { &part_attr_group, +#ifdef CONFIG_BLK_DEV_IO_TRACE + &blk_trace_attr_group, +#endif NULL }; diff --git a/fs/proc/base.c b/fs/proc/base.c index e0afd326b688..f71559784bfb 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -80,6 +80,7 @@ #include <linux/oom.h> #include <linux/elf.h> #include <linux/pid_namespace.h> +#include <linux/fs_struct.h> #include "internal.h" /* NOTE: diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 43d23948384a..74ea974f5ca6 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -120,7 +120,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(i.freeram-i.freehigh), #endif #ifndef CONFIG_MMU - K((unsigned long) atomic_read(&mmap_pages_allocated)), + K((unsigned long) atomic_long_read(&mmap_pages_allocated)), #endif K(i.totalswap), K(i.freeswap), diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index b446d7ad0b0d..7e14d1a04001 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -76,7 +76,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region) /* * display a list of all the REGIONs the kernel knows about - * - nommu kernals have a single flat list + * - nommu kernels have a single flat list */ static int nommu_region_list_show(struct seq_file *m, void *_p) { diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index b0ae0be4801f..39e4ad4f59f4 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -204,6 +204,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) struct file *file = vma->vm_file; int flags = vma->vm_flags; unsigned long ino = 0; + unsigned long long pgoff = 0; dev_t dev = 0; int len; @@ -211,6 +212,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) struct inode *inode = vma->vm_file->f_path.dentry->d_inode; dev = inode->i_sb->s_dev; ino = inode->i_ino; + pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; } seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", @@ -220,7 +222,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) flags & VM_WRITE ? 'w' : '-', flags & VM_EXEC ? 'x' : '-', flags & VM_MAYSHARE ? 's' : 'p', - ((loff_t)vma->vm_pgoff) << PAGE_SHIFT, + pgoff, MAJOR(dev), MINOR(dev), ino, &len); /* diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 343ea1216bc8..64a72e2e7650 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -2,6 +2,7 @@ #include <linux/mm.h> #include <linux/file.h> #include <linux/fdtable.h> +#include <linux/fs_struct.h> #include <linux/mount.h> #include <linux/ptrace.h> #include <linux/seq_file.h> @@ -49,7 +50,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) else bytes += kobjsize(mm); - if (current->fs && atomic_read(¤t->fs->count) > 1) + if (current->fs && current->fs->users > 1) sbytes += kobjsize(current->fs); else bytes += kobjsize(current->fs); @@ -125,6 +126,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) struct file *file; dev_t dev = 0; int flags, len; + unsigned long long pgoff = 0; flags = vma->vm_flags; file = vma->vm_file; @@ -133,17 +135,18 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) struct inode *inode = vma->vm_file->f_path.dentry->d_inode; dev = inode->i_sb->s_dev; ino = inode->i_ino; + pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT; } seq_printf(m, - "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n", + "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", vma->vm_start, vma->vm_end, flags & VM_READ ? 'r' : '-', flags & VM_WRITE ? 'w' : '-', flags & VM_EXEC ? 'x' : '-', flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', - vma->vm_pgoff << PAGE_SHIFT, + pgoff, MAJOR(dev), MINOR(dev), ino, &len); if (file) { diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 2aad1044b84c..fe1f0f31d11c 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -282,6 +282,7 @@ unsigned long qnx4_block_map( struct inode *inode, long iblock ) static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); lock_kernel(); @@ -291,6 +292,8 @@ static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bfree = qnx4_count_free_blocks(sb); buf->f_bavail = buf->f_bfree; buf->f_namelen = QNX4_NAME_MAX; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); unlock_kernel(); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 2ca967a5ef77..607c579e5eca 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -823,7 +823,7 @@ static void add_dquot_ref(struct super_block *sb, int type) spin_lock(&inode_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) + if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) continue; if (!atomic_read(&inode->i_writecount)) continue; diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index a404fb88e456..3a6b193d8444 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -221,22 +221,23 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent) save_mount_options(sb, data); fsi = kzalloc(sizeof(struct ramfs_fs_info), GFP_KERNEL); + sb->s_fs_info = fsi; if (!fsi) { err = -ENOMEM; goto fail; } - sb->s_fs_info = fsi; err = ramfs_parse_options(data, &fsi->mount_opts); if (err) goto fail; - sb->s_maxbytes = MAX_LFS_FILESIZE; - sb->s_blocksize = PAGE_CACHE_SIZE; - sb->s_blocksize_bits = PAGE_CACHE_SHIFT; - sb->s_magic = RAMFS_MAGIC; - sb->s_op = &ramfs_ops; - sb->s_time_gran = 1; + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = RAMFS_MAGIC; + sb->s_op = &ramfs_ops; + sb->s_time_gran = 1; + inode = ramfs_get_inode(sb, S_IFDIR | fsi->mount_opts.mode, 0); if (!inode) { err = -ENOMEM; @@ -244,14 +245,16 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent) } root = d_alloc_root(inode); + sb->s_root = root; if (!root) { err = -ENOMEM; goto fail; } - sb->s_root = root; + return 0; fail: kfree(fsi); + sb->s_fs_info = NULL; iput(inode); return err; } diff --git a/fs/read_write.c b/fs/read_write.c index 400fe81c973e..9d1e76bb9ee1 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -731,6 +731,62 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, return ret; } +static inline loff_t pos_from_hilo(unsigned long high, unsigned long low) +{ +#define HALF_LONG_BITS (BITS_PER_LONG / 2) + return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low; +} + +SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec, + unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) +{ + loff_t pos = pos_from_hilo(pos_h, pos_l); + struct file *file; + ssize_t ret = -EBADF; + int fput_needed; + + if (pos < 0) + return -EINVAL; + + file = fget_light(fd, &fput_needed); + if (file) { + ret = -ESPIPE; + if (file->f_mode & FMODE_PREAD) + ret = vfs_readv(file, vec, vlen, &pos); + fput_light(file, fput_needed); + } + + if (ret > 0) + add_rchar(current, ret); + inc_syscr(current); + return ret; +} + +SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, + unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) +{ + loff_t pos = pos_from_hilo(pos_h, pos_l); + struct file *file; + ssize_t ret = -EBADF; + int fput_needed; + + if (pos < 0) + return -EINVAL; + + file = fget_light(fd, &fput_needed); + if (file) { + ret = -ESPIPE; + if (file->f_mode & FMODE_PWRITE) + ret = vfs_writev(file, vec, vlen, &pos); + fput_light(file, fput_needed); + } + + if (ret > 0) + add_wchar(current, ret); + inc_syscw(current); + return ret; +} + static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count, loff_t max) { diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig index 949b8c6addc8..513f431038f9 100644 --- a/fs/reiserfs/Kconfig +++ b/fs/reiserfs/Kconfig @@ -1,5 +1,6 @@ config REISERFS_FS tristate "Reiserfs support" + select CRC32 help Stores not just filenames but the files themselves in a balanced tree. Uses journalling. diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 972250c62896..0ae6486d9046 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -27,6 +27,7 @@ #include <linux/mnt_namespace.h> #include <linux/mount.h> #include <linux/namei.h> +#include <linux/crc32.h> struct file_system_type reiserfs_fs_type; @@ -1904,6 +1905,10 @@ static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bsize = dentry->d_sb->s_blocksize; /* changed to accommodate gcc folks. */ buf->f_type = REISERFS_SUPER_MAGIC; + buf->f_fsid.val[0] = (u32)crc32_le(0, rs->s_uuid, sizeof(rs->s_uuid)/2); + buf->f_fsid.val[1] = (u32)crc32_le(0, rs->s_uuid + sizeof(rs->s_uuid)/2, + sizeof(rs->s_uuid)/2); + return 0; } diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index d423416d93d1..c303c426fe2b 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -428,7 +428,7 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, } else { apply_umask: /* no ACL, apply umask */ - inode->i_mode &= ~current->fs->umask; + inode->i_mode &= ~current_umask(); } return err; diff --git a/fs/romfs/Kconfig b/fs/romfs/Kconfig index 1a17020f9faf..ce2d6bcc6266 100644 --- a/fs/romfs/Kconfig +++ b/fs/romfs/Kconfig @@ -1,6 +1,6 @@ config ROMFS_FS tristate "ROM file system support" - depends on BLOCK + depends on BLOCK || MTD ---help--- This is a very small read-only file system mainly intended for initial ram disks of installation disks, but it could be used for @@ -14,3 +14,49 @@ config ROMFS_FS If you don't know whether you need it, then you don't need it: answer N. + +# +# Select the backing stores to be supported +# +choice + prompt "RomFS backing stores" + depends on ROMFS_FS + default ROMFS_BACKED_BY_BLOCK + help + Select the backing stores to be supported. + +config ROMFS_BACKED_BY_BLOCK + bool "Block device-backed ROM file system support" + depends on BLOCK + help + This permits ROMFS to use block devices buffered through the page + cache as the medium from which to retrieve data. It does not allow + direct mapping of the medium. + + If unsure, answer Y. + +config ROMFS_BACKED_BY_MTD + bool "MTD-backed ROM file system support" + depends on MTD=y || (ROMFS_FS=m && MTD) + help + This permits ROMFS to use MTD based devices directly, without the + intercession of the block layer (which may have been disabled). It + also allows direct mapping of MTD devices through romfs files under + NOMMU conditions if the underlying device is directly addressable by + the CPU. + + If unsure, answer Y. + +config ROMFS_BACKED_BY_BOTH + bool "Both the above" + depends on BLOCK && (MTD=y || (ROMFS_FS=m && MTD)) +endchoice + + +config ROMFS_ON_BLOCK + bool + default y if ROMFS_BACKED_BY_BLOCK || ROMFS_BACKED_BY_BOTH + +config ROMFS_ON_MTD + bool + default y if ROMFS_BACKED_BY_MTD || ROMFS_BACKED_BY_BOTH diff --git a/fs/romfs/Makefile b/fs/romfs/Makefile index c95b21cf49a3..420beb7d495c 100644 --- a/fs/romfs/Makefile +++ b/fs/romfs/Makefile @@ -1,7 +1,12 @@ # -# Makefile for the linux romfs filesystem routines. +# Makefile for the linux RomFS filesystem routines. # obj-$(CONFIG_ROMFS_FS) += romfs.o -romfs-objs := inode.o +romfs-y := storage.o super.o + +ifneq ($(CONFIG_MMU),y) +romfs-$(CONFIG_ROMFS_ON_MTD) += mmap-nommu.o +endif + diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c deleted file mode 100644 index 98a232f7196b..000000000000 --- a/fs/romfs/inode.c +++ /dev/null @@ -1,665 +0,0 @@ -/* - * ROMFS file system, Linux implementation - * - * Copyright (C) 1997-1999 Janos Farkas <chexum@shadow.banki.hu> - * - * Using parts of the minix filesystem - * Copyright (C) 1991, 1992 Linus Torvalds - * - * and parts of the affs filesystem additionally - * Copyright (C) 1993 Ray Burr - * Copyright (C) 1996 Hans-Joachim Widmaier - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes - * Changed for 2.1.19 modules - * Jan 1997 Initial release - * Jun 1997 2.1.43+ changes - * Proper page locking in readpage - * Changed to work with 2.1.45+ fs - * Jul 1997 Fixed follow_link - * 2.1.47 - * lookup shouldn't return -ENOENT - * from Horst von Brand: - * fail on wrong checksum - * double unlock_super was possible - * correct namelen for statfs - * spotted by Bill Hawes: - * readlink shouldn't iput() - * Jun 1998 2.1.106 from Avery Pennarun: glibc scandir() - * exposed a problem in readdir - * 2.1.107 code-freeze spellchecker run - * Aug 1998 2.1.118+ VFS changes - * Sep 1998 2.1.122 another VFS change (follow_link) - * Apr 1999 2.2.7 no more EBADF checking in - * lookup/readdir, use ERR_PTR - * Jun 1999 2.3.6 d_alloc_root use changed - * 2.3.9 clean up usage of ENOENT/negative - * dentries in lookup - * clean up page flags setting - * (error, uptodate, locking) in - * in readpage - * use init_special_inode for - * fifos/sockets (and streamline) in - * read_inode, fix _ops table order - * Aug 1999 2.3.16 __initfunc() => __init change - * Oct 1999 2.3.24 page->owner hack obsoleted - * Nov 1999 2.3.27 2.3.25+ page->offset => index change - */ - -/* todo: - * - see Documentation/filesystems/romfs.txt - * - use allocated, not stack memory for file names? - * - considering write access... - * - network (tftp) files? - * - merge back some _op tables - */ - -/* - * Sorry about some optimizations and for some goto's. I just wanted - * to squeeze some more bytes out of this code.. :) - */ - -#include <linux/module.h> -#include <linux/types.h> -#include <linux/errno.h> -#include <linux/slab.h> -#include <linux/romfs_fs.h> -#include <linux/fs.h> -#include <linux/init.h> -#include <linux/pagemap.h> -#include <linux/smp_lock.h> -#include <linux/buffer_head.h> -#include <linux/vfs.h> - -#include <asm/uaccess.h> - -struct romfs_inode_info { - unsigned long i_metasize; /* size of non-data area */ - unsigned long i_dataoffset; /* from the start of fs */ - struct inode vfs_inode; -}; - -static struct inode *romfs_iget(struct super_block *, unsigned long); - -/* instead of private superblock data */ -static inline unsigned long romfs_maxsize(struct super_block *sb) -{ - return (unsigned long)sb->s_fs_info; -} - -static inline struct romfs_inode_info *ROMFS_I(struct inode *inode) -{ - return container_of(inode, struct romfs_inode_info, vfs_inode); -} - -static __u32 -romfs_checksum(void *data, int size) -{ - __u32 sum; - __be32 *ptr; - - sum = 0; ptr = data; - size>>=2; - while (size>0) { - sum += be32_to_cpu(*ptr++); - size--; - } - return sum; -} - -static const struct super_operations romfs_ops; - -static int romfs_fill_super(struct super_block *s, void *data, int silent) -{ - struct buffer_head *bh; - struct romfs_super_block *rsb; - struct inode *root; - int sz, ret = -EINVAL; - - /* I would parse the options here, but there are none.. :) */ - - sb_set_blocksize(s, ROMBSIZE); - s->s_maxbytes = 0xFFFFFFFF; - - bh = sb_bread(s, 0); - if (!bh) { - /* XXX merge with other printk? */ - printk ("romfs: unable to read superblock\n"); - goto outnobh; - } - - rsb = (struct romfs_super_block *)bh->b_data; - sz = be32_to_cpu(rsb->size); - if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1 - || sz < ROMFH_SIZE) { - if (!silent) - printk ("VFS: Can't find a romfs filesystem on dev " - "%s.\n", s->s_id); - goto out; - } - if (romfs_checksum(rsb, min_t(int, sz, 512))) { - printk ("romfs: bad initial checksum on dev " - "%s.\n", s->s_id); - goto out; - } - - s->s_magic = ROMFS_MAGIC; - s->s_fs_info = (void *)(long)sz; - - s->s_flags |= MS_RDONLY; - - /* Find the start of the fs */ - sz = (ROMFH_SIZE + - strnlen(rsb->name, ROMFS_MAXFN) + 1 + ROMFH_PAD) - & ROMFH_MASK; - - s->s_op = &romfs_ops; - root = romfs_iget(s, sz); - if (IS_ERR(root)) { - ret = PTR_ERR(root); - goto out; - } - - ret = -ENOMEM; - s->s_root = d_alloc_root(root); - if (!s->s_root) - goto outiput; - - brelse(bh); - return 0; - -outiput: - iput(root); -out: - brelse(bh); -outnobh: - return ret; -} - -/* That's simple too. */ - -static int -romfs_statfs(struct dentry *dentry, struct kstatfs *buf) -{ - buf->f_type = ROMFS_MAGIC; - buf->f_bsize = ROMBSIZE; - buf->f_bfree = buf->f_bavail = buf->f_ffree; - buf->f_blocks = (romfs_maxsize(dentry->d_sb)+ROMBSIZE-1)>>ROMBSBITS; - buf->f_namelen = ROMFS_MAXFN; - return 0; -} - -/* some helper routines */ - -static int -romfs_strnlen(struct inode *i, unsigned long offset, unsigned long count) -{ - struct buffer_head *bh; - unsigned long avail, maxsize, res; - - maxsize = romfs_maxsize(i->i_sb); - if (offset >= maxsize) - return -1; - - /* strnlen is almost always valid */ - if (count > maxsize || offset+count > maxsize) - count = maxsize-offset; - - bh = sb_bread(i->i_sb, offset>>ROMBSBITS); - if (!bh) - return -1; /* error */ - - avail = ROMBSIZE - (offset & ROMBMASK); - maxsize = min_t(unsigned long, count, avail); - res = strnlen(((char *)bh->b_data)+(offset&ROMBMASK), maxsize); - brelse(bh); - - if (res < maxsize) - return res; /* found all of it */ - - while (res < count) { - offset += maxsize; - - bh = sb_bread(i->i_sb, offset>>ROMBSBITS); - if (!bh) - return -1; - maxsize = min_t(unsigned long, count - res, ROMBSIZE); - avail = strnlen(bh->b_data, maxsize); - res += avail; - brelse(bh); - if (avail < maxsize) - return res; - } - return res; -} - -static int -romfs_copyfrom(struct inode *i, void *dest, unsigned long offset, unsigned long count) -{ - struct buffer_head *bh; - unsigned long avail, maxsize, res; - - maxsize = romfs_maxsize(i->i_sb); - if (offset >= maxsize || count > maxsize || offset+count>maxsize) - return -1; - - bh = sb_bread(i->i_sb, offset>>ROMBSBITS); - if (!bh) - return -1; /* error */ - - avail = ROMBSIZE - (offset & ROMBMASK); - maxsize = min_t(unsigned long, count, avail); - memcpy(dest, ((char *)bh->b_data) + (offset & ROMBMASK), maxsize); - brelse(bh); - - res = maxsize; /* all of it */ - - while (res < count) { - offset += maxsize; - dest += maxsize; - - bh = sb_bread(i->i_sb, offset>>ROMBSBITS); - if (!bh) - return -1; - maxsize = min_t(unsigned long, count - res, ROMBSIZE); - memcpy(dest, bh->b_data, maxsize); - brelse(bh); - res += maxsize; - } - return res; -} - -static unsigned char romfs_dtype_table[] = { - DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_SOCK, DT_FIFO -}; - -static int -romfs_readdir(struct file *filp, void *dirent, filldir_t filldir) -{ - struct inode *i = filp->f_path.dentry->d_inode; - struct romfs_inode ri; - unsigned long offset, maxoff; - int j, ino, nextfh; - int stored = 0; - char fsname[ROMFS_MAXFN]; /* XXX dynamic? */ - - lock_kernel(); - - maxoff = romfs_maxsize(i->i_sb); - - offset = filp->f_pos; - if (!offset) { - offset = i->i_ino & ROMFH_MASK; - if (romfs_copyfrom(i, &ri, offset, ROMFH_SIZE) <= 0) - goto out; - offset = be32_to_cpu(ri.spec) & ROMFH_MASK; - } - - /* Not really failsafe, but we are read-only... */ - for(;;) { - if (!offset || offset >= maxoff) { - offset = maxoff; - filp->f_pos = offset; - goto out; - } - filp->f_pos = offset; - - /* Fetch inode info */ - if (romfs_copyfrom(i, &ri, offset, ROMFH_SIZE) <= 0) - goto out; - - j = romfs_strnlen(i, offset+ROMFH_SIZE, sizeof(fsname)-1); - if (j < 0) - goto out; - - fsname[j]=0; - romfs_copyfrom(i, fsname, offset+ROMFH_SIZE, j); - - ino = offset; - nextfh = be32_to_cpu(ri.next); - if ((nextfh & ROMFH_TYPE) == ROMFH_HRD) - ino = be32_to_cpu(ri.spec); - if (filldir(dirent, fsname, j, offset, ino, - romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0) { - goto out; - } - stored++; - offset = nextfh & ROMFH_MASK; - } -out: - unlock_kernel(); - return stored; -} - -static struct dentry * -romfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) -{ - unsigned long offset, maxoff; - long res; - int fslen; - struct inode *inode = NULL; - char fsname[ROMFS_MAXFN]; /* XXX dynamic? */ - struct romfs_inode ri; - const char *name; /* got from dentry */ - int len; - - res = -EACCES; /* placeholder for "no data here" */ - offset = dir->i_ino & ROMFH_MASK; - lock_kernel(); - if (romfs_copyfrom(dir, &ri, offset, ROMFH_SIZE) <= 0) - goto error; - - maxoff = romfs_maxsize(dir->i_sb); - offset = be32_to_cpu(ri.spec) & ROMFH_MASK; - - /* OK, now find the file whose name is in "dentry" in the - * directory specified by "dir". */ - - name = dentry->d_name.name; - len = dentry->d_name.len; - - for(;;) { - if (!offset || offset >= maxoff) - goto success; /* negative success */ - if (romfs_copyfrom(dir, &ri, offset, ROMFH_SIZE) <= 0) - goto error; - - /* try to match the first 16 bytes of name */ - fslen = romfs_strnlen(dir, offset+ROMFH_SIZE, ROMFH_SIZE); - if (len < ROMFH_SIZE) { - if (len == fslen) { - /* both are shorter, and same size */ - romfs_copyfrom(dir, fsname, offset+ROMFH_SIZE, len+1); - if (strncmp (name, fsname, len) == 0) - break; - } - } else if (fslen >= ROMFH_SIZE) { - /* both are longer; XXX optimize max size */ - fslen = romfs_strnlen(dir, offset+ROMFH_SIZE, sizeof(fsname)-1); - if (len == fslen) { - romfs_copyfrom(dir, fsname, offset+ROMFH_SIZE, len+1); - if (strncmp(name, fsname, len) == 0) - break; - } - } - /* next entry */ - offset = be32_to_cpu(ri.next) & ROMFH_MASK; - } - - /* Hard link handling */ - if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD) - offset = be32_to_cpu(ri.spec) & ROMFH_MASK; - - inode = romfs_iget(dir->i_sb, offset); - if (IS_ERR(inode)) { - res = PTR_ERR(inode); - goto error; - } - -success: - d_add(dentry, inode); - res = 0; -error: - unlock_kernel(); - return ERR_PTR(res); -} - -/* - * Ok, we do readpage, to be able to execute programs. Unfortunately, - * we can't use bmap, since we may have looser alignments. - */ - -static int -romfs_readpage(struct file *file, struct page * page) -{ - struct inode *inode = page->mapping->host; - loff_t offset, size; - unsigned long filled; - void *buf; - int result = -EIO; - - page_cache_get(page); - lock_kernel(); - buf = kmap(page); - if (!buf) - goto err_out; - - /* 32 bit warning -- but not for us :) */ - offset = page_offset(page); - size = i_size_read(inode); - filled = 0; - result = 0; - if (offset < size) { - unsigned long readlen; - - size -= offset; - readlen = size > PAGE_SIZE ? PAGE_SIZE : size; - - filled = romfs_copyfrom(inode, buf, ROMFS_I(inode)->i_dataoffset+offset, readlen); - - if (filled != readlen) { - SetPageError(page); - filled = 0; - result = -EIO; - } - } - - if (filled < PAGE_SIZE) - memset(buf + filled, 0, PAGE_SIZE-filled); - - if (!result) - SetPageUptodate(page); - flush_dcache_page(page); - - unlock_page(page); - - kunmap(page); -err_out: - page_cache_release(page); - unlock_kernel(); - - return result; -} - -/* Mapping from our types to the kernel */ - -static const struct address_space_operations romfs_aops = { - .readpage = romfs_readpage -}; - -static const struct file_operations romfs_dir_operations = { - .read = generic_read_dir, - .readdir = romfs_readdir, -}; - -static const struct inode_operations romfs_dir_inode_operations = { - .lookup = romfs_lookup, -}; - -static mode_t romfs_modemap[] = -{ - 0, S_IFDIR+0644, S_IFREG+0644, S_IFLNK+0777, - S_IFBLK+0600, S_IFCHR+0600, S_IFSOCK+0644, S_IFIFO+0644 -}; - -static struct inode * -romfs_iget(struct super_block *sb, unsigned long ino) -{ - int nextfh, ret; - struct romfs_inode ri; - struct inode *i; - - ino &= ROMFH_MASK; - i = iget_locked(sb, ino); - if (!i) - return ERR_PTR(-ENOMEM); - if (!(i->i_state & I_NEW)) - return i; - - i->i_mode = 0; - - /* Loop for finding the real hard link */ - for(;;) { - if (romfs_copyfrom(i, &ri, ino, ROMFH_SIZE) <= 0) { - printk(KERN_ERR "romfs: read error for inode 0x%lx\n", - ino); - iget_failed(i); - return ERR_PTR(-EIO); - } - /* XXX: do romfs_checksum here too (with name) */ - - nextfh = be32_to_cpu(ri.next); - if ((nextfh & ROMFH_TYPE) != ROMFH_HRD) - break; - - ino = be32_to_cpu(ri.spec) & ROMFH_MASK; - } - - i->i_nlink = 1; /* Hard to decide.. */ - i->i_size = be32_to_cpu(ri.size); - i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0; - i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0; - - /* Precalculate the data offset */ - ret = romfs_strnlen(i, ino + ROMFH_SIZE, ROMFS_MAXFN); - if (ret >= 0) - ino = (ROMFH_SIZE + ret + 1 + ROMFH_PAD) & ROMFH_MASK; - else - ino = 0; - - ROMFS_I(i)->i_metasize = ino; - ROMFS_I(i)->i_dataoffset = ino+(i->i_ino&ROMFH_MASK); - - /* Compute permissions */ - ino = romfs_modemap[nextfh & ROMFH_TYPE]; - /* only "normal" files have ops */ - switch (nextfh & ROMFH_TYPE) { - case 1: - i->i_size = ROMFS_I(i)->i_metasize; - i->i_op = &romfs_dir_inode_operations; - i->i_fop = &romfs_dir_operations; - if (nextfh & ROMFH_EXEC) - ino |= S_IXUGO; - i->i_mode = ino; - break; - case 2: - i->i_fop = &generic_ro_fops; - i->i_data.a_ops = &romfs_aops; - if (nextfh & ROMFH_EXEC) - ino |= S_IXUGO; - i->i_mode = ino; - break; - case 3: - i->i_op = &page_symlink_inode_operations; - i->i_data.a_ops = &romfs_aops; - i->i_mode = ino | S_IRWXUGO; - break; - default: - /* depending on MBZ for sock/fifos */ - nextfh = be32_to_cpu(ri.spec); - init_special_inode(i, ino, - MKDEV(nextfh>>16,nextfh&0xffff)); - } - unlock_new_inode(i); - return i; -} - -static struct kmem_cache * romfs_inode_cachep; - -static struct inode *romfs_alloc_inode(struct super_block *sb) -{ - struct romfs_inode_info *ei; - ei = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL); - if (!ei) - return NULL; - return &ei->vfs_inode; -} - -static void romfs_destroy_inode(struct inode *inode) -{ - kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode)); -} - -static void init_once(void *foo) -{ - struct romfs_inode_info *ei = foo; - - inode_init_once(&ei->vfs_inode); -} - -static int init_inodecache(void) -{ - romfs_inode_cachep = kmem_cache_create("romfs_inode_cache", - sizeof(struct romfs_inode_info), - 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), - init_once); - if (romfs_inode_cachep == NULL) - return -ENOMEM; - return 0; -} - -static void destroy_inodecache(void) -{ - kmem_cache_destroy(romfs_inode_cachep); -} - -static int romfs_remount(struct super_block *sb, int *flags, char *data) -{ - *flags |= MS_RDONLY; - return 0; -} - -static const struct super_operations romfs_ops = { - .alloc_inode = romfs_alloc_inode, - .destroy_inode = romfs_destroy_inode, - .statfs = romfs_statfs, - .remount_fs = romfs_remount, -}; - -static int romfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) -{ - return get_sb_bdev(fs_type, flags, dev_name, data, romfs_fill_super, - mnt); -} - -static struct file_system_type romfs_fs_type = { - .owner = THIS_MODULE, - .name = "romfs", - .get_sb = romfs_get_sb, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, -}; - -static int __init init_romfs_fs(void) -{ - int err = init_inodecache(); - if (err) - goto out1; - err = register_filesystem(&romfs_fs_type); - if (err) - goto out; - return 0; -out: - destroy_inodecache(); -out1: - return err; -} - -static void __exit exit_romfs_fs(void) -{ - unregister_filesystem(&romfs_fs_type); - destroy_inodecache(); -} - -/* Yes, works even as a module... :) */ - -module_init(init_romfs_fs) -module_exit(exit_romfs_fs) -MODULE_LICENSE("GPL"); diff --git a/fs/romfs/internal.h b/fs/romfs/internal.h new file mode 100644 index 000000000000..06044a9dc62d --- /dev/null +++ b/fs/romfs/internal.h @@ -0,0 +1,47 @@ +/* RomFS internal definitions + * + * Copyright © 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/romfs_fs.h> + +struct romfs_inode_info { + struct inode vfs_inode; + unsigned long i_metasize; /* size of non-data area */ + unsigned long i_dataoffset; /* from the start of fs */ +}; + +static inline size_t romfs_maxsize(struct super_block *sb) +{ + return (size_t) (unsigned long) sb->s_fs_info; +} + +static inline struct romfs_inode_info *ROMFS_I(struct inode *inode) +{ + return container_of(inode, struct romfs_inode_info, vfs_inode); +} + +/* + * mmap-nommu.c + */ +#if !defined(CONFIG_MMU) && defined(CONFIG_ROMFS_ON_MTD) +extern const struct file_operations romfs_ro_fops; +#else +#define romfs_ro_fops generic_ro_fops +#endif + +/* + * storage.c + */ +extern int romfs_dev_read(struct super_block *sb, unsigned long pos, + void *buf, size_t buflen); +extern ssize_t romfs_dev_strnlen(struct super_block *sb, + unsigned long pos, size_t maxlen); +extern int romfs_dev_strncmp(struct super_block *sb, unsigned long pos, + const char *str, size_t size); diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c new file mode 100644 index 000000000000..f0511e816967 --- /dev/null +++ b/fs/romfs/mmap-nommu.c @@ -0,0 +1,75 @@ +/* NOMMU mmap support for RomFS on MTD devices + * + * Copyright © 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/mm.h> +#include <linux/mtd/super.h> +#include "internal.h" + +/* + * try to determine where a shared mapping can be made + * - only supported for NOMMU at the moment (MMU can't doesn't copy private + * mappings) + * - attempts to map through to the underlying MTD device + */ +static unsigned long romfs_get_unmapped_area(struct file *file, + unsigned long addr, + unsigned long len, + unsigned long pgoff, + unsigned long flags) +{ + struct inode *inode = file->f_mapping->host; + struct mtd_info *mtd = inode->i_sb->s_mtd; + unsigned long isize, offset; + + if (!mtd) + goto cant_map_directly; + + isize = i_size_read(inode); + offset = pgoff << PAGE_SHIFT; + if (offset > isize || len > isize || offset > isize - len) + return (unsigned long) -EINVAL; + + /* we need to call down to the MTD layer to do the actual mapping */ + if (mtd->get_unmapped_area) { + if (addr != 0) + return (unsigned long) -EINVAL; + + if (len > mtd->size || pgoff >= (mtd->size >> PAGE_SHIFT)) + return (unsigned long) -EINVAL; + + offset += ROMFS_I(inode)->i_dataoffset; + if (offset > mtd->size - len) + return (unsigned long) -EINVAL; + + return mtd->get_unmapped_area(mtd, len, offset, flags); + } + +cant_map_directly: + return (unsigned long) -ENOSYS; +} + +/* + * permit a R/O mapping to be made directly through onto an MTD device if + * possible + */ +static int romfs_mmap(struct file *file, struct vm_area_struct *vma) +{ + return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS; +} + +const struct file_operations romfs_ro_fops = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .aio_read = generic_file_aio_read, + .splice_read = generic_file_splice_read, + .mmap = romfs_mmap, + .get_unmapped_area = romfs_get_unmapped_area, +}; diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c new file mode 100644 index 000000000000..7e3e1e12a081 --- /dev/null +++ b/fs/romfs/storage.c @@ -0,0 +1,261 @@ +/* RomFS storage access routines + * + * Copyright © 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/fs.h> +#include <linux/mtd/super.h> +#include <linux/buffer_head.h> +#include "internal.h" + +#if !defined(CONFIG_ROMFS_ON_MTD) && !defined(CONFIG_ROMFS_ON_BLOCK) +#error no ROMFS backing store interface configured +#endif + +#ifdef CONFIG_ROMFS_ON_MTD +#define ROMFS_MTD_READ(sb, ...) ((sb)->s_mtd->read((sb)->s_mtd, ##__VA_ARGS__)) + +/* + * read data from an romfs image on an MTD device + */ +static int romfs_mtd_read(struct super_block *sb, unsigned long pos, + void *buf, size_t buflen) +{ + size_t rlen; + int ret; + + ret = ROMFS_MTD_READ(sb, pos, buflen, &rlen, buf); + return (ret < 0 || rlen != buflen) ? -EIO : 0; +} + +/* + * determine the length of a string in a romfs image on an MTD device + */ +static ssize_t romfs_mtd_strnlen(struct super_block *sb, + unsigned long pos, size_t maxlen) +{ + ssize_t n = 0; + size_t segment; + u_char buf[16], *p; + size_t len; + int ret; + + /* scan the string up to 16 bytes at a time */ + while (maxlen > 0) { + segment = min_t(size_t, maxlen, 16); + ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf); + if (ret < 0) + return ret; + p = memchr(buf, 0, len); + if (p) + return n + (p - buf); + maxlen -= len; + pos += len; + n += len; + } + + return n; +} + +/* + * compare a string to one in a romfs image on MTD + * - return 1 if matched, 0 if differ, -ve if error + */ +static int romfs_mtd_strncmp(struct super_block *sb, unsigned long pos, + const char *str, size_t size) +{ + u_char buf[16]; + size_t len, segment; + int ret; + + /* scan the string up to 16 bytes at a time */ + while (size > 0) { + segment = min_t(size_t, size, 16); + ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf); + if (ret < 0) + return ret; + if (memcmp(buf, str, len) != 0) + return 0; + size -= len; + pos += len; + str += len; + } + + return 1; +} +#endif /* CONFIG_ROMFS_ON_MTD */ + +#ifdef CONFIG_ROMFS_ON_BLOCK +/* + * read data from an romfs image on a block device + */ +static int romfs_blk_read(struct super_block *sb, unsigned long pos, + void *buf, size_t buflen) +{ + struct buffer_head *bh; + unsigned long offset; + size_t segment; + + /* copy the string up to blocksize bytes at a time */ + while (buflen > 0) { + offset = pos & (ROMBSIZE - 1); + segment = min_t(size_t, buflen, ROMBSIZE - offset); + bh = sb_bread(sb, pos >> ROMBSBITS); + if (!bh) + return -EIO; + memcpy(buf, bh->b_data + offset, segment); + brelse(bh); + buflen -= segment; + pos += segment; + } + + return 0; +} + +/* + * determine the length of a string in romfs on a block device + */ +static ssize_t romfs_blk_strnlen(struct super_block *sb, + unsigned long pos, size_t limit) +{ + struct buffer_head *bh; + unsigned long offset; + ssize_t n = 0; + size_t segment; + u_char *buf, *p; + + /* scan the string up to blocksize bytes at a time */ + while (limit > 0) { + offset = pos & (ROMBSIZE - 1); + segment = min_t(size_t, limit, ROMBSIZE - offset); + bh = sb_bread(sb, pos >> ROMBSBITS); + if (!bh) + return -EIO; + buf = bh->b_data + offset; + p = memchr(buf, 0, segment); + brelse(bh); + if (p) + return n + (p - buf); + limit -= segment; + pos += segment; + n += segment; + } + + return n; +} + +/* + * compare a string to one in a romfs image on a block device + * - return 1 if matched, 0 if differ, -ve if error + */ +static int romfs_blk_strncmp(struct super_block *sb, unsigned long pos, + const char *str, size_t size) +{ + struct buffer_head *bh; + unsigned long offset; + size_t segment; + bool x; + + /* scan the string up to 16 bytes at a time */ + while (size > 0) { + offset = pos & (ROMBSIZE - 1); + segment = min_t(size_t, size, ROMBSIZE - offset); + bh = sb_bread(sb, pos >> ROMBSBITS); + if (!bh) + return -EIO; + x = (memcmp(bh->b_data + offset, str, segment) != 0); + brelse(bh); + if (x) + return 0; + size -= segment; + pos += segment; + str += segment; + } + + return 1; +} +#endif /* CONFIG_ROMFS_ON_BLOCK */ + +/* + * read data from the romfs image + */ +int romfs_dev_read(struct super_block *sb, unsigned long pos, + void *buf, size_t buflen) +{ + size_t limit; + + limit = romfs_maxsize(sb); + if (pos >= limit) + return -EIO; + if (buflen > limit - pos) + buflen = limit - pos; + +#ifdef CONFIG_ROMFS_ON_MTD + if (sb->s_mtd) + return romfs_mtd_read(sb, pos, buf, buflen); +#endif +#ifdef CONFIG_ROMFS_ON_BLOCK + if (sb->s_bdev) + return romfs_blk_read(sb, pos, buf, buflen); +#endif + return -EIO; +} + +/* + * determine the length of a string in romfs + */ +ssize_t romfs_dev_strnlen(struct super_block *sb, + unsigned long pos, size_t maxlen) +{ + size_t limit; + + limit = romfs_maxsize(sb); + if (pos >= limit) + return -EIO; + if (maxlen > limit - pos) + maxlen = limit - pos; + +#ifdef CONFIG_ROMFS_ON_MTD + if (sb->s_mtd) + return romfs_mtd_strnlen(sb, pos, limit); +#endif +#ifdef CONFIG_ROMFS_ON_BLOCK + if (sb->s_bdev) + return romfs_blk_strnlen(sb, pos, limit); +#endif + return -EIO; +} + +/* + * compare a string to one in romfs + * - return 1 if matched, 0 if differ, -ve if error + */ +int romfs_dev_strncmp(struct super_block *sb, unsigned long pos, + const char *str, size_t size) +{ + size_t limit; + + limit = romfs_maxsize(sb); + if (pos >= limit) + return -EIO; + if (size > ROMFS_MAXFN) + return -ENAMETOOLONG; + if (size > limit - pos) + return -EIO; + +#ifdef CONFIG_ROMFS_ON_MTD + if (sb->s_mtd) + return romfs_mtd_strncmp(sb, pos, str, size); +#endif +#ifdef CONFIG_ROMFS_ON_BLOCK + if (sb->s_bdev) + return romfs_blk_strncmp(sb, pos, str, size); +#endif + return -EIO; +} diff --git a/fs/romfs/super.c b/fs/romfs/super.c new file mode 100644 index 000000000000..10ca7d984a8b --- /dev/null +++ b/fs/romfs/super.c @@ -0,0 +1,653 @@ +/* Block- or MTD-based romfs + * + * Copyright © 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * Derived from: ROMFS file system, Linux implementation + * + * Copyright © 1997-1999 Janos Farkas <chexum@shadow.banki.hu> + * + * Using parts of the minix filesystem + * Copyright © 1991, 1992 Linus Torvalds + * + * and parts of the affs filesystem additionally + * Copyright © 1993 Ray Burr + * Copyright © 1996 Hans-Joachim Widmaier + * + * Changes + * Changed for 2.1.19 modules + * Jan 1997 Initial release + * Jun 1997 2.1.43+ changes + * Proper page locking in readpage + * Changed to work with 2.1.45+ fs + * Jul 1997 Fixed follow_link + * 2.1.47 + * lookup shouldn't return -ENOENT + * from Horst von Brand: + * fail on wrong checksum + * double unlock_super was possible + * correct namelen for statfs + * spotted by Bill Hawes: + * readlink shouldn't iput() + * Jun 1998 2.1.106 from Avery Pennarun: glibc scandir() + * exposed a problem in readdir + * 2.1.107 code-freeze spellchecker run + * Aug 1998 2.1.118+ VFS changes + * Sep 1998 2.1.122 another VFS change (follow_link) + * Apr 1999 2.2.7 no more EBADF checking in + * lookup/readdir, use ERR_PTR + * Jun 1999 2.3.6 d_alloc_root use changed + * 2.3.9 clean up usage of ENOENT/negative + * dentries in lookup + * clean up page flags setting + * (error, uptodate, locking) in + * in readpage + * use init_special_inode for + * fifos/sockets (and streamline) in + * read_inode, fix _ops table order + * Aug 1999 2.3.16 __initfunc() => __init change + * Oct 1999 2.3.24 page->owner hack obsoleted + * Nov 1999 2.3.27 2.3.25+ page->offset => index change + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/string.h> +#include <linux/fs.h> +#include <linux/time.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/blkdev.h> +#include <linux/parser.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/statfs.h> +#include <linux/mtd/super.h> +#include <linux/ctype.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/uaccess.h> +#include "internal.h" + +static struct kmem_cache *romfs_inode_cachep; + +static const umode_t romfs_modemap[8] = { + 0, /* hard link */ + S_IFDIR | 0644, /* directory */ + S_IFREG | 0644, /* regular file */ + S_IFLNK | 0777, /* symlink */ + S_IFBLK | 0600, /* blockdev */ + S_IFCHR | 0600, /* chardev */ + S_IFSOCK | 0644, /* socket */ + S_IFIFO | 0644 /* FIFO */ +}; + +static const unsigned char romfs_dtype_table[] = { + DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_SOCK, DT_FIFO +}; + +static struct inode *romfs_iget(struct super_block *sb, unsigned long pos); + +/* + * read a page worth of data from the image + */ +static int romfs_readpage(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + loff_t offset, size; + unsigned long fillsize, pos; + void *buf; + int ret; + + buf = kmap(page); + if (!buf) + return -ENOMEM; + + /* 32 bit warning -- but not for us :) */ + offset = page_offset(page); + size = i_size_read(inode); + fillsize = 0; + ret = 0; + if (offset < size) { + size -= offset; + fillsize = size > PAGE_SIZE ? PAGE_SIZE : size; + + pos = ROMFS_I(inode)->i_dataoffset + offset; + + ret = romfs_dev_read(inode->i_sb, pos, buf, fillsize); + if (ret < 0) { + SetPageError(page); + fillsize = 0; + ret = -EIO; + } + } + + if (fillsize < PAGE_SIZE) + memset(buf + fillsize, 0, PAGE_SIZE - fillsize); + if (ret == 0) + SetPageUptodate(page); + + flush_dcache_page(page); + kunmap(page); + unlock_page(page); + return ret; +} + +static const struct address_space_operations romfs_aops = { + .readpage = romfs_readpage +}; + +/* + * read the entries from a directory + */ +static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + struct inode *i = filp->f_dentry->d_inode; + struct romfs_inode ri; + unsigned long offset, maxoff; + int j, ino, nextfh; + int stored = 0; + char fsname[ROMFS_MAXFN]; /* XXX dynamic? */ + int ret; + + maxoff = romfs_maxsize(i->i_sb); + + offset = filp->f_pos; + if (!offset) { + offset = i->i_ino & ROMFH_MASK; + ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE); + if (ret < 0) + goto out; + offset = be32_to_cpu(ri.spec) & ROMFH_MASK; + } + + /* Not really failsafe, but we are read-only... */ + for (;;) { + if (!offset || offset >= maxoff) { + offset = maxoff; + filp->f_pos = offset; + goto out; + } + filp->f_pos = offset; + + /* Fetch inode info */ + ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE); + if (ret < 0) + goto out; + + j = romfs_dev_strnlen(i->i_sb, offset + ROMFH_SIZE, + sizeof(fsname) - 1); + if (j < 0) + goto out; + + ret = romfs_dev_read(i->i_sb, offset + ROMFH_SIZE, fsname, j); + if (ret < 0) + goto out; + fsname[j] = '\0'; + + ino = offset; + nextfh = be32_to_cpu(ri.next); + if ((nextfh & ROMFH_TYPE) == ROMFH_HRD) + ino = be32_to_cpu(ri.spec); + if (filldir(dirent, fsname, j, offset, ino, + romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0) + goto out; + + stored++; + offset = nextfh & ROMFH_MASK; + } + +out: + return stored; +} + +/* + * look up an entry in a directory + */ +static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + unsigned long offset, maxoff; + struct inode *inode; + struct romfs_inode ri; + const char *name; /* got from dentry */ + int len, ret; + + offset = dir->i_ino & ROMFH_MASK; + ret = romfs_dev_read(dir->i_sb, offset, &ri, ROMFH_SIZE); + if (ret < 0) + goto error; + + /* search all the file entries in the list starting from the one + * pointed to by the directory's special data */ + maxoff = romfs_maxsize(dir->i_sb); + offset = be32_to_cpu(ri.spec) & ROMFH_MASK; + + name = dentry->d_name.name; + len = dentry->d_name.len; + + for (;;) { + if (!offset || offset >= maxoff) + goto out0; + + ret = romfs_dev_read(dir->i_sb, offset, &ri, sizeof(ri)); + if (ret < 0) + goto error; + + /* try to match the first 16 bytes of name */ + ret = romfs_dev_strncmp(dir->i_sb, offset + ROMFH_SIZE, name, + len); + if (ret < 0) + goto error; + if (ret == 1) + break; + + /* next entry */ + offset = be32_to_cpu(ri.next) & ROMFH_MASK; + } + + /* Hard link handling */ + if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD) + offset = be32_to_cpu(ri.spec) & ROMFH_MASK; + + inode = romfs_iget(dir->i_sb, offset); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto error; + } + goto outi; + + /* + * it's a bit funky, _lookup needs to return an error code + * (negative) or a NULL, both as a dentry. ENOENT should not + * be returned, instead we need to create a negative dentry by + * d_add(dentry, NULL); and return 0 as no error. + * (Although as I see, it only matters on writable file + * systems). + */ +out0: + inode = NULL; +outi: + d_add(dentry, inode); + ret = 0; +error: + return ERR_PTR(ret); +} + +static const struct file_operations romfs_dir_operations = { + .read = generic_read_dir, + .readdir = romfs_readdir, +}; + +static struct inode_operations romfs_dir_inode_operations = { + .lookup = romfs_lookup, +}; + +/* + * get a romfs inode based on its position in the image (which doubles as the + * inode number) + */ +static struct inode *romfs_iget(struct super_block *sb, unsigned long pos) +{ + struct romfs_inode_info *inode; + struct romfs_inode ri; + struct inode *i; + unsigned long nlen; + unsigned nextfh, ret; + umode_t mode; + + /* we might have to traverse a chain of "hard link" file entries to get + * to the actual file */ + for (;;) { + ret = romfs_dev_read(sb, pos, &ri, sizeof(ri)); + if (ret < 0) + goto error; + + /* XXX: do romfs_checksum here too (with name) */ + + nextfh = be32_to_cpu(ri.next); + if ((nextfh & ROMFH_TYPE) != ROMFH_HRD) + break; + + pos = be32_to_cpu(ri.spec) & ROMFH_MASK; + } + + /* determine the length of the filename */ + nlen = romfs_dev_strnlen(sb, pos + ROMFH_SIZE, ROMFS_MAXFN); + if (IS_ERR_VALUE(nlen)) + goto eio; + + /* get an inode for this image position */ + i = iget_locked(sb, pos); + if (!i) + return ERR_PTR(-ENOMEM); + + if (!(i->i_state & I_NEW)) + return i; + + /* precalculate the data offset */ + inode = ROMFS_I(i); + inode->i_metasize = (ROMFH_SIZE + nlen + 1 + ROMFH_PAD) & ROMFH_MASK; + inode->i_dataoffset = pos + inode->i_metasize; + + i->i_nlink = 1; /* Hard to decide.. */ + i->i_size = be32_to_cpu(ri.size); + i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0; + i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0; + + /* set up mode and ops */ + mode = romfs_modemap[nextfh & ROMFH_TYPE]; + + switch (nextfh & ROMFH_TYPE) { + case ROMFH_DIR: + i->i_size = ROMFS_I(i)->i_metasize; + i->i_op = &romfs_dir_inode_operations; + i->i_fop = &romfs_dir_operations; + if (nextfh & ROMFH_EXEC) + mode |= S_IXUGO; + break; + case ROMFH_REG: + i->i_fop = &romfs_ro_fops; + i->i_data.a_ops = &romfs_aops; + if (i->i_sb->s_mtd) + i->i_data.backing_dev_info = + i->i_sb->s_mtd->backing_dev_info; + if (nextfh & ROMFH_EXEC) + mode |= S_IXUGO; + break; + case ROMFH_SYM: + i->i_op = &page_symlink_inode_operations; + i->i_data.a_ops = &romfs_aops; + mode |= S_IRWXUGO; + break; + default: + /* depending on MBZ for sock/fifos */ + nextfh = be32_to_cpu(ri.spec); + init_special_inode(i, mode, MKDEV(nextfh >> 16, + nextfh & 0xffff)); + break; + } + + i->i_mode = mode; + + unlock_new_inode(i); + return i; + +eio: + ret = -EIO; +error: + printk(KERN_ERR "ROMFS: read error for inode 0x%lx\n", pos); + return ERR_PTR(ret); +} + +/* + * allocate a new inode + */ +static struct inode *romfs_alloc_inode(struct super_block *sb) +{ + struct romfs_inode_info *inode; + inode = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL); + return inode ? &inode->vfs_inode : NULL; +} + +/* + * return a spent inode to the slab cache + */ +static void romfs_destroy_inode(struct inode *inode) +{ + kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode)); +} + +/* + * get filesystem statistics + */ +static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + + buf->f_type = ROMFS_MAGIC; + buf->f_namelen = ROMFS_MAXFN; + buf->f_bsize = ROMBSIZE; + buf->f_bfree = buf->f_bavail = buf->f_ffree; + buf->f_blocks = + (romfs_maxsize(dentry->d_sb) + ROMBSIZE - 1) >> ROMBSBITS; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + return 0; +} + +/* + * remounting must involve read-only + */ +static int romfs_remount(struct super_block *sb, int *flags, char *data) +{ + *flags |= MS_RDONLY; + return 0; +} + +static const struct super_operations romfs_super_ops = { + .alloc_inode = romfs_alloc_inode, + .destroy_inode = romfs_destroy_inode, + .statfs = romfs_statfs, + .remount_fs = romfs_remount, +}; + +/* + * checksum check on part of a romfs filesystem + */ +static __u32 romfs_checksum(const void *data, int size) +{ + const __be32 *ptr = data; + __u32 sum; + + sum = 0; + size >>= 2; + while (size > 0) { + sum += be32_to_cpu(*ptr++); + size--; + } + return sum; +} + +/* + * fill in the superblock + */ +static int romfs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct romfs_super_block *rsb; + struct inode *root; + unsigned long pos, img_size; + const char *storage; + size_t len; + int ret; + +#ifdef CONFIG_BLOCK + if (!sb->s_mtd) { + sb_set_blocksize(sb, ROMBSIZE); + } else { + sb->s_blocksize = ROMBSIZE; + sb->s_blocksize_bits = blksize_bits(ROMBSIZE); + } +#endif + + sb->s_maxbytes = 0xFFFFFFFF; + sb->s_magic = ROMFS_MAGIC; + sb->s_flags |= MS_RDONLY | MS_NOATIME; + sb->s_op = &romfs_super_ops; + + /* read the image superblock and check it */ + rsb = kmalloc(512, GFP_KERNEL); + if (!rsb) + return -ENOMEM; + + sb->s_fs_info = (void *) 512; + ret = romfs_dev_read(sb, 0, rsb, 512); + if (ret < 0) + goto error_rsb; + + img_size = be32_to_cpu(rsb->size); + + if (sb->s_mtd && img_size > sb->s_mtd->size) + goto error_rsb_inval; + + sb->s_fs_info = (void *) img_size; + + if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1 || + img_size < ROMFH_SIZE) { + if (!silent) + printk(KERN_WARNING "VFS:" + " Can't find a romfs filesystem on dev %s.\n", + sb->s_id); + goto error_rsb_inval; + } + + if (romfs_checksum(rsb, min_t(size_t, img_size, 512))) { + printk(KERN_ERR "ROMFS: bad initial checksum on dev %s.\n", + sb->s_id); + goto error_rsb_inval; + } + + storage = sb->s_mtd ? "MTD" : "the block layer"; + + len = strnlen(rsb->name, ROMFS_MAXFN); + if (!silent) + printk(KERN_NOTICE "ROMFS: Mounting image '%*.*s' through %s\n", + (unsigned) len, (unsigned) len, rsb->name, storage); + + kfree(rsb); + rsb = NULL; + + /* find the root directory */ + pos = (ROMFH_SIZE + len + 1 + ROMFH_PAD) & ROMFH_MASK; + + root = romfs_iget(sb, pos); + if (!root) + goto error; + + sb->s_root = d_alloc_root(root); + if (!sb->s_root) + goto error_i; + + return 0; + +error_i: + iput(root); +error: + return -EINVAL; +error_rsb_inval: + ret = -EINVAL; +error_rsb: + return ret; +} + +/* + * get a superblock for mounting + */ +static int romfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data, struct vfsmount *mnt) +{ + int ret = -EINVAL; + +#ifdef CONFIG_ROMFS_ON_MTD + ret = get_sb_mtd(fs_type, flags, dev_name, data, romfs_fill_super, + mnt); +#endif +#ifdef CONFIG_ROMFS_ON_BLOCK + if (ret == -EINVAL) + ret = get_sb_bdev(fs_type, flags, dev_name, data, + romfs_fill_super, mnt); +#endif + return ret; +} + +/* + * destroy a romfs superblock in the appropriate manner + */ +static void romfs_kill_sb(struct super_block *sb) +{ +#ifdef CONFIG_ROMFS_ON_MTD + if (sb->s_mtd) { + kill_mtd_super(sb); + return; + } +#endif +#ifdef CONFIG_ROMFS_ON_BLOCK + if (sb->s_bdev) { + kill_block_super(sb); + return; + } +#endif +} + +static struct file_system_type romfs_fs_type = { + .owner = THIS_MODULE, + .name = "romfs", + .get_sb = romfs_get_sb, + .kill_sb = romfs_kill_sb, + .fs_flags = FS_REQUIRES_DEV, +}; + +/* + * inode storage initialiser + */ +static void romfs_i_init_once(void *_inode) +{ + struct romfs_inode_info *inode = _inode; + + inode_init_once(&inode->vfs_inode); +} + +/* + * romfs module initialisation + */ +static int __init init_romfs_fs(void) +{ + int ret; + + printk(KERN_INFO "ROMFS MTD (C) 2007 Red Hat, Inc.\n"); + + romfs_inode_cachep = + kmem_cache_create("romfs_i", + sizeof(struct romfs_inode_info), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + romfs_i_init_once); + + if (!romfs_inode_cachep) { + printk(KERN_ERR + "ROMFS error: Failed to initialise inode cache\n"); + return -ENOMEM; + } + ret = register_filesystem(&romfs_fs_type); + if (ret) { + printk(KERN_ERR "ROMFS error: Failed to register filesystem\n"); + goto error_register; + } + return 0; + +error_register: + kmem_cache_destroy(romfs_inode_cachep); + return ret; +} + +/* + * romfs module removal + */ +static void __exit exit_romfs_fs(void) +{ + unregister_filesystem(&romfs_fs_type); + kmem_cache_destroy(romfs_inode_cachep); +} + +module_init(init_romfs_fs); +module_exit(exit_romfs_fs); + +MODULE_DESCRIPTION("Direct-MTD Capable RomFS"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); /* Actually dual-licensed, but it doesn't matter for */ diff --git a/fs/splice.c b/fs/splice.c index 4ed0ba44a966..c18aa7e03e2b 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -59,7 +59,8 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, */ wait_on_page_writeback(page); - if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL)) + if (page_has_private(page) && + !try_to_release_page(page, GFP_KERNEL)) goto out_unlock; /* @@ -736,10 +737,19 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, * ->write_end. Most of the time, these expect i_mutex to * be held. Since this may result in an ABBA deadlock with * pipe->inode, we have to order lock acquiry here. + * + * Outer lock must be inode->i_mutex, as pipe_wait() will + * release and reacquire pipe->inode->i_mutex, AND inode must + * never be a pipe. */ - inode_double_lock(inode, pipe->inode); + WARN_ON(S_ISFIFO(inode->i_mode)); + mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); + if (pipe->inode) + mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD); ret = __splice_from_pipe(pipe, &sd, actor); - inode_double_unlock(inode, pipe->inode); + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); + mutex_unlock(&inode->i_mutex); return ret; } @@ -830,11 +840,17 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, }; ssize_t ret; - inode_double_lock(inode, pipe->inode); + WARN_ON(S_ISFIFO(inode->i_mode)); + mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); ret = file_remove_suid(out); - if (likely(!ret)) + if (likely(!ret)) { + if (pipe->inode) + mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD); ret = __splice_from_pipe(pipe, &sd, pipe_to_file); - inode_double_unlock(inode, pipe->inode); + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); + } + mutex_unlock(&inode->i_mutex); if (ret > 0) { unsigned long nr_pages; diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c index 69e971d5ddc1..2b1b8fe5e037 100644 --- a/fs/squashfs/export.c +++ b/fs/squashfs/export.c @@ -40,6 +40,7 @@ #include <linux/dcache.h> #include <linux/exportfs.h> #include <linux/zlib.h> +#include <linux/slab.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 681ec0d83799..ffa6edcd2d0c 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -301,6 +301,7 @@ failure: static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct squashfs_sb_info *msblk = dentry->d_sb->s_fs_info; + u64 id = huge_encode_dev(dentry->d_sb->s_bdev->bd_dev); TRACE("Entered squashfs_statfs\n"); @@ -311,6 +312,8 @@ static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = msblk->inodes; buf->f_ffree = 0; buf->f_namelen = SQUASHFS_NAME_LEN; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); return 0; } diff --git a/fs/super.c b/fs/super.c index 2ba481518ba7..786fe7d72790 100644 --- a/fs/super.c +++ b/fs/super.c @@ -287,6 +287,7 @@ int fsync_super(struct super_block *sb) __fsync_super(sb); return sync_blockdev(sb->s_bdev); } +EXPORT_SYMBOL_GPL(fsync_super); /** * generic_shutdown_super - common helper for ->kill_sb() @@ -770,6 +771,46 @@ void kill_litter_super(struct super_block *sb) EXPORT_SYMBOL(kill_litter_super); +static int ns_test_super(struct super_block *sb, void *data) +{ + return sb->s_fs_info == data; +} + +static int ns_set_super(struct super_block *sb, void *data) +{ + sb->s_fs_info = data; + return set_anon_super(sb, NULL); +} + +int get_sb_ns(struct file_system_type *fs_type, int flags, void *data, + int (*fill_super)(struct super_block *, void *, int), + struct vfsmount *mnt) +{ + struct super_block *sb; + + sb = sget(fs_type, ns_test_super, ns_set_super, data); + if (IS_ERR(sb)) + return PTR_ERR(sb); + + if (!sb->s_root) { + int err; + sb->s_flags = flags; + err = fill_super(sb, data, flags & MS_SILENT ? 1 : 0); + if (err) { + up_write(&sb->s_umount); + deactivate_super(sb); + return err; + } + + sb->s_flags |= MS_ACTIVE; + } + + simple_set_mnt(mnt, sb); + return 0; +} + +EXPORT_SYMBOL(get_sb_ns); + #ifdef CONFIG_BLOCK static int set_bdev_super(struct super_block *s, void *data) { diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 3d81bf58dae2..da20b48d350f 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -90,6 +90,7 @@ static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct sysv_sb_info *sbi = SYSV_SB(sb); + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); buf->f_type = sb->s_magic; buf->f_bsize = sb->s_blocksize; @@ -98,6 +99,8 @@ static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = sbi->s_ninodes; buf->f_ffree = sysv_count_free_inodes(sb); buf->f_namelen = SYSV_NAMELEN; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); return 0; } diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig index e35b54d5059d..830e3f76f442 100644 --- a/fs/ubifs/Kconfig +++ b/fs/ubifs/Kconfig @@ -22,7 +22,7 @@ config UBIFS_FS_ADVANCED_COMPR depends on UBIFS_FS help This option allows to explicitly choose which compressions, if any, - are enabled in UBIFS. Removing compressors means inbility to read + are enabled in UBIFS. Removing compressors means inability to read existing file systems. If unsure, say 'N'. @@ -32,7 +32,7 @@ config UBIFS_FS_LZO depends on UBIFS_FS default y help - LZO compressor is generally faster then zlib but compresses worse. + LZO compressor is generally faster than zlib but compresses worse. Say 'Y' if unsure. config UBIFS_FS_ZLIB diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index f393620890ee..af1914462f02 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -194,29 +194,26 @@ static int make_free_space(struct ubifs_info *c) } /** - * ubifs_calc_min_idx_lebs - calculate amount of eraseblocks for the index. + * ubifs_calc_min_idx_lebs - calculate amount of LEBs for the index. * @c: UBIFS file-system description object * - * This function calculates and returns the number of eraseblocks which should - * be kept for index usage. + * This function calculates and returns the number of LEBs which should be kept + * for index usage. */ int ubifs_calc_min_idx_lebs(struct ubifs_info *c) { - int idx_lebs, eff_leb_size = c->leb_size - c->max_idx_node_sz; + int idx_lebs; long long idx_size; idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx; - /* And make sure we have thrice the index size of space reserved */ - idx_size = idx_size + (idx_size << 1); - + idx_size += idx_size << 1; /* * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes' * pair, nor similarly the two variables for the new index size, so we * have to do this costly 64-bit division on fast-path. */ - idx_size += eff_leb_size - 1; - idx_lebs = div_u64(idx_size, eff_leb_size); + idx_lebs = div_u64(idx_size + c->idx_leb_size - 1, c->idx_leb_size); /* * The index head is not available for the in-the-gaps method, so add an * extra LEB to compensate. @@ -310,23 +307,23 @@ static int can_use_rp(struct ubifs_info *c) * do_budget_space - reserve flash space for index and data growth. * @c: UBIFS file-system description object * - * This function makes sure UBIFS has enough free eraseblocks for index growth - * and data. + * This function makes sure UBIFS has enough free LEBs for index growth and + * data. * * When budgeting index space, UBIFS reserves thrice as many LEBs as the index * would take if it was consolidated and written to the flash. This guarantees * that the "in-the-gaps" commit method always succeeds and UBIFS will always * be able to commit dirty index. So this function basically adds amount of * budgeted index space to the size of the current index, multiplies this by 3, - * and makes sure this does not exceed the amount of free eraseblocks. + * and makes sure this does not exceed the amount of free LEBs. * * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables: * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might * be large, because UBIFS does not do any index consolidation as long as * there is free space. IOW, the index may take a lot of LEBs, but the LEBs * will contain a lot of dirt. - * o @c->min_idx_lebs is the the index presumably takes. IOW, the index may be - * consolidated to take up to @c->min_idx_lebs LEBs. + * o @c->min_idx_lebs is the number of LEBS the index presumably takes. IOW, + * the index may be consolidated to take up to @c->min_idx_lebs LEBs. * * This function returns zero in case of success, and %-ENOSPC in case of * failure. @@ -695,12 +692,12 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free) * This function calculates amount of free space to report to user-space. * * Because UBIFS may introduce substantial overhead (the index, node headers, - * alignment, wastage at the end of eraseblocks, etc), it cannot report real - * amount of free flash space it has (well, because not all dirty space is - * reclaimable, UBIFS does not actually know the real amount). If UBIFS did so, - * it would bread user expectations about what free space is. Users seem to - * accustomed to assume that if the file-system reports N bytes of free space, - * they would be able to fit a file of N bytes to the FS. This almost works for + * alignment, wastage at the end of LEBs, etc), it cannot report real amount of + * free flash space it has (well, because not all dirty space is reclaimable, + * UBIFS does not actually know the real amount). If UBIFS did so, it would + * bread user expectations about what free space is. Users seem to accustomed + * to assume that if the file-system reports N bytes of free space, they would + * be able to fit a file of N bytes to the FS. This almost works for * traditional file-systems, because they have way less overhead than UBIFS. * So, to keep users happy, UBIFS tries to take the overhead into account. */ diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index e975bd82f38b..ce2cd8343618 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -479,9 +479,9 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) "bad or corrupted node)"); else { for (i = 0; i < nlen && dent->name[i]; i++) - printk("%c", dent->name[i]); + printk(KERN_CONT "%c", dent->name[i]); } - printk("\n"); + printk(KERN_CONT "\n"); break; } @@ -1214,7 +1214,7 @@ static int dbg_check_znode(struct ubifs_info *c, struct ubifs_zbranch *zbr) /* * Make sure the last key in our znode is less or - * equivalent than the the key in zbranch which goes + * equivalent than the key in the zbranch which goes * after our pointing zbranch. */ cmp = keys_cmp(c, max, diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 0ff89fe71e51..6d34dc7e33e1 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -430,6 +430,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, struct ubifs_inode *ui = ubifs_inode(inode); pgoff_t index = pos >> PAGE_CACHE_SHIFT; int uninitialized_var(err), appending = !!(pos + len > inode->i_size); + int skipped_read = 0; struct page *page; ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size); @@ -444,7 +445,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, if (!PageUptodate(page)) { /* The page is not loaded from the flash */ - if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) + if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) { /* * We change whole page so no need to load it. But we * have to set the @PG_checked flag to make the further @@ -453,7 +454,8 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, * the media. */ SetPageChecked(page); - else { + skipped_read = 1; + } else { err = do_readpage(page); if (err) { unlock_page(page); @@ -470,6 +472,14 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, if (unlikely(err)) { ubifs_assert(err == -ENOSPC); /* + * If we skipped reading the page because we were going to + * write all of it, then it is not up to date. + */ + if (skipped_read) { + ClearPageChecked(page); + ClearPageUptodate(page); + } + /* * Budgeting failed which means it would have to force * write-back but didn't, because we set the @fast flag in the * request. Write-back cannot be done now, while we have the @@ -949,7 +959,7 @@ static int do_writepage(struct page *page, int len) * whole index and correct all inode sizes, which is long an unacceptable. * * To prevent situations like this, UBIFS writes pages back only if they are - * within last synchronized inode size, i.e. the the size which has been + * within the last synchronized inode size, i.e. the size which has been * written to the flash media last time. Otherwise, UBIFS forces inode * write-back, thus making sure the on-flash inode contains current inode size, * and then keeps writing pages back. diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c index 717d79c97c5e..1d54383d1269 100644 --- a/fs/ubifs/find.c +++ b/fs/ubifs/find.c @@ -478,7 +478,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c, * ubifs_find_free_space - find a data LEB with free space. * @c: the UBIFS file-system description object * @min_space: minimum amount of required free space - * @free: contains amount of free space in the LEB on exit + * @offs: contains offset of where free space starts on exit * @squeeze: whether to try to find space in a non-empty LEB first * * This function looks for an LEB with at least @min_space bytes of free space. @@ -490,7 +490,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c, * failed to find a LEB with @min_space bytes of free space and other a negative * error codes in case of failure. */ -int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free, +int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs, int squeeze) { const struct ubifs_lprops *lprops; @@ -558,10 +558,10 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free, spin_unlock(&c->space_lock); } - *free = lprops->free; + *offs = c->leb_size - lprops->free; ubifs_release_lprops(c); - if (*free == c->leb_size) { + if (*offs == 0) { /* * Ensure that empty LEBs have been unmapped. They may not have * been, for example, because of an unclean unmount. Also @@ -573,8 +573,8 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free, return err; } - dbg_find("found LEB %d, free %d", lnum, *free); - ubifs_assert(*free >= min_space); + dbg_find("found LEB %d, free %d", lnum, c->leb_size - *offs); + ubifs_assert(*offs <= c->leb_size - min_space); return lnum; out: diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index a711d33b3d3e..f0f5f15d384e 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -47,7 +47,7 @@ * have to waste large pieces of free space at the end of LEB B, because nodes * from LEB A would not fit. And the worst situation is when all nodes are of * maximum size. So dark watermark is the amount of free + dirty space in LEB - * which are guaranteed to be reclaimable. If LEB has less space, the GC migh + * which are guaranteed to be reclaimable. If LEB has less space, the GC might * be unable to reclaim it. So, LEBs with free + dirty greater than dark * watermark are "good" LEBs from GC's point of few. The other LEBs are not so * good, and GC takes extra care when moving them. @@ -57,14 +57,6 @@ #include "ubifs.h" /* - * GC tries to optimize the way it fit nodes to available space, and it sorts - * nodes a little. The below constants are watermarks which define "large", - * "medium", and "small" nodes. - */ -#define MEDIUM_NODE_WM (UBIFS_BLOCK_SIZE / 4) -#define SMALL_NODE_WM UBIFS_MAX_DENT_NODE_SZ - -/* * GC may need to move more than one LEB to make progress. The below constants * define "soft" and "hard" limits on the number of LEBs the garbage collector * may move. @@ -116,83 +108,222 @@ static int switch_gc_head(struct ubifs_info *c) } /** - * joinup - bring data nodes for an inode together. - * @c: UBIFS file-system description object - * @sleb: describes scanned LEB - * @inum: inode number - * @blk: block number - * @data: list to which to add data nodes + * list_sort - sort a list. + * @priv: private data, passed to @cmp + * @head: the list to sort + * @cmp: the elements comparison function * - * This function looks at the first few nodes in the scanned LEB @sleb and adds - * them to @data if they are data nodes from @inum and have a larger block - * number than @blk. This function returns %0 on success and a negative error - * code on failure. + * This function has been implemented by Mark J Roberts <mjr@znex.org>. It + * implements "merge sort" which has O(nlog(n)) complexity. The list is sorted + * in ascending order. + * + * The comparison function @cmp is supposed to return a negative value if @a is + * than @b, and a positive value if @a is greater than @b. If @a and @b are + * equivalent, then it does not matter what this function returns. */ -static int joinup(struct ubifs_info *c, struct ubifs_scan_leb *sleb, ino_t inum, - unsigned int blk, struct list_head *data) +static void list_sort(void *priv, struct list_head *head, + int (*cmp)(void *priv, struct list_head *a, + struct list_head *b)) { - int err, cnt = 6, lnum = sleb->lnum, offs; - struct ubifs_scan_node *snod, *tmp; - union ubifs_key *key; + struct list_head *p, *q, *e, *list, *tail, *oldhead; + int insize, nmerges, psize, qsize, i; + + if (list_empty(head)) + return; + + list = head->next; + list_del(head); + insize = 1; + for (;;) { + p = oldhead = list; + list = tail = NULL; + nmerges = 0; + + while (p) { + nmerges++; + q = p; + psize = 0; + for (i = 0; i < insize; i++) { + psize++; + q = q->next == oldhead ? NULL : q->next; + if (!q) + break; + } - list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { - key = &snod->key; - if (key_inum(c, key) == inum && - key_type(c, key) == UBIFS_DATA_KEY && - key_block(c, key) > blk) { - offs = snod->offs; - err = ubifs_tnc_has_node(c, key, 0, lnum, offs, 0); - if (err < 0) - return err; - list_del(&snod->list); - if (err) { - list_add_tail(&snod->list, data); - blk = key_block(c, key); - } else - kfree(snod); - cnt = 6; - } else if (--cnt == 0) + qsize = insize; + while (psize > 0 || (qsize > 0 && q)) { + if (!psize) { + e = q; + q = q->next; + qsize--; + if (q == oldhead) + q = NULL; + } else if (!qsize || !q) { + e = p; + p = p->next; + psize--; + if (p == oldhead) + p = NULL; + } else if (cmp(priv, p, q) <= 0) { + e = p; + p = p->next; + psize--; + if (p == oldhead) + p = NULL; + } else { + e = q; + q = q->next; + qsize--; + if (q == oldhead) + q = NULL; + } + if (tail) + tail->next = e; + else + list = e; + e->prev = tail; + tail = e; + } + p = q; + } + + tail->next = list; + list->prev = tail; + + if (nmerges <= 1) break; + + insize *= 2; } - return 0; + + head->next = list; + head->prev = list->prev; + list->prev->next = head; + list->prev = head; } /** - * move_nodes - move nodes. + * data_nodes_cmp - compare 2 data nodes. + * @priv: UBIFS file-system description object + * @a: first data node + * @a: second data node + * + * This function compares data nodes @a and @b. Returns %1 if @a has greater + * inode or block number, and %-1 otherwise. + */ +int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + ino_t inuma, inumb; + struct ubifs_info *c = priv; + struct ubifs_scan_node *sa, *sb; + + cond_resched(); + sa = list_entry(a, struct ubifs_scan_node, list); + sb = list_entry(b, struct ubifs_scan_node, list); + ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY); + ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY); + + inuma = key_inum(c, &sa->key); + inumb = key_inum(c, &sb->key); + + if (inuma == inumb) { + unsigned int blka = key_block(c, &sa->key); + unsigned int blkb = key_block(c, &sb->key); + + if (blka <= blkb) + return -1; + } else if (inuma <= inumb) + return -1; + + return 1; +} + +/* + * nondata_nodes_cmp - compare 2 non-data nodes. + * @priv: UBIFS file-system description object + * @a: first node + * @a: second node + * + * This function compares nodes @a and @b. It makes sure that inode nodes go + * first and sorted by length in descending order. Directory entry nodes go + * after inode nodes and are sorted in ascending hash valuer order. + */ +int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + int typea, typeb; + ino_t inuma, inumb; + struct ubifs_info *c = priv; + struct ubifs_scan_node *sa, *sb; + + cond_resched(); + sa = list_entry(a, struct ubifs_scan_node, list); + sb = list_entry(b, struct ubifs_scan_node, list); + typea = key_type(c, &sa->key); + typeb = key_type(c, &sb->key); + ubifs_assert(typea != UBIFS_DATA_KEY && typeb != UBIFS_DATA_KEY); + + /* Inodes go before directory entries */ + if (typea == UBIFS_INO_KEY) { + if (typeb == UBIFS_INO_KEY) + return sb->len - sa->len; + return -1; + } + if (typeb == UBIFS_INO_KEY) + return 1; + + ubifs_assert(typea == UBIFS_DENT_KEY && typeb == UBIFS_DENT_KEY); + inuma = key_inum(c, &sa->key); + inumb = key_inum(c, &sb->key); + + if (inuma == inumb) { + uint32_t hasha = key_hash(c, &sa->key); + uint32_t hashb = key_hash(c, &sb->key); + + if (hasha <= hashb) + return -1; + } else if (inuma <= inumb) + return -1; + + return 1; +} + +/** + * sort_nodes - sort nodes for GC. * @c: UBIFS file-system description object - * @sleb: describes nodes to move + * @sleb: describes nodes to sort and contains the result on exit + * @nondata: contains non-data nodes on exit + * @min: minimum node size is returned here * - * This function moves valid nodes from data LEB described by @sleb to the GC - * journal head. The obsolete nodes are dropped. + * This function sorts the list of inodes to garbage collect. First of all, it + * kills obsolete nodes and separates data and non-data nodes to the + * @sleb->nodes and @nondata lists correspondingly. + * + * Data nodes are then sorted in block number order - this is important for + * bulk-read; data nodes with lower inode number go before data nodes with + * higher inode number, and data nodes with lower block number go before data + * nodes with higher block number; * - * When moving nodes we have to deal with classical bin-packing problem: the - * space in the current GC journal head LEB and in @c->gc_lnum are the "bins", - * where the nodes in the @sleb->nodes list are the elements which should be - * fit optimally to the bins. This function uses the "first fit decreasing" - * strategy, although it does not really sort the nodes but just split them on - * 3 classes - large, medium, and small, so they are roughly sorted. + * Non-data nodes are sorted as follows. + * o First go inode nodes - they are sorted in descending length order. + * o Then go directory entry nodes - they are sorted in hash order, which + * should supposedly optimize 'readdir()'. Direntry nodes with lower parent + * inode number go before direntry nodes with higher parent inode number, + * and direntry nodes with lower name hash values go before direntry nodes + * with higher name hash values. * - * This function returns zero in case of success, %-EAGAIN if commit is - * required, and other negative error codes in case of other failures. + * This function returns zero in case of success and a negative error code in + * case of failure. */ -static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) +static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb, + struct list_head *nondata, int *min) { struct ubifs_scan_node *snod, *tmp; - struct list_head data, large, medium, small; - struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; - int avail, err, min = INT_MAX; - unsigned int blk = 0; - ino_t inum = 0; - INIT_LIST_HEAD(&data); - INIT_LIST_HEAD(&large); - INIT_LIST_HEAD(&medium); - INIT_LIST_HEAD(&small); + *min = INT_MAX; - while (!list_empty(&sleb->nodes)) { - struct list_head *lst = sleb->nodes.next; - - snod = list_entry(lst, struct ubifs_scan_node, list); + /* Separate data nodes and non-data nodes */ + list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { + int err; ubifs_assert(snod->type != UBIFS_IDX_NODE); ubifs_assert(snod->type != UBIFS_REF_NODE); @@ -201,53 +332,72 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum, snod->offs, 0); if (err < 0) - goto out; + return err; - list_del(lst); if (!err) { /* The node is obsolete, remove it from the list */ + list_del(&snod->list); kfree(snod); continue; } - /* - * Sort the list of nodes so that data nodes go first, large - * nodes go second, and small nodes go last. - */ - if (key_type(c, &snod->key) == UBIFS_DATA_KEY) { - if (inum != key_inum(c, &snod->key)) { - if (inum) { - /* - * Try to move data nodes from the same - * inode together. - */ - err = joinup(c, sleb, inum, blk, &data); - if (err) - goto out; - } - inum = key_inum(c, &snod->key); - blk = key_block(c, &snod->key); - } - list_add_tail(lst, &data); - } else if (snod->len > MEDIUM_NODE_WM) - list_add_tail(lst, &large); - else if (snod->len > SMALL_NODE_WM) - list_add_tail(lst, &medium); - else - list_add_tail(lst, &small); - - /* And find the smallest node */ - if (snod->len < min) - min = snod->len; + if (snod->len < *min) + *min = snod->len; + + if (key_type(c, &snod->key) != UBIFS_DATA_KEY) + list_move_tail(&snod->list, nondata); } - /* - * Join the tree lists so that we'd have one roughly sorted list - * ('large' will be the head of the joined list). - */ - list_splice(&data, &large); - list_splice(&medium, large.prev); - list_splice(&small, large.prev); + /* Sort data and non-data nodes */ + list_sort(c, &sleb->nodes, &data_nodes_cmp); + list_sort(c, nondata, &nondata_nodes_cmp); + return 0; +} + +/** + * move_node - move a node. + * @c: UBIFS file-system description object + * @sleb: describes the LEB to move nodes from + * @snod: the mode to move + * @wbuf: write-buffer to move node to + * + * This function moves node @snod to @wbuf, changes TNC correspondingly, and + * destroys @snod. Returns zero in case of success and a negative error code in + * case of failure. + */ +static int move_node(struct ubifs_info *c, struct ubifs_scan_leb *sleb, + struct ubifs_scan_node *snod, struct ubifs_wbuf *wbuf) +{ + int err, new_lnum = wbuf->lnum, new_offs = wbuf->offs + wbuf->used; + + cond_resched(); + err = ubifs_wbuf_write_nolock(wbuf, snod->node, snod->len); + if (err) + return err; + + err = ubifs_tnc_replace(c, &snod->key, sleb->lnum, + snod->offs, new_lnum, new_offs, + snod->len); + list_del(&snod->list); + kfree(snod); + return err; +} + +/** + * move_nodes - move nodes. + * @c: UBIFS file-system description object + * @sleb: describes the LEB to move nodes from + * + * This function moves valid nodes from data LEB described by @sleb to the GC + * journal head. This function returns zero in case of success, %-EAGAIN if + * commit is required, and other negative error codes in case of other + * failures. + */ +static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) +{ + int err, min; + LIST_HEAD(nondata); + struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; if (wbuf->lnum == -1) { /* @@ -256,42 +406,59 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) */ err = switch_gc_head(c); if (err) - goto out; + return err; } + err = sort_nodes(c, sleb, &nondata, &min); + if (err) + goto out; + /* Write nodes to their new location. Use the first-fit strategy */ while (1) { - avail = c->leb_size - wbuf->offs - wbuf->used; - list_for_each_entry_safe(snod, tmp, &large, list) { - int new_lnum, new_offs; + int avail; + struct ubifs_scan_node *snod, *tmp; + + /* Move data nodes */ + list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { + avail = c->leb_size - wbuf->offs - wbuf->used; + if (snod->len > avail) + /* + * Do not skip data nodes in order to optimize + * bulk-read. + */ + break; + + err = move_node(c, sleb, snod, wbuf); + if (err) + goto out; + } + /* Move non-data nodes */ + list_for_each_entry_safe(snod, tmp, &nondata, list) { + avail = c->leb_size - wbuf->offs - wbuf->used; if (avail < min) break; - if (snod->len > avail) - /* This node does not fit */ + if (snod->len > avail) { + /* + * Keep going only if this is an inode with + * some data. Otherwise stop and switch the GC + * head. IOW, we assume that data-less inode + * nodes and direntry nodes are roughly of the + * same size. + */ + if (key_type(c, &snod->key) == UBIFS_DENT_KEY || + snod->len == UBIFS_INO_NODE_SZ) + break; continue; + } - cond_resched(); - - new_lnum = wbuf->lnum; - new_offs = wbuf->offs + wbuf->used; - err = ubifs_wbuf_write_nolock(wbuf, snod->node, - snod->len); + err = move_node(c, sleb, snod, wbuf); if (err) goto out; - err = ubifs_tnc_replace(c, &snod->key, sleb->lnum, - snod->offs, new_lnum, new_offs, - snod->len); - if (err) - goto out; - - avail = c->leb_size - wbuf->offs - wbuf->used; - list_del(&snod->list); - kfree(snod); } - if (list_empty(&large)) + if (list_empty(&sleb->nodes) && list_empty(&nondata)) break; /* @@ -306,10 +473,7 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) return 0; out: - list_for_each_entry_safe(snod, tmp, &large, list) { - list_del(&snod->list); - kfree(snod); - } + list_splice_tail(&nondata, &sleb->nodes); return err; } diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index a11ca0958a23..64b5f3a309f5 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -114,7 +114,7 @@ static inline void zero_trun_node_unused(struct ubifs_trun_node *trun) */ static int reserve_space(struct ubifs_info *c, int jhead, int len) { - int err = 0, err1, retries = 0, avail, lnum, offs, free, squeeze; + int err = 0, err1, retries = 0, avail, lnum, offs, squeeze; struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf; /* @@ -139,10 +139,9 @@ again: * Write buffer wasn't seek'ed or there is no enough space - look for an * LEB with some empty space. */ - lnum = ubifs_find_free_space(c, len, &free, squeeze); + lnum = ubifs_find_free_space(c, len, &offs, squeeze); if (lnum >= 0) { /* Found an LEB, add it to the journal head */ - offs = c->leb_size - free; err = ubifs_add_bud_to_log(c, jhead, lnum, offs); if (err) goto out_return; @@ -1366,7 +1365,7 @@ out_ro: * @host: host inode * * This function writes the updated version of an extended attribute inode and - * the host inode tho the journal (to the base head). The host inode is written + * the host inode to the journal (to the base head). The host inode is written * after the extended attribute inode in order to guarantee that the extended * attribute will be flushed when the inode is synchronized by 'fsync()' and * consequently, the write-buffer is synchronized. This function returns zero diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h index efb3430a2581..5fa27ea031ba 100644 --- a/fs/ubifs/key.h +++ b/fs/ubifs/key.h @@ -381,8 +381,8 @@ static inline ino_t key_inum_flash(const struct ubifs_info *c, const void *k) * @c: UBIFS file-system description object * @key: the key to get hash from */ -static inline int key_hash(const struct ubifs_info *c, - const union ubifs_key *key) +static inline uint32_t key_hash(const struct ubifs_info *c, + const union ubifs_key *key) { return key->u32[1] & UBIFS_S_KEY_HASH_MASK; } @@ -392,7 +392,7 @@ static inline int key_hash(const struct ubifs_info *c, * @c: UBIFS file-system description object * @k: the key to get hash from */ -static inline int key_hash_flash(const struct ubifs_info *c, const void *k) +static inline uint32_t key_hash_flash(const struct ubifs_info *c, const void *k) { const union ubifs_key *key = k; diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c index 3e0aa7367556..56e33772a1ee 100644 --- a/fs/ubifs/log.c +++ b/fs/ubifs/log.c @@ -239,7 +239,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs) } /* - * Make sure the the amount of space in buds will not exceed + * Make sure the amount of space in buds will not exceed the * 'c->max_bud_bytes' limit, because we want to guarantee mount time * limits. * @@ -367,7 +367,6 @@ static void remove_buds(struct ubifs_info *c) bud->jhead, c->leb_size - bud->start, c->cmt_bud_bytes); rb_erase(p1, &c->buds); - list_del(&bud->list); /* * If the commit does not finish, the recovery will need * to replay the journal, in which case the old buds @@ -375,7 +374,7 @@ static void remove_buds(struct ubifs_info *c) * commit i.e. do not allow them to be garbage * collected. */ - list_add(&bud->list, &c->old_buds); + list_move(&bud->list, &c->old_buds); } } spin_unlock(&c->buds_lock); diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index 3216a1f277f8..8cbfb8248025 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -229,7 +229,7 @@ static int layout_cnodes(struct ubifs_info *c) while (offs + len > c->leb_size) { alen = ALIGN(offs, c->min_io_size); upd_ltab(c, lnum, c->leb_size - alen, alen - offs); - dbg_chk_lpt_sz(c, 2, alen - offs); + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); err = alloc_lpt_leb(c, &lnum); if (err) goto no_space; @@ -272,7 +272,7 @@ static int layout_cnodes(struct ubifs_info *c) if (offs + c->lsave_sz > c->leb_size) { alen = ALIGN(offs, c->min_io_size); upd_ltab(c, lnum, c->leb_size - alen, alen - offs); - dbg_chk_lpt_sz(c, 2, alen - offs); + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); err = alloc_lpt_leb(c, &lnum); if (err) goto no_space; @@ -292,7 +292,7 @@ static int layout_cnodes(struct ubifs_info *c) if (offs + c->ltab_sz > c->leb_size) { alen = ALIGN(offs, c->min_io_size); upd_ltab(c, lnum, c->leb_size - alen, alen - offs); - dbg_chk_lpt_sz(c, 2, alen - offs); + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); err = alloc_lpt_leb(c, &lnum); if (err) goto no_space; @@ -416,14 +416,12 @@ static int write_cnodes(struct ubifs_info *c) alen, UBI_SHORTTERM); if (err) return err; - dbg_chk_lpt_sz(c, 4, alen - wlen); } - dbg_chk_lpt_sz(c, 2, 0); + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); err = realloc_lpt_leb(c, &lnum); if (err) goto no_space; - offs = 0; - from = 0; + offs = from = 0; ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); err = ubifs_leb_unmap(c, lnum); @@ -477,11 +475,11 @@ static int write_cnodes(struct ubifs_info *c) UBI_SHORTTERM); if (err) return err; - dbg_chk_lpt_sz(c, 2, alen - wlen); + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); err = realloc_lpt_leb(c, &lnum); if (err) goto no_space; - offs = 0; + offs = from = 0; ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); err = ubifs_leb_unmap(c, lnum); @@ -504,11 +502,11 @@ static int write_cnodes(struct ubifs_info *c) UBI_SHORTTERM); if (err) return err; - dbg_chk_lpt_sz(c, 2, alen - wlen); + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); err = realloc_lpt_leb(c, &lnum); if (err) goto no_space; - offs = 0; + offs = from = 0; ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); err = ubifs_leb_unmap(c, lnum); @@ -1756,10 +1754,16 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c) /** * dbg_chk_lpt_sz - check LPT does not write more than LPT size. * @c: the UBIFS file-system description object - * @action: action + * @action: what to do * @len: length written * * This function returns %0 on success and a negative error code on failure. + * The @action argument may be one of: + * o %0 - LPT debugging checking starts, initialize debugging variables; + * o %1 - wrote an LPT node, increase LPT size by @len bytes; + * o %2 - switched to a different LEB and wasted @len bytes; + * o %3 - check that we've written the right number of bytes. + * o %4 - wasted @len bytes; */ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) { @@ -1917,12 +1921,12 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum) lnum, offs); err = ubifs_unpack_nnode(c, buf, &nnode); for (i = 0; i < UBIFS_LPT_FANOUT; i++) { - printk("%d:%d", nnode.nbranch[i].lnum, + printk(KERN_CONT "%d:%d", nnode.nbranch[i].lnum, nnode.nbranch[i].offs); if (i != UBIFS_LPT_FANOUT - 1) - printk(", "); + printk(KERN_CONT ", "); } - printk("\n"); + printk(KERN_CONT "\n"); break; } case UBIFS_LPT_LTAB: diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index 90acac603e63..10662975d2ef 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -425,59 +425,35 @@ static void clean_buf(const struct ubifs_info *c, void **buf, int lnum, * @lnum: LEB number of the LEB from which @buf was read * @offs: offset from which @buf was read * - * This function scans @buf for more nodes and returns %0 is a node is found and - * %1 if no more nodes are found. + * This function ensures that the corrupted node at @offs is the last thing + * written to a LEB. This function returns %1 if more data is not found and + * %0 if more data is found. */ static int no_more_nodes(const struct ubifs_info *c, void *buf, int len, int lnum, int offs) { - int skip, next_offs = 0; + struct ubifs_ch *ch = buf; + int skip, dlen = le32_to_cpu(ch->len); - if (len > UBIFS_DATA_NODE_SZ) { - struct ubifs_ch *ch = buf; - int dlen = le32_to_cpu(ch->len); - - if (ch->node_type == UBIFS_DATA_NODE && dlen >= UBIFS_CH_SZ && - dlen <= UBIFS_MAX_DATA_NODE_SZ) - /* The corrupt node looks like a data node */ - next_offs = ALIGN(offs + dlen, 8); - } - - if (c->min_io_size == 1) - skip = 8; - else - skip = ALIGN(offs + 1, c->min_io_size) - offs; - - offs += skip; - buf += skip; - len -= skip; - while (len > 8) { - struct ubifs_ch *ch = buf; - uint32_t magic = le32_to_cpu(ch->magic); - int ret; - - if (magic == UBIFS_NODE_MAGIC) { - ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1); - if (ret == SCANNED_A_NODE || ret > 0) { - /* - * There is a small chance this is just data in - * a data node, so check that possibility. e.g. - * this is part of a file that itself contains - * a UBIFS image. - */ - if (next_offs && offs + le32_to_cpu(ch->len) <= - next_offs) - continue; - dbg_rcvry("unexpected node at %d:%d", lnum, - offs); - return 0; - } - } - offs += 8; - buf += 8; - len -= 8; + /* Check for empty space after the corrupt node's common header */ + skip = ALIGN(offs + UBIFS_CH_SZ, c->min_io_size) - offs; + if (is_empty(buf + skip, len - skip)) + return 1; + /* + * The area after the common header size is not empty, so the common + * header must be intact. Check it. + */ + if (ubifs_check_node(c, buf, lnum, offs, 1, 0) != -EUCLEAN) { + dbg_rcvry("unexpected bad common header at %d:%d", lnum, offs); + return 0; } - return 1; + /* Now we know the corrupt node's length we can skip over it */ + skip = ALIGN(offs + dlen, c->min_io_size) - offs; + /* After which there should be empty space */ + if (is_empty(buf + skip, len - skip)) + return 1; + dbg_rcvry("unexpected data at %d:%d", lnum, offs + skip); + return 0; } /** diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index ce42a7b0ca5a..11cc80125a49 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -143,7 +143,7 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r) dirty -= c->leb_size - lp->free; /* * If the replay order was perfect the dirty space would now be - * zero. The order is not perfect because the the journal heads + * zero. The order is not perfect because the journal heads * race with each other. This is not a problem but is does mean * that the dirty space may temporarily exceed c->leb_size * during the replay. diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index e070c643d1bb..57085e43320f 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -193,6 +193,7 @@ static int create_default_filesystem(struct ubifs_info *c) if (tmp64 > DEFAULT_MAX_RP_SIZE) tmp64 = DEFAULT_MAX_RP_SIZE; sup->rp_size = cpu_to_le64(tmp64); + sup->ro_compat_version = cpu_to_le32(UBIFS_RO_COMPAT_VERSION); err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0, UBI_LONGTERM); kfree(sup); @@ -532,17 +533,39 @@ int ubifs_read_superblock(struct ubifs_info *c) if (IS_ERR(sup)) return PTR_ERR(sup); + c->fmt_version = le32_to_cpu(sup->fmt_version); + c->ro_compat_version = le32_to_cpu(sup->ro_compat_version); + /* * The software supports all previous versions but not future versions, * due to the unavailability of time-travelling equipment. */ - c->fmt_version = le32_to_cpu(sup->fmt_version); if (c->fmt_version > UBIFS_FORMAT_VERSION) { - ubifs_err("on-flash format version is %d, but software only " - "supports up to version %d", c->fmt_version, - UBIFS_FORMAT_VERSION); - err = -EINVAL; - goto out; + struct super_block *sb = c->vfs_sb; + int mounting_ro = sb->s_flags & MS_RDONLY; + + ubifs_assert(!c->ro_media || mounting_ro); + if (!mounting_ro || + c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) { + ubifs_err("on-flash format version is w%d/r%d, but " + "software only supports up to version " + "w%d/r%d", c->fmt_version, + c->ro_compat_version, UBIFS_FORMAT_VERSION, + UBIFS_RO_COMPAT_VERSION); + if (c->ro_compat_version <= UBIFS_RO_COMPAT_VERSION) { + ubifs_msg("only R/O mounting is possible"); + err = -EROFS; + } else + err = -EINVAL; + goto out; + } + + /* + * The FS is mounted R/O, and the media format is + * R/O-compatible with the UBIFS implementation, so we can + * mount. + */ + c->rw_incompat = 1; } if (c->fmt_version < 3) { @@ -623,7 +646,6 @@ int ubifs_read_superblock(struct ubifs_info *c) c->main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS; c->main_lebs -= c->log_lebs + c->lpt_lebs + c->orph_lebs; c->main_first = c->leb_cnt - c->main_lebs; - c->report_rp_size = ubifs_reported_space(c, c->rp_size); err = validate_sb(c, sup); out: diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c index e7bab52a1410..02feb59cefca 100644 --- a/fs/ubifs/shrinker.c +++ b/fs/ubifs/shrinker.c @@ -206,8 +206,7 @@ static int shrink_tnc_trees(int nr, int age, int *contention) * Move this one to the end of the list to provide some * fairness. */ - list_del(&c->infos_list); - list_add_tail(&c->infos_list, &ubifs_infos); + list_move_tail(&c->infos_list, &ubifs_infos); mutex_unlock(&c->umount_mutex); if (freed >= nr) break; @@ -263,8 +262,7 @@ static int kick_a_thread(void) } if (i == 1) { - list_del(&c->infos_list); - list_add_tail(&c->infos_list, &ubifs_infos); + list_move_tail(&c->infos_list, &ubifs_infos); spin_unlock(&ubifs_infos_lock); ubifs_request_bg_commit(c); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index c5c98355459a..faa44f90608a 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -421,8 +421,8 @@ static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt) seq_printf(s, ",no_chk_data_crc"); if (c->mount_opts.override_compr) { - seq_printf(s, ",compr="); - seq_printf(s, ubifs_compr_name(c->mount_opts.compr_type)); + seq_printf(s, ",compr=%s", + ubifs_compr_name(c->mount_opts.compr_type)); } return 0; @@ -700,6 +700,8 @@ static int init_constants_sb(struct ubifs_info *c) if (err) return err; + /* Initialize effective LEB size used in budgeting calculations */ + c->idx_leb_size = c->leb_size - c->max_idx_node_sz; return 0; } @@ -716,6 +718,7 @@ static void init_constants_master(struct ubifs_info *c) long long tmp64; c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); + c->report_rp_size = ubifs_reported_space(c, c->rp_size); /* * Calculate total amount of FS blocks. This number is not used @@ -1201,7 +1204,7 @@ static int mount_ubifs(struct ubifs_info *c) goto out_cbuf; /* Create background thread */ - c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name); + c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name); if (IS_ERR(c->bgt)) { err = PTR_ERR(c->bgt); c->bgt = NULL; @@ -1318,11 +1321,15 @@ static int mount_ubifs(struct ubifs_info *c) else { c->need_recovery = 0; ubifs_msg("recovery completed"); - /* GC LEB has to be empty and taken at this point */ - ubifs_assert(c->lst.taken_empty_lebs == 1); + /* + * GC LEB has to be empty and taken at this point. But + * the journal head LEBs may also be accounted as + * "empty taken" if they are empty. + */ + ubifs_assert(c->lst.taken_empty_lebs > 0); } } else - ubifs_assert(c->lst.taken_empty_lebs == 1); + ubifs_assert(c->lst.taken_empty_lebs > 0); err = dbg_check_filesystem(c); if (err) @@ -1344,8 +1351,9 @@ static int mount_ubifs(struct ubifs_info *c) x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes; ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d " "LEBs)", x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt); - ubifs_msg("media format: %d (latest is %d)", - c->fmt_version, UBIFS_FORMAT_VERSION); + ubifs_msg("media format: w%d/r%d (latest is w%d/r%d)", + c->fmt_version, c->ro_compat_version, + UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION); ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr)); ubifs_msg("reserved for root: %llu bytes (%llu KiB)", c->report_rp_size, c->report_rp_size >> 10); @@ -1485,6 +1493,15 @@ static int ubifs_remount_rw(struct ubifs_info *c) { int err, lnum; + if (c->rw_incompat) { + ubifs_err("the file-system is not R/W-compatible"); + ubifs_msg("on-flash format version is w%d/r%d, but software " + "only supports up to version w%d/r%d", c->fmt_version, + c->ro_compat_version, UBIFS_FORMAT_VERSION, + UBIFS_RO_COMPAT_VERSION); + return -EROFS; + } + mutex_lock(&c->umount_mutex); dbg_save_space_info(c); c->remounting_rw = 1; @@ -1554,7 +1571,7 @@ static int ubifs_remount_rw(struct ubifs_info *c) ubifs_create_buds_lists(c); /* Create background thread */ - c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name); + c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name); if (IS_ERR(c->bgt)) { err = PTR_ERR(c->bgt); c->bgt = NULL; @@ -1775,7 +1792,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) c->bu.buf = NULL; } - ubifs_assert(c->lst.taken_empty_lebs == 1); + ubifs_assert(c->lst.taken_empty_lebs > 0); return 0; } diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index fa28a84c6a1b..f249f7b0d656 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -1252,7 +1252,7 @@ int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key, * splitting in the middle of the colliding sequence. Also, when * removing the leftmost key, we would have to correct the key of the * parent node, which would introduce additional complications. Namely, - * if we changed the the leftmost key of the parent znode, the garbage + * if we changed the leftmost key of the parent znode, the garbage * collector would be unable to find it (GC is doing this when GC'ing * indexing LEBs). Although we already have an additional RB-tree where * we save such changed znodes (see 'ins_clr_old_idx_znode()') until diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h index b25fc36cf72f..3eee07e0c495 100644 --- a/fs/ubifs/ubifs-media.h +++ b/fs/ubifs/ubifs-media.h @@ -36,9 +36,31 @@ /* UBIFS node magic number (must not have the padding byte first or last) */ #define UBIFS_NODE_MAGIC 0x06101831 -/* UBIFS on-flash format version */ +/* + * UBIFS on-flash format version. This version is increased when the on-flash + * format is changing. If this happens, UBIFS is will support older versions as + * well. But older UBIFS code will not support newer formats. Format changes + * will be rare and only when absolutely necessary, e.g. to fix a bug or to add + * a new feature. + * + * UBIFS went into mainline kernel with format version 4. The older formats + * were development formats. + */ #define UBIFS_FORMAT_VERSION 4 +/* + * Read-only compatibility version. If the UBIFS format is changed, older UBIFS + * implementations will not be able to mount newer formats in read-write mode. + * However, depending on the change, it may be possible to mount newer formats + * in R/O mode. This is indicated by the R/O compatibility version which is + * stored in the super-block. + * + * This is needed to support boot-loaders which only need R/O mounting. With + * this flag it is possible to do UBIFS format changes without a need to update + * boot-loaders. + */ +#define UBIFS_RO_COMPAT_VERSION 0 + /* Minimum logical eraseblock size in bytes */ #define UBIFS_MIN_LEB_SZ (15*1024) @@ -53,7 +75,7 @@ /* * If compressed data length is less than %UBIFS_MIN_COMPRESS_DIFF bytes - * shorter than uncompressed data length, UBIFS preferes to leave this data + * shorter than uncompressed data length, UBIFS prefers to leave this data * node uncompress, because it'll be read faster. */ #define UBIFS_MIN_COMPRESS_DIFF 64 @@ -586,6 +608,7 @@ struct ubifs_pad_node { * @padding2: reserved for future, zeroes * @time_gran: time granularity in nanoseconds * @uuid: UUID generated when the file system image was created + * @ro_compat_version: UBIFS R/O compatibility version */ struct ubifs_sb_node { struct ubifs_ch ch; @@ -612,7 +635,8 @@ struct ubifs_sb_node { __le64 rp_size; __le32 time_gran; __u8 uuid[16]; - __u8 padding2[3972]; + __le32 ro_compat_version; + __u8 padding2[3968]; } __attribute__ ((packed)); /** diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 039a68bee29a..0a8341e14088 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -934,6 +934,7 @@ struct ubifs_debug_info; * by @commit_sem * @cnt_lock: protects @highest_inum and @max_sqnum counters * @fmt_version: UBIFS on-flash format version + * @ro_compat_version: R/O compatibility version * @uuid: UUID from super block * * @lhead_lnum: log head logical eraseblock number @@ -966,6 +967,7 @@ struct ubifs_debug_info; * recovery) * @bulk_read: enable bulk-reads * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc) + * @rw_incompat: the media is not R/W compatible * * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and * @calc_idx_sz @@ -1015,6 +1017,8 @@ struct ubifs_debug_info; * @min_io_shift: number of bits in @min_io_size minus one * @leb_size: logical eraseblock size in bytes * @half_leb_size: half LEB size + * @idx_leb_size: how many bytes of an LEB are effectively available when it is + * used to store indexing nodes (@leb_size - @max_idx_node_sz) * @leb_cnt: count of logical eraseblocks * @max_leb_cnt: maximum count of logical eraseblocks * @old_leb_cnt: count of logical eraseblocks before re-size @@ -1132,8 +1136,8 @@ struct ubifs_debug_info; * previous commit start * @uncat_list: list of un-categorized LEBs * @empty_list: list of empty LEBs - * @freeable_list: list of freeable non-index LEBs (free + dirty == leb_size) - * @frdi_idx_list: list of freeable index LEBs (free + dirty == leb_size) + * @freeable_list: list of freeable non-index LEBs (free + dirty == @leb_size) + * @frdi_idx_list: list of freeable index LEBs (free + dirty == @leb_size) * @freeable_cnt: number of freeable LEBs in @freeable_list * * @ltab_lnum: LEB number of LPT's own lprops table @@ -1177,6 +1181,7 @@ struct ubifs_info { unsigned long long cmt_no; spinlock_t cnt_lock; int fmt_version; + int ro_compat_version; unsigned char uuid[16]; int lhead_lnum; @@ -1205,6 +1210,7 @@ struct ubifs_info { unsigned int no_chk_data_crc:1; unsigned int bulk_read:1; unsigned int default_compr:2; + unsigned int rw_incompat:1; struct mutex tnc_mutex; struct ubifs_zbranch zroot; @@ -1253,6 +1259,7 @@ struct ubifs_info { int min_io_shift; int leb_size; int half_leb_size; + int idx_leb_size; int leb_cnt; int max_leb_cnt; int old_leb_cnt; @@ -1500,7 +1507,7 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free); long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs); /* find.c */ -int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free, +int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs, int squeeze); int ubifs_find_free_leb_for_idx(struct ubifs_info *c); int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index 2bb788a2acb1..e48e9a3af763 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -87,12 +87,12 @@ static int read_block_bitmap(struct super_block *sb, { struct buffer_head *bh = NULL; int retval = 0; - kernel_lb_addr loc; + struct kernel_lb_addr loc; loc.logicalBlockNum = bitmap->s_extPosition; loc.partitionReferenceNum = UDF_SB(sb)->s_partition; - bh = udf_tread(sb, udf_get_lb_pblock(sb, loc, block)); + bh = udf_tread(sb, udf_get_lb_pblock(sb, &loc, block)); if (!bh) retval = -EIO; @@ -140,27 +140,29 @@ static inline int load_block_bitmap(struct super_block *sb, return slot; } -static bool udf_add_free_space(struct udf_sb_info *sbi, - u16 partition, u32 cnt) +static void udf_add_free_space(struct super_block *sb, u16 partition, u32 cnt) { + struct udf_sb_info *sbi = UDF_SB(sb); struct logicalVolIntegrityDesc *lvid; - if (sbi->s_lvid_bh == NULL) - return false; + if (!sbi->s_lvid_bh) + return; lvid = (struct logicalVolIntegrityDesc *)sbi->s_lvid_bh->b_data; le32_add_cpu(&lvid->freeSpaceTable[partition], cnt); - return true; + udf_updated_lvid(sb); } static void udf_bitmap_free_blocks(struct super_block *sb, struct inode *inode, struct udf_bitmap *bitmap, - kernel_lb_addr bloc, uint32_t offset, + struct kernel_lb_addr *bloc, + uint32_t offset, uint32_t count) { struct udf_sb_info *sbi = UDF_SB(sb); struct buffer_head *bh = NULL; + struct udf_part_map *partmap; unsigned long block; unsigned long block_group; unsigned long bit; @@ -169,17 +171,17 @@ static void udf_bitmap_free_blocks(struct super_block *sb, unsigned long overflow; mutex_lock(&sbi->s_alloc_mutex); - if (bloc.logicalBlockNum < 0 || - (bloc.logicalBlockNum + count) > - sbi->s_partmaps[bloc.partitionReferenceNum].s_partition_len) { + partmap = &sbi->s_partmaps[bloc->partitionReferenceNum]; + if (bloc->logicalBlockNum < 0 || + (bloc->logicalBlockNum + count) > + partmap->s_partition_len) { udf_debug("%d < %d || %d + %d > %d\n", - bloc.logicalBlockNum, 0, bloc.logicalBlockNum, count, - sbi->s_partmaps[bloc.partitionReferenceNum]. - s_partition_len); + bloc->logicalBlockNum, 0, bloc->logicalBlockNum, + count, partmap->s_partition_len); goto error_return; } - block = bloc.logicalBlockNum + offset + + block = bloc->logicalBlockNum + offset + (sizeof(struct spaceBitmapDesc) << 3); do { @@ -207,7 +209,7 @@ static void udf_bitmap_free_blocks(struct super_block *sb, } else { if (inode) vfs_dq_free_block(inode, 1); - udf_add_free_space(sbi, sbi->s_partition, 1); + udf_add_free_space(sb, sbi->s_partition, 1); } } mark_buffer_dirty(bh); @@ -218,9 +220,6 @@ static void udf_bitmap_free_blocks(struct super_block *sb, } while (overflow); error_return: - sb->s_dirt = 1; - if (sbi->s_lvid_bh) - mark_buffer_dirty(sbi->s_lvid_bh); mutex_unlock(&sbi->s_alloc_mutex); } @@ -277,9 +276,7 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb, } while (block_count > 0); out: - if (udf_add_free_space(sbi, partition, -alloc_count)) - mark_buffer_dirty(sbi->s_lvid_bh); - sb->s_dirt = 1; + udf_add_free_space(sb, partition, -alloc_count); mutex_unlock(&sbi->s_alloc_mutex); return alloc_count; } @@ -409,9 +406,7 @@ got_block: mark_buffer_dirty(bh); - if (udf_add_free_space(sbi, partition, -1)) - mark_buffer_dirty(sbi->s_lvid_bh); - sb->s_dirt = 1; + udf_add_free_space(sb, partition, -1); mutex_unlock(&sbi->s_alloc_mutex); *err = 0; return newblock; @@ -425,26 +420,28 @@ error_return: static void udf_table_free_blocks(struct super_block *sb, struct inode *inode, struct inode *table, - kernel_lb_addr bloc, uint32_t offset, + struct kernel_lb_addr *bloc, + uint32_t offset, uint32_t count) { struct udf_sb_info *sbi = UDF_SB(sb); + struct udf_part_map *partmap; uint32_t start, end; uint32_t elen; - kernel_lb_addr eloc; + struct kernel_lb_addr eloc; struct extent_position oepos, epos; int8_t etype; int i; struct udf_inode_info *iinfo; mutex_lock(&sbi->s_alloc_mutex); - if (bloc.logicalBlockNum < 0 || - (bloc.logicalBlockNum + count) > - sbi->s_partmaps[bloc.partitionReferenceNum].s_partition_len) { + partmap = &sbi->s_partmaps[bloc->partitionReferenceNum]; + if (bloc->logicalBlockNum < 0 || + (bloc->logicalBlockNum + count) > + partmap->s_partition_len) { udf_debug("%d < %d || %d + %d > %d\n", bloc.logicalBlockNum, 0, bloc.logicalBlockNum, count, - sbi->s_partmaps[bloc.partitionReferenceNum]. - s_partition_len); + partmap->s_partition_len); goto error_return; } @@ -453,11 +450,10 @@ static void udf_table_free_blocks(struct super_block *sb, could occure, but.. oh well */ if (inode) vfs_dq_free_block(inode, count); - if (udf_add_free_space(sbi, sbi->s_partition, count)) - mark_buffer_dirty(sbi->s_lvid_bh); + udf_add_free_space(sb, sbi->s_partition, count); - start = bloc.logicalBlockNum + offset; - end = bloc.logicalBlockNum + offset + count - 1; + start = bloc->logicalBlockNum + offset; + end = bloc->logicalBlockNum + offset + count - 1; epos.offset = oepos.offset = sizeof(struct unallocSpaceEntry); elen = 0; @@ -483,7 +479,7 @@ static void udf_table_free_blocks(struct super_block *sb, start += count; count = 0; } - udf_write_aext(table, &oepos, eloc, elen, 1); + udf_write_aext(table, &oepos, &eloc, elen, 1); } else if (eloc.logicalBlockNum == (end + 1)) { if ((0x3FFFFFFF - elen) < (count << sb->s_blocksize_bits)) { @@ -502,7 +498,7 @@ static void udf_table_free_blocks(struct super_block *sb, end -= count; count = 0; } - udf_write_aext(table, &oepos, eloc, elen, 1); + udf_write_aext(table, &oepos, &eloc, elen, 1); } if (epos.bh != oepos.bh) { @@ -532,8 +528,8 @@ static void udf_table_free_blocks(struct super_block *sb, */ int adsize; - short_ad *sad = NULL; - long_ad *lad = NULL; + struct short_ad *sad = NULL; + struct long_ad *lad = NULL; struct allocExtDesc *aed; eloc.logicalBlockNum = start; @@ -541,9 +537,9 @@ static void udf_table_free_blocks(struct super_block *sb, (count << sb->s_blocksize_bits); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - adsize = sizeof(short_ad); + adsize = sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - adsize = sizeof(long_ad); + adsize = sizeof(struct long_ad); else { brelse(oepos.bh); brelse(epos.bh); @@ -563,7 +559,7 @@ static void udf_table_free_blocks(struct super_block *sb, elen -= sb->s_blocksize; epos.bh = udf_tread(sb, - udf_get_lb_pblock(sb, epos.block, 0)); + udf_get_lb_pblock(sb, &epos.block, 0)); if (!epos.bh) { brelse(oepos.bh); goto error_return; @@ -601,15 +597,15 @@ static void udf_table_free_blocks(struct super_block *sb, if (sbi->s_udfrev >= 0x0200) udf_new_tag(epos.bh->b_data, TAG_IDENT_AED, 3, 1, epos.block.logicalBlockNum, - sizeof(tag)); + sizeof(struct tag)); else udf_new_tag(epos.bh->b_data, TAG_IDENT_AED, 2, 1, epos.block.logicalBlockNum, - sizeof(tag)); + sizeof(struct tag)); switch (iinfo->i_alloc_type) { case ICBTAG_FLAG_AD_SHORT: - sad = (short_ad *)sptr; + sad = (struct short_ad *)sptr; sad->extLength = cpu_to_le32( EXT_NEXT_EXTENT_ALLOCDECS | sb->s_blocksize); @@ -617,7 +613,7 @@ static void udf_table_free_blocks(struct super_block *sb, cpu_to_le32(epos.block.logicalBlockNum); break; case ICBTAG_FLAG_AD_LONG: - lad = (long_ad *)sptr; + lad = (struct long_ad *)sptr; lad->extLength = cpu_to_le32( EXT_NEXT_EXTENT_ALLOCDECS | sb->s_blocksize); @@ -635,7 +631,7 @@ static void udf_table_free_blocks(struct super_block *sb, /* It's possible that stealing the block emptied the extent */ if (elen) { - udf_write_aext(table, &epos, eloc, elen, 1); + udf_write_aext(table, &epos, &eloc, elen, 1); if (!epos.bh) { iinfo->i_lenAlloc += adsize; @@ -653,7 +649,6 @@ static void udf_table_free_blocks(struct super_block *sb, brelse(oepos.bh); error_return: - sb->s_dirt = 1; mutex_unlock(&sbi->s_alloc_mutex); return; } @@ -666,7 +661,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb, struct udf_sb_info *sbi = UDF_SB(sb); int alloc_count = 0; uint32_t elen, adsize; - kernel_lb_addr eloc; + struct kernel_lb_addr eloc; struct extent_position epos; int8_t etype = -1; struct udf_inode_info *iinfo; @@ -677,9 +672,9 @@ static int udf_table_prealloc_blocks(struct super_block *sb, iinfo = UDF_I(table); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - adsize = sizeof(short_ad); + adsize = sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - adsize = sizeof(long_ad); + adsize = sizeof(struct long_ad); else return 0; @@ -707,7 +702,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb, alloc_count = block_count; eloc.logicalBlockNum += alloc_count; elen -= (alloc_count << sb->s_blocksize_bits); - udf_write_aext(table, &epos, eloc, + udf_write_aext(table, &epos, &eloc, (etype << 30) | elen, 1); } else udf_delete_aext(table, epos, eloc, @@ -718,10 +713,8 @@ static int udf_table_prealloc_blocks(struct super_block *sb, brelse(epos.bh); - if (alloc_count && udf_add_free_space(sbi, partition, -alloc_count)) { - mark_buffer_dirty(sbi->s_lvid_bh); - sb->s_dirt = 1; - } + if (alloc_count) + udf_add_free_space(sb, partition, -alloc_count); mutex_unlock(&sbi->s_alloc_mutex); return alloc_count; } @@ -735,7 +728,7 @@ static int udf_table_new_block(struct super_block *sb, uint32_t spread = 0xFFFFFFFF, nspread = 0xFFFFFFFF; uint32_t newblock = 0, adsize; uint32_t elen, goal_elen = 0; - kernel_lb_addr eloc, uninitialized_var(goal_eloc); + struct kernel_lb_addr eloc, uninitialized_var(goal_eloc); struct extent_position epos, goal_epos; int8_t etype; struct udf_inode_info *iinfo = UDF_I(table); @@ -743,9 +736,9 @@ static int udf_table_new_block(struct super_block *sb, *err = -ENOSPC; if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - adsize = sizeof(short_ad); + adsize = sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - adsize = sizeof(long_ad); + adsize = sizeof(struct long_ad); else return newblock; @@ -814,46 +807,37 @@ static int udf_table_new_block(struct super_block *sb, } if (goal_elen) - udf_write_aext(table, &goal_epos, goal_eloc, goal_elen, 1); + udf_write_aext(table, &goal_epos, &goal_eloc, goal_elen, 1); else udf_delete_aext(table, goal_epos, goal_eloc, goal_elen); brelse(goal_epos.bh); - if (udf_add_free_space(sbi, partition, -1)) - mark_buffer_dirty(sbi->s_lvid_bh); + udf_add_free_space(sb, partition, -1); - sb->s_dirt = 1; mutex_unlock(&sbi->s_alloc_mutex); *err = 0; return newblock; } -inline void udf_free_blocks(struct super_block *sb, - struct inode *inode, - kernel_lb_addr bloc, uint32_t offset, - uint32_t count) +void udf_free_blocks(struct super_block *sb, struct inode *inode, + struct kernel_lb_addr *bloc, uint32_t offset, + uint32_t count) { - uint16_t partition = bloc.partitionReferenceNum; + uint16_t partition = bloc->partitionReferenceNum; struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition]; if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) { - return udf_bitmap_free_blocks(sb, inode, - map->s_uspace.s_bitmap, - bloc, offset, count); + udf_bitmap_free_blocks(sb, inode, map->s_uspace.s_bitmap, + bloc, offset, count); } else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) { - return udf_table_free_blocks(sb, inode, - map->s_uspace.s_table, - bloc, offset, count); + udf_table_free_blocks(sb, inode, map->s_uspace.s_table, + bloc, offset, count); } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) { - return udf_bitmap_free_blocks(sb, inode, - map->s_fspace.s_bitmap, - bloc, offset, count); + udf_bitmap_free_blocks(sb, inode, map->s_fspace.s_bitmap, + bloc, offset, count); } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) { - return udf_table_free_blocks(sb, inode, - map->s_fspace.s_table, - bloc, offset, count); - } else { - return; + udf_table_free_blocks(sb, inode, map->s_fspace.s_table, + bloc, offset, count); } } diff --git a/fs/udf/dir.c b/fs/udf/dir.c index 62dc270c69d1..2efd4d5291b6 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -51,7 +51,7 @@ static int do_udf_readdir(struct inode *dir, struct file *filp, uint8_t lfi; loff_t size = udf_ext0_offset(dir) + dir->i_size; struct buffer_head *tmp, *bha[16]; - kernel_lb_addr eloc; + struct kernel_lb_addr eloc; uint32_t elen; sector_t offset; int i, num, ret = 0; @@ -80,13 +80,13 @@ static int do_udf_readdir(struct inode *dir, struct file *filp, ret = -ENOENT; goto out; } - block = udf_get_lb_pblock(dir->i_sb, eloc, offset); + block = udf_get_lb_pblock(dir->i_sb, &eloc, offset); if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - epos.offset -= sizeof(short_ad); + epos.offset -= sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - epos.offset -= sizeof(long_ad); + epos.offset -= sizeof(struct long_ad); } else { offset = 0; } @@ -101,7 +101,7 @@ static int do_udf_readdir(struct inode *dir, struct file *filp, if (i + offset > (elen >> dir->i_sb->s_blocksize_bits)) i = (elen >> dir->i_sb->s_blocksize_bits) - offset; for (num = 0; i > 0; i--) { - block = udf_get_lb_pblock(dir->i_sb, eloc, offset + i); + block = udf_get_lb_pblock(dir->i_sb, &eloc, offset + i); tmp = udf_tgetblk(dir->i_sb, block); if (tmp && !buffer_uptodate(tmp) && !buffer_locked(tmp)) bha[num++] = tmp; @@ -161,9 +161,9 @@ static int do_udf_readdir(struct inode *dir, struct file *filp, memcpy(fname, "..", flen); dt_type = DT_DIR; } else { - kernel_lb_addr tloc = lelb_to_cpu(cfi.icb.extLocation); + struct kernel_lb_addr tloc = lelb_to_cpu(cfi.icb.extLocation); - iblock = udf_get_lb_pblock(dir->i_sb, tloc, 0); + iblock = udf_get_lb_pblock(dir->i_sb, &tloc, 0); flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi); dt_type = DT_UNKNOWN; } diff --git a/fs/udf/directory.c b/fs/udf/directory.c index 2820f8fcf4cc..1d2c570704c8 100644 --- a/fs/udf/directory.c +++ b/fs/udf/directory.c @@ -20,7 +20,7 @@ #if 0 static uint8_t *udf_filead_read(struct inode *dir, uint8_t *tmpad, - uint8_t ad_size, kernel_lb_addr fe_loc, + uint8_t ad_size, struct kernel_lb_addr fe_loc, int *pos, int *offset, struct buffer_head **bh, int *error) { @@ -75,7 +75,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, struct udf_fileident_bh *fibh, struct fileIdentDesc *cfi, struct extent_position *epos, - kernel_lb_addr *eloc, uint32_t *elen, + struct kernel_lb_addr *eloc, uint32_t *elen, sector_t *offset) { struct fileIdentDesc *fi; @@ -111,7 +111,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, (EXT_RECORDED_ALLOCATED >> 30)) return NULL; - block = udf_get_lb_pblock(dir->i_sb, *eloc, *offset); + block = udf_get_lb_pblock(dir->i_sb, eloc, *offset); (*offset)++; @@ -131,7 +131,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, if (i + *offset > (*elen >> blocksize_bits)) i = (*elen >> blocksize_bits)-*offset; for (num = 0; i > 0; i--) { - block = udf_get_lb_pblock(dir->i_sb, *eloc, + block = udf_get_lb_pblock(dir->i_sb, eloc, *offset + i); tmp = udf_tgetblk(dir->i_sb, block); if (tmp && !buffer_uptodate(tmp) && @@ -169,7 +169,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, (EXT_RECORDED_ALLOCATED >> 30)) return NULL; - block = udf_get_lb_pblock(dir->i_sb, *eloc, *offset); + block = udf_get_lb_pblock(dir->i_sb, eloc, *offset); (*offset)++; @@ -249,9 +249,9 @@ struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset) } #if 0 -static extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset) +static struct extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset) { - extent_ad *ext; + struct extent_ad *ext; struct fileEntry *fe; uint8_t *ptr; @@ -274,54 +274,54 @@ static extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset) if ((*offset > 0) && (*offset < le32_to_cpu(fe->lengthAllocDescs))) ptr += *offset; - ext = (extent_ad *)ptr; + ext = (struct extent_ad *)ptr; - *offset = *offset + sizeof(extent_ad); + *offset = *offset + sizeof(struct extent_ad); return ext; } #endif -short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset, +struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset, int inc) { - short_ad *sa; + struct short_ad *sa; if ((!ptr) || (!offset)) { printk(KERN_ERR "udf: udf_get_fileshortad() invalidparms\n"); return NULL; } - if ((*offset + sizeof(short_ad)) > maxoffset) + if ((*offset + sizeof(struct short_ad)) > maxoffset) return NULL; else { - sa = (short_ad *)ptr; + sa = (struct short_ad *)ptr; if (sa->extLength == 0) return NULL; } if (inc) - *offset += sizeof(short_ad); + *offset += sizeof(struct short_ad); return sa; } -long_ad *udf_get_filelongad(uint8_t *ptr, int maxoffset, uint32_t *offset, int inc) +struct long_ad *udf_get_filelongad(uint8_t *ptr, int maxoffset, uint32_t *offset, int inc) { - long_ad *la; + struct long_ad *la; if ((!ptr) || (!offset)) { printk(KERN_ERR "udf: udf_get_filelongad() invalidparms\n"); return NULL; } - if ((*offset + sizeof(long_ad)) > maxoffset) + if ((*offset + sizeof(struct long_ad)) > maxoffset) return NULL; else { - la = (long_ad *)ptr; + la = (struct long_ad *)ptr; if (la->extLength == 0) return NULL; } if (inc) - *offset += sizeof(long_ad); + *offset += sizeof(struct long_ad); return la; } diff --git a/fs/udf/ecma_167.h b/fs/udf/ecma_167.h index a0974df82b31..4792b771aa80 100644 --- a/fs/udf/ecma_167.h +++ b/fs/udf/ecma_167.h @@ -38,10 +38,10 @@ #define _ECMA_167_H 1 /* Character set specification (ECMA 167r3 1/7.2.1) */ -typedef struct { +struct charspec { uint8_t charSetType; uint8_t charSetInfo[63]; -} __attribute__ ((packed)) charspec; +} __attribute__ ((packed)); /* Character Set Type (ECMA 167r3 1/7.2.1.1) */ #define CHARSPEC_TYPE_CS0 0x00 /* (1/7.2.2) */ @@ -57,7 +57,7 @@ typedef struct { typedef uint8_t dstring; /* Timestamp (ECMA 167r3 1/7.3) */ -typedef struct { +struct timestamp { __le16 typeAndTimezone; __le16 year; uint8_t month; @@ -68,7 +68,7 @@ typedef struct { uint8_t centiseconds; uint8_t hundredsOfMicroseconds; uint8_t microseconds; -} __attribute__ ((packed)) timestamp; +} __attribute__ ((packed)); /* Type and Time Zone (ECMA 167r3 1/7.3.1) */ #define TIMESTAMP_TYPE_MASK 0xF000 @@ -78,11 +78,11 @@ typedef struct { #define TIMESTAMP_TIMEZONE_MASK 0x0FFF /* Entity identifier (ECMA 167r3 1/7.4) */ -typedef struct { +struct regid { uint8_t flags; uint8_t ident[23]; uint8_t identSuffix[8]; -} __attribute__ ((packed)) regid; +} __attribute__ ((packed)); /* Flags (ECMA 167r3 1/7.4.1) */ #define ENTITYID_FLAGS_DIRTY 0x00 @@ -126,38 +126,38 @@ struct terminatingExtendedAreaDesc { /* Boot Descriptor (ECMA 167r3 2/9.4) */ struct bootDesc { - uint8_t structType; - uint8_t stdIdent[VSD_STD_ID_LEN]; - uint8_t structVersion; - uint8_t reserved1; - regid archType; - regid bootIdent; - __le32 bootExtLocation; - __le32 bootExtLength; - __le64 loadAddress; - __le64 startAddress; - timestamp descCreationDateAndTime; - __le16 flags; - uint8_t reserved2[32]; - uint8_t bootUse[1906]; + uint8_t structType; + uint8_t stdIdent[VSD_STD_ID_LEN]; + uint8_t structVersion; + uint8_t reserved1; + struct regid archType; + struct regid bootIdent; + __le32 bootExtLocation; + __le32 bootExtLength; + __le64 loadAddress; + __le64 startAddress; + struct timestamp descCreationDateAndTime; + __le16 flags; + uint8_t reserved2[32]; + uint8_t bootUse[1906]; } __attribute__ ((packed)); /* Flags (ECMA 167r3 2/9.4.12) */ #define BOOT_FLAGS_ERASE 0x01 /* Extent Descriptor (ECMA 167r3 3/7.1) */ -typedef struct { +struct extent_ad { __le32 extLength; __le32 extLocation; -} __attribute__ ((packed)) extent_ad; +} __attribute__ ((packed)); -typedef struct { +struct kernel_extent_ad { uint32_t extLength; uint32_t extLocation; -} kernel_extent_ad; +}; /* Descriptor Tag (ECMA 167r3 3/7.2) */ -typedef struct { +struct tag { __le16 tagIdent; __le16 descVersion; uint8_t tagChecksum; @@ -166,7 +166,7 @@ typedef struct { __le16 descCRC; __le16 descCRCLength; __le32 tagLocation; -} __attribute__ ((packed)) tag; +} __attribute__ ((packed)); /* Tag Identifier (ECMA 167r3 3/7.2.1) */ #define TAG_IDENT_PVD 0x0001 @@ -190,28 +190,28 @@ struct NSRDesc { /* Primary Volume Descriptor (ECMA 167r3 3/10.1) */ struct primaryVolDesc { - tag descTag; - __le32 volDescSeqNum; - __le32 primaryVolDescNum; - dstring volIdent[32]; - __le16 volSeqNum; - __le16 maxVolSeqNum; - __le16 interchangeLvl; - __le16 maxInterchangeLvl; - __le32 charSetList; - __le32 maxCharSetList; - dstring volSetIdent[128]; - charspec descCharSet; - charspec explanatoryCharSet; - extent_ad volAbstract; - extent_ad volCopyright; - regid appIdent; - timestamp recordingDateAndTime; - regid impIdent; - uint8_t impUse[64]; - __le32 predecessorVolDescSeqLocation; - __le16 flags; - uint8_t reserved[22]; + struct tag descTag; + __le32 volDescSeqNum; + __le32 primaryVolDescNum; + dstring volIdent[32]; + __le16 volSeqNum; + __le16 maxVolSeqNum; + __le16 interchangeLvl; + __le16 maxInterchangeLvl; + __le32 charSetList; + __le32 maxCharSetList; + dstring volSetIdent[128]; + struct charspec descCharSet; + struct charspec explanatoryCharSet; + struct extent_ad volAbstract; + struct extent_ad volCopyright; + struct regid appIdent; + struct timestamp recordingDateAndTime; + struct regid impIdent; + uint8_t impUse[64]; + __le32 predecessorVolDescSeqLocation; + __le16 flags; + uint8_t reserved[22]; } __attribute__ ((packed)); /* Flags (ECMA 167r3 3/10.1.21) */ @@ -219,40 +219,40 @@ struct primaryVolDesc { /* Anchor Volume Descriptor Pointer (ECMA 167r3 3/10.2) */ struct anchorVolDescPtr { - tag descTag; - extent_ad mainVolDescSeqExt; - extent_ad reserveVolDescSeqExt; - uint8_t reserved[480]; + struct tag descTag; + struct extent_ad mainVolDescSeqExt; + struct extent_ad reserveVolDescSeqExt; + uint8_t reserved[480]; } __attribute__ ((packed)); /* Volume Descriptor Pointer (ECMA 167r3 3/10.3) */ struct volDescPtr { - tag descTag; - __le32 volDescSeqNum; - extent_ad nextVolDescSeqExt; - uint8_t reserved[484]; + struct tag descTag; + __le32 volDescSeqNum; + struct extent_ad nextVolDescSeqExt; + uint8_t reserved[484]; } __attribute__ ((packed)); /* Implementation Use Volume Descriptor (ECMA 167r3 3/10.4) */ struct impUseVolDesc { - tag descTag; + struct tag descTag; __le32 volDescSeqNum; - regid impIdent; + struct regid impIdent; uint8_t impUse[460]; } __attribute__ ((packed)); /* Partition Descriptor (ECMA 167r3 3/10.5) */ struct partitionDesc { - tag descTag; + struct tag descTag; __le32 volDescSeqNum; __le16 partitionFlags; __le16 partitionNumber; - regid partitionContents; + struct regid partitionContents; uint8_t partitionContentsUse[128]; __le32 accessType; __le32 partitionStartingLocation; __le32 partitionLength; - regid impIdent; + struct regid impIdent; uint8_t impUse[128]; uint8_t reserved[156]; } __attribute__ ((packed)); @@ -278,19 +278,19 @@ struct partitionDesc { /* Logical Volume Descriptor (ECMA 167r3 3/10.6) */ struct logicalVolDesc { - tag descTag; - __le32 volDescSeqNum; - charspec descCharSet; - dstring logicalVolIdent[128]; - __le32 logicalBlockSize; - regid domainIdent; - uint8_t logicalVolContentsUse[16]; - __le32 mapTableLength; - __le32 numPartitionMaps; - regid impIdent; - uint8_t impUse[128]; - extent_ad integritySeqExt; - uint8_t partitionMaps[0]; + struct tag descTag; + __le32 volDescSeqNum; + struct charspec descCharSet; + dstring logicalVolIdent[128]; + __le32 logicalBlockSize; + struct regid domainIdent; + uint8_t logicalVolContentsUse[16]; + __le32 mapTableLength; + __le32 numPartitionMaps; + struct regid impIdent; + uint8_t impUse[128]; + struct extent_ad integritySeqExt; + uint8_t partitionMaps[0]; } __attribute__ ((packed)); /* Generic Partition Map (ECMA 167r3 3/10.7.1) */ @@ -322,30 +322,30 @@ struct genericPartitionMap2 { /* Unallocated Space Descriptor (ECMA 167r3 3/10.8) */ struct unallocSpaceDesc { - tag descTag; - __le32 volDescSeqNum; - __le32 numAllocDescs; - extent_ad allocDescs[0]; + struct tag descTag; + __le32 volDescSeqNum; + __le32 numAllocDescs; + struct extent_ad allocDescs[0]; } __attribute__ ((packed)); /* Terminating Descriptor (ECMA 167r3 3/10.9) */ struct terminatingDesc { - tag descTag; + struct tag descTag; uint8_t reserved[496]; } __attribute__ ((packed)); /* Logical Volume Integrity Descriptor (ECMA 167r3 3/10.10) */ struct logicalVolIntegrityDesc { - tag descTag; - timestamp recordingDateAndTime; - __le32 integrityType; - extent_ad nextIntegrityExt; - uint8_t logicalVolContentsUse[32]; - __le32 numOfPartitions; - __le32 lengthOfImpUse; - __le32 freeSpaceTable[0]; - __le32 sizeTable[0]; - uint8_t impUse[0]; + struct tag descTag; + struct timestamp recordingDateAndTime; + __le32 integrityType; + struct extent_ad nextIntegrityExt; + uint8_t logicalVolContentsUse[32]; + __le32 numOfPartitions; + __le32 lengthOfImpUse; + __le32 freeSpaceTable[0]; + __le32 sizeTable[0]; + uint8_t impUse[0]; } __attribute__ ((packed)); /* Integrity Type (ECMA 167r3 3/10.10.3) */ @@ -353,50 +353,50 @@ struct logicalVolIntegrityDesc { #define LVID_INTEGRITY_TYPE_CLOSE 0x00000001 /* Recorded Address (ECMA 167r3 4/7.1) */ -typedef struct { +struct lb_addr { __le32 logicalBlockNum; __le16 partitionReferenceNum; -} __attribute__ ((packed)) lb_addr; +} __attribute__ ((packed)); /* ... and its in-core analog */ -typedef struct { +struct kernel_lb_addr { uint32_t logicalBlockNum; uint16_t partitionReferenceNum; -} kernel_lb_addr; +}; /* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */ -typedef struct { +struct short_ad { __le32 extLength; __le32 extPosition; -} __attribute__ ((packed)) short_ad; +} __attribute__ ((packed)); /* Long Allocation Descriptor (ECMA 167r3 4/14.14.2) */ -typedef struct { +struct long_ad { __le32 extLength; - lb_addr extLocation; + struct lb_addr extLocation; uint8_t impUse[6]; -} __attribute__ ((packed)) long_ad; +} __attribute__ ((packed)); -typedef struct { - uint32_t extLength; - kernel_lb_addr extLocation; - uint8_t impUse[6]; -} kernel_long_ad; +struct kernel_long_ad { + uint32_t extLength; + struct kernel_lb_addr extLocation; + uint8_t impUse[6]; +}; /* Extended Allocation Descriptor (ECMA 167r3 4/14.14.3) */ -typedef struct { +struct ext_ad { __le32 extLength; __le32 recordedLength; __le32 informationLength; - lb_addr extLocation; -} __attribute__ ((packed)) ext_ad; + struct lb_addr extLocation; +} __attribute__ ((packed)); -typedef struct { - uint32_t extLength; - uint32_t recordedLength; - uint32_t informationLength; - kernel_lb_addr extLocation; -} kernel_ext_ad; +struct kernel_ext_ad { + uint32_t extLength; + uint32_t recordedLength; + uint32_t informationLength; + struct kernel_lb_addr extLocation; +}; /* Descriptor Tag (ECMA 167r3 4/7.2 - See 3/7.2) */ @@ -415,44 +415,44 @@ typedef struct { /* File Set Descriptor (ECMA 167r3 4/14.1) */ struct fileSetDesc { - tag descTag; - timestamp recordingDateAndTime; - __le16 interchangeLvl; - __le16 maxInterchangeLvl; - __le32 charSetList; - __le32 maxCharSetList; - __le32 fileSetNum; - __le32 fileSetDescNum; - charspec logicalVolIdentCharSet; - dstring logicalVolIdent[128]; - charspec fileSetCharSet; - dstring fileSetIdent[32]; - dstring copyrightFileIdent[32]; - dstring abstractFileIdent[32]; - long_ad rootDirectoryICB; - regid domainIdent; - long_ad nextExt; - long_ad streamDirectoryICB; - uint8_t reserved[32]; + struct tag descTag; + struct timestamp recordingDateAndTime; + __le16 interchangeLvl; + __le16 maxInterchangeLvl; + __le32 charSetList; + __le32 maxCharSetList; + __le32 fileSetNum; + __le32 fileSetDescNum; + struct charspec logicalVolIdentCharSet; + dstring logicalVolIdent[128]; + struct charspec fileSetCharSet; + dstring fileSetIdent[32]; + dstring copyrightFileIdent[32]; + dstring abstractFileIdent[32]; + struct long_ad rootDirectoryICB; + struct regid domainIdent; + struct long_ad nextExt; + struct long_ad streamDirectoryICB; + uint8_t reserved[32]; } __attribute__ ((packed)); /* Partition Header Descriptor (ECMA 167r3 4/14.3) */ struct partitionHeaderDesc { - short_ad unallocSpaceTable; - short_ad unallocSpaceBitmap; - short_ad partitionIntegrityTable; - short_ad freedSpaceTable; - short_ad freedSpaceBitmap; + struct short_ad unallocSpaceTable; + struct short_ad unallocSpaceBitmap; + struct short_ad partitionIntegrityTable; + struct short_ad freedSpaceTable; + struct short_ad freedSpaceBitmap; uint8_t reserved[88]; } __attribute__ ((packed)); /* File Identifier Descriptor (ECMA 167r3 4/14.4) */ struct fileIdentDesc { - tag descTag; + struct tag descTag; __le16 fileVersionNum; uint8_t fileCharacteristics; uint8_t lengthFileIdent; - long_ad icb; + struct long_ad icb; __le16 lengthOfImpUse; uint8_t impUse[0]; uint8_t fileIdent[0]; @@ -468,22 +468,22 @@ struct fileIdentDesc { /* Allocation Ext Descriptor (ECMA 167r3 4/14.5) */ struct allocExtDesc { - tag descTag; + struct tag descTag; __le32 previousAllocExtLocation; __le32 lengthAllocDescs; } __attribute__ ((packed)); /* ICB Tag (ECMA 167r3 4/14.6) */ -typedef struct { +struct icbtag { __le32 priorRecordedNumDirectEntries; __le16 strategyType; __le16 strategyParameter; __le16 numEntries; uint8_t reserved; uint8_t fileType; - lb_addr parentICBLocation; + struct lb_addr parentICBLocation; __le16 flags; -} __attribute__ ((packed)) icbtag; +} __attribute__ ((packed)); /* Strategy Type (ECMA 167r3 4/14.6.2) */ #define ICBTAG_STRATEGY_TYPE_UNDEF 0x0000 @@ -528,41 +528,41 @@ typedef struct { /* Indirect Entry (ECMA 167r3 4/14.7) */ struct indirectEntry { - tag descTag; - icbtag icbTag; - long_ad indirectICB; + struct tag descTag; + struct icbtag icbTag; + struct long_ad indirectICB; } __attribute__ ((packed)); /* Terminal Entry (ECMA 167r3 4/14.8) */ struct terminalEntry { - tag descTag; - icbtag icbTag; + struct tag descTag; + struct icbtag icbTag; } __attribute__ ((packed)); /* File Entry (ECMA 167r3 4/14.9) */ struct fileEntry { - tag descTag; - icbtag icbTag; - __le32 uid; - __le32 gid; - __le32 permissions; - __le16 fileLinkCount; - uint8_t recordFormat; - uint8_t recordDisplayAttr; - __le32 recordLength; - __le64 informationLength; - __le64 logicalBlocksRecorded; - timestamp accessTime; - timestamp modificationTime; - timestamp attrTime; - __le32 checkpoint; - long_ad extendedAttrICB; - regid impIdent; - __le64 uniqueID; - __le32 lengthExtendedAttr; - __le32 lengthAllocDescs; - uint8_t extendedAttr[0]; - uint8_t allocDescs[0]; + struct tag descTag; + struct icbtag icbTag; + __le32 uid; + __le32 gid; + __le32 permissions; + __le16 fileLinkCount; + uint8_t recordFormat; + uint8_t recordDisplayAttr; + __le32 recordLength; + __le64 informationLength; + __le64 logicalBlocksRecorded; + struct timestamp accessTime; + struct timestamp modificationTime; + struct timestamp attrTime; + __le32 checkpoint; + struct long_ad extendedAttrICB; + struct regid impIdent; + __le64 uniqueID; + __le32 lengthExtendedAttr; + __le32 lengthAllocDescs; + uint8_t extendedAttr[0]; + uint8_t allocDescs[0]; } __attribute__ ((packed)); /* Permissions (ECMA 167r3 4/14.9.5) */ @@ -604,7 +604,7 @@ struct fileEntry { /* Extended Attribute Header Descriptor (ECMA 167r3 4/14.10.1) */ struct extendedAttrHeaderDesc { - tag descTag; + struct tag descTag; __le32 impAttrLocation; __le32 appAttrLocation; } __attribute__ ((packed)); @@ -687,7 +687,7 @@ struct impUseExtAttr { uint8_t reserved[3]; __le32 attrLength; __le32 impUseLength; - regid impIdent; + struct regid impIdent; uint8_t impUse[0]; } __attribute__ ((packed)); @@ -698,7 +698,7 @@ struct appUseExtAttr { uint8_t reserved[3]; __le32 attrLength; __le32 appUseLength; - regid appIdent; + struct regid appIdent; uint8_t appUse[0]; } __attribute__ ((packed)); @@ -712,15 +712,15 @@ struct appUseExtAttr { /* Unallocated Space Entry (ECMA 167r3 4/14.11) */ struct unallocSpaceEntry { - tag descTag; - icbtag icbTag; + struct tag descTag; + struct icbtag icbTag; __le32 lengthAllocDescs; uint8_t allocDescs[0]; } __attribute__ ((packed)); /* Space Bitmap Descriptor (ECMA 167r3 4/14.12) */ struct spaceBitmapDesc { - tag descTag; + struct tag descTag; __le32 numOfBits; __le32 numOfBytes; uint8_t bitmap[0]; @@ -728,13 +728,13 @@ struct spaceBitmapDesc { /* Partition Integrity Entry (ECMA 167r3 4/14.13) */ struct partitionIntegrityEntry { - tag descTag; - icbtag icbTag; - timestamp recordingDateAndTime; - uint8_t integrityType; - uint8_t reserved[175]; - regid impIdent; - uint8_t impUse[256]; + struct tag descTag; + struct icbtag icbTag; + struct timestamp recordingDateAndTime; + uint8_t integrityType; + uint8_t reserved[175]; + struct regid impIdent; + uint8_t impUse[256]; } __attribute__ ((packed)); /* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */ @@ -765,32 +765,32 @@ struct pathComponent { /* File Entry (ECMA 167r3 4/14.17) */ struct extendedFileEntry { - tag descTag; - icbtag icbTag; - __le32 uid; - __le32 gid; - __le32 permissions; - __le16 fileLinkCount; - uint8_t recordFormat; - uint8_t recordDisplayAttr; - __le32 recordLength; - __le64 informationLength; - __le64 objectSize; - __le64 logicalBlocksRecorded; - timestamp accessTime; - timestamp modificationTime; - timestamp createTime; - timestamp attrTime; - __le32 checkpoint; - __le32 reserved; - long_ad extendedAttrICB; - long_ad streamDirectoryICB; - regid impIdent; - __le64 uniqueID; - __le32 lengthExtendedAttr; - __le32 lengthAllocDescs; - uint8_t extendedAttr[0]; - uint8_t allocDescs[0]; + struct tag descTag; + struct icbtag icbTag; + __le32 uid; + __le32 gid; + __le32 permissions; + __le16 fileLinkCount; + uint8_t recordFormat; + uint8_t recordDisplayAttr; + __le32 recordLength; + __le64 informationLength; + __le64 objectSize; + __le64 logicalBlocksRecorded; + struct timestamp accessTime; + struct timestamp modificationTime; + struct timestamp createTime; + struct timestamp attrTime; + __le32 checkpoint; + __le32 reserved; + struct long_ad extendedAttrICB; + struct long_ad streamDirectoryICB; + struct regid impIdent; + __le64 uniqueID; + __le32 lengthExtendedAttr; + __le32 lengthAllocDescs; + uint8_t extendedAttr[0]; + uint8_t allocDescs[0]; } __attribute__ ((packed)); #endif /* _ECMA_167_H */ diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index 47dbe5613f90..c10fa39f97e2 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -49,12 +49,11 @@ void udf_free_inode(struct inode *inode) le32_add_cpu(&lvidiu->numDirs, -1); else le32_add_cpu(&lvidiu->numFiles, -1); - - mark_buffer_dirty(sbi->s_lvid_bh); + udf_updated_lvid(sb); } mutex_unlock(&sbi->s_alloc_mutex); - udf_free_blocks(sb, NULL, UDF_I(inode)->i_location, 0, 1); + udf_free_blocks(sb, NULL, &UDF_I(inode)->i_location, 0, 1); } struct inode *udf_new_inode(struct inode *dir, int mode, int *err) @@ -122,7 +121,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err) if (!(++uniqueID & 0x00000000FFFFFFFFUL)) uniqueID += 16; lvhd->uniqueID = cpu_to_le64(uniqueID); - mark_buffer_dirty(sbi->s_lvid_bh); + udf_updated_lvid(sb); } mutex_unlock(&sbi->s_alloc_mutex); inode->i_mode = mode; @@ -138,7 +137,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err) iinfo->i_location.logicalBlockNum = block; iinfo->i_location.partitionReferenceNum = dinfo->i_location.partitionReferenceNum; - inode->i_ino = udf_get_lb_pblock(sb, iinfo->i_location, 0); + inode->i_ino = udf_get_lb_pblock(sb, &iinfo->i_location, 0); inode->i_blocks = 0; iinfo->i_lenEAttr = 0; iinfo->i_lenAlloc = 0; diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 30ebde490f7f..e7533f785636 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -55,15 +55,15 @@ static int udf_alloc_i_data(struct inode *inode, size_t size); static struct buffer_head *inode_getblk(struct inode *, sector_t, int *, sector_t *, int *); static int8_t udf_insert_aext(struct inode *, struct extent_position, - kernel_lb_addr, uint32_t); + struct kernel_lb_addr, uint32_t); static void udf_split_extents(struct inode *, int *, int, int, - kernel_long_ad[EXTENT_MERGE_SIZE], int *); + struct kernel_long_ad[EXTENT_MERGE_SIZE], int *); static void udf_prealloc_extents(struct inode *, int, int, - kernel_long_ad[EXTENT_MERGE_SIZE], int *); + struct kernel_long_ad[EXTENT_MERGE_SIZE], int *); static void udf_merge_extents(struct inode *, - kernel_long_ad[EXTENT_MERGE_SIZE], int *); + struct kernel_long_ad[EXTENT_MERGE_SIZE], int *); static void udf_update_extents(struct inode *, - kernel_long_ad[EXTENT_MERGE_SIZE], int, int, + struct kernel_long_ad[EXTENT_MERGE_SIZE], int, int, struct extent_position *); static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int); @@ -200,7 +200,7 @@ struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block, { int newblock; struct buffer_head *dbh = NULL; - kernel_lb_addr eloc; + struct kernel_lb_addr eloc; uint32_t elen; uint8_t alloctype; struct extent_position epos; @@ -281,7 +281,7 @@ struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block, epos.bh = NULL; epos.block = iinfo->i_location; epos.offset = udf_file_entry_alloc_offset(inode); - udf_add_aext(inode, &epos, eloc, elen, 0); + udf_add_aext(inode, &epos, &eloc, elen, 0); /* UniqueID stuff */ brelse(epos.bh); @@ -359,12 +359,12 @@ static struct buffer_head *udf_getblk(struct inode *inode, long block, /* Extend the file by 'blocks' blocks, return the number of extents added */ int udf_extend_file(struct inode *inode, struct extent_position *last_pos, - kernel_long_ad *last_ext, sector_t blocks) + struct kernel_long_ad *last_ext, sector_t blocks) { sector_t add; int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK); struct super_block *sb = inode->i_sb; - kernel_lb_addr prealloc_loc = {}; + struct kernel_lb_addr prealloc_loc = {}; int prealloc_len = 0; struct udf_inode_info *iinfo; @@ -411,11 +411,11 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos, } if (fake) { - udf_add_aext(inode, last_pos, last_ext->extLocation, + udf_add_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1); count++; } else - udf_write_aext(inode, last_pos, last_ext->extLocation, + udf_write_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1); /* Managed to do everything necessary? */ @@ -432,7 +432,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos, /* Create enough extents to cover the whole hole */ while (blocks > add) { blocks -= add; - if (udf_add_aext(inode, last_pos, last_ext->extLocation, + if (udf_add_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1) == -1) return -1; count++; @@ -440,7 +440,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos, if (blocks) { last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | (blocks << sb->s_blocksize_bits); - if (udf_add_aext(inode, last_pos, last_ext->extLocation, + if (udf_add_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1) == -1) return -1; count++; @@ -449,7 +449,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos, out: /* Do we have some preallocated blocks saved? */ if (prealloc_len) { - if (udf_add_aext(inode, last_pos, prealloc_loc, + if (udf_add_aext(inode, last_pos, &prealloc_loc, prealloc_len, 1) == -1) return -1; last_ext->extLocation = prealloc_loc; @@ -459,9 +459,9 @@ out: /* last_pos should point to the last written extent... */ if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - last_pos->offset -= sizeof(short_ad); + last_pos->offset -= sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - last_pos->offset -= sizeof(long_ad); + last_pos->offset -= sizeof(struct long_ad); else return -1; @@ -473,11 +473,11 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block, { static sector_t last_block; struct buffer_head *result = NULL; - kernel_long_ad laarr[EXTENT_MERGE_SIZE]; + struct kernel_long_ad laarr[EXTENT_MERGE_SIZE]; struct extent_position prev_epos, cur_epos, next_epos; int count = 0, startnum = 0, endnum = 0; uint32_t elen = 0, tmpelen; - kernel_lb_addr eloc, tmpeloc; + struct kernel_lb_addr eloc, tmpeloc; int c = 1; loff_t lbcount = 0, b_off = 0; uint32_t newblocknum, newblock; @@ -550,12 +550,12 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block, elen = EXT_RECORDED_ALLOCATED | ((elen + inode->i_sb->s_blocksize - 1) & ~(inode->i_sb->s_blocksize - 1)); - etype = udf_write_aext(inode, &cur_epos, eloc, elen, 1); + etype = udf_write_aext(inode, &cur_epos, &eloc, elen, 1); } brelse(prev_epos.bh); brelse(cur_epos.bh); brelse(next_epos.bh); - newblock = udf_get_lb_pblock(inode->i_sb, eloc, offset); + newblock = udf_get_lb_pblock(inode->i_sb, &eloc, offset); *phys = newblock; return NULL; } @@ -572,7 +572,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block, } else { /* Create a fake extent when there's not one */ memset(&laarr[0].extLocation, 0x00, - sizeof(kernel_lb_addr)); + sizeof(struct kernel_lb_addr)); laarr[0].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED; /* Will udf_extend_file() create real extent from a fake one? */ @@ -602,7 +602,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block, laarr[c].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | inode->i_sb->s_blocksize; memset(&laarr[c].extLocation, 0x00, - sizeof(kernel_lb_addr)); + sizeof(struct kernel_lb_addr)); count++; endnum++; } @@ -699,7 +699,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block, static void udf_split_extents(struct inode *inode, int *c, int offset, int newblocknum, - kernel_long_ad laarr[EXTENT_MERGE_SIZE], + struct kernel_long_ad laarr[EXTENT_MERGE_SIZE], int *endnum) { unsigned long blocksize = inode->i_sb->s_blocksize; @@ -726,7 +726,7 @@ static void udf_split_extents(struct inode *inode, int *c, int offset, if (offset) { if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { udf_free_blocks(inode->i_sb, inode, - laarr[curr].extLocation, + &laarr[curr].extLocation, 0, offset); laarr[curr].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | @@ -763,7 +763,7 @@ static void udf_split_extents(struct inode *inode, int *c, int offset, } static void udf_prealloc_extents(struct inode *inode, int c, int lastblock, - kernel_long_ad laarr[EXTENT_MERGE_SIZE], + struct kernel_long_ad laarr[EXTENT_MERGE_SIZE], int *endnum) { int start, length = 0, currlength = 0, i; @@ -817,7 +817,7 @@ static void udf_prealloc_extents(struct inode *inode, int c, int lastblock, inode->i_sb->s_blocksize_bits); else { memmove(&laarr[c + 2], &laarr[c + 1], - sizeof(long_ad) * (*endnum - (c + 1))); + sizeof(struct long_ad) * (*endnum - (c + 1))); (*endnum)++; laarr[c + 1].extLocation.logicalBlockNum = next; laarr[c + 1].extLocation.partitionReferenceNum = @@ -846,7 +846,7 @@ static void udf_prealloc_extents(struct inode *inode, int c, int lastblock, if (*endnum > (i + 1)) memmove(&laarr[i], &laarr[i + 1], - sizeof(long_ad) * + sizeof(struct long_ad) * (*endnum - (i + 1))); i--; (*endnum)--; @@ -859,7 +859,7 @@ static void udf_prealloc_extents(struct inode *inode, int c, int lastblock, } static void udf_merge_extents(struct inode *inode, - kernel_long_ad laarr[EXTENT_MERGE_SIZE], + struct kernel_long_ad laarr[EXTENT_MERGE_SIZE], int *endnum) { int i; @@ -867,8 +867,8 @@ static void udf_merge_extents(struct inode *inode, unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; for (i = 0; i < (*endnum - 1); i++) { - kernel_long_ad *li /*l[i]*/ = &laarr[i]; - kernel_long_ad *lip1 /*l[i plus 1]*/ = &laarr[i + 1]; + struct kernel_long_ad *li /*l[i]*/ = &laarr[i]; + struct kernel_long_ad *lip1 /*l[i plus 1]*/ = &laarr[i + 1]; if (((li->extLength >> 30) == (lip1->extLength >> 30)) && (((li->extLength >> 30) == @@ -902,7 +902,7 @@ static void udf_merge_extents(struct inode *inode, blocksize - 1) & ~(blocksize - 1)); if (*endnum > (i + 2)) memmove(&laarr[i + 1], &laarr[i + 2], - sizeof(long_ad) * + sizeof(struct long_ad) * (*endnum - (i + 2))); i--; (*endnum)--; @@ -911,7 +911,7 @@ static void udf_merge_extents(struct inode *inode, (EXT_NOT_RECORDED_ALLOCATED >> 30)) && ((lip1->extLength >> 30) == (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30))) { - udf_free_blocks(inode->i_sb, inode, li->extLocation, 0, + udf_free_blocks(inode->i_sb, inode, &li->extLocation, 0, ((li->extLength & UDF_EXTENT_LENGTH_MASK) + blocksize - 1) >> blocksize_bits); @@ -937,7 +937,7 @@ static void udf_merge_extents(struct inode *inode, blocksize - 1) & ~(blocksize - 1)); if (*endnum > (i + 2)) memmove(&laarr[i + 1], &laarr[i + 2], - sizeof(long_ad) * + sizeof(struct long_ad) * (*endnum - (i + 2))); i--; (*endnum)--; @@ -945,7 +945,7 @@ static void udf_merge_extents(struct inode *inode, } else if ((li->extLength >> 30) == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { udf_free_blocks(inode->i_sb, inode, - li->extLocation, 0, + &li->extLocation, 0, ((li->extLength & UDF_EXTENT_LENGTH_MASK) + blocksize - 1) >> blocksize_bits); @@ -959,12 +959,12 @@ static void udf_merge_extents(struct inode *inode, } static void udf_update_extents(struct inode *inode, - kernel_long_ad laarr[EXTENT_MERGE_SIZE], + struct kernel_long_ad laarr[EXTENT_MERGE_SIZE], int startnum, int endnum, struct extent_position *epos) { int start = 0, i; - kernel_lb_addr tmploc; + struct kernel_lb_addr tmploc; uint32_t tmplen; if (startnum > endnum) { @@ -983,7 +983,7 @@ static void udf_update_extents(struct inode *inode, for (i = start; i < endnum; i++) { udf_next_aext(inode, epos, &tmploc, &tmplen, 0); - udf_write_aext(inode, epos, laarr[i].extLocation, + udf_write_aext(inode, epos, &laarr[i].extLocation, laarr[i].extLength, 1); } } @@ -1076,7 +1076,7 @@ static void __udf_read_inode(struct inode *inode) * i_nlink = 1 * i_op = NULL; */ - bh = udf_read_ptagged(inode->i_sb, iinfo->i_location, 0, &ident); + bh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 0, &ident); if (!bh) { printk(KERN_ERR "udf: udf_read_inode(ino %ld) failed !bh\n", inode->i_ino); @@ -1098,24 +1098,24 @@ static void __udf_read_inode(struct inode *inode) if (fe->icbTag.strategyType == cpu_to_le16(4096)) { struct buffer_head *ibh; - ibh = udf_read_ptagged(inode->i_sb, iinfo->i_location, 1, + ibh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 1, &ident); if (ident == TAG_IDENT_IE && ibh) { struct buffer_head *nbh = NULL; - kernel_lb_addr loc; + struct kernel_lb_addr loc; struct indirectEntry *ie; ie = (struct indirectEntry *)ibh->b_data; loc = lelb_to_cpu(ie->indirectICB.extLocation); if (ie->indirectICB.extLength && - (nbh = udf_read_ptagged(inode->i_sb, loc, 0, + (nbh = udf_read_ptagged(inode->i_sb, &loc, 0, &ident))) { if (ident == TAG_IDENT_FE || ident == TAG_IDENT_EFE) { memcpy(&iinfo->i_location, &loc, - sizeof(kernel_lb_addr)); + sizeof(struct kernel_lb_addr)); brelse(bh); brelse(ibh); brelse(nbh); @@ -1222,8 +1222,15 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) inode->i_size = le64_to_cpu(fe->informationLength); iinfo->i_lenExtents = inode->i_size; - inode->i_mode = udf_convert_permissions(fe); - inode->i_mode &= ~UDF_SB(inode->i_sb)->s_umask; + if (fe->icbTag.fileType != ICBTAG_FILE_TYPE_DIRECTORY && + sbi->s_fmode != UDF_INVALID_MODE) + inode->i_mode = sbi->s_fmode; + else if (fe->icbTag.fileType == ICBTAG_FILE_TYPE_DIRECTORY && + sbi->s_dmode != UDF_INVALID_MODE) + inode->i_mode = sbi->s_dmode; + else + inode->i_mode = udf_convert_permissions(fe); + inode->i_mode &= ~sbi->s_umask; if (iinfo->i_efe == 0) { inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) << @@ -1396,7 +1403,7 @@ static int udf_update_inode(struct inode *inode, int do_sync) bh = udf_tread(inode->i_sb, udf_get_lb_pblock(inode->i_sb, - iinfo->i_location, 0)); + &iinfo->i_location, 0)); if (!bh) { udf_debug("bread failure\n"); return -EIO; @@ -1416,13 +1423,13 @@ static int udf_update_inode(struct inode *inode, int do_sync) iinfo->i_ext.i_data, inode->i_sb->s_blocksize - sizeof(struct unallocSpaceEntry)); crclen = sizeof(struct unallocSpaceEntry) + - iinfo->i_lenAlloc - sizeof(tag); + iinfo->i_lenAlloc - sizeof(struct tag); use->descTag.tagLocation = cpu_to_le32( iinfo->i_location. logicalBlockNum); use->descTag.descCRCLength = cpu_to_le16(crclen); use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use + - sizeof(tag), + sizeof(struct tag), crclen)); use->descTag.tagChecksum = udf_tag_checksum(&use->descTag); @@ -1459,23 +1466,23 @@ static int udf_update_inode(struct inode *inode, int do_sync) fe->informationLength = cpu_to_le64(inode->i_size); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { - regid *eid; + struct regid *eid; struct deviceSpec *dsea = (struct deviceSpec *)udf_get_extendedattr(inode, 12, 1); if (!dsea) { dsea = (struct deviceSpec *) udf_add_extendedattr(inode, sizeof(struct deviceSpec) + - sizeof(regid), 12, 0x3); + sizeof(struct regid), 12, 0x3); dsea->attrType = cpu_to_le32(12); dsea->attrSubtype = 1; dsea->attrLength = cpu_to_le32( sizeof(struct deviceSpec) + - sizeof(regid)); - dsea->impUseLength = cpu_to_le32(sizeof(regid)); + sizeof(struct regid)); + dsea->impUseLength = cpu_to_le32(sizeof(struct regid)); } - eid = (regid *)dsea->impUse; - memset(eid, 0, sizeof(regid)); + eid = (struct regid *)dsea->impUse; + memset(eid, 0, sizeof(struct regid)); strcpy(eid->ident, UDF_ID_DEVELOPER); eid->identSuffix[0] = UDF_OS_CLASS_UNIX; eid->identSuffix[1] = UDF_OS_ID_LINUX; @@ -1494,7 +1501,7 @@ static int udf_update_inode(struct inode *inode, int do_sync) udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime); udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime); udf_time_to_disk_stamp(&fe->attrTime, inode->i_ctime); - memset(&(fe->impIdent), 0, sizeof(regid)); + memset(&(fe->impIdent), 0, sizeof(struct regid)); strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER); fe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; fe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; @@ -1533,7 +1540,7 @@ static int udf_update_inode(struct inode *inode, int do_sync) udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime); udf_time_to_disk_stamp(&efe->attrTime, inode->i_ctime); - memset(&(efe->impIdent), 0, sizeof(regid)); + memset(&(efe->impIdent), 0, sizeof(struct regid)); strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER); efe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; efe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; @@ -1584,9 +1591,9 @@ static int udf_update_inode(struct inode *inode, int do_sync) fe->descTag.tagLocation = cpu_to_le32( iinfo->i_location.logicalBlockNum); crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc - - sizeof(tag); + sizeof(struct tag); fe->descTag.descCRCLength = cpu_to_le16(crclen); - fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(tag), + fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(struct tag), crclen)); fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag); @@ -1606,7 +1613,7 @@ static int udf_update_inode(struct inode *inode, int do_sync) return err; } -struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino) +struct inode *udf_iget(struct super_block *sb, struct kernel_lb_addr *ino) { unsigned long block = udf_get_lb_pblock(sb, ino, 0); struct inode *inode = iget_locked(sb, block); @@ -1615,7 +1622,7 @@ struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino) return NULL; if (inode->i_state & I_NEW) { - memcpy(&UDF_I(inode)->i_location, &ino, sizeof(kernel_lb_addr)); + memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr)); __udf_read_inode(inode); unlock_new_inode(inode); } @@ -1623,10 +1630,10 @@ struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino) if (is_bad_inode(inode)) goto out_iput; - if (ino.logicalBlockNum >= UDF_SB(sb)-> - s_partmaps[ino.partitionReferenceNum].s_partition_len) { + if (ino->logicalBlockNum >= UDF_SB(sb)-> + s_partmaps[ino->partitionReferenceNum].s_partition_len) { udf_debug("block=%d, partition=%d out of range\n", - ino.logicalBlockNum, ino.partitionReferenceNum); + ino->logicalBlockNum, ino->partitionReferenceNum); make_bad_inode(inode); goto out_iput; } @@ -1639,11 +1646,11 @@ struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino) } int8_t udf_add_aext(struct inode *inode, struct extent_position *epos, - kernel_lb_addr eloc, uint32_t elen, int inc) + struct kernel_lb_addr *eloc, uint32_t elen, int inc) { int adsize; - short_ad *sad = NULL; - long_ad *lad = NULL; + struct short_ad *sad = NULL; + struct long_ad *lad = NULL; struct allocExtDesc *aed; int8_t etype; uint8_t *ptr; @@ -1657,9 +1664,9 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos, ptr = epos->bh->b_data + epos->offset; if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - adsize = sizeof(short_ad); + adsize = sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - adsize = sizeof(long_ad); + adsize = sizeof(struct long_ad); else return -1; @@ -1667,7 +1674,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos, char *sptr, *dptr; struct buffer_head *nbh; int err, loffset; - kernel_lb_addr obloc = epos->block; + struct kernel_lb_addr obloc = epos->block; epos->block.logicalBlockNum = udf_new_block(inode->i_sb, NULL, obloc.partitionReferenceNum, @@ -1675,7 +1682,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos, if (!epos->block.logicalBlockNum) return -1; nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb, - epos->block, + &epos->block, 0)); if (!nbh) return -1; @@ -1712,20 +1719,20 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos, } if (UDF_SB(inode->i_sb)->s_udfrev >= 0x0200) udf_new_tag(nbh->b_data, TAG_IDENT_AED, 3, 1, - epos->block.logicalBlockNum, sizeof(tag)); + epos->block.logicalBlockNum, sizeof(struct tag)); else udf_new_tag(nbh->b_data, TAG_IDENT_AED, 2, 1, - epos->block.logicalBlockNum, sizeof(tag)); + epos->block.logicalBlockNum, sizeof(struct tag)); switch (iinfo->i_alloc_type) { case ICBTAG_FLAG_AD_SHORT: - sad = (short_ad *)sptr; + sad = (struct short_ad *)sptr; sad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS | inode->i_sb->s_blocksize); sad->extPosition = cpu_to_le32(epos->block.logicalBlockNum); break; case ICBTAG_FLAG_AD_LONG: - lad = (long_ad *)sptr; + lad = (struct long_ad *)sptr; lad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS | inode->i_sb->s_blocksize); lad->extLocation = cpu_to_lelb(epos->block); @@ -1769,12 +1776,12 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos, } int8_t udf_write_aext(struct inode *inode, struct extent_position *epos, - kernel_lb_addr eloc, uint32_t elen, int inc) + struct kernel_lb_addr *eloc, uint32_t elen, int inc) { int adsize; uint8_t *ptr; - short_ad *sad; - long_ad *lad; + struct short_ad *sad; + struct long_ad *lad; struct udf_inode_info *iinfo = UDF_I(inode); if (!epos->bh) @@ -1786,17 +1793,17 @@ int8_t udf_write_aext(struct inode *inode, struct extent_position *epos, switch (iinfo->i_alloc_type) { case ICBTAG_FLAG_AD_SHORT: - sad = (short_ad *)ptr; + sad = (struct short_ad *)ptr; sad->extLength = cpu_to_le32(elen); - sad->extPosition = cpu_to_le32(eloc.logicalBlockNum); - adsize = sizeof(short_ad); + sad->extPosition = cpu_to_le32(eloc->logicalBlockNum); + adsize = sizeof(struct short_ad); break; case ICBTAG_FLAG_AD_LONG: - lad = (long_ad *)ptr; + lad = (struct long_ad *)ptr; lad->extLength = cpu_to_le32(elen); - lad->extLocation = cpu_to_lelb(eloc); + lad->extLocation = cpu_to_lelb(*eloc); memset(lad->impUse, 0x00, sizeof(lad->impUse)); - adsize = sizeof(long_ad); + adsize = sizeof(struct long_ad); break; default: return -1; @@ -1823,7 +1830,7 @@ int8_t udf_write_aext(struct inode *inode, struct extent_position *epos, } int8_t udf_next_aext(struct inode *inode, struct extent_position *epos, - kernel_lb_addr *eloc, uint32_t *elen, int inc) + struct kernel_lb_addr *eloc, uint32_t *elen, int inc) { int8_t etype; @@ -1833,7 +1840,7 @@ int8_t udf_next_aext(struct inode *inode, struct extent_position *epos, epos->block = *eloc; epos->offset = sizeof(struct allocExtDesc); brelse(epos->bh); - block = udf_get_lb_pblock(inode->i_sb, epos->block, 0); + block = udf_get_lb_pblock(inode->i_sb, &epos->block, 0); epos->bh = udf_tread(inode->i_sb, block); if (!epos->bh) { udf_debug("reading block %d failed!\n", block); @@ -1845,13 +1852,13 @@ int8_t udf_next_aext(struct inode *inode, struct extent_position *epos, } int8_t udf_current_aext(struct inode *inode, struct extent_position *epos, - kernel_lb_addr *eloc, uint32_t *elen, int inc) + struct kernel_lb_addr *eloc, uint32_t *elen, int inc) { int alen; int8_t etype; uint8_t *ptr; - short_ad *sad; - long_ad *lad; + struct short_ad *sad; + struct long_ad *lad; struct udf_inode_info *iinfo = UDF_I(inode); if (!epos->bh) { @@ -1900,9 +1907,9 @@ int8_t udf_current_aext(struct inode *inode, struct extent_position *epos, } static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos, - kernel_lb_addr neloc, uint32_t nelen) + struct kernel_lb_addr neloc, uint32_t nelen) { - kernel_lb_addr oeloc; + struct kernel_lb_addr oeloc; uint32_t oelen; int8_t etype; @@ -1910,18 +1917,18 @@ static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos, get_bh(epos.bh); while ((etype = udf_next_aext(inode, &epos, &oeloc, &oelen, 0)) != -1) { - udf_write_aext(inode, &epos, neloc, nelen, 1); + udf_write_aext(inode, &epos, &neloc, nelen, 1); neloc = oeloc; nelen = (etype << 30) | oelen; } - udf_add_aext(inode, &epos, neloc, nelen, 1); + udf_add_aext(inode, &epos, &neloc, nelen, 1); brelse(epos.bh); return (nelen >> 30); } int8_t udf_delete_aext(struct inode *inode, struct extent_position epos, - kernel_lb_addr eloc, uint32_t elen) + struct kernel_lb_addr eloc, uint32_t elen) { struct extent_position oepos; int adsize; @@ -1936,9 +1943,9 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos, iinfo = UDF_I(inode); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - adsize = sizeof(short_ad); + adsize = sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - adsize = sizeof(long_ad); + adsize = sizeof(struct long_ad); else adsize = 0; @@ -1947,7 +1954,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos, return -1; while ((etype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) { - udf_write_aext(inode, &oepos, eloc, (etype << 30) | elen, 1); + udf_write_aext(inode, &oepos, &eloc, (etype << 30) | elen, 1); if (oepos.bh != epos.bh) { oepos.block = epos.block; brelse(oepos.bh); @@ -1956,13 +1963,13 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos, oepos.offset = epos.offset - adsize; } } - memset(&eloc, 0x00, sizeof(kernel_lb_addr)); + memset(&eloc, 0x00, sizeof(struct kernel_lb_addr)); elen = 0; if (epos.bh != oepos.bh) { - udf_free_blocks(inode->i_sb, inode, epos.block, 0, 1); - udf_write_aext(inode, &oepos, eloc, elen, 1); - udf_write_aext(inode, &oepos, eloc, elen, 1); + udf_free_blocks(inode->i_sb, inode, &epos.block, 0, 1); + udf_write_aext(inode, &oepos, &eloc, elen, 1); + udf_write_aext(inode, &oepos, &eloc, elen, 1); if (!oepos.bh) { iinfo->i_lenAlloc -= (adsize * 2); mark_inode_dirty(inode); @@ -1979,7 +1986,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos, mark_buffer_dirty_inode(oepos.bh, inode); } } else { - udf_write_aext(inode, &oepos, eloc, elen, 1); + udf_write_aext(inode, &oepos, &eloc, elen, 1); if (!oepos.bh) { iinfo->i_lenAlloc -= adsize; mark_inode_dirty(inode); @@ -2004,7 +2011,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos, } int8_t inode_bmap(struct inode *inode, sector_t block, - struct extent_position *pos, kernel_lb_addr *eloc, + struct extent_position *pos, struct kernel_lb_addr *eloc, uint32_t *elen, sector_t *offset) { unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; @@ -2036,7 +2043,7 @@ int8_t inode_bmap(struct inode *inode, sector_t block, long udf_block_map(struct inode *inode, sector_t block) { - kernel_lb_addr eloc; + struct kernel_lb_addr eloc; uint32_t elen; sector_t offset; struct extent_position epos = {}; @@ -2046,7 +2053,7 @@ long udf_block_map(struct inode *inode, sector_t block) if (inode_bmap(inode, block, &epos, &eloc, &elen, &offset) == (EXT_RECORDED_ALLOCATED >> 30)) - ret = udf_get_lb_pblock(inode->i_sb, eloc, offset); + ret = udf_get_lb_pblock(inode->i_sb, &eloc, offset); else ret = 0; diff --git a/fs/udf/misc.c b/fs/udf/misc.c index 84bf0fd4a4f1..9215700c00a4 100644 --- a/fs/udf/misc.c +++ b/fs/udf/misc.c @@ -134,10 +134,10 @@ struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size, } } /* rewrite CRC + checksum of eahd */ - crclen = sizeof(struct extendedAttrHeaderDesc) - sizeof(tag); + crclen = sizeof(struct extendedAttrHeaderDesc) - sizeof(struct tag); eahd->descTag.descCRCLength = cpu_to_le16(crclen); eahd->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)eahd + - sizeof(tag), crclen)); + sizeof(struct tag), crclen)); eahd->descTag.tagChecksum = udf_tag_checksum(&eahd->descTag); iinfo->i_lenEAttr += size; return (struct genericFormat *)&ea[offset]; @@ -202,7 +202,7 @@ struct genericFormat *udf_get_extendedattr(struct inode *inode, uint32_t type, struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block, uint32_t location, uint16_t *ident) { - tag *tag_p; + struct tag *tag_p; struct buffer_head *bh = NULL; /* Read the block */ @@ -216,7 +216,7 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block, return NULL; } - tag_p = (tag *)(bh->b_data); + tag_p = (struct tag *)(bh->b_data); *ident = le16_to_cpu(tag_p->tagIdent); @@ -241,9 +241,9 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block, } /* Verify the descriptor CRC */ - if (le16_to_cpu(tag_p->descCRCLength) + sizeof(tag) > sb->s_blocksize || + if (le16_to_cpu(tag_p->descCRCLength) + sizeof(struct tag) > sb->s_blocksize || le16_to_cpu(tag_p->descCRC) == crc_itu_t(0, - bh->b_data + sizeof(tag), + bh->b_data + sizeof(struct tag), le16_to_cpu(tag_p->descCRCLength))) return bh; @@ -255,27 +255,28 @@ error_out: return NULL; } -struct buffer_head *udf_read_ptagged(struct super_block *sb, kernel_lb_addr loc, +struct buffer_head *udf_read_ptagged(struct super_block *sb, + struct kernel_lb_addr *loc, uint32_t offset, uint16_t *ident) { return udf_read_tagged(sb, udf_get_lb_pblock(sb, loc, offset), - loc.logicalBlockNum + offset, ident); + loc->logicalBlockNum + offset, ident); } void udf_update_tag(char *data, int length) { - tag *tptr = (tag *)data; - length -= sizeof(tag); + struct tag *tptr = (struct tag *)data; + length -= sizeof(struct tag); tptr->descCRCLength = cpu_to_le16(length); - tptr->descCRC = cpu_to_le16(crc_itu_t(0, data + sizeof(tag), length)); + tptr->descCRC = cpu_to_le16(crc_itu_t(0, data + sizeof(struct tag), length)); tptr->tagChecksum = udf_tag_checksum(tptr); } void udf_new_tag(char *data, uint16_t ident, uint16_t version, uint16_t snum, uint32_t loc, int length) { - tag *tptr = (tag *)data; + struct tag *tptr = (struct tag *)data; tptr->tagIdent = cpu_to_le16(ident); tptr->descVersion = cpu_to_le16(version); tptr->tagSerialNum = cpu_to_le16(snum); @@ -283,12 +284,12 @@ void udf_new_tag(char *data, uint16_t ident, uint16_t version, uint16_t snum, udf_update_tag(data, length); } -u8 udf_tag_checksum(const tag *t) +u8 udf_tag_checksum(const struct tag *t) { u8 *data = (u8 *)t; u8 checksum = 0; int i; - for (i = 0; i < sizeof(tag); ++i) + for (i = 0; i < sizeof(struct tag); ++i) if (i != 4) /* position of checksum */ checksum += data[i]; return checksum; diff --git a/fs/udf/namei.c b/fs/udf/namei.c index f84bfaa8d941..6a29fa34c478 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -47,7 +47,7 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi, struct fileIdentDesc *sfi, struct udf_fileident_bh *fibh, uint8_t *impuse, uint8_t *fileident) { - uint16_t crclen = fibh->eoffset - fibh->soffset - sizeof(tag); + uint16_t crclen = fibh->eoffset - fibh->soffset - sizeof(struct tag); uint16_t crc; int offset; uint16_t liu = le16_to_cpu(cfi->lengthOfImpUse); @@ -99,18 +99,18 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi, memset(fibh->ebh->b_data, 0x00, padlen + offset); } - crc = crc_itu_t(0, (uint8_t *)cfi + sizeof(tag), - sizeof(struct fileIdentDesc) - sizeof(tag)); + crc = crc_itu_t(0, (uint8_t *)cfi + sizeof(struct tag), + sizeof(struct fileIdentDesc) - sizeof(struct tag)); if (fibh->sbh == fibh->ebh) { crc = crc_itu_t(crc, (uint8_t *)sfi->impUse, - crclen + sizeof(tag) - + crclen + sizeof(struct tag) - sizeof(struct fileIdentDesc)); } else if (sizeof(struct fileIdentDesc) >= -fibh->soffset) { crc = crc_itu_t(crc, fibh->ebh->b_data + sizeof(struct fileIdentDesc) + fibh->soffset, - crclen + sizeof(tag) - + crclen + sizeof(struct tag) - sizeof(struct fileIdentDesc)); } else { crc = crc_itu_t(crc, (uint8_t *)sfi->impUse, @@ -154,7 +154,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir, uint8_t lfi; uint16_t liu; loff_t size; - kernel_lb_addr eloc; + struct kernel_lb_addr eloc; uint32_t elen; sector_t offset; struct extent_position epos = {}; @@ -171,12 +171,12 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir, if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos, &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) goto out_err; - block = udf_get_lb_pblock(dir->i_sb, eloc, offset); + block = udf_get_lb_pblock(dir->i_sb, &eloc, offset); if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - epos.offset -= sizeof(short_ad); + epos.offset -= sizeof(struct short_ad); else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - epos.offset -= sizeof(long_ad); + epos.offset -= sizeof(struct long_ad); } else offset = 0; @@ -268,7 +268,7 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry, #ifdef UDF_RECOVERY /* temporary shorthand for specifying files by inode number */ if (!strncmp(dentry->d_name.name, ".B=", 3)) { - kernel_lb_addr lb = { + struct kernel_lb_addr lb = { .logicalBlockNum = 0, .partitionReferenceNum = simple_strtoul(dentry->d_name.name + 3, @@ -283,11 +283,14 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry, #endif /* UDF_RECOVERY */ if (udf_find_entry(dir, &dentry->d_name, &fibh, &cfi)) { + struct kernel_lb_addr loc; + if (fibh.sbh != fibh.ebh) brelse(fibh.ebh); brelse(fibh.sbh); - inode = udf_iget(dir->i_sb, lelb_to_cpu(cfi.icb.extLocation)); + loc = lelb_to_cpu(cfi.icb.extLocation); + inode = udf_iget(dir->i_sb, &loc); if (!inode) { unlock_kernel(); return ERR_PTR(-EACCES); @@ -313,7 +316,7 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir, uint8_t lfi; uint16_t liu; int block; - kernel_lb_addr eloc; + struct kernel_lb_addr eloc; uint32_t elen = 0; sector_t offset; struct extent_position epos = {}; @@ -351,16 +354,16 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir, if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos, &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) { block = udf_get_lb_pblock(dir->i_sb, - dinfo->i_location, 0); + &dinfo->i_location, 0); fibh->soffset = fibh->eoffset = sb->s_blocksize; goto add; } - block = udf_get_lb_pblock(dir->i_sb, eloc, offset); + block = udf_get_lb_pblock(dir->i_sb, &eloc, offset); if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - epos.offset -= sizeof(short_ad); + epos.offset -= sizeof(struct short_ad); else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - epos.offset -= sizeof(long_ad); + epos.offset -= sizeof(struct long_ad); } else offset = 0; @@ -409,10 +412,10 @@ add: if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && elen) { elen = (elen + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1); if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - epos.offset -= sizeof(short_ad); + epos.offset -= sizeof(struct short_ad); else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - epos.offset -= sizeof(long_ad); - udf_write_aext(dir, &epos, eloc, elen, 1); + epos.offset -= sizeof(struct long_ad); + udf_write_aext(dir, &epos, &eloc, elen, 1); } f_pos += nfidlen; @@ -494,10 +497,10 @@ add: memset(cfi, 0, sizeof(struct fileIdentDesc)); if (UDF_SB(sb)->s_udfrev >= 0x0200) udf_new_tag((char *)cfi, TAG_IDENT_FID, 3, 1, block, - sizeof(tag)); + sizeof(struct tag)); else udf_new_tag((char *)cfi, TAG_IDENT_FID, 2, 1, block, - sizeof(tag)); + sizeof(struct tag)); cfi->fileVersionNum = cpu_to_le16(1); cfi->lengthFileIdent = namelen; cfi->lengthOfImpUse = cpu_to_le16(0); @@ -530,7 +533,7 @@ static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi, cfi->fileCharacteristics |= FID_FILE_CHAR_DELETED; if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT)) - memset(&(cfi->icb), 0x00, sizeof(long_ad)); + memset(&(cfi->icb), 0x00, sizeof(struct long_ad)); return udf_write_fi(inode, cfi, fi, fibh, NULL, NULL); } @@ -710,7 +713,7 @@ static int empty_dir(struct inode *dir) loff_t f_pos; loff_t size = udf_ext0_offset(dir) + dir->i_size; int block; - kernel_lb_addr eloc; + struct kernel_lb_addr eloc; uint32_t elen; sector_t offset; struct extent_position epos = {}; @@ -724,12 +727,12 @@ static int empty_dir(struct inode *dir) else if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos, &eloc, &elen, &offset) == (EXT_RECORDED_ALLOCATED >> 30)) { - block = udf_get_lb_pblock(dir->i_sb, eloc, offset); + block = udf_get_lb_pblock(dir->i_sb, &eloc, offset); if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - epos.offset -= sizeof(short_ad); + epos.offset -= sizeof(struct short_ad); else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - epos.offset -= sizeof(long_ad); + epos.offset -= sizeof(struct long_ad); } else offset = 0; @@ -778,7 +781,7 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry) struct inode *inode = dentry->d_inode; struct udf_fileident_bh fibh; struct fileIdentDesc *fi, cfi; - kernel_lb_addr tloc; + struct kernel_lb_addr tloc; retval = -ENOENT; lock_kernel(); @@ -788,7 +791,7 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry) retval = -EIO; tloc = lelb_to_cpu(cfi.icb.extLocation); - if (udf_get_lb_pblock(dir->i_sb, tloc, 0) != inode->i_ino) + if (udf_get_lb_pblock(dir->i_sb, &tloc, 0) != inode->i_ino) goto end_rmdir; retval = -ENOTEMPTY; if (!empty_dir(inode)) @@ -824,7 +827,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry) struct udf_fileident_bh fibh; struct fileIdentDesc *fi; struct fileIdentDesc cfi; - kernel_lb_addr tloc; + struct kernel_lb_addr tloc; retval = -ENOENT; lock_kernel(); @@ -834,7 +837,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry) retval = -EIO; tloc = lelb_to_cpu(cfi.icb.extLocation); - if (udf_get_lb_pblock(dir->i_sb, tloc, 0) != inode->i_ino) + if (udf_get_lb_pblock(dir->i_sb, &tloc, 0) != inode->i_ino) goto end_unlink; if (!inode->i_nlink) { @@ -897,7 +900,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, inode->i_op = &page_symlink_inode_operations; if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { - kernel_lb_addr eloc; + struct kernel_lb_addr eloc; uint32_t bsize; block = udf_new_block(inode->i_sb, inode, @@ -913,7 +916,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, iinfo->i_location.partitionReferenceNum; bsize = inode->i_sb->s_blocksize; iinfo->i_lenExtents = bsize; - udf_add_aext(inode, &epos, eloc, bsize, 0); + udf_add_aext(inode, &epos, &eloc, bsize, 0); brelse(epos.bh); block = udf_get_pblock(inode->i_sb, block, @@ -1108,7 +1111,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, struct fileIdentDesc ocfi, ncfi; struct buffer_head *dir_bh = NULL; int retval = -ENOENT; - kernel_lb_addr tloc; + struct kernel_lb_addr tloc; struct udf_inode_info *old_iinfo = UDF_I(old_inode); lock_kernel(); @@ -1119,7 +1122,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, brelse(ofibh.sbh); } tloc = lelb_to_cpu(ocfi.icb.extLocation); - if (!ofi || udf_get_lb_pblock(old_dir->i_sb, tloc, 0) + if (!ofi || udf_get_lb_pblock(old_dir->i_sb, &tloc, 0) != old_inode->i_ino) goto end_rename; @@ -1158,7 +1161,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, if (!dir_fi) goto end_rename; tloc = lelb_to_cpu(dir_fi->icb.extLocation); - if (udf_get_lb_pblock(old_inode->i_sb, tloc, 0) != + if (udf_get_lb_pblock(old_inode->i_sb, &tloc, 0) != old_dir->i_ino) goto end_rename; @@ -1187,7 +1190,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, */ ncfi.fileVersionNum = ocfi.fileVersionNum; ncfi.fileCharacteristics = ocfi.fileCharacteristics; - memcpy(&(ncfi.icb), &(ocfi.icb), sizeof(long_ad)); + memcpy(&(ncfi.icb), &(ocfi.icb), sizeof(struct long_ad)); udf_write_fi(new_dir, &ncfi, nfi, &nfibh, NULL, NULL); /* The old fid may have moved - find it again */ @@ -1242,6 +1245,7 @@ end_rename: static struct dentry *udf_get_parent(struct dentry *child) { + struct kernel_lb_addr tloc; struct inode *inode = NULL; struct qstr dotdot = {.name = "..", .len = 2}; struct fileIdentDesc cfi; @@ -1255,8 +1259,8 @@ static struct dentry *udf_get_parent(struct dentry *child) brelse(fibh.ebh); brelse(fibh.sbh); - inode = udf_iget(child->d_inode->i_sb, - lelb_to_cpu(cfi.icb.extLocation)); + tloc = lelb_to_cpu(cfi.icb.extLocation); + inode = udf_iget(child->d_inode->i_sb, &tloc); if (!inode) goto out_unlock; unlock_kernel(); @@ -1272,14 +1276,14 @@ static struct dentry *udf_nfs_get_inode(struct super_block *sb, u32 block, u16 partref, __u32 generation) { struct inode *inode; - kernel_lb_addr loc; + struct kernel_lb_addr loc; if (block == 0) return ERR_PTR(-ESTALE); loc.logicalBlockNum = block; loc.partitionReferenceNum = partref; - inode = udf_iget(sb, loc); + inode = udf_iget(sb, &loc); if (inode == NULL) return ERR_PTR(-ENOMEM); @@ -1318,7 +1322,7 @@ static int udf_encode_fh(struct dentry *de, __u32 *fh, int *lenp, { int len = *lenp; struct inode *inode = de->d_inode; - kernel_lb_addr location = UDF_I(inode)->i_location; + struct kernel_lb_addr location = UDF_I(inode)->i_location; struct fid *fid = (struct fid *)fh; int type = FILEID_UDF_WITHOUT_PARENT; diff --git a/fs/udf/osta_udf.h b/fs/udf/osta_udf.h index 65ff47902bd2..fbff74654df2 100644 --- a/fs/udf/osta_udf.h +++ b/fs/udf/osta_udf.h @@ -85,7 +85,7 @@ struct appIdentSuffix { /* Logical Volume Integrity Descriptor (UDF 2.50 2.2.6) */ /* Implementation Use (UDF 2.50 2.2.6.4) */ struct logicalVolIntegrityDescImpUse { - regid impIdent; + struct regid impIdent; __le32 numFiles; __le32 numDirs; __le16 minUDFReadRev; @@ -97,12 +97,12 @@ struct logicalVolIntegrityDescImpUse { /* Implementation Use Volume Descriptor (UDF 2.50 2.2.7) */ /* Implementation Use (UDF 2.50 2.2.7.2) */ struct impUseVolDescImpUse { - charspec LVICharset; + struct charspec LVICharset; dstring logicalVolIdent[128]; dstring LVInfo1[36]; dstring LVInfo2[36]; dstring LVInfo3[36]; - regid impIdent; + struct regid impIdent; uint8_t impUse[128]; } __attribute__ ((packed)); @@ -110,7 +110,7 @@ struct udfPartitionMap2 { uint8_t partitionMapType; uint8_t partitionMapLength; uint8_t reserved1[2]; - regid partIdent; + struct regid partIdent; __le16 volSeqNum; __le16 partitionNum; } __attribute__ ((packed)); @@ -120,7 +120,7 @@ struct virtualPartitionMap { uint8_t partitionMapType; uint8_t partitionMapLength; uint8_t reserved1[2]; - regid partIdent; + struct regid partIdent; __le16 volSeqNum; __le16 partitionNum; uint8_t reserved2[24]; @@ -131,7 +131,7 @@ struct sparablePartitionMap { uint8_t partitionMapType; uint8_t partitionMapLength; uint8_t reserved1[2]; - regid partIdent; + struct regid partIdent; __le16 volSeqNum; __le16 partitionNum; __le16 packetLength; @@ -146,7 +146,7 @@ struct metadataPartitionMap { uint8_t partitionMapType; uint8_t partitionMapLength; uint8_t reserved1[2]; - regid partIdent; + struct regid partIdent; __le16 volSeqNum; __le16 partitionNum; __le32 metadataFileLoc; @@ -161,7 +161,7 @@ struct metadataPartitionMap { /* Virtual Allocation Table (UDF 1.5 2.2.10) */ struct virtualAllocationTable15 { __le32 VirtualSector[0]; - regid vatIdent; + struct regid vatIdent; __le32 previousVATICBLoc; } __attribute__ ((packed)); @@ -192,8 +192,8 @@ struct sparingEntry { } __attribute__ ((packed)); struct sparingTable { - tag descTag; - regid sparingIdent; + struct tag descTag; + struct regid sparingIdent; __le16 reallocationTableLen; __le16 reserved; __le32 sequenceNum; @@ -206,7 +206,7 @@ struct sparingTable { #define ICBTAG_FILE_TYPE_MIRROR 0xFB #define ICBTAG_FILE_TYPE_BITMAP 0xFC -/* struct long_ad ICB - ADImpUse (UDF 2.50 2.2.4.3) */ +/* struct struct long_ad ICB - ADImpUse (UDF 2.50 2.2.4.3) */ struct allocDescImpUse { __le16 flags; uint8_t impUse[4]; diff --git a/fs/udf/partition.c b/fs/udf/partition.c index 96dfd207c3d6..4b540ee632d5 100644 --- a/fs/udf/partition.c +++ b/fs/udf/partition.c @@ -273,7 +273,7 @@ static uint32_t udf_try_read_meta(struct inode *inode, uint32_t block, { struct super_block *sb = inode->i_sb; struct udf_part_map *map; - kernel_lb_addr eloc; + struct kernel_lb_addr eloc; uint32_t elen; sector_t ext_offset; struct extent_position epos = {}; diff --git a/fs/udf/super.c b/fs/udf/super.c index e25e7010627b..72348cc855a4 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -81,16 +81,13 @@ static char error_buf[1024]; /* These are the "meat" - everything else is stuffing */ static int udf_fill_super(struct super_block *, void *, int); static void udf_put_super(struct super_block *); -static void udf_write_super(struct super_block *); +static int udf_sync_fs(struct super_block *, int); static int udf_remount_fs(struct super_block *, int *, char *); -static int udf_check_valid(struct super_block *, int, int); -static int udf_vrs(struct super_block *sb, int silent); -static void udf_load_logicalvolint(struct super_block *, kernel_extent_ad); -static void udf_find_anchor(struct super_block *); -static int udf_find_fileset(struct super_block *, kernel_lb_addr *, - kernel_lb_addr *); +static void udf_load_logicalvolint(struct super_block *, struct kernel_extent_ad); +static int udf_find_fileset(struct super_block *, struct kernel_lb_addr *, + struct kernel_lb_addr *); static void udf_load_fileset(struct super_block *, struct buffer_head *, - kernel_lb_addr *); + struct kernel_lb_addr *); static void udf_open_lvid(struct super_block *); static void udf_close_lvid(struct super_block *); static unsigned int udf_count_free(struct super_block *); @@ -181,7 +178,7 @@ static const struct super_operations udf_sb_ops = { .delete_inode = udf_delete_inode, .clear_inode = udf_clear_inode, .put_super = udf_put_super, - .write_super = udf_write_super, + .sync_fs = udf_sync_fs, .statfs = udf_statfs, .remount_fs = udf_remount_fs, .show_options = udf_show_options, @@ -201,6 +198,8 @@ struct udf_options { mode_t umask; gid_t gid; uid_t uid; + mode_t fmode; + mode_t dmode; struct nls_table *nls_map; }; @@ -258,7 +257,7 @@ static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt) if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT)) seq_puts(seq, ",nostrict"); - if (sb->s_blocksize != UDF_DEFAULT_BLOCKSIZE) + if (UDF_QUERY_FLAG(sb, UDF_FLAG_BLOCKSIZE_SET)) seq_printf(seq, ",bs=%lu", sb->s_blocksize); if (UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE)) seq_puts(seq, ",unhide"); @@ -282,18 +281,16 @@ static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt) seq_printf(seq, ",gid=%u", sbi->s_gid); if (sbi->s_umask != 0) seq_printf(seq, ",umask=%o", sbi->s_umask); + if (sbi->s_fmode != UDF_INVALID_MODE) + seq_printf(seq, ",mode=%o", sbi->s_fmode); + if (sbi->s_dmode != UDF_INVALID_MODE) + seq_printf(seq, ",dmode=%o", sbi->s_dmode); if (UDF_QUERY_FLAG(sb, UDF_FLAG_SESSION_SET)) seq_printf(seq, ",session=%u", sbi->s_session); if (UDF_QUERY_FLAG(sb, UDF_FLAG_LASTBLOCK_SET)) seq_printf(seq, ",lastblock=%u", sbi->s_last_block); - /* - * s_anchor[2] could be zeroed out in case there is no anchor - * in the specified block, but then the "anchor=N" option - * originally given by the user wasn't effective, so it's OK - * if we don't show it. - */ - if (sbi->s_anchor[2] != 0) - seq_printf(seq, ",anchor=%u", sbi->s_anchor[2]); + if (sbi->s_anchor != 0) + seq_printf(seq, ",anchor=%u", sbi->s_anchor); /* * volume, partition, fileset and rootdir seem to be ignored * currently @@ -317,6 +314,8 @@ static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt) * * gid= Set the default group. * umask= Set the default umask. + * mode= Set the default file permissions. + * dmode= Set the default directory permissions. * uid= Set the default user. * bs= Set the block size. * unhide Show otherwise hidden files. @@ -366,7 +365,8 @@ enum { Opt_gid, Opt_uid, Opt_umask, Opt_session, Opt_lastblock, Opt_anchor, Opt_volume, Opt_partition, Opt_fileset, Opt_rootdir, Opt_utf8, Opt_iocharset, - Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore + Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore, + Opt_fmode, Opt_dmode }; static const match_table_t tokens = { @@ -395,6 +395,8 @@ static const match_table_t tokens = { {Opt_rootdir, "rootdir=%u"}, {Opt_utf8, "utf8"}, {Opt_iocharset, "iocharset=%s"}, + {Opt_fmode, "mode=%o"}, + {Opt_dmode, "dmode=%o"}, {Opt_err, NULL} }; @@ -405,7 +407,6 @@ static int udf_parse_options(char *options, struct udf_options *uopt, int option; uopt->novrs = 0; - uopt->blocksize = UDF_DEFAULT_BLOCKSIZE; uopt->partition = 0xFFFF; uopt->session = 0xFFFFFFFF; uopt->lastblock = 0; @@ -428,10 +429,12 @@ static int udf_parse_options(char *options, struct udf_options *uopt, switch (token) { case Opt_novrs: uopt->novrs = 1; + break; case Opt_bs: if (match_int(&args[0], &option)) return 0; uopt->blocksize = option; + uopt->flags |= (1 << UDF_FLAG_BLOCKSIZE_SET); break; case Opt_unhide: uopt->flags |= (1 << UDF_FLAG_UNHIDE); @@ -531,6 +534,16 @@ static int udf_parse_options(char *options, struct udf_options *uopt, case Opt_gforget: uopt->flags |= (1 << UDF_FLAG_GID_FORGET); break; + case Opt_fmode: + if (match_octal(args, &option)) + return 0; + uopt->fmode = option & 0777; + break; + case Opt_dmode: + if (match_octal(args, &option)) + return 0; + uopt->dmode = option & 0777; + break; default: printk(KERN_ERR "udf: bad mount option \"%s\" " "or missing value\n", p); @@ -540,17 +553,6 @@ static int udf_parse_options(char *options, struct udf_options *uopt, return 1; } -static void udf_write_super(struct super_block *sb) -{ - lock_kernel(); - - if (!(sb->s_flags & MS_RDONLY)) - udf_open_lvid(sb); - sb->s_dirt = 0; - - unlock_kernel(); -} - static int udf_remount_fs(struct super_block *sb, int *flags, char *options) { struct udf_options uopt; @@ -560,6 +562,8 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options) uopt.uid = sbi->s_uid; uopt.gid = sbi->s_gid; uopt.umask = sbi->s_umask; + uopt.fmode = sbi->s_fmode; + uopt.dmode = sbi->s_dmode; if (!udf_parse_options(options, &uopt, true)) return -EINVAL; @@ -568,6 +572,8 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options) sbi->s_uid = uopt.uid; sbi->s_gid = uopt.gid; sbi->s_umask = uopt.umask; + sbi->s_fmode = uopt.fmode; + sbi->s_dmode = uopt.dmode; if (sbi->s_lvid_bh) { int write_rev = le16_to_cpu(udf_sb_lvidiu(sbi)->minUDFWriteRev); @@ -585,22 +591,19 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options) return 0; } -static int udf_vrs(struct super_block *sb, int silent) +/* Check Volume Structure Descriptors (ECMA 167 2/9.1) */ +/* We also check any "CD-ROM Volume Descriptor Set" (ECMA 167 2/8.3.1) */ +static loff_t udf_check_vsd(struct super_block *sb) { struct volStructDesc *vsd = NULL; loff_t sector = 32768; int sectorsize; struct buffer_head *bh = NULL; - int iso9660 = 0; int nsr02 = 0; int nsr03 = 0; struct udf_sb_info *sbi; - /* Block size must be a multiple of 512 */ - if (sb->s_blocksize & 511) - return 0; sbi = UDF_SB(sb); - if (sb->s_blocksize < sizeof(struct volStructDesc)) sectorsize = sizeof(struct volStructDesc); else @@ -627,7 +630,6 @@ static int udf_vrs(struct super_block *sb, int silent) break; } else if (!strncmp(vsd->stdIdent, VSD_STD_ID_CD001, VSD_STD_ID_LEN)) { - iso9660 = sector; switch (vsd->structType) { case 0: udf_debug("ISO9660 Boot Record found\n"); @@ -679,139 +681,9 @@ static int udf_vrs(struct super_block *sb, int silent) return 0; } -/* - * Check whether there is an anchor block in the given block - */ -static int udf_check_anchor_block(struct super_block *sb, sector_t block) -{ - struct buffer_head *bh; - uint16_t ident; - - if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) && - udf_fixed_to_variable(block) >= - sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits) - return 0; - - bh = udf_read_tagged(sb, block, block, &ident); - if (!bh) - return 0; - brelse(bh); - - return ident == TAG_IDENT_AVDP; -} - -/* Search for an anchor volume descriptor pointer */ -static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock) -{ - sector_t last[6]; - int i; - struct udf_sb_info *sbi = UDF_SB(sb); - - last[0] = lastblock; - last[1] = last[0] - 1; - last[2] = last[0] + 1; - last[3] = last[0] - 2; - last[4] = last[0] - 150; - last[5] = last[0] - 152; - - /* according to spec, anchor is in either: - * block 256 - * lastblock-256 - * lastblock - * however, if the disc isn't closed, it could be 512 */ - - for (i = 0; i < ARRAY_SIZE(last); i++) { - if (last[i] < 0) - continue; - if (last[i] >= sb->s_bdev->bd_inode->i_size >> - sb->s_blocksize_bits) - continue; - - if (udf_check_anchor_block(sb, last[i])) { - sbi->s_anchor[0] = last[i]; - sbi->s_anchor[1] = last[i] - 256; - return last[i]; - } - - if (last[i] < 256) - continue; - - if (udf_check_anchor_block(sb, last[i] - 256)) { - sbi->s_anchor[1] = last[i] - 256; - return last[i]; - } - } - - if (udf_check_anchor_block(sb, sbi->s_session + 256)) { - sbi->s_anchor[0] = sbi->s_session + 256; - return last[0]; - } - if (udf_check_anchor_block(sb, sbi->s_session + 512)) { - sbi->s_anchor[0] = sbi->s_session + 512; - return last[0]; - } - return 0; -} - -/* - * Find an anchor volume descriptor. The function expects sbi->s_lastblock to - * be the last block on the media. - * - * Return 1 if not found, 0 if ok - * - */ -static void udf_find_anchor(struct super_block *sb) -{ - sector_t lastblock; - struct buffer_head *bh = NULL; - uint16_t ident; - int i; - struct udf_sb_info *sbi = UDF_SB(sb); - - lastblock = udf_scan_anchors(sb, sbi->s_last_block); - if (lastblock) - goto check_anchor; - - /* No anchor found? Try VARCONV conversion of block numbers */ - UDF_SET_FLAG(sb, UDF_FLAG_VARCONV); - /* Firstly, we try to not convert number of the last block */ - lastblock = udf_scan_anchors(sb, - udf_variable_to_fixed(sbi->s_last_block)); - if (lastblock) - goto check_anchor; - - /* Secondly, we try with converted number of the last block */ - lastblock = udf_scan_anchors(sb, sbi->s_last_block); - if (!lastblock) { - /* VARCONV didn't help. Clear it. */ - UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV); - } - -check_anchor: - /* - * Check located anchors and the anchor block supplied via - * mount options - */ - for (i = 0; i < ARRAY_SIZE(sbi->s_anchor); i++) { - if (!sbi->s_anchor[i]) - continue; - bh = udf_read_tagged(sb, sbi->s_anchor[i], - sbi->s_anchor[i], &ident); - if (!bh) - sbi->s_anchor[i] = 0; - else { - brelse(bh); - if (ident != TAG_IDENT_AVDP) - sbi->s_anchor[i] = 0; - } - } - - sbi->s_last_block = lastblock; -} - static int udf_find_fileset(struct super_block *sb, - kernel_lb_addr *fileset, - kernel_lb_addr *root) + struct kernel_lb_addr *fileset, + struct kernel_lb_addr *root) { struct buffer_head *bh = NULL; long lastblock; @@ -820,7 +692,7 @@ static int udf_find_fileset(struct super_block *sb, if (fileset->logicalBlockNum != 0xFFFFFFFF || fileset->partitionReferenceNum != 0xFFFF) { - bh = udf_read_ptagged(sb, *fileset, 0, &ident); + bh = udf_read_ptagged(sb, fileset, 0, &ident); if (!bh) { return 1; @@ -834,7 +706,7 @@ static int udf_find_fileset(struct super_block *sb, sbi = UDF_SB(sb); if (!bh) { /* Search backwards through the partitions */ - kernel_lb_addr newfileset; + struct kernel_lb_addr newfileset; /* --> cvg: FIXME - is it reasonable? */ return 1; @@ -850,7 +722,7 @@ static int udf_find_fileset(struct super_block *sb, newfileset.logicalBlockNum = 0; do { - bh = udf_read_ptagged(sb, newfileset, 0, + bh = udf_read_ptagged(sb, &newfileset, 0, &ident); if (!bh) { newfileset.logicalBlockNum++; @@ -902,14 +774,23 @@ static int udf_find_fileset(struct super_block *sb, static int udf_load_pvoldesc(struct super_block *sb, sector_t block) { struct primaryVolDesc *pvoldesc; - struct ustr instr; - struct ustr outstr; + struct ustr *instr, *outstr; struct buffer_head *bh; uint16_t ident; + int ret = 1; + + instr = kmalloc(sizeof(struct ustr), GFP_NOFS); + if (!instr) + return 1; + + outstr = kmalloc(sizeof(struct ustr), GFP_NOFS); + if (!outstr) + goto out1; bh = udf_read_tagged(sb, block, block, &ident); if (!bh) - return 1; + goto out2; + BUG_ON(ident != TAG_IDENT_PVD); pvoldesc = (struct primaryVolDesc *)bh->b_data; @@ -917,7 +798,7 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block) if (udf_disk_stamp_to_time(&UDF_SB(sb)->s_record_time, pvoldesc->recordingDateAndTime)) { #ifdef UDFFS_DEBUG - timestamp *ts = &pvoldesc->recordingDateAndTime; + struct timestamp *ts = &pvoldesc->recordingDateAndTime; udf_debug("recording time %04u/%02u/%02u" " %02u:%02u (%x)\n", le16_to_cpu(ts->year), ts->month, ts->day, ts->hour, @@ -925,20 +806,25 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block) #endif } - if (!udf_build_ustr(&instr, pvoldesc->volIdent, 32)) - if (udf_CS0toUTF8(&outstr, &instr)) { - strncpy(UDF_SB(sb)->s_volume_ident, outstr.u_name, - outstr.u_len > 31 ? 31 : outstr.u_len); + if (!udf_build_ustr(instr, pvoldesc->volIdent, 32)) + if (udf_CS0toUTF8(outstr, instr)) { + strncpy(UDF_SB(sb)->s_volume_ident, outstr->u_name, + outstr->u_len > 31 ? 31 : outstr->u_len); udf_debug("volIdent[] = '%s'\n", UDF_SB(sb)->s_volume_ident); } - if (!udf_build_ustr(&instr, pvoldesc->volSetIdent, 128)) - if (udf_CS0toUTF8(&outstr, &instr)) - udf_debug("volSetIdent[] = '%s'\n", outstr.u_name); + if (!udf_build_ustr(instr, pvoldesc->volSetIdent, 128)) + if (udf_CS0toUTF8(outstr, instr)) + udf_debug("volSetIdent[] = '%s'\n", outstr->u_name); brelse(bh); - return 0; + ret = 0; +out2: + kfree(outstr); +out1: + kfree(instr); + return ret; } static int udf_load_metadata_files(struct super_block *sb, int partition) @@ -946,7 +832,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition) struct udf_sb_info *sbi = UDF_SB(sb); struct udf_part_map *map; struct udf_meta_data *mdata; - kernel_lb_addr addr; + struct kernel_lb_addr addr; int fe_error = 0; map = &sbi->s_partmaps[partition]; @@ -959,7 +845,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition) udf_debug("Metadata file location: block = %d part = %d\n", addr.logicalBlockNum, addr.partitionReferenceNum); - mdata->s_metadata_fe = udf_iget(sb, addr); + mdata->s_metadata_fe = udf_iget(sb, &addr); if (mdata->s_metadata_fe == NULL) { udf_warning(sb, __func__, "metadata inode efe not found, " @@ -981,7 +867,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition) udf_debug("Mirror metadata file location: block = %d part = %d\n", addr.logicalBlockNum, addr.partitionReferenceNum); - mdata->s_mirror_fe = udf_iget(sb, addr); + mdata->s_mirror_fe = udf_iget(sb, &addr); if (mdata->s_mirror_fe == NULL) { if (fe_error) { @@ -1013,7 +899,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition) udf_debug("Bitmap file location: block = %d part = %d\n", addr.logicalBlockNum, addr.partitionReferenceNum); - mdata->s_bitmap_fe = udf_iget(sb, addr); + mdata->s_bitmap_fe = udf_iget(sb, &addr); if (mdata->s_bitmap_fe == NULL) { if (sb->s_flags & MS_RDONLY) @@ -1037,7 +923,7 @@ error_exit: } static void udf_load_fileset(struct super_block *sb, struct buffer_head *bh, - kernel_lb_addr *root) + struct kernel_lb_addr *root) { struct fileSetDesc *fset; @@ -1119,13 +1005,13 @@ static int udf_fill_partdesc_info(struct super_block *sb, phd = (struct partitionHeaderDesc *)p->partitionContentsUse; if (phd->unallocSpaceTable.extLength) { - kernel_lb_addr loc = { + struct kernel_lb_addr loc = { .logicalBlockNum = le32_to_cpu( phd->unallocSpaceTable.extPosition), .partitionReferenceNum = p_index, }; - map->s_uspace.s_table = udf_iget(sb, loc); + map->s_uspace.s_table = udf_iget(sb, &loc); if (!map->s_uspace.s_table) { udf_debug("cannot load unallocSpaceTable (part %d)\n", p_index); @@ -1154,13 +1040,13 @@ static int udf_fill_partdesc_info(struct super_block *sb, udf_debug("partitionIntegrityTable (part %d)\n", p_index); if (phd->freedSpaceTable.extLength) { - kernel_lb_addr loc = { + struct kernel_lb_addr loc = { .logicalBlockNum = le32_to_cpu( phd->freedSpaceTable.extPosition), .partitionReferenceNum = p_index, }; - map->s_fspace.s_table = udf_iget(sb, loc); + map->s_fspace.s_table = udf_iget(sb, &loc); if (!map->s_fspace.s_table) { udf_debug("cannot load freedSpaceTable (part %d)\n", p_index); @@ -1192,7 +1078,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) { struct udf_sb_info *sbi = UDF_SB(sb); struct udf_part_map *map = &sbi->s_partmaps[p_index]; - kernel_lb_addr ino; + struct kernel_lb_addr ino; struct buffer_head *bh = NULL; struct udf_inode_info *vati; uint32_t pos; @@ -1201,7 +1087,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) /* VAT file entry is in the last recorded block */ ino.partitionReferenceNum = type1_index; ino.logicalBlockNum = sbi->s_last_block - map->s_partition_root; - sbi->s_vat_inode = udf_iget(sb, ino); + sbi->s_vat_inode = udf_iget(sb, &ino); if (!sbi->s_vat_inode) return 1; @@ -1322,7 +1208,7 @@ out_bh: } static int udf_load_logicalvol(struct super_block *sb, sector_t block, - kernel_lb_addr *fileset) + struct kernel_lb_addr *fileset) { struct logicalVolDesc *lvd; int i, j, offset; @@ -1471,7 +1357,7 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block, } if (fileset) { - long_ad *la = (long_ad *)&(lvd->logicalVolContentsUse[0]); + struct long_ad *la = (struct long_ad *)&(lvd->logicalVolContentsUse[0]); *fileset = lelb_to_cpu(la->extLocation); udf_debug("FileSet found in LogicalVolDesc at block=%d, " @@ -1490,7 +1376,7 @@ out_bh: * udf_load_logicalvolint * */ -static void udf_load_logicalvolint(struct super_block *sb, kernel_extent_ad loc) +static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_ad loc) { struct buffer_head *bh = NULL; uint16_t ident; @@ -1533,7 +1419,7 @@ static void udf_load_logicalvolint(struct super_block *sb, kernel_extent_ad loc) * Written, tested, and released. */ static noinline int udf_process_sequence(struct super_block *sb, long block, - long lastblock, kernel_lb_addr *fileset) + long lastblock, struct kernel_lb_addr *fileset) { struct buffer_head *bh = NULL; struct udf_vds_record vds[VDS_POS_LENGTH]; @@ -1655,85 +1541,199 @@ static noinline int udf_process_sequence(struct super_block *sb, long block, return 0; } +static int udf_load_sequence(struct super_block *sb, struct buffer_head *bh, + struct kernel_lb_addr *fileset) +{ + struct anchorVolDescPtr *anchor; + long main_s, main_e, reserve_s, reserve_e; + struct udf_sb_info *sbi; + + sbi = UDF_SB(sb); + anchor = (struct anchorVolDescPtr *)bh->b_data; + + /* Locate the main sequence */ + main_s = le32_to_cpu(anchor->mainVolDescSeqExt.extLocation); + main_e = le32_to_cpu(anchor->mainVolDescSeqExt.extLength); + main_e = main_e >> sb->s_blocksize_bits; + main_e += main_s; + + /* Locate the reserve sequence */ + reserve_s = le32_to_cpu(anchor->reserveVolDescSeqExt.extLocation); + reserve_e = le32_to_cpu(anchor->reserveVolDescSeqExt.extLength); + reserve_e = reserve_e >> sb->s_blocksize_bits; + reserve_e += reserve_s; + + /* Process the main & reserve sequences */ + /* responsible for finding the PartitionDesc(s) */ + if (!udf_process_sequence(sb, main_s, main_e, fileset)) + return 1; + return !udf_process_sequence(sb, reserve_s, reserve_e, fileset); +} + /* - * udf_check_valid() + * Check whether there is an anchor block in the given block and + * load Volume Descriptor Sequence if so. */ -static int udf_check_valid(struct super_block *sb, int novrs, int silent) +static int udf_check_anchor_block(struct super_block *sb, sector_t block, + struct kernel_lb_addr *fileset) { - long block; - struct udf_sb_info *sbi = UDF_SB(sb); + struct buffer_head *bh; + uint16_t ident; + int ret; - if (novrs) { - udf_debug("Validity check skipped because of novrs option\n"); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) && + udf_fixed_to_variable(block) >= + sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits) + return 0; + + bh = udf_read_tagged(sb, block, block, &ident); + if (!bh) + return 0; + if (ident != TAG_IDENT_AVDP) { + brelse(bh); return 0; } - /* Check that it is NSR02 compliant */ - /* Process any "CD-ROM Volume Descriptor Set" (ECMA 167 2/8.3.1) */ - block = udf_vrs(sb, silent); - if (block == -1) - udf_debug("Failed to read byte 32768. Assuming open " - "disc. Skipping validity check\n"); - if (block && !sbi->s_last_block) - sbi->s_last_block = udf_get_last_block(sb); - return !block; + ret = udf_load_sequence(sb, bh, fileset); + brelse(bh); + return ret; } -static int udf_load_sequence(struct super_block *sb, kernel_lb_addr *fileset) +/* Search for an anchor volume descriptor pointer */ +static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock, + struct kernel_lb_addr *fileset) { - struct anchorVolDescPtr *anchor; - uint16_t ident; - struct buffer_head *bh; - long main_s, main_e, reserve_s, reserve_e; + sector_t last[6]; int i; - struct udf_sb_info *sbi; - - if (!sb) - return 1; - sbi = UDF_SB(sb); + struct udf_sb_info *sbi = UDF_SB(sb); + int last_count = 0; - for (i = 0; i < ARRAY_SIZE(sbi->s_anchor); i++) { - if (!sbi->s_anchor[i]) + /* First try user provided anchor */ + if (sbi->s_anchor) { + if (udf_check_anchor_block(sb, sbi->s_anchor, fileset)) + return lastblock; + } + /* + * according to spec, anchor is in either: + * block 256 + * lastblock-256 + * lastblock + * however, if the disc isn't closed, it could be 512. + */ + if (udf_check_anchor_block(sb, sbi->s_session + 256, fileset)) + return lastblock; + /* + * The trouble is which block is the last one. Drives often misreport + * this so we try various possibilities. + */ + last[last_count++] = lastblock; + if (lastblock >= 1) + last[last_count++] = lastblock - 1; + last[last_count++] = lastblock + 1; + if (lastblock >= 2) + last[last_count++] = lastblock - 2; + if (lastblock >= 150) + last[last_count++] = lastblock - 150; + if (lastblock >= 152) + last[last_count++] = lastblock - 152; + + for (i = 0; i < last_count; i++) { + if (last[i] >= sb->s_bdev->bd_inode->i_size >> + sb->s_blocksize_bits) continue; - - bh = udf_read_tagged(sb, sbi->s_anchor[i], sbi->s_anchor[i], - &ident); - if (!bh) + if (udf_check_anchor_block(sb, last[i], fileset)) + return last[i]; + if (last[i] < 256) continue; + if (udf_check_anchor_block(sb, last[i] - 256, fileset)) + return last[i]; + } - anchor = (struct anchorVolDescPtr *)bh->b_data; + /* Finally try block 512 in case media is open */ + if (udf_check_anchor_block(sb, sbi->s_session + 512, fileset)) + return last[0]; + return 0; +} - /* Locate the main sequence */ - main_s = le32_to_cpu(anchor->mainVolDescSeqExt.extLocation); - main_e = le32_to_cpu(anchor->mainVolDescSeqExt.extLength); - main_e = main_e >> sb->s_blocksize_bits; - main_e += main_s; +/* + * Find an anchor volume descriptor and load Volume Descriptor Sequence from + * area specified by it. The function expects sbi->s_lastblock to be the last + * block on the media. + * + * Return 1 if ok, 0 if not found. + * + */ +static int udf_find_anchor(struct super_block *sb, + struct kernel_lb_addr *fileset) +{ + sector_t lastblock; + struct udf_sb_info *sbi = UDF_SB(sb); - /* Locate the reserve sequence */ - reserve_s = le32_to_cpu( - anchor->reserveVolDescSeqExt.extLocation); - reserve_e = le32_to_cpu( - anchor->reserveVolDescSeqExt.extLength); - reserve_e = reserve_e >> sb->s_blocksize_bits; - reserve_e += reserve_s; + lastblock = udf_scan_anchors(sb, sbi->s_last_block, fileset); + if (lastblock) + goto out; - brelse(bh); + /* No anchor found? Try VARCONV conversion of block numbers */ + UDF_SET_FLAG(sb, UDF_FLAG_VARCONV); + /* Firstly, we try to not convert number of the last block */ + lastblock = udf_scan_anchors(sb, + udf_variable_to_fixed(sbi->s_last_block), + fileset); + if (lastblock) + goto out; - /* Process the main & reserve sequences */ - /* responsible for finding the PartitionDesc(s) */ - if (!(udf_process_sequence(sb, main_s, main_e, - fileset) && - udf_process_sequence(sb, reserve_s, reserve_e, - fileset))) - break; + /* Secondly, we try with converted number of the last block */ + lastblock = udf_scan_anchors(sb, sbi->s_last_block, fileset); + if (!lastblock) { + /* VARCONV didn't help. Clear it. */ + UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV); + return 0; } +out: + sbi->s_last_block = lastblock; + return 1; +} - if (i == ARRAY_SIZE(sbi->s_anchor)) { - udf_debug("No Anchor block found\n"); - return 1; +/* + * Check Volume Structure Descriptor, find Anchor block and load Volume + * Descriptor Sequence + */ +static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt, + int silent, struct kernel_lb_addr *fileset) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + loff_t nsr_off; + + if (!sb_set_blocksize(sb, uopt->blocksize)) { + if (!silent) + printk(KERN_WARNING "UDF-fs: Bad block size\n"); + return 0; + } + sbi->s_last_block = uopt->lastblock; + if (!uopt->novrs) { + /* Check that it is NSR02 compliant */ + nsr_off = udf_check_vsd(sb); + if (!nsr_off) { + if (!silent) + printk(KERN_WARNING "UDF-fs: No VRS found\n"); + return 0; + } + if (nsr_off == -1) + udf_debug("Failed to read byte 32768. Assuming open " + "disc. Skipping validity check\n"); + if (!sbi->s_last_block) + sbi->s_last_block = udf_get_last_block(sb); + } else { + udf_debug("Validity check skipped because of novrs option\n"); } - udf_debug("Using anchor in block %d\n", sbi->s_anchor[i]); - return 0; + /* Look for anchor block and load Volume Descriptor Sequence */ + sbi->s_anchor = uopt->anchor; + if (!udf_find_anchor(sb, fileset)) { + if (!silent) + printk(KERN_WARNING "UDF-fs: No anchor found\n"); + return 0; + } + return 1; } static void udf_open_lvid(struct super_block *sb) @@ -1742,9 +1742,9 @@ static void udf_open_lvid(struct super_block *sb) struct buffer_head *bh = sbi->s_lvid_bh; struct logicalVolIntegrityDesc *lvid; struct logicalVolIntegrityDescImpUse *lvidiu; + if (!bh) return; - lvid = (struct logicalVolIntegrityDesc *)bh->b_data; lvidiu = udf_sb_lvidiu(sbi); @@ -1752,14 +1752,15 @@ static void udf_open_lvid(struct super_block *sb) lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; udf_time_to_disk_stamp(&lvid->recordingDateAndTime, CURRENT_TIME); - lvid->integrityType = LVID_INTEGRITY_TYPE_OPEN; + lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN); lvid->descTag.descCRC = cpu_to_le16( - crc_itu_t(0, (char *)lvid + sizeof(tag), + crc_itu_t(0, (char *)lvid + sizeof(struct tag), le16_to_cpu(lvid->descTag.descCRCLength))); lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag); mark_buffer_dirty(bh); + sbi->s_lvid_dirty = 0; } static void udf_close_lvid(struct super_block *sb) @@ -1773,10 +1774,6 @@ static void udf_close_lvid(struct super_block *sb) return; lvid = (struct logicalVolIntegrityDesc *)bh->b_data; - - if (lvid->integrityType != LVID_INTEGRITY_TYPE_OPEN) - return; - lvidiu = udf_sb_lvidiu(sbi); lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; @@ -1790,11 +1787,12 @@ static void udf_close_lvid(struct super_block *sb) lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE); lvid->descTag.descCRC = cpu_to_le16( - crc_itu_t(0, (char *)lvid + sizeof(tag), + crc_itu_t(0, (char *)lvid + sizeof(struct tag), le16_to_cpu(lvid->descTag.descCRCLength))); lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag); mark_buffer_dirty(bh); + sbi->s_lvid_dirty = 0; } static void udf_sb_free_bitmap(struct udf_bitmap *bitmap) @@ -1846,15 +1844,18 @@ static void udf_free_partition(struct udf_part_map *map) static int udf_fill_super(struct super_block *sb, void *options, int silent) { int i; + int ret; struct inode *inode = NULL; struct udf_options uopt; - kernel_lb_addr rootdir, fileset; + struct kernel_lb_addr rootdir, fileset; struct udf_sb_info *sbi; uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT); uopt.uid = -1; uopt.gid = -1; uopt.umask = 0; + uopt.fmode = UDF_INVALID_MODE; + uopt.dmode = UDF_INVALID_MODE; sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL); if (!sbi) @@ -1892,15 +1893,10 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) sbi->s_uid = uopt.uid; sbi->s_gid = uopt.gid; sbi->s_umask = uopt.umask; + sbi->s_fmode = uopt.fmode; + sbi->s_dmode = uopt.dmode; sbi->s_nls_map = uopt.nls_map; - /* Set the block size for all transfers */ - if (!sb_min_blocksize(sb, uopt.blocksize)) { - udf_debug("Bad block size (%d)\n", uopt.blocksize); - printk(KERN_ERR "udf: bad block size (%d)\n", uopt.blocksize); - goto error_out; - } - if (uopt.session == 0xFFFFFFFF) sbi->s_session = udf_get_last_session(sb); else @@ -1908,18 +1904,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) udf_debug("Multi-session=%d\n", sbi->s_session); - sbi->s_last_block = uopt.lastblock; - sbi->s_anchor[0] = sbi->s_anchor[1] = 0; - sbi->s_anchor[2] = uopt.anchor; - - if (udf_check_valid(sb, uopt.novrs, silent)) { - /* read volume recognition sequences */ - printk(KERN_WARNING "UDF-fs: No VRS found\n"); - goto error_out; - } - - udf_find_anchor(sb); - /* Fill in the rest of the superblock */ sb->s_op = &udf_sb_ops; sb->s_export_op = &udf_export_ops; @@ -1928,7 +1912,21 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) sb->s_magic = UDF_SUPER_MAGIC; sb->s_time_gran = 1000; - if (udf_load_sequence(sb, &fileset)) { + if (uopt.flags & (1 << UDF_FLAG_BLOCKSIZE_SET)) { + ret = udf_load_vrs(sb, &uopt, silent, &fileset); + } else { + uopt.blocksize = bdev_hardsect_size(sb->s_bdev); + ret = udf_load_vrs(sb, &uopt, silent, &fileset); + if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) { + if (!silent) + printk(KERN_NOTICE + "UDF-fs: Rescanning with blocksize " + "%d\n", UDF_DEFAULT_BLOCKSIZE); + uopt.blocksize = UDF_DEFAULT_BLOCKSIZE; + ret = udf_load_vrs(sb, &uopt, silent, &fileset); + } + } + if (!ret) { printk(KERN_WARNING "UDF-fs: No partition found (1)\n"); goto error_out; } @@ -1978,7 +1976,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) } if (!silent) { - timestamp ts; + struct timestamp ts; udf_time_to_disk_stamp(&ts, sbi->s_record_time); udf_info("UDF: Mounting volume '%s', " "timestamp %04u/%02u/%02u %02u:%02u (%x)\n", @@ -1991,7 +1989,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) /* Assign the root inode */ /* assign inodes by physical block number */ /* perhaps it's not extensible enough, but for now ... */ - inode = udf_iget(sb, rootdir); + inode = udf_iget(sb, &rootdir); if (!inode) { printk(KERN_ERR "UDF-fs: Error in udf_iget, block=%d, " "partition=%d\n", @@ -2081,11 +2079,31 @@ static void udf_put_super(struct super_block *sb) sb->s_fs_info = NULL; } +static int udf_sync_fs(struct super_block *sb, int wait) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + + mutex_lock(&sbi->s_alloc_mutex); + if (sbi->s_lvid_dirty) { + /* + * Blockdevice will be synced later so we don't have to submit + * the buffer for IO + */ + mark_buffer_dirty(sbi->s_lvid_bh); + sb->s_dirt = 0; + sbi->s_lvid_dirty = 0; + } + mutex_unlock(&sbi->s_alloc_mutex); + + return 0; +} + static int udf_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct udf_sb_info *sbi = UDF_SB(sb); struct logicalVolIntegrityDescImpUse *lvidiu; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); if (sbi->s_lvid_bh != NULL) lvidiu = udf_sb_lvidiu(sbi); @@ -2101,8 +2119,9 @@ static int udf_statfs(struct dentry *dentry, struct kstatfs *buf) le32_to_cpu(lvidiu->numDirs)) : 0) + buf->f_bfree; buf->f_ffree = buf->f_bfree; - /* __kernel_fsid_t f_fsid */ buf->f_namelen = UDF_NAME_LEN - 2; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); return 0; } @@ -2114,7 +2133,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb, unsigned int accum = 0; int index; int block = 0, newblock; - kernel_lb_addr loc; + struct kernel_lb_addr loc; uint32_t bytes; uint8_t *ptr; uint16_t ident; @@ -2124,7 +2143,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb, loc.logicalBlockNum = bitmap->s_extPosition; loc.partitionReferenceNum = UDF_SB(sb)->s_partition; - bh = udf_read_ptagged(sb, loc, 0, &ident); + bh = udf_read_ptagged(sb, &loc, 0, &ident); if (!bh) { printk(KERN_ERR "udf: udf_count_free failed\n"); @@ -2147,7 +2166,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb, bytes -= cur_bytes; if (bytes) { brelse(bh); - newblock = udf_get_lb_pblock(sb, loc, ++block); + newblock = udf_get_lb_pblock(sb, &loc, ++block); bh = udf_tread(sb, newblock); if (!bh) { udf_debug("read failed\n"); @@ -2170,7 +2189,7 @@ static unsigned int udf_count_free_table(struct super_block *sb, { unsigned int accum = 0; uint32_t elen; - kernel_lb_addr eloc; + struct kernel_lb_addr eloc; int8_t etype; struct extent_position epos; diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c index 65e19b4f9424..225527cdc885 100644 --- a/fs/udf/truncate.c +++ b/fs/udf/truncate.c @@ -28,10 +28,10 @@ #include "udf_sb.h" static void extent_trunc(struct inode *inode, struct extent_position *epos, - kernel_lb_addr eloc, int8_t etype, uint32_t elen, + struct kernel_lb_addr *eloc, int8_t etype, uint32_t elen, uint32_t nelen) { - kernel_lb_addr neloc = {}; + struct kernel_lb_addr neloc = {}; int last_block = (elen + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; int first_block = (nelen + inode->i_sb->s_blocksize - 1) >> @@ -43,12 +43,12 @@ static void extent_trunc(struct inode *inode, struct extent_position *epos, last_block); etype = (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30); } else - neloc = eloc; + neloc = *eloc; nelen = (etype << 30) | nelen; } if (elen != nelen) { - udf_write_aext(inode, epos, neloc, nelen, 0); + udf_write_aext(inode, epos, &neloc, nelen, 0); if (last_block - first_block > 0) { if (etype == (EXT_RECORDED_ALLOCATED >> 30)) mark_inode_dirty(inode); @@ -68,7 +68,7 @@ static void extent_trunc(struct inode *inode, struct extent_position *epos, void udf_truncate_tail_extent(struct inode *inode) { struct extent_position epos = {}; - kernel_lb_addr eloc; + struct kernel_lb_addr eloc; uint32_t elen, nelen; uint64_t lbcount = 0; int8_t etype = -1, netype; @@ -83,9 +83,9 @@ void udf_truncate_tail_extent(struct inode *inode) return; if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - adsize = sizeof(short_ad); + adsize = sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - adsize = sizeof(long_ad); + adsize = sizeof(struct long_ad); else BUG(); @@ -106,7 +106,7 @@ void udf_truncate_tail_extent(struct inode *inode) (unsigned)elen); nelen = elen - (lbcount - inode->i_size); epos.offset -= adsize; - extent_trunc(inode, &epos, eloc, etype, elen, nelen); + extent_trunc(inode, &epos, &eloc, etype, elen, nelen); epos.offset += adsize; if (udf_next_aext(inode, &epos, &eloc, &elen, 1) != -1) printk(KERN_ERR "udf_truncate_tail_extent(): " @@ -124,7 +124,7 @@ void udf_truncate_tail_extent(struct inode *inode) void udf_discard_prealloc(struct inode *inode) { struct extent_position epos = { NULL, 0, {0, 0} }; - kernel_lb_addr eloc; + struct kernel_lb_addr eloc; uint32_t elen; uint64_t lbcount = 0; int8_t etype = -1, netype; @@ -136,9 +136,9 @@ void udf_discard_prealloc(struct inode *inode) return; if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - adsize = sizeof(short_ad); + adsize = sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - adsize = sizeof(long_ad); + adsize = sizeof(struct long_ad); else adsize = 0; @@ -152,7 +152,7 @@ void udf_discard_prealloc(struct inode *inode) if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { epos.offset -= adsize; lbcount -= elen; - extent_trunc(inode, &epos, eloc, etype, elen, 0); + extent_trunc(inode, &epos, &eloc, etype, elen, 0); if (!epos.bh) { iinfo->i_lenAlloc = epos.offset - @@ -200,7 +200,7 @@ static void udf_update_alloc_ext_desc(struct inode *inode, void udf_truncate_extents(struct inode *inode) { struct extent_position epos; - kernel_lb_addr eloc, neloc = {}; + struct kernel_lb_addr eloc, neloc = {}; uint32_t elen, nelen = 0, indirect_ext_len = 0, lenalloc; int8_t etype; struct super_block *sb = inode->i_sb; @@ -210,9 +210,9 @@ void udf_truncate_extents(struct inode *inode) struct udf_inode_info *iinfo = UDF_I(inode); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - adsize = sizeof(short_ad); + adsize = sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - adsize = sizeof(long_ad); + adsize = sizeof(struct long_ad); else BUG(); @@ -221,7 +221,7 @@ void udf_truncate_extents(struct inode *inode) (inode->i_size & (sb->s_blocksize - 1)); if (etype != -1) { epos.offset -= adsize; - extent_trunc(inode, &epos, eloc, etype, elen, byte_offset); + extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset); epos.offset += adsize; if (byte_offset) lenalloc = epos.offset; @@ -236,12 +236,12 @@ void udf_truncate_extents(struct inode *inode) while ((etype = udf_current_aext(inode, &epos, &eloc, &elen, 0)) != -1) { if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) { - udf_write_aext(inode, &epos, neloc, nelen, 0); + udf_write_aext(inode, &epos, &neloc, nelen, 0); if (indirect_ext_len) { /* We managed to free all extents in the * indirect extent - free it too */ BUG_ON(!epos.bh); - udf_free_blocks(sb, inode, epos.block, + udf_free_blocks(sb, inode, &epos.block, 0, indirect_ext_len); } else if (!epos.bh) { iinfo->i_lenAlloc = lenalloc; @@ -253,7 +253,7 @@ void udf_truncate_extents(struct inode *inode) epos.offset = sizeof(struct allocExtDesc); epos.block = eloc; epos.bh = udf_tread(sb, - udf_get_lb_pblock(sb, eloc, 0)); + udf_get_lb_pblock(sb, &eloc, 0)); if (elen) indirect_ext_len = (elen + sb->s_blocksize - 1) >> @@ -261,7 +261,7 @@ void udf_truncate_extents(struct inode *inode) else indirect_ext_len = 1; } else { - extent_trunc(inode, &epos, eloc, etype, + extent_trunc(inode, &epos, &eloc, etype, elen, 0); epos.offset += adsize; } @@ -269,7 +269,7 @@ void udf_truncate_extents(struct inode *inode) if (indirect_ext_len) { BUG_ON(!epos.bh); - udf_free_blocks(sb, inode, epos.block, 0, + udf_free_blocks(sb, inode, &epos.block, 0, indirect_ext_len); } else if (!epos.bh) { iinfo->i_lenAlloc = lenalloc; @@ -278,7 +278,7 @@ void udf_truncate_extents(struct inode *inode) udf_update_alloc_ext_desc(inode, &epos, lenalloc); } else if (inode->i_size) { if (byte_offset) { - kernel_long_ad extent; + struct kernel_long_ad extent; /* * OK, there is not extent covering inode->i_size and diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h index 4f86b1d98a5d..e58d1de41073 100644 --- a/fs/udf/udf_i.h +++ b/fs/udf/udf_i.h @@ -4,7 +4,7 @@ struct udf_inode_info { struct timespec i_crtime; /* Physical address of inode */ - kernel_lb_addr i_location; + struct kernel_lb_addr i_location; __u64 i_unique; __u32 i_lenEAttr; __u32 i_lenAlloc; @@ -17,8 +17,8 @@ struct udf_inode_info { unsigned i_strat4096 : 1; unsigned reserved : 26; union { - short_ad *i_sad; - long_ad *i_lad; + struct short_ad *i_sad; + struct long_ad *i_lad; __u8 *i_data; } i_ext; struct inode vfs_inode; diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h index 1c1c514a9725..d113b72c2768 100644 --- a/fs/udf/udf_sb.h +++ b/fs/udf/udf_sb.h @@ -30,6 +30,7 @@ #define UDF_FLAG_GID_SET 16 #define UDF_FLAG_SESSION_SET 17 #define UDF_FLAG_LASTBLOCK_SET 18 +#define UDF_FLAG_BLOCKSIZE_SET 19 #define UDF_PART_FLAG_UNALLOC_BITMAP 0x0001 #define UDF_PART_FLAG_UNALLOC_TABLE 0x0002 @@ -48,6 +49,8 @@ #define UDF_SPARABLE_MAP15 0x1522U #define UDF_METADATA_MAP25 0x2511U +#define UDF_INVALID_MODE ((mode_t)-1) + #pragma pack(1) /* XXX(hch): Why? This file just defines in-core structures */ struct udf_meta_data { @@ -114,7 +117,7 @@ struct udf_sb_info { /* Sector headers */ __s32 s_session; - __u32 s_anchor[3]; + __u32 s_anchor; __u32 s_last_block; struct buffer_head *s_lvid_bh; @@ -123,6 +126,8 @@ struct udf_sb_info { mode_t s_umask; gid_t s_gid; uid_t s_uid; + mode_t s_fmode; + mode_t s_dmode; /* Root Info */ struct timespec s_record_time; @@ -143,6 +148,8 @@ struct udf_sb_info { struct inode *s_vat_inode; struct mutex s_alloc_mutex; + /* Protected by s_alloc_mutex */ + unsigned int s_lvid_dirty; }; static inline struct udf_sb_info *UDF_SB(struct super_block *sb) diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index 8ec865de5f13..cac51b77a5d1 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -62,10 +62,8 @@ static inline size_t udf_ext0_offset(struct inode *inode) return 0; } -#define udf_get_lb_pblock(sb,loc,offset) udf_get_pblock((sb), (loc).logicalBlockNum, (loc).partitionReferenceNum, (offset)) - /* computes tag checksum */ -u8 udf_tag_checksum(const tag *t); +u8 udf_tag_checksum(const struct tag *t); struct dentry; struct inode; @@ -95,7 +93,7 @@ struct udf_vds_record { }; struct generic_desc { - tag descTag; + struct tag descTag; __le32 volDescSeqNum; }; @@ -108,11 +106,22 @@ struct ustr { struct extent_position { struct buffer_head *bh; uint32_t offset; - kernel_lb_addr block; + struct kernel_lb_addr block; }; /* super.c */ extern void udf_warning(struct super_block *, const char *, const char *, ...); +static inline void udf_updated_lvid(struct super_block *sb) +{ + struct buffer_head *bh = UDF_SB(sb)->s_lvid_bh; + + BUG_ON(!bh); + WARN_ON_ONCE(((struct logicalVolIntegrityDesc *) + bh->b_data)->integrityType != + cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN)); + sb->s_dirt = 1; + UDF_SB(sb)->s_lvid_dirty = 1; +} /* namei.c */ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *, @@ -124,7 +133,7 @@ extern int udf_ioctl(struct inode *, struct file *, unsigned int, unsigned long); /* inode.c */ -extern struct inode *udf_iget(struct super_block *, kernel_lb_addr); +extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *); extern int udf_sync_inode(struct inode *); extern void udf_expand_file_adinicb(struct inode *, int, int *); extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *); @@ -136,19 +145,19 @@ extern void udf_clear_inode(struct inode *); extern int udf_write_inode(struct inode *, int); extern long udf_block_map(struct inode *, sector_t); extern int udf_extend_file(struct inode *, struct extent_position *, - kernel_long_ad *, sector_t); + struct kernel_long_ad *, sector_t); extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *, - kernel_lb_addr *, uint32_t *, sector_t *); + struct kernel_lb_addr *, uint32_t *, sector_t *); extern int8_t udf_add_aext(struct inode *, struct extent_position *, - kernel_lb_addr, uint32_t, int); + struct kernel_lb_addr *, uint32_t, int); extern int8_t udf_write_aext(struct inode *, struct extent_position *, - kernel_lb_addr, uint32_t, int); + struct kernel_lb_addr *, uint32_t, int); extern int8_t udf_delete_aext(struct inode *, struct extent_position, - kernel_lb_addr, uint32_t); + struct kernel_lb_addr, uint32_t); extern int8_t udf_next_aext(struct inode *, struct extent_position *, - kernel_lb_addr *, uint32_t *, int); + struct kernel_lb_addr *, uint32_t *, int); extern int8_t udf_current_aext(struct inode *, struct extent_position *, - kernel_lb_addr *, uint32_t *, int); + struct kernel_lb_addr *, uint32_t *, int); /* misc.c */ extern struct buffer_head *udf_tgetblk(struct super_block *, int); @@ -160,7 +169,7 @@ extern struct genericFormat *udf_get_extendedattr(struct inode *, uint32_t, extern struct buffer_head *udf_read_tagged(struct super_block *, uint32_t, uint32_t, uint16_t *); extern struct buffer_head *udf_read_ptagged(struct super_block *, - kernel_lb_addr, uint32_t, + struct kernel_lb_addr *, uint32_t, uint16_t *); extern void udf_update_tag(char *, int); extern void udf_new_tag(char *, uint16_t, uint16_t, uint16_t, uint32_t, int); @@ -182,6 +191,14 @@ extern uint32_t udf_get_pblock_meta25(struct super_block *, uint32_t, uint16_t, uint32_t); extern int udf_relocate_blocks(struct super_block *, long, long *); +static inline uint32_t +udf_get_lb_pblock(struct super_block *sb, struct kernel_lb_addr *loc, + uint32_t offset) +{ + return udf_get_pblock(sb, loc->logicalBlockNum, + loc->partitionReferenceNum, offset); +} + /* unicode.c */ extern int udf_get_filename(struct super_block *, uint8_t *, uint8_t *, int); extern int udf_put_filename(struct super_block *, const uint8_t *, uint8_t *, @@ -200,7 +217,7 @@ extern void udf_truncate_extents(struct inode *); /* balloc.c */ extern void udf_free_blocks(struct super_block *, struct inode *, - kernel_lb_addr, uint32_t, uint32_t); + struct kernel_lb_addr *, uint32_t, uint32_t); extern int udf_prealloc_blocks(struct super_block *, struct inode *, uint16_t, uint32_t, uint32_t); extern int udf_new_block(struct super_block *, struct inode *, uint16_t, @@ -214,16 +231,16 @@ extern struct fileIdentDesc *udf_fileident_read(struct inode *, loff_t *, struct udf_fileident_bh *, struct fileIdentDesc *, struct extent_position *, - kernel_lb_addr *, uint32_t *, + struct kernel_lb_addr *, uint32_t *, sector_t *); extern struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset); -extern long_ad *udf_get_filelongad(uint8_t *, int, uint32_t *, int); -extern short_ad *udf_get_fileshortad(uint8_t *, int, uint32_t *, int); +extern struct long_ad *udf_get_filelongad(uint8_t *, int, uint32_t *, int); +extern struct short_ad *udf_get_fileshortad(uint8_t *, int, uint32_t *, int); /* udftime.c */ extern struct timespec *udf_disk_stamp_to_time(struct timespec *dest, - timestamp src); -extern timestamp *udf_time_to_disk_stamp(timestamp *dest, struct timespec src); + struct timestamp src); +extern struct timestamp *udf_time_to_disk_stamp(struct timestamp *dest, struct timespec src); #endif /* __UDF_DECL_H */ diff --git a/fs/udf/udfend.h b/fs/udf/udfend.h index 489f52fb428c..6a9f3a9cc428 100644 --- a/fs/udf/udfend.h +++ b/fs/udf/udfend.h @@ -4,9 +4,9 @@ #include <asm/byteorder.h> #include <linux/string.h> -static inline kernel_lb_addr lelb_to_cpu(lb_addr in) +static inline struct kernel_lb_addr lelb_to_cpu(struct lb_addr in) { - kernel_lb_addr out; + struct kernel_lb_addr out; out.logicalBlockNum = le32_to_cpu(in.logicalBlockNum); out.partitionReferenceNum = le16_to_cpu(in.partitionReferenceNum); @@ -14,9 +14,9 @@ static inline kernel_lb_addr lelb_to_cpu(lb_addr in) return out; } -static inline lb_addr cpu_to_lelb(kernel_lb_addr in) +static inline struct lb_addr cpu_to_lelb(struct kernel_lb_addr in) { - lb_addr out; + struct lb_addr out; out.logicalBlockNum = cpu_to_le32(in.logicalBlockNum); out.partitionReferenceNum = cpu_to_le16(in.partitionReferenceNum); @@ -24,9 +24,9 @@ static inline lb_addr cpu_to_lelb(kernel_lb_addr in) return out; } -static inline short_ad lesa_to_cpu(short_ad in) +static inline struct short_ad lesa_to_cpu(struct short_ad in) { - short_ad out; + struct short_ad out; out.extLength = le32_to_cpu(in.extLength); out.extPosition = le32_to_cpu(in.extPosition); @@ -34,9 +34,9 @@ static inline short_ad lesa_to_cpu(short_ad in) return out; } -static inline short_ad cpu_to_lesa(short_ad in) +static inline struct short_ad cpu_to_lesa(struct short_ad in) { - short_ad out; + struct short_ad out; out.extLength = cpu_to_le32(in.extLength); out.extPosition = cpu_to_le32(in.extPosition); @@ -44,9 +44,9 @@ static inline short_ad cpu_to_lesa(short_ad in) return out; } -static inline kernel_long_ad lela_to_cpu(long_ad in) +static inline struct kernel_long_ad lela_to_cpu(struct long_ad in) { - kernel_long_ad out; + struct kernel_long_ad out; out.extLength = le32_to_cpu(in.extLength); out.extLocation = lelb_to_cpu(in.extLocation); @@ -54,9 +54,9 @@ static inline kernel_long_ad lela_to_cpu(long_ad in) return out; } -static inline long_ad cpu_to_lela(kernel_long_ad in) +static inline struct long_ad cpu_to_lela(struct kernel_long_ad in) { - long_ad out; + struct long_ad out; out.extLength = cpu_to_le32(in.extLength); out.extLocation = cpu_to_lelb(in.extLocation); @@ -64,9 +64,9 @@ static inline long_ad cpu_to_lela(kernel_long_ad in) return out; } -static inline kernel_extent_ad leea_to_cpu(extent_ad in) +static inline struct kernel_extent_ad leea_to_cpu(struct extent_ad in) { - kernel_extent_ad out; + struct kernel_extent_ad out; out.extLength = le32_to_cpu(in.extLength); out.extLocation = le32_to_cpu(in.extLocation); diff --git a/fs/udf/udftime.c b/fs/udf/udftime.c index 5f811655c9b5..b8c828c4d200 100644 --- a/fs/udf/udftime.c +++ b/fs/udf/udftime.c @@ -85,7 +85,8 @@ extern struct timezone sys_tz; #define SECS_PER_HOUR (60 * 60) #define SECS_PER_DAY (SECS_PER_HOUR * 24) -struct timespec *udf_disk_stamp_to_time(struct timespec *dest, timestamp src) +struct timespec * +udf_disk_stamp_to_time(struct timespec *dest, struct timestamp src) { int yday; u16 typeAndTimezone = le16_to_cpu(src.typeAndTimezone); @@ -116,7 +117,8 @@ struct timespec *udf_disk_stamp_to_time(struct timespec *dest, timestamp src) return dest; } -timestamp *udf_time_to_disk_stamp(timestamp *dest, struct timespec ts) +struct timestamp * +udf_time_to_disk_stamp(struct timestamp *dest, struct timespec ts) { long int days, rem, y; const unsigned short int *ip; diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c index 9fdf8c93c58e..cefa8c8913e6 100644 --- a/fs/udf/unicode.c +++ b/fs/udf/unicode.c @@ -254,7 +254,7 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o, { const uint8_t *ocu; uint8_t cmp_id, ocu_len; - int i; + int i, len; ocu_len = ocu_i->u_len; @@ -279,8 +279,13 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o, if (cmp_id == 16) c = (c << 8) | ocu[i++]; - utf_o->u_len += nls->uni2char(c, &utf_o->u_name[utf_o->u_len], - UDF_NAME_LEN - utf_o->u_len); + len = nls->uni2char(c, &utf_o->u_name[utf_o->u_len], + UDF_NAME_LEN - utf_o->u_len); + /* Valid character? */ + if (len >= 0) + utf_o->u_len += len; + else + utf_o->u_name[utf_o->u_len++] = '?'; } utf_o->u_cmpID = 8; @@ -290,7 +295,8 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o, static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni, int length) { - unsigned len, i, max_val; + int len; + unsigned i, max_val; uint16_t uni_char; int u_len; @@ -302,8 +308,13 @@ try_again: u_len = 0U; for (i = 0U; i < uni->u_len; i++) { len = nls->char2uni(&uni->u_name[i], uni->u_len - i, &uni_char); - if (len <= 0) + if (!len) continue; + /* Invalid character, deal with it */ + if (len < 0) { + len = 1; + uni_char = '?'; + } if (uni_char > max_val) { max_val = 0xffffU; @@ -324,34 +335,43 @@ try_again: int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname, int flen) { - struct ustr filename, unifilename; - int len; + struct ustr *filename, *unifilename; + int len = 0; - if (udf_build_ustr_exact(&unifilename, sname, flen)) + filename = kmalloc(sizeof(struct ustr), GFP_NOFS); + if (!filename) return 0; + unifilename = kmalloc(sizeof(struct ustr), GFP_NOFS); + if (!unifilename) + goto out1; + + if (udf_build_ustr_exact(unifilename, sname, flen)) + goto out2; + if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) { - if (!udf_CS0toUTF8(&filename, &unifilename)) { + if (!udf_CS0toUTF8(filename, unifilename)) { udf_debug("Failed in udf_get_filename: sname = %s\n", sname); - return 0; + goto out2; } } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) { - if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, &filename, - &unifilename)) { + if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, filename, + unifilename)) { udf_debug("Failed in udf_get_filename: sname = %s\n", sname); - return 0; + goto out2; } } else - return 0; - - len = udf_translate_to_linux(dname, filename.u_name, filename.u_len, - unifilename.u_name, unifilename.u_len); - if (len) - return len; - - return 0; + goto out2; + + len = udf_translate_to_linux(dname, filename->u_name, filename->u_len, + unifilename->u_name, unifilename->u_len); +out2: + kfree(unifilename); +out1: + kfree(filename); + return len; } int udf_put_filename(struct super_block *sb, const uint8_t *sname, diff --git a/fs/ufs/super.c b/fs/ufs/super.c index e1c1fc5ee239..60359291761f 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1268,6 +1268,7 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf) struct ufs_super_block_first *usb1; struct ufs_super_block_second *usb2; struct ufs_super_block_third *usb3; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); lock_kernel(); @@ -1290,6 +1291,8 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf) ? (buf->f_bfree - (((long)buf->f_blocks / 100) * uspi->s_minfree)) : 0; buf->f_files = uspi->s_ncg * uspi->s_ipg; buf->f_namelen = UFS_MAXNAMLEN; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); unlock_kernel(); diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 631d0137551e..6075382336d7 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -230,7 +230,7 @@ xfs_vn_mknod( } if (IS_POSIXACL(dir) && !default_acl) - mode &= ~current->fs->umask; + mode &= ~current_umask(); xfs_dentry_to_name(&name, dentry); error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL); @@ -404,7 +404,7 @@ xfs_vn_symlink( mode_t mode; mode = S_IFLNK | - (irix_symlink_mode ? 0777 & ~current->fs->umask : S_IRWXUGO); + (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO); xfs_dentry_to_name(&name, dentry); error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip, NULL); |