summaryrefslogtreecommitdiff
path: root/Documentation/filesystems
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation/filesystems')
-rw-r--r--Documentation/filesystems/Locking32
-rw-r--r--Documentation/filesystems/autofs4-mount-control.txt71
-rw-r--r--Documentation/filesystems/autofs4.txt8
-rw-r--r--Documentation/filesystems/ceph.txt4
-rw-r--r--Documentation/filesystems/dax.txt15
-rw-r--r--Documentation/filesystems/directory-locking2
-rw-r--r--Documentation/filesystems/f2fs.txt1
-rw-r--r--Documentation/filesystems/overlayfs.txt8
-rw-r--r--Documentation/filesystems/porting8
-rw-r--r--Documentation/filesystems/proc.txt14
-rw-r--r--Documentation/filesystems/vfs.txt56
-rw-r--r--Documentation/filesystems/xfs.txt123
12 files changed, 259 insertions, 83 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index d30fb2cb5066..14cdc101d165 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -51,8 +51,6 @@ prototypes:
int (*rmdir) (struct inode *,struct dentry *);
int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t);
int (*rename) (struct inode *, struct dentry *,
- struct inode *, struct dentry *);
- int (*rename2) (struct inode *, struct dentry *,
struct inode *, struct dentry *, unsigned int);
int (*readlink) (struct dentry *, char __user *,int);
const char *(*get_link) (struct dentry *, struct inode *, void **);
@@ -61,10 +59,7 @@ prototypes:
int (*get_acl)(struct inode *, int);
int (*setattr) (struct dentry *, struct iattr *);
int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *);
- int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
- ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
ssize_t (*listxattr) (struct dentry *, char *, size_t);
- int (*removexattr) (struct dentry *, const char *);
int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len);
void (*update_time)(struct inode *, struct timespec *, int);
int (*atomic_open)(struct inode *, struct dentry *,
@@ -83,31 +78,44 @@ symlink: yes
mkdir: yes
unlink: yes (both)
rmdir: yes (both) (see below)
-rename: yes (all) (see below)
-rename2: yes (all) (see below)
+rename: yes (all) (see below)
readlink: no
get_link: no
setattr: yes
permission: no (may not block if called in rcu-walk mode)
get_acl: no
getattr: no
-setxattr: yes
-getxattr: no
listxattr: no
-removexattr: yes
fiemap: no
update_time: no
atomic_open: yes
tmpfile: no
+
Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on
victim.
- cross-directory ->rename() and rename2() has (per-superblock)
-->s_vfs_rename_sem.
+ cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem.
See Documentation/filesystems/directory-locking for more detailed discussion
of the locking scheme for directory operations.
+----------------------- xattr_handler operations -----------------------
+prototypes:
+ bool (*list)(struct dentry *dentry);
+ int (*get)(const struct xattr_handler *handler, struct dentry *dentry,
+ struct inode *inode, const char *name, void *buffer,
+ size_t size);
+ int (*set)(const struct xattr_handler *handler, struct dentry *dentry,
+ struct inode *inode, const char *name, const void *buffer,
+ size_t size, int flags);
+
+locking rules:
+ all may block
+ i_mutex(inode)
+list: no
+get: no
+set: yes
+
--------------------------- super_operations ---------------------------
prototypes:
struct inode *(*alloc_inode)(struct super_block *sb);
diff --git a/Documentation/filesystems/autofs4-mount-control.txt b/Documentation/filesystems/autofs4-mount-control.txt
index aff22113a986..50a3e01a36f8 100644
--- a/Documentation/filesystems/autofs4-mount-control.txt
+++ b/Documentation/filesystems/autofs4-mount-control.txt
@@ -179,8 +179,19 @@ struct autofs_dev_ioctl {
* including this struct */
__s32 ioctlfd; /* automount command fd */
- __u32 arg1; /* Command parameters */
- __u32 arg2;
+ union {
+ struct args_protover protover;
+ struct args_protosubver protosubver;
+ struct args_openmount openmount;
+ struct args_ready ready;
+ struct args_fail fail;
+ struct args_setpipefd setpipefd;
+ struct args_timeout timeout;
+ struct args_requester requester;
+ struct args_expire expire;
+ struct args_askumount askumount;
+ struct args_ismountpoint ismountpoint;
+ };
char path[0];
};
@@ -192,8 +203,8 @@ optionally be used to check a specific mount corresponding to a given
mount point file descriptor, and when requesting the uid and gid of the
last successful mount on a directory within the autofs file system.
-The fields arg1 and arg2 are used to communicate parameters and results of
-calls made as described below.
+The union is used to communicate parameters and results of calls made
+as described below.
The path field is used to pass a path where it is needed and the size field
is used account for the increased structure length when translating the
@@ -245,9 +256,9 @@ AUTOFS_DEV_IOCTL_PROTOVER_CMD and AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD
Get the major and minor version of the autofs4 protocol version understood
by loaded module. This call requires an initialized struct autofs_dev_ioctl
with the ioctlfd field set to a valid autofs mount point descriptor
-and sets the requested version number in structure field arg1. These
-commands return 0 on success or one of the negative error codes if
-validation fails.
+and sets the requested version number in version field of struct args_protover
+or sub_version field of struct args_protosubver. These commands return
+0 on success or one of the negative error codes if validation fails.
AUTOFS_DEV_IOCTL_OPENMOUNT and AUTOFS_DEV_IOCTL_CLOSEMOUNT
@@ -256,9 +267,9 @@ AUTOFS_DEV_IOCTL_OPENMOUNT and AUTOFS_DEV_IOCTL_CLOSEMOUNT
Obtain and release a file descriptor for an autofs managed mount point
path. The open call requires an initialized struct autofs_dev_ioctl with
the path field set and the size field adjusted appropriately as well
-as the arg1 field set to the device number of the autofs mount. The
-device number can be obtained from the mount options shown in
-/proc/mounts. The close call requires an initialized struct
+as the devid field of struct args_openmount set to the device number of
+the autofs mount. The device number can be obtained from the mount options
+shown in /proc/mounts. The close call requires an initialized struct
autofs_dev_ioct with the ioctlfd field set to the descriptor obtained
from the open call. The release of the file descriptor can also be done
with close(2) so any open descriptors will also be closed at process exit.
@@ -272,10 +283,10 @@ AUTOFS_DEV_IOCTL_READY_CMD and AUTOFS_DEV_IOCTL_FAIL_CMD
Return mount and expire result status from user space to the kernel.
Both of these calls require an initialized struct autofs_dev_ioctl
with the ioctlfd field set to the descriptor obtained from the open
-call and the arg1 field set to the wait queue token number, received
-by user space in the foregoing mount or expire request. The arg2 field
-is set to the status to be returned. For the ready call this is always
-0 and for the fail call it is set to the errno of the operation.
+call and the token field of struct args_ready or struct args_fail set
+to the wait queue token number, received by user space in the foregoing
+mount or expire request. The status field of struct args_fail is set to
+the errno of the operation. It is set to 0 on success.
AUTOFS_DEV_IOCTL_SETPIPEFD_CMD
@@ -290,9 +301,10 @@ mount be catatonic (see next call).
The call requires an initialized struct autofs_dev_ioctl with the
ioctlfd field set to the descriptor obtained from the open call and
-the arg1 field set to descriptor of the pipe. On success the call
-also sets the process group id used to identify the controlling process
-(eg. the owning automount(8) daemon) to the process group of the caller.
+the pipefd field of struct args_setpipefd set to descriptor of the pipe.
+On success the call also sets the process group id used to identify the
+controlling process (eg. the owning automount(8) daemon) to the process
+group of the caller.
AUTOFS_DEV_IOCTL_CATATONIC_CMD
@@ -323,9 +335,8 @@ mount on the given path dentry.
The call requires an initialized struct autofs_dev_ioctl with the path
field set to the mount point in question and the size field adjusted
-appropriately as well as the arg1 field set to the device number of the
-containing autofs mount. Upon return the struct field arg1 contains the
-uid and arg2 the gid.
+appropriately. Upon return the uid field of struct args_requester contains
+the uid and gid field the gid.
When reconstructing an autofs mount tree with active mounts we need to
re-connect to mounts that may have used the original process uid and
@@ -343,8 +354,9 @@ this ioctl is called until no further expire candidates are found.
The call requires an initialized struct autofs_dev_ioctl with the
ioctlfd field set to the descriptor obtained from the open call. In
addition an immediate expire, independent of the mount timeout, can be
-requested by setting the arg1 field to 1. If no expire candidates can
-be found the ioctl returns -1 with errno set to EAGAIN.
+requested by setting the how field of struct args_expire to 1. If no
+expire candidates can be found the ioctl returns -1 with errno set to
+EAGAIN.
This call causes the kernel module to check the mount corresponding
to the given ioctlfd for mounts that can be expired, issues an expire
@@ -357,7 +369,8 @@ Checks if an autofs mount point is in use.
The call requires an initialized struct autofs_dev_ioctl with the
ioctlfd field set to the descriptor obtained from the open call and
-it returns the result in the arg1 field, 1 for busy and 0 otherwise.
+it returns the result in the may_umount field of struct args_askumount,
+1 for busy and 0 otherwise.
AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD
@@ -369,12 +382,12 @@ The call requires an initialized struct autofs_dev_ioctl. There are two
possible variations. Both use the path field set to the path of the mount
point to check and the size field adjusted appropriately. One uses the
ioctlfd field to identify a specific mount point to check while the other
-variation uses the path and optionally arg1 set to an autofs mount type.
-The call returns 1 if this is a mount point and sets arg1 to the device
-number of the mount and field arg2 to the relevant super block magic
-number (described below) or 0 if it isn't a mountpoint. In both cases
-the the device number (as returned by new_encode_dev()) is returned
-in field arg1.
+variation uses the path and optionally in.type field of struct args_ismountpoint
+set to an autofs mount type. The call returns 1 if this is a mount point
+and sets out.devid field to the device number of the mount and out.magic
+field to the relevant super block magic number (described below) or 0 if
+it isn't a mountpoint. In both cases the the device number (as returned
+by new_encode_dev()) is returned in out.devid field.
If supplied with a file descriptor we're looking for a specific mount,
not necessarily at the top of the mounted stack. In this case the path
diff --git a/Documentation/filesystems/autofs4.txt b/Documentation/filesystems/autofs4.txt
index 39d02e19fb62..8fac3fe7b8c9 100644
--- a/Documentation/filesystems/autofs4.txt
+++ b/Documentation/filesystems/autofs4.txt
@@ -203,9 +203,9 @@ initiated or is being considered, otherwise it returns 0.
Mountpoint expiry
-----------------
-The VFS has a mechansim for automatically expiring unused mounts,
+The VFS has a mechanism for automatically expiring unused mounts,
much as it can expire any unused dentry information from the dcache.
-This is guided by the MNT_SHRINKABLE flag. This only applies to
+This is guided by the MNT_SHRINKABLE flag. This only applies to
mounts that were created by `d_automount()` returning a filesystem to be
mounted. As autofs doesn't return such a filesystem but leaves the
mounting to the automount daemon, it must involve the automount daemon
@@ -298,7 +298,7 @@ remove directories and symlinks using normal filesystem operations.
autofs knows whether a process requesting some operation is the daemon
or not based on its process-group id number (see getpgid(1)).
-When an autofs filesystem it mounted the pgid of the mounting
+When an autofs filesystem is mounted the pgid of the mounting
processes is recorded unless the "pgrp=" option is given, in which
case that number is recorded instead. Any request arriving from a
process in that process group is considered to come from the daemon.
@@ -450,7 +450,7 @@ Commands are:
numbers for existing filesystems can be found in
`/proc/self/mountinfo`.
- **AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD**: same as `close(ioctlfd)`.
-- **AUTOFS_DEV_IOCTL_SETPIPEFD_CMD**: if the filesystem is in
+- **AUTOFS_DEV_IOCTL_SETPIPEFD_CMD**: if the filesystem is in
catatonic mode, this can provide the write end of a new pipe
in `arg1` to re-establish communication with a daemon. The
process group of the calling process is used to identify the
diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt
index d6030aa33376..f5306ee40ea9 100644
--- a/Documentation/filesystems/ceph.txt
+++ b/Documentation/filesystems/ceph.txt
@@ -98,6 +98,10 @@ Mount Options
size.
rsize=X
+ Specify the maximum read size in bytes. By default there is no
+ maximum.
+
+ rasize=X
Specify the maximum readahead.
mount_timeout=X
diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt
index 0c16a22521a8..23d18b8a49d5 100644
--- a/Documentation/filesystems/dax.txt
+++ b/Documentation/filesystems/dax.txt
@@ -123,9 +123,12 @@ The DAX code does not work correctly on architectures which have virtually
mapped caches such as ARM, MIPS and SPARC.
Calling get_user_pages() on a range of user memory that has been mmaped
-from a DAX file will fail as there are no 'struct page' to describe
-those pages. This problem is being worked on. That means that O_DIRECT
-reads/writes to those memory ranges from a non-DAX file will fail (note
-that O_DIRECT reads/writes _of a DAX file_ do work, it is the memory
-that is being accessed that is key here). Other things that will not
-work include RDMA, sendfile() and splice().
+from a DAX file will fail when there are no 'struct page' to describe
+those pages. This problem has been addressed in some device drivers
+by adding optional struct page support for pages under the control of
+the driver (see CONFIG_NVDIMM_PFN in drivers/nvdimm for an example of
+how to do this). In the non struct page cases O_DIRECT reads/writes to
+those memory ranges from a non-DAX file will fail (note that O_DIRECT
+reads/writes _of a DAX file_ do work, it is the memory that is being
+accessed that is key here). Other things that will not work in the
+non struct page case include RDMA, sendfile() and splice().
diff --git a/Documentation/filesystems/directory-locking b/Documentation/filesystems/directory-locking
index c314badbcfc6..4e32cb961e5b 100644
--- a/Documentation/filesystems/directory-locking
+++ b/Documentation/filesystems/directory-locking
@@ -19,7 +19,7 @@ locks victim and calls the method. Locks are exclusive.
4) rename() that is _not_ cross-directory. Locking rules: caller locks
the parent and finds source and target. In case of exchange (with
-RENAME_EXCHANGE in rename2() flags argument) lock both. In any case,
+RENAME_EXCHANGE in flags argument) lock both. In any case,
if the target already exists, lock it. If the source is a non-directory,
lock it. If we need to lock both, lock them in inode pointer order.
Then call the method. All locks are exclusive.
diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt
index ecd808088362..753dd4f96afe 100644
--- a/Documentation/filesystems/f2fs.txt
+++ b/Documentation/filesystems/f2fs.txt
@@ -131,6 +131,7 @@ inline_dentry Enable the inline dir feature: data in new created
directory entries can be written into inode block. The
space of inode block which is used to store inline
dentries is limited to ~3.4k.
+noinline_dentry Diable the inline dentry feature.
flush_merge Merge concurrent cache_flush commands as much as possible
to eliminate redundant command issues. If the underlying
device handles the cache_flush command relatively slowly,
diff --git a/Documentation/filesystems/overlayfs.txt b/Documentation/filesystems/overlayfs.txt
index d6259c786316..bcbf9710e4af 100644
--- a/Documentation/filesystems/overlayfs.txt
+++ b/Documentation/filesystems/overlayfs.txt
@@ -183,12 +183,10 @@ The copy_up operation essentially creates a new, identical file and
moves it over to the old name. The new file may be on a different
filesystem, so both st_dev and st_ino of the file may change.
-Any open files referring to this inode will access the old data and
-metadata. Similarly any file locks obtained before copy_up will not
-apply to the copied up file.
+Any open files referring to this inode will access the old data.
-On a file opened with O_RDONLY fchmod(2), fchown(2), futimesat(2) and
-fsetxattr(2) will fail with EROFS.
+Any file locks (and leases) obtained before copy_up will not apply
+to the copied up file.
If a file with multiple hard links is copied up, then this will
"break" the link. Changes will not be propagated to other names
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index b1bd05ea66b2..bdd025ceb763 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -287,8 +287,8 @@ implementing on-disk size changes. Start with a copy of the old inode_setattr
and vmtruncate, and the reorder the vmtruncate + foofs_vmtruncate sequence to
be in order of zeroing blocks using block_truncate_page or similar helpers,
size update and on finally on-disk truncation which should not fail.
-inode_change_ok now includes the size checks for ATTR_SIZE and must be called
-in the beginning of ->setattr unconditionally.
+setattr_prepare (which used to be inode_change_ok) now includes the size checks
+for ATTR_SIZE and must be called in the beginning of ->setattr unconditionally.
[mandatory]
@@ -592,3 +592,7 @@ in your dentry operations instead.
work just as well; if it's something more complicated, use dentry->d_parent.
Just be careful not to assume that fetching it more than once will yield
the same value - in RCU mode it could change under you.
+--
+[mandatory]
+ ->rename() has an added flags argument. Any flags not handled by the
+ filesystem should result in EINVAL being returned.
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 68080ad6a75e..219ffd41a911 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -145,7 +145,7 @@ Table 1-1: Process specific entries in /proc
symbol the task is blocked in - or "0" if not blocked.
pagemap Page table
stack Report full stack trace, enable via CONFIG_STACKTRACE
- smaps a extension based on maps, showing the memory consumption of
+ smaps an extension based on maps, showing the memory consumption of
each mapping and flags associated with it
numa_maps an extension based on maps, showing the memory locality and
binding policy as well as mem usage (in pages) of each mapping.
@@ -515,6 +515,18 @@ be vanished or the reverse -- new added.
This file is only present if the CONFIG_MMU kernel configuration option is
enabled.
+Note: reading /proc/PID/maps or /proc/PID/smaps is inherently racy (consistent
+output can be achieved only in the single read call).
+This typically manifests when doing partial reads of these files while the
+memory map is being modified. Despite the races, we do provide the following
+guarantees:
+
+1) The mapped addresses never go backwards, which implies no two
+ regions will ever overlap.
+2) If there is something at a given vaddr during the entirety of the
+ life of the smaps/maps walk, there will be some output for it.
+
+
The /proc/PID/clear_refs is used to reset the PG_Referenced and ACCESSED/YOUNG
bits on both physical and virtual pages associated with a process, and the
soft-dirty bit on pte (see Documentation/vm/soft-dirty.txt for details).
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 9ace359d6cc5..d619c8d71966 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -323,6 +323,35 @@ Whoever sets up the inode is responsible for filling in the "i_op" field. This
is a pointer to a "struct inode_operations" which describes the methods that
can be performed on individual inodes.
+struct xattr_handlers
+---------------------
+
+On filesystems that support extended attributes (xattrs), the s_xattr
+superblock field points to a NULL-terminated array of xattr handlers. Extended
+attributes are name:value pairs.
+
+ name: Indicates that the handler matches attributes with the specified name
+ (such as "system.posix_acl_access"); the prefix field must be NULL.
+
+ prefix: Indicates that the handler matches all attributes with the specified
+ name prefix (such as "user."); the name field must be NULL.
+
+ list: Determine if attributes matching this xattr handler should be listed
+ for a particular dentry. Used by some listxattr implementations like
+ generic_listxattr.
+
+ get: Called by the VFS to get the value of a particular extended attribute.
+ This method is called by the getxattr(2) system call.
+
+ set: Called by the VFS to set the value of a particular extended attribute.
+ When the new value is NULL, called to remove a particular extended
+ attribute. This method is called by the the setxattr(2) and
+ removexattr(2) system calls.
+
+When none of the xattr handlers of a filesystem match the specified attribute
+name or when a filesystem doesn't support extended attributes, the various
+*xattr(2) system calls return -EOPNOTSUPP.
+
The Inode Object
================
@@ -346,8 +375,6 @@ struct inode_operations {
int (*rmdir) (struct inode *,struct dentry *);
int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t);
int (*rename) (struct inode *, struct dentry *,
- struct inode *, struct dentry *);
- int (*rename2) (struct inode *, struct dentry *,
struct inode *, struct dentry *, unsigned int);
int (*readlink) (struct dentry *, char __user *,int);
const char *(*get_link) (struct dentry *, struct inode *,
@@ -356,10 +383,7 @@ struct inode_operations {
int (*get_acl)(struct inode *, int);
int (*setattr) (struct dentry *, struct iattr *);
int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
- int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
- ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
ssize_t (*listxattr) (struct dentry *, char *, size_t);
- int (*removexattr) (struct dentry *, const char *);
void (*update_time)(struct inode *, struct timespec *, int);
int (*atomic_open)(struct inode *, struct dentry *, struct file *,
unsigned open_flag, umode_t create_mode, int *opened);
@@ -416,11 +440,8 @@ otherwise noted.
rename: called by the rename(2) system call to rename the object to
have the parent and name given by the second inode and dentry.
- rename2: this has an additional flags argument compared to rename.
- If no flags are supported by the filesystem then this method
- need not be implemented. If some flags are supported then the
- filesystem must return -EINVAL for any unsupported or unknown
- flags. Currently the following flags are implemented:
+ The filesystem must return -EINVAL for any unsupported or
+ unknown flags. Currently the following flags are implemented:
(1) RENAME_NOREPLACE: this flag indicates that if the target
of the rename exists the rename should fail with -EEXIST
instead of replacing the target. The VFS already checks for
@@ -463,19 +484,8 @@ otherwise noted.
getattr: called by the VFS to get attributes of a file. This method
is called by stat(2) and related system calls.
- setxattr: called by the VFS to set an extended attribute for a file.
- Extended attribute is a name:value pair associated with an
- inode. This method is called by setxattr(2) system call.
-
- getxattr: called by the VFS to retrieve the value of an extended
- attribute name. This method is called by getxattr(2) function
- call.
-
listxattr: called by the VFS to list all extended attributes for a
- given file. This method is called by listxattr(2) system call.
-
- removexattr: called by the VFS to remove an extended attribute from
- a file. This method is called by removexattr(2) system call.
+ given file. This method is called by the listxattr(2) system call.
update_time: called by the VFS to update a specific time or the i_version of
an inode. If this is not defined the VFS will update the inode itself
@@ -722,7 +732,7 @@ struct address_space_operations {
The second case is when a request has been made to invalidate
some or all pages in an address_space. This can happen
- through the fadvice(POSIX_FADV_DONTNEED) system call or by the
+ through the fadvise(POSIX_FADV_DONTNEED) system call or by the
filesystem explicitly requesting it as nfs and 9fs do (when
they believe the cache may be out of date with storage) by
calling invalidate_inode_pages2().
diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt
index 8146e9fd5ffc..c2d44e6e117b 100644
--- a/Documentation/filesystems/xfs.txt
+++ b/Documentation/filesystems/xfs.txt
@@ -348,3 +348,126 @@ Removed Sysctls
---- -------
fs.xfs.xfsbufd_centisec v4.0
fs.xfs.age_buffer_centisecs v4.0
+
+
+Error handling
+==============
+
+XFS can act differently according to the type of error found during its
+operation. The implementation introduces the following concepts to the error
+handler:
+
+ -failure speed:
+ Defines how fast XFS should propagate an error upwards when a specific
+ error is found during the filesystem operation. It can propagate
+ immediately, after a defined number of retries, after a set time period,
+ or simply retry forever.
+
+ -error classes:
+ Specifies the subsystem the error configuration will apply to, such as
+ metadata IO or memory allocation. Different subsystems will have
+ different error handlers for which behaviour can be configured.
+
+ -error handlers:
+ Defines the behavior for a specific error.
+
+The filesystem behavior during an error can be set via sysfs files. Each
+error handler works independently - the first condition met by an error handler
+for a specific class will cause the error to be propagated rather than reset and
+retried.
+
+The action taken by the filesystem when the error is propagated is context
+dependent - it may cause a shut down in the case of an unrecoverable error,
+it may be reported back to userspace, or it may even be ignored because
+there's nothing useful we can with the error or anyone we can report it to (e.g.
+during unmount).
+
+The configuration files are organized into the following hierarchy for each
+mounted filesystem:
+
+ /sys/fs/xfs/<dev>/error/<class>/<error>/
+
+Where:
+ <dev>
+ The short device name of the mounted filesystem. This is the same device
+ name that shows up in XFS kernel error messages as "XFS(<dev>): ..."
+
+ <class>
+ The subsystem the error configuration belongs to. As of 4.9, the defined
+ classes are:
+
+ - "metadata": applies metadata buffer write IO
+
+ <error>
+ The individual error handler configurations.
+
+
+Each filesystem has "global" error configuration options defined in their top
+level directory:
+
+ /sys/fs/xfs/<dev>/error/
+
+ fail_at_unmount (Min: 0 Default: 1 Max: 1)
+ Defines the filesystem error behavior at unmount time.
+
+ If set to a value of 1, XFS will override all other error configurations
+ during unmount and replace them with "immediate fail" characteristics.
+ i.e. no retries, no retry timeout. This will always allow unmount to
+ succeed when there are persistent errors present.
+
+ If set to 0, the configured retry behaviour will continue until all
+ retries and/or timeouts have been exhausted. This will delay unmount
+ completion when there are persistent errors, and it may prevent the
+ filesystem from ever unmounting fully in the case of "retry forever"
+ handler configurations.
+
+ Note: there is no guarantee that fail_at_unmount can be set whilst an
+ unmount is in progress. It is possible that the sysfs entries are
+ removed by the unmounting filesystem before a "retry forever" error
+ handler configuration causes unmount to hang, and hence the filesystem
+ must be configured appropriately before unmount begins to prevent
+ unmount hangs.
+
+Each filesystem has specific error class handlers that define the error
+propagation behaviour for specific errors. There is also a "default" error
+handler defined, which defines the behaviour for all errors that don't have
+specific handlers defined. Where multiple retry constraints are configuredi for
+a single error, the first retry configuration that expires will cause the error
+to be propagated. The handler configurations are found in the directory:
+
+ /sys/fs/xfs/<dev>/error/<class>/<error>/
+
+ max_retries (Min: -1 Default: Varies Max: INTMAX)
+ Defines the allowed number of retries of a specific error before
+ the filesystem will propagate the error. The retry count for a given
+ error context (e.g. a specific metadata buffer) is reset every time
+ there is a successful completion of the operation.
+
+ Setting the value to "-1" will cause XFS to retry forever for this
+ specific error.
+
+ Setting the value to "0" will cause XFS to fail immediately when the
+ specific error is reported.
+
+ Setting the value to "N" (where 0 < N < Max) will make XFS retry the
+ operation "N" times before propagating the error.
+
+ retry_timeout_seconds (Min: -1 Default: Varies Max: 1 day)
+ Define the amount of time (in seconds) that the filesystem is
+ allowed to retry its operations when the specific error is
+ found.
+
+ Setting the value to "-1" will allow XFS to retry forever for this
+ specific error.
+
+ Setting the value to "0" will cause XFS to fail immediately when the
+ specific error is reported.
+
+ Setting the value to "N" (where 0 < N < Max) will allow XFS to retry the
+ operation for up to "N" seconds before propagating the error.
+
+Note: The default behaviour for a specific error handler is dependent on both
+the class and error context. For example, the default values for
+"metadata/ENODEV" are "0" rather than "-1" so that this error handler defaults
+to "fail immediately" behaviour. This is done because ENODEV is a fatal,
+unrecoverable error no matter how many times the metadata IO is retried.