summaryrefslogtreecommitdiff
path: root/Documentation/filesystems
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation/filesystems')
-rw-r--r--Documentation/filesystems/Locking18
-rw-r--r--Documentation/filesystems/api-summary.rst147
-rw-r--r--Documentation/filesystems/autofs-mount-control.txt6
-rw-r--r--Documentation/filesystems/autofs.txt66
-rw-r--r--Documentation/filesystems/ceph.txt14
-rw-r--r--Documentation/filesystems/cifs/TODO3
-rw-r--r--Documentation/filesystems/cifs/cifs.txt34
-rw-r--r--Documentation/filesystems/coda.txt11
-rw-r--r--Documentation/filesystems/dax.txt2
-rw-r--r--Documentation/filesystems/debugfs.txt18
-rw-r--r--Documentation/filesystems/exofs.txt185
-rw-r--r--Documentation/filesystems/ext2.txt8
-rw-r--r--Documentation/filesystems/ext4/index.rst8
-rw-r--r--Documentation/filesystems/f2fs.txt135
-rw-r--r--Documentation/filesystems/fscrypt.rst59
-rw-r--r--Documentation/filesystems/index.rst390
-rw-r--r--Documentation/filesystems/journalling.rst184
-rw-r--r--Documentation/filesystems/mount_api.txt732
-rw-r--r--Documentation/filesystems/nfs/nfsroot.txt2
-rw-r--r--Documentation/filesystems/overlayfs.txt16
-rw-r--r--Documentation/filesystems/path-lookup.rst39
-rw-r--r--Documentation/filesystems/porting45
-rw-r--r--Documentation/filesystems/proc.txt83
-rw-r--r--Documentation/filesystems/ramfs-rootfs-initramfs.txt4
-rw-r--r--Documentation/filesystems/splice.rst22
-rw-r--r--Documentation/filesystems/sysfs.txt23
-rw-r--r--Documentation/filesystems/tmpfs.txt2
-rw-r--r--Documentation/filesystems/ubifs-authentication.md4
-rw-r--r--Documentation/filesystems/vfs.rst1428
-rw-r--r--Documentation/filesystems/vfs.txt1261
-rw-r--r--Documentation/filesystems/xfs-delayed-logging-design.txt2
-rw-r--r--Documentation/filesystems/xfs-self-describing-metadata.txt8
-rw-r--r--Documentation/filesystems/xfs.txt469
33 files changed, 3015 insertions, 2413 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index efea228ccd8a..204dd3ea36bb 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -52,7 +52,7 @@ prototypes:
int (*rename) (struct inode *, struct dentry *,
struct inode *, struct dentry *, unsigned int);
int (*readlink) (struct dentry *, char __user *,int);
- const char *(*get_link) (struct dentry *, struct inode *, void **);
+ const char *(*get_link) (struct dentry *, struct inode *, struct delayed_call *);
void (*truncate) (struct inode *);
int (*permission) (struct inode *, int, unsigned int);
int (*get_acl)(struct inode *, int);
@@ -118,6 +118,7 @@ set: exclusive
--------------------------- super_operations ---------------------------
prototypes:
struct inode *(*alloc_inode)(struct super_block *sb);
+ void (*free_inode)(struct inode *);
void (*destroy_inode)(struct inode *);
void (*dirty_inode) (struct inode *, int flags);
int (*write_inode) (struct inode *, struct writeback_control *wbc);
@@ -139,6 +140,7 @@ locking rules:
All may block [not true, see below]
s_umount
alloc_inode:
+free_inode: called from RCU callback
destroy_inode:
dirty_inode:
write_inode:
@@ -359,8 +361,6 @@ so fl_release_private called on a lease should not block.
----------------------- lock_manager_operations ---------------------------
prototypes:
- int (*lm_compare_owner)(struct file_lock *, struct file_lock *);
- unsigned long (*lm_owner_key)(struct file_lock *);
void (*lm_notify)(struct file_lock *); /* unblock callback */
int (*lm_grant)(struct file_lock *, struct file_lock *, int);
void (*lm_break)(struct file_lock *); /* break_lease callback */
@@ -369,23 +369,11 @@ prototypes:
locking rules:
inode->i_lock blocked_lock_lock may block
-lm_compare_owner: yes[1] maybe no
-lm_owner_key yes[1] yes no
lm_notify: yes yes no
lm_grant: no no no
lm_break: yes no no
lm_change yes no no
-[1]: ->lm_compare_owner and ->lm_owner_key are generally called with
-*an* inode->i_lock held. It may not be the i_lock of the inode
-associated with either file_lock argument! This is the case with deadlock
-detection, since the code has to chase down the owners of locks that may
-be entirely unrelated to the one on which the lock is being acquired.
-For deadlock detection however, the blocked_lock_lock is also held. The
-fact that these locks are held ensures that the file_locks do not
-disappear out from under you while doing the comparison or generating an
-owner key.
-
--------------------------- buffer_head -----------------------------------
prototypes:
void (*b_end_io)(struct buffer_head *bh, int uptodate);
diff --git a/Documentation/filesystems/api-summary.rst b/Documentation/filesystems/api-summary.rst
new file mode 100644
index 000000000000..bbb0c1c0e5cf
--- /dev/null
+++ b/Documentation/filesystems/api-summary.rst
@@ -0,0 +1,147 @@
+=============================
+Linux Filesystems API summary
+=============================
+
+This section contains API-level documentation, mostly taken from the source
+code itself.
+
+The Linux VFS
+=============
+
+The Filesystem types
+--------------------
+
+.. kernel-doc:: include/linux/fs.h
+ :internal:
+
+The Directory Cache
+-------------------
+
+.. kernel-doc:: fs/dcache.c
+ :export:
+
+.. kernel-doc:: include/linux/dcache.h
+ :internal:
+
+Inode Handling
+--------------
+
+.. kernel-doc:: fs/inode.c
+ :export:
+
+.. kernel-doc:: fs/bad_inode.c
+ :export:
+
+Registration and Superblocks
+----------------------------
+
+.. kernel-doc:: fs/super.c
+ :export:
+
+File Locks
+----------
+
+.. kernel-doc:: fs/locks.c
+ :export:
+
+.. kernel-doc:: fs/locks.c
+ :internal:
+
+Other Functions
+---------------
+
+.. kernel-doc:: fs/mpage.c
+ :export:
+
+.. kernel-doc:: fs/namei.c
+ :export:
+
+.. kernel-doc:: fs/buffer.c
+ :export:
+
+.. kernel-doc:: block/bio.c
+ :export:
+
+.. kernel-doc:: fs/seq_file.c
+ :export:
+
+.. kernel-doc:: fs/filesystems.c
+ :export:
+
+.. kernel-doc:: fs/fs-writeback.c
+ :export:
+
+.. kernel-doc:: fs/block_dev.c
+ :export:
+
+.. kernel-doc:: fs/anon_inodes.c
+ :export:
+
+.. kernel-doc:: fs/attr.c
+ :export:
+
+.. kernel-doc:: fs/d_path.c
+ :export:
+
+.. kernel-doc:: fs/dax.c
+ :export:
+
+.. kernel-doc:: fs/direct-io.c
+ :export:
+
+.. kernel-doc:: fs/libfs.c
+ :export:
+
+.. kernel-doc:: fs/posix_acl.c
+ :export:
+
+.. kernel-doc:: fs/stat.c
+ :export:
+
+.. kernel-doc:: fs/sync.c
+ :export:
+
+.. kernel-doc:: fs/xattr.c
+ :export:
+
+The proc filesystem
+===================
+
+sysctl interface
+----------------
+
+.. kernel-doc:: kernel/sysctl.c
+ :export:
+
+proc filesystem interface
+-------------------------
+
+.. kernel-doc:: fs/proc/base.c
+ :internal:
+
+Events based on file descriptors
+================================
+
+.. kernel-doc:: fs/eventfd.c
+ :export:
+
+The Filesystem for Exporting Kernel Objects
+===========================================
+
+.. kernel-doc:: fs/sysfs/file.c
+ :export:
+
+.. kernel-doc:: fs/sysfs/symlink.c
+ :export:
+
+The debugfs filesystem
+======================
+
+debugfs interface
+-----------------
+
+.. kernel-doc:: fs/debugfs/inode.c
+ :export:
+
+.. kernel-doc:: fs/debugfs/file.c
+ :export:
diff --git a/Documentation/filesystems/autofs-mount-control.txt b/Documentation/filesystems/autofs-mount-control.txt
index 45edad6933cc..acc02fc57993 100644
--- a/Documentation/filesystems/autofs-mount-control.txt
+++ b/Documentation/filesystems/autofs-mount-control.txt
@@ -354,8 +354,10 @@ this ioctl is called until no further expire candidates are found.
The call requires an initialized struct autofs_dev_ioctl with the
ioctlfd field set to the descriptor obtained from the open call. In
-addition an immediate expire, independent of the mount timeout, can be
-requested by setting the how field of struct args_expire to 1. If no
+addition an immediate expire that's independent of the mount timeout,
+and a forced expire that's independent of whether the mount is busy,
+can be requested by setting the how field of struct args_expire to
+AUTOFS_EXP_IMMEDIATE or AUTOFS_EXP_FORCED, respectively . If no
expire candidates can be found the ioctl returns -1 with errno set to
EAGAIN.
diff --git a/Documentation/filesystems/autofs.txt b/Documentation/filesystems/autofs.txt
index 373ad25852d3..3af38c7fd26d 100644
--- a/Documentation/filesystems/autofs.txt
+++ b/Documentation/filesystems/autofs.txt
@@ -116,7 +116,7 @@ that purpose there is another flag.
**DCACHE_MANAGE_TRANSIT**
If a dentry has DCACHE_MANAGE_TRANSIT set then two very different but
-related behaviors are invoked, both using the `d_op->d_manage()`
+related behaviours are invoked, both using the `d_op->d_manage()`
dentry operation.
Firstly, before checking to see if any filesystem is mounted on the
@@ -193,8 +193,8 @@ VFS remain in RCU-walk mode, but can only tell it to get out of
RCU-walk mode by returning `-ECHILD`.
So `d_manage()`, when called with `rcu_walk` set, should either return
--ECHILD if there is any reason to believe it is unsafe to end the
-mounted filesystem, and otherwise should return 0.
+-ECHILD if there is any reason to believe it is unsafe to enter the
+mounted filesystem, otherwise it should return 0.
autofs will return `-ECHILD` if an expiry of the filesystem has been
initiated or is being considered, otherwise it returns 0.
@@ -210,7 +210,7 @@ mounts that were created by `d_automount()` returning a filesystem to be
mounted. As autofs doesn't return such a filesystem but leaves the
mounting to the automount daemon, it must involve the automount daemon
in unmounting as well. This also means that autofs has more control
-of expiry.
+over expiry.
The VFS also supports "expiry" of mounts using the MNT_EXPIRE flag to
the `umount` system call. Unmounting with MNT_EXPIRE will fail unless
@@ -225,7 +225,7 @@ unmount any filesystems mounted on the autofs filesystem or remove any
symbolic links or empty directories any time it likes. If the unmount
or removal is successful the filesystem will be returned to the state
it was before the mount or creation, so that any access of the name
-will trigger normal auto-mount processing. In particlar, `rmdir` and
+will trigger normal auto-mount processing. In particular, `rmdir` and
`unlink` do not leave negative entries in the dcache as a normal
filesystem would, so an attempt to access a recently-removed object is
passed to autofs for handling.
@@ -240,11 +240,18 @@ Normally the daemon only wants to remove entries which haven't been
used for a while. For this purpose autofs maintains a "`last_used`"
time stamp on each directory or symlink. For symlinks it genuinely
does record the last time the symlink was "used" or followed to find
-out where it points to. For directories the field is a slight
-misnomer. It actually records the last time that autofs checked if
-the directory or one of its descendents was busy and found that it
-was. This is just as useful and doesn't require updating the field so
-often.
+out where it points to. For directories the field is used slightly
+differently. The field is updated at mount time and during expire
+checks if it is found to be in use (ie. open file descriptor or
+process working directory) and during path walks. The update done
+during path walks prevents frequent expire and immediate mount of
+frequently accessed automounts. But in the case where a GUI continually
+access or an application frequently scans an autofs directory tree
+there can be an accumulation of mounts that aren't actually being
+used. To cater for this case the "`strictexpire`" autofs mount option
+can be used to avoid the "`last_used`" update on path walk thereby
+preventing this apparent inability to expire mounts that aren't
+really in use.
The daemon is able to ask autofs if anything is due to be expired,
using an `ioctl` as discussed later. For a *direct* mount, autofs
@@ -255,8 +262,12 @@ up.
There is an option with indirect mounts to consider each of the leaves
that has been mounted on instead of considering the top-level names.
-This is intended for compatability with version 4 of autofs and should
-be considered as deprecated.
+This was originally intended for compatibility with version 4 of autofs
+and should be considered as deprecated for Sun Format automount maps.
+However, it may be used again for amd format mount maps (which are
+generally indirect maps) because the amd automounter allows for the
+setting of an expire timeout for individual mounts. But there are
+some difficulties in making the needed changes for this.
When autofs considers a directory it checks the `last_used` time and
compares it with the "timeout" value set when the filesystem was
@@ -273,7 +284,7 @@ mounts. If it finds something in the root directory to expire it will
return the name of that thing. Once a name has been returned the
automount daemon needs to unmount any filesystems mounted below the
name normally. As described above, this is unsafe for non-toplevel
-mounts in a version-5 autofs. For this reason the current `automountd`
+mounts in a version-5 autofs. For this reason the current `automount(8)`
does not use this ioctl.
The second mechanism uses either the **AUTOFS_DEV_IOCTL_EXPIRE_CMD** or
@@ -345,7 +356,7 @@ The `wait_queue_token` is a unique number which can identify a
particular request to be acknowledged. When a message is sent over
the pipe the affected dentry is marked as either "active" or
"expiring" and other accesses to it block until the message is
-acknowledged using one of the ioctls below and the relevant
+acknowledged using one of the ioctls below with the relevant
`wait_queue_token`.
Communicating with autofs: root directory ioctls
@@ -367,15 +378,14 @@ The available ioctl commands are:
This mode is also entered if a write to the pipe fails.
- **AUTOFS_IOC_PROTOVER**: This returns the protocol version in use.
- **AUTOFS_IOC_PROTOSUBVER**: Returns the protocol sub-version which
- is really a version number for the implementation. It is
- currently 2.
+ is really a version number for the implementation.
- **AUTOFS_IOC_SETTIMEOUT**: This passes a pointer to an unsigned
long. The value is used to set the timeout for expiry, and
the current timeout value is stored back through the pointer.
- **AUTOFS_IOC_ASKUMOUNT**: Returns, in the pointed-to `int`, 1 if
the filesystem could be unmounted. This is only a hint as
the situation could change at any instant. This call can be
- use to avoid a more expensive full unmount attempt.
+ used to avoid a more expensive full unmount attempt.
- **AUTOFS_IOC_EXPIRE**: as described above, this asks if there is
anything suitable to expire. A pointer to a packet:
@@ -400,6 +410,11 @@ The available ioctl commands are:
**AUTOFS_EXP_IMMEDIATE** causes `last_used` time to be ignored
and objects are expired if the are not in use.
+ **AUTOFS_EXP_FORCED** causes the in use status to be ignored
+ and objects are expired ieven if they are in use. This assumes
+ that the daemon has requested this because it is capable of
+ performing the umount.
+
**AUTOFS_EXP_LEAVES** will select a leaf rather than a top-level
name to expire. This is only safe when *maxproto* is 4.
@@ -415,7 +430,7 @@ which can be used to communicate directly with the autofs filesystem.
It requires CAP_SYS_ADMIN for access.
The `ioctl`s that can be used on this device are described in a separate
-document `autofs-mount-control.txt`, and are summarized briefly here.
+document `autofs-mount-control.txt`, and are summarised briefly here.
Each ioctl is passed a pointer to an `autofs_dev_ioctl` structure:
struct autofs_dev_ioctl {
@@ -511,6 +526,21 @@ directories.
Catatonic mode can only be left via the
**AUTOFS_DEV_IOCTL_OPENMOUNT_CMD** ioctl on the `/dev/autofs`.
+The "ignore" mount option
+-------------------------
+
+The "ignore" mount option can be used to provide a generic indicator
+to applications that the mount entry should be ignored when displaying
+mount information.
+
+In other OSes that provide autofs and that provide a mount list to user
+space based on the kernel mount list a no-op mount option ("ignore" is
+the one use on the most common OSes) is allowed so that autofs file
+system users can optionally use it.
+
+This is intended to be used by user space programs to exclude autofs
+mounts from consideration when reading the mounts list.
+
autofs, name spaces, and shared mounts
--------------------------------------
diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt
index 1177052701e1..d2c6a5ccf0f5 100644
--- a/Documentation/filesystems/ceph.txt
+++ b/Documentation/filesystems/ceph.txt
@@ -22,9 +22,7 @@ In contrast to cluster filesystems like GFS, OCFS2, and GPFS that rely
on symmetric access by all clients to shared block devices, Ceph
separates data and metadata management into independent server
clusters, similar to Lustre. Unlike Lustre, however, metadata and
-storage nodes run entirely as user space daemons. Storage nodes
-utilize btrfs to store data objects, leveraging its advanced features
-(checksumming, metadata replication, etc.). File data is striped
+storage nodes run entirely as user space daemons. File data is striped
across storage nodes in large chunks to distribute workload and
facilitate high throughputs. When storage nodes fail, data is
re-replicated in a distributed fashion by the storage nodes themselves
@@ -118,6 +116,10 @@ Mount Options
of a non-responsive Ceph file system. The default is 30
seconds.
+ caps_max=X
+ Specify the maximum number of caps to hold. Unused caps are released
+ when number of caps exceeds the limit. The default is 0 (no limit)
+
rbytes
When stat() is called on a directory, set st_size to 'rbytes',
the summation of file sizes over all files nested beneath that
@@ -160,11 +162,11 @@ More Information
================
For more information on Ceph, see the home page at
- http://ceph.newdream.net/
+ https://ceph.com/
The Linux kernel client source tree is available at
- git://ceph.newdream.net/git/ceph-client.git
+ https://github.com/ceph/ceph-client.git
git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
and the source for the full system is at
- git://ceph.newdream.net/git/ceph.git
+ https://github.com/ceph/ceph.git
diff --git a/Documentation/filesystems/cifs/TODO b/Documentation/filesystems/cifs/TODO
index 66b3f54aa6dc..9267f3fb131f 100644
--- a/Documentation/filesystems/cifs/TODO
+++ b/Documentation/filesystems/cifs/TODO
@@ -111,7 +111,8 @@ negotiated size) and send larger write sizes to modern servers.
5) Continue to extend the smb3 "buildbot" which does automated xfstesting
against Windows, Samba and Azure currently - to add additional tests and
-to allow the buildbot to execute the tests faster.
+to allow the buildbot to execute the tests faster. The URL for the
+buildbot is: http://smb3-test-rhel-75.southcentralus.cloudapp.azure.com
6) Address various coverity warnings (most are not bugs per-se, but
the more warnings are addressed, the easier it is to spot real
diff --git a/Documentation/filesystems/cifs/cifs.txt b/Documentation/filesystems/cifs/cifs.txt
index 67756607246e..1be3d21c286e 100644
--- a/Documentation/filesystems/cifs/cifs.txt
+++ b/Documentation/filesystems/cifs/cifs.txt
@@ -1,16 +1,21 @@
This is the client VFS module for the SMB3 NAS protocol as well
- older dialects such as the Common Internet File System (CIFS)
+ as for older dialects such as the Common Internet File System (CIFS)
protocol which was the successor to the Server Message Block
(SMB) protocol, the native file sharing mechanism for most early
PC operating systems. New and improved versions of CIFS are now
- called SMB2 and SMB3. These dialects are also supported by the
- CIFS VFS module. CIFS is fully supported by network
- file servers such as Windows 2000, 2003, 2008, 2012 and 2016
- as well by Samba (which provides excellent CIFS
- server support for Linux and many other operating systems), Apple
- systems, as well as most Network Attached Storage vendors, so
- this network filesystem client can mount to a wide variety of
- servers.
+ called SMB2 and SMB3. Use of SMB3 (and later, including SMB3.1.1)
+ is strongly preferred over using older dialects like CIFS due to
+ security reaasons. All modern dialects, including the most recent,
+ SMB3.1.1 are supported by the CIFS VFS module. The SMB3 protocol
+ is implemented and supported by all major file servers
+ such as all modern versions of Windows (including Windows 2016
+ Server), as well as by Samba (which provides excellent
+ CIFS/SMB2/SMB3 server support and tools for Linux and many other
+ operating systems). Apple systems also support SMB3 well, as
+ do most Network Attached Storage vendors, so this network
+ filesystem client can mount to a wide variety of systems.
+ It also supports mounting to the cloud (for example
+ Microsoft Azure), including the necessary security features.
The intent of this module is to provide the most advanced network
file system function for SMB3 compliant servers, including advanced
@@ -24,12 +29,17 @@
cluster file systems for fileserving in some Linux to Linux environments,
not just in Linux to Windows (or Linux to Mac) environments.
- This filesystem has an mount utility (mount.cifs) that can be obtained from
+ This filesystem has a mount utility (mount.cifs) and various user space
+ tools (including smbinfo and setcifsacl) that can be obtained from
- https://ftp.samba.org/pub/linux-cifs/cifs-utils/
+ https://git.samba.org/?p=cifs-utils.git
+ or
+ git://git.samba.org/cifs-utils.git
- It must be installed in the directory with the other mount helpers.
+ mount.cifs should be installed in the directory with the other mount helpers.
For more information on the module see the project wiki page at
+ https://wiki.samba.org/index.php/LinuxCIFS
+ and
https://wiki.samba.org/index.php/LinuxCIFS_utils
diff --git a/Documentation/filesystems/coda.txt b/Documentation/filesystems/coda.txt
index 61311356025d..545262c167c3 100644
--- a/Documentation/filesystems/coda.txt
+++ b/Documentation/filesystems/coda.txt
@@ -481,7 +481,10 @@ kernel support.
-
+ struct coda_timespec {
+ int64_t tv_sec; /* seconds */
+ long tv_nsec; /* nanoseconds */
+ };
struct coda_vattr {
enum coda_vtype va_type; /* vnode type (for create) */
@@ -493,9 +496,9 @@ kernel support.
long va_fileid; /* file id */
u_quad_t va_size; /* file size in bytes */
long va_blocksize; /* blocksize preferred for i/o */
- struct timespec va_atime; /* time of last access */
- struct timespec va_mtime; /* time of last modification */
- struct timespec va_ctime; /* time file changed */
+ struct coda_timespec va_atime; /* time of last access */
+ struct coda_timespec va_mtime; /* time of last modification */
+ struct coda_timespec va_ctime; /* time file changed */
u_long va_gen; /* generation number of file */
u_long va_flags; /* flags defined for file */
dev_t va_rdev; /* device special file represents */
diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt
index 6d2c0d340dea..679729442fd2 100644
--- a/Documentation/filesystems/dax.txt
+++ b/Documentation/filesystems/dax.txt
@@ -76,7 +76,7 @@ exposure of uninitialized data through mmap.
These filesystems may be used for inspiration:
- ext2: see Documentation/filesystems/ext2.txt
- ext4: see Documentation/filesystems/ext4/
-- xfs: see Documentation/filesystems/xfs.txt
+- xfs: see Documentation/admin-guide/xfs.rst
Handling Media Errors
diff --git a/Documentation/filesystems/debugfs.txt b/Documentation/filesystems/debugfs.txt
index 4f45f71149cb..9e27c843d00e 100644
--- a/Documentation/filesystems/debugfs.txt
+++ b/Documentation/filesystems/debugfs.txt
@@ -31,10 +31,10 @@ This call, if successful, will make a directory called name underneath the
indicated parent directory. If parent is NULL, the directory will be
created in the debugfs root. On success, the return value is a struct
dentry pointer which can be used to create files in the directory (and to
-clean it up at the end). A NULL return value indicates that something went
-wrong. If ERR_PTR(-ENODEV) is returned, that is an indication that the
-kernel has been built without debugfs support and none of the functions
-described below will work.
+clean it up at the end). An ERR_PTR(-ERROR) return value indicates that
+something went wrong. If ERR_PTR(-ENODEV) is returned, that is an
+indication that the kernel has been built without debugfs support and none
+of the functions described below will work.
The most general way to create a file within a debugfs directory is with:
@@ -48,8 +48,9 @@ should hold the file, data will be stored in the i_private field of the
resulting inode structure, and fops is a set of file operations which
implement the file's behavior. At a minimum, the read() and/or write()
operations should be provided; others can be included as needed. Again,
-the return value will be a dentry pointer to the created file, NULL for
-error, or ERR_PTR(-ENODEV) if debugfs support is missing.
+the return value will be a dentry pointer to the created file,
+ERR_PTR(-ERROR) on error, or ERR_PTR(-ENODEV) if debugfs support is
+missing.
Create a file with an initial size, the following function can be used
instead:
@@ -168,7 +169,7 @@ byte offsets over a base for the register block.
If you want to dump an u32 array in debugfs, you can create file with:
- struct dentry *debugfs_create_u32_array(const char *name, umode_t mode,
+ void debugfs_create_u32_array(const char *name, umode_t mode,
struct dentry *parent,
u32 *array, u32 elements);
@@ -214,7 +215,8 @@ can be removed with:
void debugfs_remove(struct dentry *dentry);
-The dentry value can be NULL, in which case nothing will be removed.
+The dentry value can be NULL or an error value, in which case nothing will
+be removed.
Once upon a time, debugfs users were required to remember the dentry
pointer for every debugfs file they created so that all files could be
diff --git a/Documentation/filesystems/exofs.txt b/Documentation/filesystems/exofs.txt
deleted file mode 100644
index 23583a136975..000000000000
--- a/Documentation/filesystems/exofs.txt
+++ /dev/null
@@ -1,185 +0,0 @@
-===============================================================================
-WHAT IS EXOFS?
-===============================================================================
-
-exofs is a file system that uses an OSD and exports the API of a normal Linux
-file system. Users access exofs like any other local file system, and exofs
-will in turn issue commands to the local OSD initiator.
-
-OSD is a new T10 command set that views storage devices not as a large/flat
-array of sectors but as a container of objects, each having a length, quota,
-time attributes and more. Each object is addressed by a 64bit ID, and is
-contained in a 64bit ID partition. Each object has associated attributes
-attached to it, which are integral part of the object and provide metadata about
-the object. The standard defines some common obligatory attributes, but user
-attributes can be added as needed.
-
-===============================================================================
-ENVIRONMENT
-===============================================================================
-
-To use this file system, you need to have an object store to run it on. You
-may download a target from:
-http://open-osd.org
-
-See Documentation/scsi/osd.txt for how to setup a working osd environment.
-
-===============================================================================
-USAGE
-===============================================================================
-
-1. Download and compile exofs and open-osd initiator:
- You need an external Kernel source tree or kernel headers from your
- distribution. (anything based on 2.6.26 or later).
-
- a. download open-osd including exofs source using:
- [parent-directory]$ git clone git://git.open-osd.org/open-osd.git
-
- b. Build the library module like this:
- [parent-directory]$ make -C KSRC=$(KER_DIR) open-osd
-
- This will build both the open-osd initiator as well as the exofs kernel
- module. Use whatever parameters you compiled your Kernel with and
- $(KER_DIR) above pointing to the Kernel you compile against. See the file
- open-osd/top-level-Makefile for an example.
-
-2. Get the OSD initiator and target set up properly, and login to the target.
- See Documentation/scsi/osd.txt for farther instructions. Also see ./do-osd
- for example script that does all these steps.
-
-3. Insmod the exofs.ko module:
- [exofs]$ insmod exofs.ko
-
-4. Make sure the directory where you want to mount exists. If not, create it.
- (For example, mkdir /mnt/exofs)
-
-5. At first run you will need to invoke the mkfs.exofs application
-
- As an example, this will create the file system on:
- /dev/osd0 partition ID 65536
-
- mkfs.exofs --pid=65536 --format /dev/osd0
-
- The --format is optional. If not specified, no OSD_FORMAT will be
- performed and a clean file system will be created in the specified pid,
- in the available space of the target. (Use --format=size_in_meg to limit
- the total LUN space available)
-
- If pid already exists, it will be deleted and a new one will be created in
- its place. Be careful.
-
- An exofs lives inside a single OSD partition. You can create multiple exofs
- filesystems on the same device using multiple pids.
-
- (run mkfs.exofs without any parameters for usage help message)
-
-6. Mount the file system.
-
- For example, to mount /dev/osd0, partition ID 0x10000 on /mnt/exofs:
-
- mount -t exofs -o pid=65536 /dev/osd0 /mnt/exofs/
-
-7. For reference (See do-exofs example script):
- do-exofs start - an example of how to perform the above steps.
- do-exofs stop - an example of how to unmount the file system.
- do-exofs format - an example of how to format and mkfs a new exofs.
-
-8. Extra compilation flags (uncomment in fs/exofs/Kbuild):
- CONFIG_EXOFS_DEBUG - for debug messages and extra checks.
-
-===============================================================================
-exofs mount options
-===============================================================================
-Similar to any mount command:
- mount -t exofs -o exofs_options /dev/osdX mount_exofs_directory
-
-Where:
- -t exofs: specifies the exofs file system
-
- /dev/osdX: X is a decimal number. /dev/osdX was created after a successful
- login into an OSD target.
-
- mount_exofs_directory: The directory to mount the file system on
-
- exofs specific options: Options are separated by commas (,)
- pid=<integer> - The partition number to mount/create as
- container of the filesystem.
- This option is mandatory. integer can be
- Hex by pre-pending an 0x to the number.
- osdname=<id> - Mount by a device's osdname.
- osdname is usually a 36 character uuid of the
- form "d2683732-c906-4ee1-9dbd-c10c27bb40df".
- It is one of the device's uuid specified in the
- mkfs.exofs format command.
- If this option is specified then the /dev/osdX
- above can be empty and is ignored.
- to=<integer> - Timeout in ticks for a single command.
- default is (60 * HZ) [for debugging only]
-
-===============================================================================
-DESIGN
-===============================================================================
-
-* The file system control block (AKA on-disk superblock) resides in an object
- with a special ID (defined in common.h).
- Information included in the file system control block is used to fill the
- in-memory superblock structure at mount time. This object is created before
- the file system is used by mkexofs.c. It contains information such as:
- - The file system's magic number
- - The next inode number to be allocated
-
-* Each file resides in its own object and contains the data (and it will be
- possible to extend the file over multiple objects, though this has not been
- implemented yet).
-
-* A directory is treated as a file, and essentially contains a list of <file
- name, inode #> pairs for files that are found in that directory. The object
- IDs correspond to the files' inode numbers and will be allocated according to
- a bitmap (stored in a separate object). Now they are allocated using a
- counter.
-
-* Each file's control block (AKA on-disk inode) is stored in its object's
- attributes. This applies to both regular files and other types (directories,
- device files, symlinks, etc.).
-
-* Credentials are generated per object (inode and superblock) when they are
- created in memory (read from disk or created). The credential works for all
- operations and is used as long as the object remains in memory.
-
-* Async OSD operations are used whenever possible, but the target may execute
- them out of order. The operations that concern us are create, delete,
- readpage, writepage, update_inode, and truncate. The following pairs of
- operations should execute in the order written, and we need to prevent them
- from executing in reverse order:
- - The following are handled with the OBJ_CREATED and OBJ_2BCREATED
- flags. OBJ_CREATED is set when we know the object exists on the OSD -
- in create's callback function, and when we successfully do a
- read_inode.
- OBJ_2BCREATED is set in the beginning of the create function, so we
- know that we should wait.
- - create/delete: delete should wait until the object is created
- on the OSD.
- - create/readpage: readpage should be able to return a page
- full of zeroes in this case. If there was a write already
- en-route (i.e. create, writepage, readpage) then the page
- would be locked, and so it would really be the same as
- create/writepage.
- - create/writepage: if writepage is called for a sync write, it
- should wait until the object is created on the OSD.
- Otherwise, it should just return.
- - create/truncate: truncate should wait until the object is
- created on the OSD.
- - create/update_inode: update_inode should wait until the
- object is created on the OSD.
- - Handled by VFS locks:
- - readpage/delete: shouldn't happen because of page lock.
- - writepage/delete: shouldn't happen because of page lock.
- - readpage/writepage: shouldn't happen because of page lock.
-
-===============================================================================
-LICENSE/COPYRIGHT
-===============================================================================
-The exofs file system is based on ext2 v0.5b (distributed with the Linux kernel
-version 2.6.10). All files include the original copyrights, and the license
-is GPL version 2 (only version 2, as is true for the Linux kernel). The
-Linux kernel can be downloaded from www.kernel.org.
diff --git a/Documentation/filesystems/ext2.txt b/Documentation/filesystems/ext2.txt
index a19973a4dd1e..94c2cf0292f5 100644
--- a/Documentation/filesystems/ext2.txt
+++ b/Documentation/filesystems/ext2.txt
@@ -57,7 +57,13 @@ noacl Don't support POSIX ACLs.
nobh Do not attach buffer_heads to file pagecache.
-grpquota,noquota,quota,usrquota Quota options are silently ignored by ext2.
+quota, usrquota Enable user disk quota support
+ (requires CONFIG_QUOTA).
+
+grpquota Enable group disk quota support
+ (requires CONFIG_QUOTA).
+
+noquota option ls silently ignored by ext2.
Specification
diff --git a/Documentation/filesystems/ext4/index.rst b/Documentation/filesystems/ext4/index.rst
index 3be3e54d480d..705d813d558f 100644
--- a/Documentation/filesystems/ext4/index.rst
+++ b/Documentation/filesystems/ext4/index.rst
@@ -8,7 +8,7 @@ ext4 Data Structures and Algorithms
:maxdepth: 6
:numbered:
- about.rst
- overview.rst
- globals.rst
- dynamic.rst
+ about
+ overview
+ globals
+ dynamic
diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt
index e46c2147ddf8..496fa28b2492 100644
--- a/Documentation/filesystems/f2fs.txt
+++ b/Documentation/filesystems/f2fs.txt
@@ -126,6 +126,8 @@ disable_ext_identify Disable the extension list configured by mkfs, so f2fs
does not aware of cold files such as media files.
inline_xattr Enable the inline xattrs feature.
noinline_xattr Disable the inline xattrs feature.
+inline_xattr_size=%u Support configuring inline xattr size, it depends on
+ flexible inline xattr feature.
inline_data Enable the inline data feature: New created small(<~3.4k)
files can be written into inode block.
inline_dentry Enable the inline dir feature: data in new created
@@ -212,11 +214,22 @@ fsync_mode=%s Control the policy of fsync. Currently supports "posix",
non-atomic files likewise "nobarrier" mount option.
test_dummy_encryption Enable dummy encryption, which provides a fake fscrypt
context. The fake fscrypt context is used by xfstests.
-checkpoint=%s Set to "disable" to turn off checkpointing. Set to "enable"
+checkpoint=%s[:%u[%]] Set to "disable" to turn off checkpointing. Set to "enable"
to reenable checkpointing. Is enabled by default. While
disabled, any unmounting or unexpected shutdowns will cause
the filesystem contents to appear as they did when the
filesystem was mounted with that option.
+ While mounting with checkpoint=disabled, the filesystem must
+ run garbage collection to ensure that all available space can
+ be used. If this takes too much time, the mount may return
+ EAGAIN. You may optionally add a value to indicate how much
+ of the disk you would be willing to temporarily give up to
+ avoid additional garbage collection. This can be given as a
+ number of blocks, or as a percent. For instance, mounting
+ with checkpoint=disable:100% would always succeed, but it may
+ hide up to all remaining free space. The actual space that
+ would be unusable can be viewed at /sys/fs/f2fs/<disk>/unusable
+ This space is reclaimed once checkpoint=enable.
================================================================================
DEBUGFS ENTRIES
@@ -244,11 +257,14 @@ Files in /sys/fs/f2fs/<devname>
..............................................................................
File Content
- gc_max_sleep_time This tuning parameter controls the maximum sleep
+ gc_urgent_sleep_time This parameter controls sleep time for gc_urgent.
+ 500 ms is set by default. See above gc_urgent.
+
+ gc_min_sleep_time This tuning parameter controls the minimum sleep
time for the garbage collection thread. Time is
in milliseconds.
- gc_min_sleep_time This tuning parameter controls the minimum sleep
+ gc_max_sleep_time This tuning parameter controls the maximum sleep
time for the garbage collection thread. Time is
in milliseconds.
@@ -268,9 +284,6 @@ Files in /sys/fs/f2fs/<devname>
to 1, background thread starts to do GC by given
gc_urgent_sleep_time interval.
- gc_urgent_sleep_time This parameter controls sleep time for gc_urgent.
- 500 ms is set by default. See above gc_urgent.
-
reclaim_segments This parameter controls the number of prefree
segments to be reclaimed. If the number of prefree
segments is larger than the number of segments
@@ -285,7 +298,16 @@ Files in /sys/fs/f2fs/<devname>
checkpoint is triggered, and issued during the
checkpoint. By default, it is disabled with 0.
- trim_sections This parameter controls the number of sections
+ discard_granularity This parameter controls the granularity of discard
+ command size. It will issue discard commands iif
+ the size is larger than given granularity. Its
+ unit size is 4KB, and 4 (=16KB) is set by default.
+ The maximum value is 128 (=512KB).
+
+ reserved_blocks This parameter indicates the number of blocks that
+ f2fs reserves internally for root.
+
+ batched_trim_sections This parameter controls the number of sections
to be trimmed out in batch mode when FITRIM
conducts. 32 sections is set by default.
@@ -307,11 +329,35 @@ Files in /sys/fs/f2fs/<devname>
the number is less than this value, it triggers
in-place-updates.
+ min_seq_blocks This parameter controls the threshold to serialize
+ write IOs issued by multiple threads in parallel.
+
+ min_hot_blocks This parameter controls the threshold to allocate
+ a hot data log for pending data blocks to write.
+
+ min_ssr_sections This parameter adds the threshold when deciding
+ SSR block allocation. If this is large, SSR mode
+ will be enabled early.
+
+ ram_thresh This parameter controls the memory footprint used
+ by free nids and cached nat entries. By default,
+ 10 is set, which indicates 10 MB / 1 GB RAM.
+
+ ra_nid_pages When building free nids, F2FS reads NAT blocks
+ ahead for speed up. Default is 0.
+
+ dirty_nats_ratio Given dirty ratio of cached nat entries, F2FS
+ determines flushing them in background.
+
max_victim_search This parameter controls the number of trials to
find a victim segment when conducting SSR and
cleaning operations. The default value is 4096
which covers 8GB block address range.
+ migration_granularity For large-sized sections, F2FS can stop GC given
+ this granularity instead of reclaiming entire
+ section.
+
dir_level This parameter controls the directory level to
support large directory. If a directory has a
number of files, it can reduce the file lookup
@@ -319,9 +365,53 @@ Files in /sys/fs/f2fs/<devname>
Otherwise, it needs to decrease this value to
reduce the space overhead. The default value is 0.
- ram_thresh This parameter controls the memory footprint used
- by free nids and cached nat entries. By default,
- 10 is set, which indicates 10 MB / 1 GB RAM.
+ cp_interval F2FS tries to do checkpoint periodically, 60 secs
+ by default.
+
+ idle_interval F2FS detects system is idle, if there's no F2FS
+ operations during given interval, 5 secs by
+ default.
+
+ discard_idle_interval F2FS detects the discard thread is idle, given
+ time interval. Default is 5 secs.
+
+ gc_idle_interval F2FS detects the GC thread is idle, given time
+ interval. Default is 5 secs.
+
+ umount_discard_timeout When unmounting the disk, F2FS waits for finishing
+ queued discard commands which can take huge time.
+ This gives time out for it, 5 secs by default.
+
+ iostat_enable This controls to enable/disable iostat in F2FS.
+
+ readdir_ra This enables/disabled readahead of inode blocks
+ in readdir, and default is enabled.
+
+ gc_pin_file_thresh This indicates how many GC can be failed for the
+ pinned file. If it exceeds this, F2FS doesn't
+ guarantee its pinning state. 2048 trials is set
+ by default.
+
+ extension_list This enables to change extension_list for hot/cold
+ files in runtime.
+
+ inject_rate This controls injection rate of arbitrary faults.
+
+ inject_type This controls injection type of arbitrary faults.
+
+ dirty_segments This shows # of dirty segments.
+
+ lifetime_write_kbytes This shows # of data written to the disk.
+
+ features This shows current features enabled on F2FS.
+
+ current_reserved_blocks This shows # of blocks currently reserved.
+
+ unusable If checkpoint=disable, this shows the number of
+ blocks that are unusable.
+ If checkpoint=enable it shows the number of blocks
+ that would be unusable if checkpoint=disable were
+ to be set.
================================================================================
USAGE
@@ -714,3 +804,28 @@ WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET
WRITE_LIFE_NONE " WRITE_LIFE_NONE
WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM
WRITE_LIFE_LONG " WRITE_LIFE_LONG
+
+Fallocate(2) Policy
+-------------------
+
+The default policy follows the below posix rule.
+
+Allocating disk space
+ The default operation (i.e., mode is zero) of fallocate() allocates
+ the disk space within the range specified by offset and len. The
+ file size (as reported by stat(2)) will be changed if offset+len is
+ greater than the file size. Any subregion within the range specified
+ by offset and len that did not contain data before the call will be
+ initialized to zero. This default behavior closely resembles the
+ behavior of the posix_fallocate(3) library function, and is intended
+ as a method of optimally implementing that function.
+
+However, once F2FS receives ioctl(fd, F2FS_IOC_SET_PIN_FILE) in prior to
+fallocate(fd, DEFAULT_MODE), it allocates on-disk blocks addressess having
+zero or random data, which is useful to the below scenario where:
+ 1. create(fd)
+ 2. ioctl(fd, F2FS_IOC_SET_PIN_FILE)
+ 3. fallocate(fd, 0, 0, size)
+ 4. address = fibmap(fd, offset)
+ 5. open(blkdev)
+ 6. write(blkdev, address)
diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst
index 3a7b60521b94..82efa41b0e6c 100644
--- a/Documentation/filesystems/fscrypt.rst
+++ b/Documentation/filesystems/fscrypt.rst
@@ -191,7 +191,9 @@ Currently, the following pairs of encryption modes are supported:
If unsure, you should use the (AES-256-XTS, AES-256-CTS-CBC) pair.
AES-128-CBC was added only for low-powered embedded devices with
-crypto accelerators such as CAAM or CESA that do not support XTS.
+crypto accelerators such as CAAM or CESA that do not support XTS. To
+use AES-128-CBC, CONFIG_CRYPTO_SHA256 (or another SHA-256
+implementation) must be enabled so that ESSIV can be used.
Adiantum is a (primarily) stream cipher-based mode that is fast even
on CPUs without dedicated crypto instructions. It's also a true
@@ -343,9 +345,9 @@ FS_IOC_SET_ENCRYPTION_POLICY can fail with the following errors:
- ``ENOTEMPTY``: the file is unencrypted and is a nonempty directory
- ``ENOTTY``: this type of filesystem does not implement encryption
- ``EOPNOTSUPP``: the kernel was not configured with encryption
- support for this filesystem, or the filesystem superblock has not
+ support for filesystems, or the filesystem superblock has not
had encryption enabled on it. (For example, to use encryption on an
- ext4 filesystem, CONFIG_EXT4_ENCRYPTION must be enabled in the
+ ext4 filesystem, CONFIG_FS_ENCRYPTION must be enabled in the
kernel config, and the superblock must have had the "encrypt"
feature flag enabled using ``tune2fs -O encrypt`` or ``mkfs.ext4 -O
encrypt``.)
@@ -451,10 +453,18 @@ astute users may notice some differences in behavior:
- Unencrypted files, or files encrypted with a different encryption
policy (i.e. different key, modes, or flags), cannot be renamed or
linked into an encrypted directory; see `Encryption policy
- enforcement`_. Attempts to do so will fail with EPERM. However,
+ enforcement`_. Attempts to do so will fail with EXDEV. However,
encrypted files can be renamed within an encrypted directory, or
into an unencrypted directory.
+ Note: "moving" an unencrypted file into an encrypted directory, e.g.
+ with the `mv` program, is implemented in userspace by a copy
+ followed by a delete. Be aware that the original unencrypted data
+ may remain recoverable from free space on the disk; prefer to keep
+ all files encrypted from the very beginning. The `shred` program
+ may be used to overwrite the source files but isn't guaranteed to be
+ effective on all filesystems and storage devices.
+
- Direct I/O is not supported on encrypted files. Attempts to use
direct I/O on such files will fall back to buffered I/O.
@@ -541,7 +551,7 @@ not be encrypted.
Except for those special files, it is forbidden to have unencrypted
files, or files encrypted with a different encryption policy, in an
encrypted directory tree. Attempts to link or rename such a file into
-an encrypted directory will fail with EPERM. This is also enforced
+an encrypted directory will fail with EXDEV. This is also enforced
during ->lookup() to provide limited protection against offline
attacks that try to disable or downgrade encryption in known locations
where applications may later write sensitive data. It is recommended
@@ -639,3 +649,42 @@ Note that the precise way that filenames are presented to userspace
without the key is subject to change in the future. It is only meant
as a way to temporarily present valid filenames so that commands like
``rm -r`` work as expected on encrypted directories.
+
+Tests
+=====
+
+To test fscrypt, use xfstests, which is Linux's de facto standard
+filesystem test suite. First, run all the tests in the "encrypt"
+group on the relevant filesystem(s). For example, to test ext4 and
+f2fs encryption using `kvm-xfstests
+<https://github.com/tytso/xfstests-bld/blob/master/Documentation/kvm-quickstart.md>`_::
+
+ kvm-xfstests -c ext4,f2fs -g encrypt
+
+UBIFS encryption can also be tested this way, but it should be done in
+a separate command, and it takes some time for kvm-xfstests to set up
+emulated UBI volumes::
+
+ kvm-xfstests -c ubifs -g encrypt
+
+No tests should fail. However, tests that use non-default encryption
+modes (e.g. generic/549 and generic/550) will be skipped if the needed
+algorithms were not built into the kernel's crypto API. Also, tests
+that access the raw block device (e.g. generic/399, generic/548,
+generic/549, generic/550) will be skipped on UBIFS.
+
+Besides running the "encrypt" group tests, for ext4 and f2fs it's also
+possible to run most xfstests with the "test_dummy_encryption" mount
+option. This option causes all new files to be automatically
+encrypted with a dummy key, without having to make any API calls.
+This tests the encrypted I/O paths more thoroughly. To do this with
+kvm-xfstests, use the "encrypt" filesystem configuration::
+
+ kvm-xfstests -c ext4/encrypt,f2fs/encrypt -g auto
+
+Because this runs many more tests than "-g encrypt" does, it takes
+much longer to run; so also consider using `gce-xfstests
+<https://github.com/tytso/xfstests-bld/blob/master/Documentation/gce-xfstests.md>`_
+instead of kvm-xfstests::
+
+ gce-xfstests -c ext4/encrypt,f2fs/encrypt -g auto
diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst
index 605befab300b..2de2fe2ab078 100644
--- a/Documentation/filesystems/index.rst
+++ b/Documentation/filesystems/index.rst
@@ -1,382 +1,34 @@
-=====================
-Linux Filesystems API
-=====================
+===============================
+Filesystems in the Linux kernel
+===============================
-The Linux VFS
-=============
+This under-development manual will, some glorious day, provide
+comprehensive information on how the Linux virtual filesystem (VFS) layer
+works, along with the filesystems that sit below it. For now, what we have
+can be found below.
-The Filesystem types
---------------------
-
-.. kernel-doc:: include/linux/fs.h
- :internal:
-
-The Directory Cache
--------------------
-
-.. kernel-doc:: fs/dcache.c
- :export:
-
-.. kernel-doc:: include/linux/dcache.h
- :internal:
-
-Inode Handling
---------------
-
-.. kernel-doc:: fs/inode.c
- :export:
-
-.. kernel-doc:: fs/bad_inode.c
- :export:
-
-Registration and Superblocks
-----------------------------
-
-.. kernel-doc:: fs/super.c
- :export:
-
-File Locks
-----------
-
-.. kernel-doc:: fs/locks.c
- :export:
-
-.. kernel-doc:: fs/locks.c
- :internal:
-
-Other Functions
----------------
-
-.. kernel-doc:: fs/mpage.c
- :export:
-
-.. kernel-doc:: fs/namei.c
- :export:
-
-.. kernel-doc:: fs/buffer.c
- :export:
-
-.. kernel-doc:: block/bio.c
- :export:
-
-.. kernel-doc:: fs/seq_file.c
- :export:
-
-.. kernel-doc:: fs/filesystems.c
- :export:
-
-.. kernel-doc:: fs/fs-writeback.c
- :export:
-
-.. kernel-doc:: fs/block_dev.c
- :export:
-
-.. kernel-doc:: fs/anon_inodes.c
- :export:
-
-.. kernel-doc:: fs/attr.c
- :export:
-
-.. kernel-doc:: fs/d_path.c
- :export:
-
-.. kernel-doc:: fs/dax.c
- :export:
-
-.. kernel-doc:: fs/direct-io.c
- :export:
-
-.. kernel-doc:: fs/file_table.c
- :export:
-
-.. kernel-doc:: fs/libfs.c
- :export:
-
-.. kernel-doc:: fs/posix_acl.c
- :export:
-
-.. kernel-doc:: fs/stat.c
- :export:
-
-.. kernel-doc:: fs/sync.c
- :export:
-
-.. kernel-doc:: fs/xattr.c
- :export:
-
-The proc filesystem
-===================
-
-sysctl interface
-----------------
-
-.. kernel-doc:: kernel/sysctl.c
- :export:
-
-proc filesystem interface
--------------------------
-
-.. kernel-doc:: fs/proc/base.c
- :internal:
-
-Events based on file descriptors
-================================
-
-.. kernel-doc:: fs/eventfd.c
- :export:
-
-The Filesystem for Exporting Kernel Objects
-===========================================
-
-.. kernel-doc:: fs/sysfs/file.c
- :export:
-
-.. kernel-doc:: fs/sysfs/symlink.c
- :export:
-
-The debugfs filesystem
+Core VFS documentation
======================
-debugfs interface
------------------
-
-.. kernel-doc:: fs/debugfs/inode.c
- :export:
-
-.. kernel-doc:: fs/debugfs/file.c
- :export:
-
-The Linux Journalling API
-=========================
-
-Overview
---------
-
-Details
-~~~~~~~
-
-The journalling layer is easy to use. You need to first of all create a
-journal_t data structure. There are two calls to do this dependent on
-how you decide to allocate the physical media on which the journal
-resides. The :c:func:`jbd2_journal_init_inode` call is for journals stored in
-filesystem inodes, or the :c:func:`jbd2_journal_init_dev` call can be used
-for journal stored on a raw device (in a continuous range of blocks). A
-journal_t is a typedef for a struct pointer, so when you are finally
-finished make sure you call :c:func:`jbd2_journal_destroy` on it to free up
-any used kernel memory.
-
-Once you have got your journal_t object you need to 'mount' or load the
-journal file. The journalling layer expects the space for the journal
-was already allocated and initialized properly by the userspace tools.
-When loading the journal you must call :c:func:`jbd2_journal_load` to process
-journal contents. If the client file system detects the journal contents
-does not need to be processed (or even need not have valid contents), it
-may call :c:func:`jbd2_journal_wipe` to clear the journal contents before
-calling :c:func:`jbd2_journal_load`.
-
-Note that jbd2_journal_wipe(..,0) calls
-:c:func:`jbd2_journal_skip_recovery` for you if it detects any outstanding
-transactions in the journal and similarly :c:func:`jbd2_journal_load` will
-call :c:func:`jbd2_journal_recover` if necessary. I would advise reading
-:c:func:`ext4_load_journal` in fs/ext4/super.c for examples on this stage.
-
-Now you can go ahead and start modifying the underlying filesystem.
-Almost.
-
-You still need to actually journal your filesystem changes, this is done
-by wrapping them into transactions. Additionally you also need to wrap
-the modification of each of the buffers with calls to the journal layer,
-so it knows what the modifications you are actually making are. To do
-this use :c:func:`jbd2_journal_start` which returns a transaction handle.
-
-:c:func:`jbd2_journal_start` and its counterpart :c:func:`jbd2_journal_stop`,
-which indicates the end of a transaction are nestable calls, so you can
-reenter a transaction if necessary, but remember you must call
-:c:func:`jbd2_journal_stop` the same number of times as
-:c:func:`jbd2_journal_start` before the transaction is completed (or more
-accurately leaves the update phase). Ext4/VFS makes use of this feature to
-simplify handling of inode dirtying, quota support, etc.
-
-Inside each transaction you need to wrap the modifications to the
-individual buffers (blocks). Before you start to modify a buffer you
-need to call :c:func:`jbd2_journal_get_create_access()` /
-:c:func:`jbd2_journal_get_write_access()` /
-:c:func:`jbd2_journal_get_undo_access()` as appropriate, this allows the
-journalling layer to copy the unmodified
-data if it needs to. After all the buffer may be part of a previously
-uncommitted transaction. At this point you are at last ready to modify a
-buffer, and once you are have done so you need to call
-:c:func:`jbd2_journal_dirty_metadata`. Or if you've asked for access to a
-buffer you now know is now longer required to be pushed back on the
-device you can call :c:func:`jbd2_journal_forget` in much the same way as you
-might have used :c:func:`bforget` in the past.
-
-A :c:func:`jbd2_journal_flush` may be called at any time to commit and
-checkpoint all your transactions.
-
-Then at umount time , in your :c:func:`put_super` you can then call
-:c:func:`jbd2_journal_destroy` to clean up your in-core journal object.
-
-Unfortunately there a couple of ways the journal layer can cause a
-deadlock. The first thing to note is that each task can only have a
-single outstanding transaction at any one time, remember nothing commits
-until the outermost :c:func:`jbd2_journal_stop`. This means you must complete
-the transaction at the end of each file/inode/address etc. operation you
-perform, so that the journalling system isn't re-entered on another
-journal. Since transactions can't be nested/batched across differing
-journals, and another filesystem other than yours (say ext4) may be
-modified in a later syscall.
-
-The second case to bear in mind is that :c:func:`jbd2_journal_start` can block
-if there isn't enough space in the journal for your transaction (based
-on the passed nblocks param) - when it blocks it merely(!) needs to wait
-for transactions to complete and be committed from other tasks, so
-essentially we are waiting for :c:func:`jbd2_journal_stop`. So to avoid
-deadlocks you must treat :c:func:`jbd2_journal_start` /
-:c:func:`jbd2_journal_stop` as if they were semaphores and include them in
-your semaphore ordering rules to prevent
-deadlocks. Note that :c:func:`jbd2_journal_extend` has similar blocking
-behaviour to :c:func:`jbd2_journal_start` so you can deadlock here just as
-easily as on :c:func:`jbd2_journal_start`.
-
-Try to reserve the right number of blocks the first time. ;-). This will
-be the maximum number of blocks you are going to touch in this
-transaction. I advise having a look at at least ext4_jbd.h to see the
-basis on which ext4 uses to make these decisions.
-
-Another wriggle to watch out for is your on-disk block allocation
-strategy. Why? Because, if you do a delete, you need to ensure you
-haven't reused any of the freed blocks until the transaction freeing
-these blocks commits. If you reused these blocks and crash happens,
-there is no way to restore the contents of the reallocated blocks at the
-end of the last fully committed transaction. One simple way of doing
-this is to mark blocks as free in internal in-memory block allocation
-structures only after the transaction freeing them commits. Ext4 uses
-journal commit callback for this purpose.
-
-With journal commit callbacks you can ask the journalling layer to call
-a callback function when the transaction is finally committed to disk,
-so that you can do some of your own management. You ask the journalling
-layer for calling the callback by simply setting
-``journal->j_commit_callback`` function pointer and that function is
-called after each transaction commit. You can also use
-``transaction->t_private_list`` for attaching entries to a transaction
-that need processing when the transaction commits.
-
-JBD2 also provides a way to block all transaction updates via
-:c:func:`jbd2_journal_lock_updates()` /
-:c:func:`jbd2_journal_unlock_updates()`. Ext4 uses this when it wants a
-window with a clean and stable fs for a moment. E.g.
-
-::
-
-
- jbd2_journal_lock_updates() //stop new stuff happening..
- jbd2_journal_flush() // checkpoint everything.
- ..do stuff on stable fs
- jbd2_journal_unlock_updates() // carry on with filesystem use.
-
-The opportunities for abuse and DOS attacks with this should be obvious,
-if you allow unprivileged userspace to trigger codepaths containing
-these calls.
-
-Summary
-~~~~~~~
-
-Using the journal is a matter of wrapping the different context changes,
-being each mount, each modification (transaction) and each changed
-buffer to tell the journalling layer about them.
-
-Data Types
-----------
-
-The journalling layer uses typedefs to 'hide' the concrete definitions
-of the structures used. As a client of the JBD2 layer you can just rely
-on the using the pointer as a magic cookie of some sort. Obviously the
-hiding is not enforced as this is 'C'.
-
-Structures
-~~~~~~~~~~
-
-.. kernel-doc:: include/linux/jbd2.h
- :internal:
-
-Functions
----------
-
-The functions here are split into two groups those that affect a journal
-as a whole, and those which are used to manage transactions
-
-Journal Level
-~~~~~~~~~~~~~
-
-.. kernel-doc:: fs/jbd2/journal.c
- :export:
-
-.. kernel-doc:: fs/jbd2/recovery.c
- :internal:
-
-Transasction Level
-~~~~~~~~~~~~~~~~~~
-
-.. kernel-doc:: fs/jbd2/transaction.c
-
-See also
---------
-
-`Journaling the Linux ext2fs Filesystem, LinuxExpo 98, Stephen
-Tweedie <http://kernel.org/pub/linux/kernel/people/sct/ext3/journal-design.ps.gz>`__
-
-`Ext3 Journalling FileSystem, OLS 2000, Dr. Stephen
-Tweedie <http://olstrans.sourceforge.net/release/OLS2000-ext3/OLS2000-ext3.html>`__
-
-splice API
-==========
-
-splice is a method for moving blocks of data around inside the kernel,
-without continually transferring them between the kernel and user space.
-
-.. kernel-doc:: fs/splice.c
-
-pipes API
-=========
-
-Pipe interfaces are all for in-kernel (builtin image) use. They are not
-exported for use by modules.
-
-.. kernel-doc:: include/linux/pipe_fs_i.h
- :internal:
-
-.. kernel-doc:: fs/pipe.c
-
-Encryption API
-==============
-
-A library which filesystems can hook into to support transparent
-encryption of files and directories.
+See these manuals for documentation about the VFS layer itself and how its
+algorithms work.
.. toctree::
- :maxdepth: 2
-
- fscrypt
-
-Pathname lookup
-===============
-
-
-This write-up is based on three articles published at lwn.net:
+ :maxdepth: 2
-- <https://lwn.net/Articles/649115/> Pathname lookup in Linux
-- <https://lwn.net/Articles/649729/> RCU-walk: faster pathname lookup in Linux
-- <https://lwn.net/Articles/650786/> A walk among the symlinks
+ vfs
+ path-lookup
+ api-summary
+ splice
-Written by Neil Brown with help from Al Viro and Jon Corbet.
-It has subsequently been updated to reflect changes in the kernel
-including:
+Filesystem support layers
+=========================
-- per-directory parallel name lookup.
+Documentation for the support code within the filesystem layer for use in
+filesystem implementations.
.. toctree::
:maxdepth: 2
- path-lookup.rst
+ journalling
+ fscrypt
diff --git a/Documentation/filesystems/journalling.rst b/Documentation/filesystems/journalling.rst
new file mode 100644
index 000000000000..58ce6b395206
--- /dev/null
+++ b/Documentation/filesystems/journalling.rst
@@ -0,0 +1,184 @@
+The Linux Journalling API
+=========================
+
+Overview
+--------
+
+Details
+~~~~~~~
+
+The journalling layer is easy to use. You need to first of all create a
+journal_t data structure. There are two calls to do this dependent on
+how you decide to allocate the physical media on which the journal
+resides. The :c:func:`jbd2_journal_init_inode` call is for journals stored in
+filesystem inodes, or the :c:func:`jbd2_journal_init_dev` call can be used
+for journal stored on a raw device (in a continuous range of blocks). A
+journal_t is a typedef for a struct pointer, so when you are finally
+finished make sure you call :c:func:`jbd2_journal_destroy` on it to free up
+any used kernel memory.
+
+Once you have got your journal_t object you need to 'mount' or load the
+journal file. The journalling layer expects the space for the journal
+was already allocated and initialized properly by the userspace tools.
+When loading the journal you must call :c:func:`jbd2_journal_load` to process
+journal contents. If the client file system detects the journal contents
+does not need to be processed (or even need not have valid contents), it
+may call :c:func:`jbd2_journal_wipe` to clear the journal contents before
+calling :c:func:`jbd2_journal_load`.
+
+Note that jbd2_journal_wipe(..,0) calls
+:c:func:`jbd2_journal_skip_recovery` for you if it detects any outstanding
+transactions in the journal and similarly :c:func:`jbd2_journal_load` will
+call :c:func:`jbd2_journal_recover` if necessary. I would advise reading
+:c:func:`ext4_load_journal` in fs/ext4/super.c for examples on this stage.
+
+Now you can go ahead and start modifying the underlying filesystem.
+Almost.
+
+You still need to actually journal your filesystem changes, this is done
+by wrapping them into transactions. Additionally you also need to wrap
+the modification of each of the buffers with calls to the journal layer,
+so it knows what the modifications you are actually making are. To do
+this use :c:func:`jbd2_journal_start` which returns a transaction handle.
+
+:c:func:`jbd2_journal_start` and its counterpart :c:func:`jbd2_journal_stop`,
+which indicates the end of a transaction are nestable calls, so you can
+reenter a transaction if necessary, but remember you must call
+:c:func:`jbd2_journal_stop` the same number of times as
+:c:func:`jbd2_journal_start` before the transaction is completed (or more
+accurately leaves the update phase). Ext4/VFS makes use of this feature to
+simplify handling of inode dirtying, quota support, etc.
+
+Inside each transaction you need to wrap the modifications to the
+individual buffers (blocks). Before you start to modify a buffer you
+need to call :c:func:`jbd2_journal_get_create_access()` /
+:c:func:`jbd2_journal_get_write_access()` /
+:c:func:`jbd2_journal_get_undo_access()` as appropriate, this allows the
+journalling layer to copy the unmodified
+data if it needs to. After all the buffer may be part of a previously
+uncommitted transaction. At this point you are at last ready to modify a
+buffer, and once you are have done so you need to call
+:c:func:`jbd2_journal_dirty_metadata`. Or if you've asked for access to a
+buffer you now know is now longer required to be pushed back on the
+device you can call :c:func:`jbd2_journal_forget` in much the same way as you
+might have used :c:func:`bforget` in the past.
+
+A :c:func:`jbd2_journal_flush` may be called at any time to commit and
+checkpoint all your transactions.
+
+Then at umount time , in your :c:func:`put_super` you can then call
+:c:func:`jbd2_journal_destroy` to clean up your in-core journal object.
+
+Unfortunately there a couple of ways the journal layer can cause a
+deadlock. The first thing to note is that each task can only have a
+single outstanding transaction at any one time, remember nothing commits
+until the outermost :c:func:`jbd2_journal_stop`. This means you must complete
+the transaction at the end of each file/inode/address etc. operation you
+perform, so that the journalling system isn't re-entered on another
+journal. Since transactions can't be nested/batched across differing
+journals, and another filesystem other than yours (say ext4) may be
+modified in a later syscall.
+
+The second case to bear in mind is that :c:func:`jbd2_journal_start` can block
+if there isn't enough space in the journal for your transaction (based
+on the passed nblocks param) - when it blocks it merely(!) needs to wait
+for transactions to complete and be committed from other tasks, so
+essentially we are waiting for :c:func:`jbd2_journal_stop`. So to avoid
+deadlocks you must treat :c:func:`jbd2_journal_start` /
+:c:func:`jbd2_journal_stop` as if they were semaphores and include them in
+your semaphore ordering rules to prevent
+deadlocks. Note that :c:func:`jbd2_journal_extend` has similar blocking
+behaviour to :c:func:`jbd2_journal_start` so you can deadlock here just as
+easily as on :c:func:`jbd2_journal_start`.
+
+Try to reserve the right number of blocks the first time. ;-). This will
+be the maximum number of blocks you are going to touch in this
+transaction. I advise having a look at at least ext4_jbd.h to see the
+basis on which ext4 uses to make these decisions.
+
+Another wriggle to watch out for is your on-disk block allocation
+strategy. Why? Because, if you do a delete, you need to ensure you
+haven't reused any of the freed blocks until the transaction freeing
+these blocks commits. If you reused these blocks and crash happens,
+there is no way to restore the contents of the reallocated blocks at the
+end of the last fully committed transaction. One simple way of doing
+this is to mark blocks as free in internal in-memory block allocation
+structures only after the transaction freeing them commits. Ext4 uses
+journal commit callback for this purpose.
+
+With journal commit callbacks you can ask the journalling layer to call
+a callback function when the transaction is finally committed to disk,
+so that you can do some of your own management. You ask the journalling
+layer for calling the callback by simply setting
+``journal->j_commit_callback`` function pointer and that function is
+called after each transaction commit. You can also use
+``transaction->t_private_list`` for attaching entries to a transaction
+that need processing when the transaction commits.
+
+JBD2 also provides a way to block all transaction updates via
+:c:func:`jbd2_journal_lock_updates()` /
+:c:func:`jbd2_journal_unlock_updates()`. Ext4 uses this when it wants a
+window with a clean and stable fs for a moment. E.g.
+
+::
+
+
+ jbd2_journal_lock_updates() //stop new stuff happening..
+ jbd2_journal_flush() // checkpoint everything.
+ ..do stuff on stable fs
+ jbd2_journal_unlock_updates() // carry on with filesystem use.
+
+The opportunities for abuse and DOS attacks with this should be obvious,
+if you allow unprivileged userspace to trigger codepaths containing
+these calls.
+
+Summary
+~~~~~~~
+
+Using the journal is a matter of wrapping the different context changes,
+being each mount, each modification (transaction) and each changed
+buffer to tell the journalling layer about them.
+
+Data Types
+----------
+
+The journalling layer uses typedefs to 'hide' the concrete definitions
+of the structures used. As a client of the JBD2 layer you can just rely
+on the using the pointer as a magic cookie of some sort. Obviously the
+hiding is not enforced as this is 'C'.
+
+Structures
+~~~~~~~~~~
+
+.. kernel-doc:: include/linux/jbd2.h
+ :internal:
+
+Functions
+---------
+
+The functions here are split into two groups those that affect a journal
+as a whole, and those which are used to manage transactions
+
+Journal Level
+~~~~~~~~~~~~~
+
+.. kernel-doc:: fs/jbd2/journal.c
+ :export:
+
+.. kernel-doc:: fs/jbd2/recovery.c
+ :internal:
+
+Transasction Level
+~~~~~~~~~~~~~~~~~~
+
+.. kernel-doc:: fs/jbd2/transaction.c
+
+See also
+--------
+
+`Journaling the Linux ext2fs Filesystem, LinuxExpo 98, Stephen
+Tweedie <http://kernel.org/pub/linux/kernel/people/sct/ext3/journal-design.ps.gz>`__
+
+`Ext3 Journalling FileSystem, OLS 2000, Dr. Stephen
+Tweedie <http://olstrans.sourceforge.net/release/OLS2000-ext3/OLS2000-ext3.html>`__
+
diff --git a/Documentation/filesystems/mount_api.txt b/Documentation/filesystems/mount_api.txt
new file mode 100644
index 000000000000..00ff0cfccfa7
--- /dev/null
+++ b/Documentation/filesystems/mount_api.txt
@@ -0,0 +1,732 @@
+ ====================
+ FILESYSTEM MOUNT API
+ ====================
+
+CONTENTS
+
+ (1) Overview.
+
+ (2) The filesystem context.
+
+ (3) The filesystem context operations.
+
+ (4) Filesystem context security.
+
+ (5) VFS filesystem context API.
+
+ (6) Superblock creation helpers.
+
+ (7) Parameter description.
+
+ (8) Parameter helper functions.
+
+
+========
+OVERVIEW
+========
+
+The creation of new mounts is now to be done in a multistep process:
+
+ (1) Create a filesystem context.
+
+ (2) Parse the parameters and attach them to the context. Parameters are
+ expected to be passed individually from userspace, though legacy binary
+ parameters can also be handled.
+
+ (3) Validate and pre-process the context.
+
+ (4) Get or create a superblock and mountable root.
+
+ (5) Perform the mount.
+
+ (6) Return an error message attached to the context.
+
+ (7) Destroy the context.
+
+To support this, the file_system_type struct gains two new fields:
+
+ int (*init_fs_context)(struct fs_context *fc);
+ const struct fs_parameter_description *parameters;
+
+The first is invoked to set up the filesystem-specific parts of a filesystem
+context, including the additional space, and the second points to the
+parameter description for validation at registration time and querying by a
+future system call.
+
+Note that security initialisation is done *after* the filesystem is called so
+that the namespaces may be adjusted first.
+
+
+======================
+THE FILESYSTEM CONTEXT
+======================
+
+The creation and reconfiguration of a superblock is governed by a filesystem
+context. This is represented by the fs_context structure:
+
+ struct fs_context {
+ const struct fs_context_operations *ops;
+ struct file_system_type *fs_type;
+ void *fs_private;
+ struct dentry *root;
+ struct user_namespace *user_ns;
+ struct net *net_ns;
+ const struct cred *cred;
+ char *source;
+ char *subtype;
+ void *security;
+ void *s_fs_info;
+ unsigned int sb_flags;
+ unsigned int sb_flags_mask;
+ unsigned int s_iflags;
+ unsigned int lsm_flags;
+ enum fs_context_purpose purpose:8;
+ ...
+ };
+
+The fs_context fields are as follows:
+
+ (*) const struct fs_context_operations *ops
+
+ These are operations that can be done on a filesystem context (see
+ below). This must be set by the ->init_fs_context() file_system_type
+ operation.
+
+ (*) struct file_system_type *fs_type
+
+ A pointer to the file_system_type of the filesystem that is being
+ constructed or reconfigured. This retains a reference on the type owner.
+
+ (*) void *fs_private
+
+ A pointer to the file system's private data. This is where the filesystem
+ will need to store any options it parses.
+
+ (*) struct dentry *root
+
+ A pointer to the root of the mountable tree (and indirectly, the
+ superblock thereof). This is filled in by the ->get_tree() op. If this
+ is set, an active reference on root->d_sb must also be held.
+
+ (*) struct user_namespace *user_ns
+ (*) struct net *net_ns
+
+ There are a subset of the namespaces in use by the invoking process. They
+ retain references on each namespace. The subscribed namespaces may be
+ replaced by the filesystem to reflect other sources, such as the parent
+ mount superblock on an automount.
+
+ (*) const struct cred *cred
+
+ The mounter's credentials. This retains a reference on the credentials.
+
+ (*) char *source
+
+ This specifies the source. It may be a block device (e.g. /dev/sda1) or
+ something more exotic, such as the "host:/path" that NFS desires.
+
+ (*) char *subtype
+
+ This is a string to be added to the type displayed in /proc/mounts to
+ qualify it (used by FUSE). This is available for the filesystem to set if
+ desired.
+
+ (*) void *security
+
+ A place for the LSMs to hang their security data for the superblock. The
+ relevant security operations are described below.
+
+ (*) void *s_fs_info
+
+ The proposed s_fs_info for a new superblock, set in the superblock by
+ sget_fc(). This can be used to distinguish superblocks.
+
+ (*) unsigned int sb_flags
+ (*) unsigned int sb_flags_mask
+
+ Which bits SB_* flags are to be set/cleared in super_block::s_flags.
+
+ (*) unsigned int s_iflags
+
+ These will be bitwise-OR'd with s->s_iflags when a superblock is created.
+
+ (*) enum fs_context_purpose
+
+ This indicates the purpose for which the context is intended. The
+ available values are:
+
+ FS_CONTEXT_FOR_MOUNT, -- New superblock for explicit mount
+ FS_CONTEXT_FOR_SUBMOUNT -- New automatic submount of extant mount
+ FS_CONTEXT_FOR_RECONFIGURE -- Change an existing mount
+
+The mount context is created by calling vfs_new_fs_context() or
+vfs_dup_fs_context() and is destroyed with put_fs_context(). Note that the
+structure is not refcounted.
+
+VFS, security and filesystem mount options are set individually with
+vfs_parse_mount_option(). Options provided by the old mount(2) system call as
+a page of data can be parsed with generic_parse_monolithic().
+
+When mounting, the filesystem is allowed to take data from any of the pointers
+and attach it to the superblock (or whatever), provided it clears the pointer
+in the mount context.
+
+The filesystem is also allowed to allocate resources and pin them with the
+mount context. For instance, NFS might pin the appropriate protocol version
+module.
+
+
+=================================
+THE FILESYSTEM CONTEXT OPERATIONS
+=================================
+
+The filesystem context points to a table of operations:
+
+ struct fs_context_operations {
+ void (*free)(struct fs_context *fc);
+ int (*dup)(struct fs_context *fc, struct fs_context *src_fc);
+ int (*parse_param)(struct fs_context *fc,
+ struct struct fs_parameter *param);
+ int (*parse_monolithic)(struct fs_context *fc, void *data);
+ int (*get_tree)(struct fs_context *fc);
+ int (*reconfigure)(struct fs_context *fc);
+ };
+
+These operations are invoked by the various stages of the mount procedure to
+manage the filesystem context. They are as follows:
+
+ (*) void (*free)(struct fs_context *fc);
+
+ Called to clean up the filesystem-specific part of the filesystem context
+ when the context is destroyed. It should be aware that parts of the
+ context may have been removed and NULL'd out by ->get_tree().
+
+ (*) int (*dup)(struct fs_context *fc, struct fs_context *src_fc);
+
+ Called when a filesystem context has been duplicated to duplicate the
+ filesystem-private data. An error may be returned to indicate failure to
+ do this.
+
+ [!] Note that even if this fails, put_fs_context() will be called
+ immediately thereafter, so ->dup() *must* make the
+ filesystem-private data safe for ->free().
+
+ (*) int (*parse_param)(struct fs_context *fc,
+ struct struct fs_parameter *param);
+
+ Called when a parameter is being added to the filesystem context. param
+ points to the key name and maybe a value object. VFS-specific options
+ will have been weeded out and fc->sb_flags updated in the context.
+ Security options will also have been weeded out and fc->security updated.
+
+ The parameter can be parsed with fs_parse() and fs_lookup_param(). Note
+ that the source(s) are presented as parameters named "source".
+
+ If successful, 0 should be returned or a negative error code otherwise.
+
+ (*) int (*parse_monolithic)(struct fs_context *fc, void *data);
+
+ Called when the mount(2) system call is invoked to pass the entire data
+ page in one go. If this is expected to be just a list of "key[=val]"
+ items separated by commas, then this may be set to NULL.
+
+ The return value is as for ->parse_param().
+
+ If the filesystem (e.g. NFS) needs to examine the data first and then
+ finds it's the standard key-val list then it may pass it off to
+ generic_parse_monolithic().
+
+ (*) int (*get_tree)(struct fs_context *fc);
+
+ Called to get or create the mountable root and superblock, using the
+ information stored in the filesystem context (reconfiguration goes via a
+ different vector). It may detach any resources it desires from the
+ filesystem context and transfer them to the superblock it creates.
+
+ On success it should set fc->root to the mountable root and return 0. In
+ the case of an error, it should return a negative error code.
+
+ The phase on a userspace-driven context will be set to only allow this to
+ be called once on any particular context.
+
+ (*) int (*reconfigure)(struct fs_context *fc);
+
+ Called to effect reconfiguration of a superblock using information stored
+ in the filesystem context. It may detach any resources it desires from
+ the filesystem context and transfer them to the superblock. The
+ superblock can be found from fc->root->d_sb.
+
+ On success it should return 0. In the case of an error, it should return
+ a negative error code.
+
+ [NOTE] reconfigure is intended as a replacement for remount_fs.
+
+
+===========================
+FILESYSTEM CONTEXT SECURITY
+===========================
+
+The filesystem context contains a security pointer that the LSMs can use for
+building up a security context for the superblock to be mounted. There are a
+number of operations used by the new mount code for this purpose:
+
+ (*) int security_fs_context_alloc(struct fs_context *fc,
+ struct dentry *reference);
+
+ Called to initialise fc->security (which is preset to NULL) and allocate
+ any resources needed. It should return 0 on success or a negative error
+ code on failure.
+
+ reference will be non-NULL if the context is being created for superblock
+ reconfiguration (FS_CONTEXT_FOR_RECONFIGURE) in which case it indicates
+ the root dentry of the superblock to be reconfigured. It will also be
+ non-NULL in the case of a submount (FS_CONTEXT_FOR_SUBMOUNT) in which case
+ it indicates the automount point.
+
+ (*) int security_fs_context_dup(struct fs_context *fc,
+ struct fs_context *src_fc);
+
+ Called to initialise fc->security (which is preset to NULL) and allocate
+ any resources needed. The original filesystem context is pointed to by
+ src_fc and may be used for reference. It should return 0 on success or a
+ negative error code on failure.
+
+ (*) void security_fs_context_free(struct fs_context *fc);
+
+ Called to clean up anything attached to fc->security. Note that the
+ contents may have been transferred to a superblock and the pointer cleared
+ during get_tree.
+
+ (*) int security_fs_context_parse_param(struct fs_context *fc,
+ struct fs_parameter *param);
+
+ Called for each mount parameter, including the source. The arguments are
+ as for the ->parse_param() method. It should return 0 to indicate that
+ the parameter should be passed on to the filesystem, 1 to indicate that
+ the parameter should be discarded or an error to indicate that the
+ parameter should be rejected.
+
+ The value pointed to by param may be modified (if a string) or stolen
+ (provided the value pointer is NULL'd out). If it is stolen, 1 must be
+ returned to prevent it being passed to the filesystem.
+
+ (*) int security_fs_context_validate(struct fs_context *fc);
+
+ Called after all the options have been parsed to validate the collection
+ as a whole and to do any necessary allocation so that
+ security_sb_get_tree() and security_sb_reconfigure() are less likely to
+ fail. It should return 0 or a negative error code.
+
+ In the case of reconfiguration, the target superblock will be accessible
+ via fc->root.
+
+ (*) int security_sb_get_tree(struct fs_context *fc);
+
+ Called during the mount procedure to verify that the specified superblock
+ is allowed to be mounted and to transfer the security data there. It
+ should return 0 or a negative error code.
+
+ (*) void security_sb_reconfigure(struct fs_context *fc);
+
+ Called to apply any reconfiguration to an LSM's context. It must not
+ fail. Error checking and resource allocation must be done in advance by
+ the parameter parsing and validation hooks.
+
+ (*) int security_sb_mountpoint(struct fs_context *fc, struct path *mountpoint,
+ unsigned int mnt_flags);
+
+ Called during the mount procedure to verify that the root dentry attached
+ to the context is permitted to be attached to the specified mountpoint.
+ It should return 0 on success or a negative error code on failure.
+
+
+==========================
+VFS FILESYSTEM CONTEXT API
+==========================
+
+There are four operations for creating a filesystem context and one for
+destroying a context:
+
+ (*) struct fs_context *fs_context_for_mount(
+ struct file_system_type *fs_type,
+ unsigned int sb_flags);
+
+ Allocate a filesystem context for the purpose of setting up a new mount,
+ whether that be with a new superblock or sharing an existing one. This
+ sets the superblock flags, initialises the security and calls
+ fs_type->init_fs_context() to initialise the filesystem private data.
+
+ fs_type specifies the filesystem type that will manage the context and
+ sb_flags presets the superblock flags stored therein.
+
+ (*) struct fs_context *fs_context_for_reconfigure(
+ struct dentry *dentry,
+ unsigned int sb_flags,
+ unsigned int sb_flags_mask);
+
+ Allocate a filesystem context for the purpose of reconfiguring an
+ existing superblock. dentry provides a reference to the superblock to be
+ configured. sb_flags and sb_flags_mask indicate which superblock flags
+ need changing and to what.
+
+ (*) struct fs_context *fs_context_for_submount(
+ struct file_system_type *fs_type,
+ struct dentry *reference);
+
+ Allocate a filesystem context for the purpose of creating a new mount for
+ an automount point or other derived superblock. fs_type specifies the
+ filesystem type that will manage the context and the reference dentry
+ supplies the parameters. Namespaces are propagated from the reference
+ dentry's superblock also.
+
+ Note that it's not a requirement that the reference dentry be of the same
+ filesystem type as fs_type.
+
+ (*) struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc);
+
+ Duplicate a filesystem context, copying any options noted and duplicating
+ or additionally referencing any resources held therein. This is available
+ for use where a filesystem has to get a mount within a mount, such as NFS4
+ does by internally mounting the root of the target server and then doing a
+ private pathwalk to the target directory.
+
+ The purpose in the new context is inherited from the old one.
+
+ (*) void put_fs_context(struct fs_context *fc);
+
+ Destroy a filesystem context, releasing any resources it holds. This
+ calls the ->free() operation. This is intended to be called by anyone who
+ created a filesystem context.
+
+ [!] filesystem contexts are not refcounted, so this causes unconditional
+ destruction.
+
+In all the above operations, apart from the put op, the return is a mount
+context pointer or a negative error code.
+
+For the remaining operations, if an error occurs, a negative error code will be
+returned.
+
+ (*) int vfs_parse_fs_param(struct fs_context *fc,
+ struct fs_parameter *param);
+
+ Supply a single mount parameter to the filesystem context. This include
+ the specification of the source/device which is specified as the "source"
+ parameter (which may be specified multiple times if the filesystem
+ supports that).
+
+ param specifies the parameter key name and the value. The parameter is
+ first checked to see if it corresponds to a standard mount flag (in which
+ case it is used to set an SB_xxx flag and consumed) or a security option
+ (in which case the LSM consumes it) before it is passed on to the
+ filesystem.
+
+ The parameter value is typed and can be one of:
+
+ fs_value_is_flag, Parameter not given a value.
+ fs_value_is_string, Value is a string
+ fs_value_is_blob, Value is a binary blob
+ fs_value_is_filename, Value is a filename* + dirfd
+ fs_value_is_filename_empty, Value is a filename* + dirfd + AT_EMPTY_PATH
+ fs_value_is_file, Value is an open file (file*)
+
+ If there is a value, that value is stored in a union in the struct in one
+ of param->{string,blob,name,file}. Note that the function may steal and
+ clear the pointer, but then becomes responsible for disposing of the
+ object.
+
+ (*) int vfs_parse_fs_string(struct fs_context *fc, const char *key,
+ const char *value, size_t v_size);
+
+ A wrapper around vfs_parse_fs_param() that copies the value string it is
+ passed.
+
+ (*) int generic_parse_monolithic(struct fs_context *fc, void *data);
+
+ Parse a sys_mount() data page, assuming the form to be a text list
+ consisting of key[=val] options separated by commas. Each item in the
+ list is passed to vfs_mount_option(). This is the default when the
+ ->parse_monolithic() method is NULL.
+
+ (*) int vfs_get_tree(struct fs_context *fc);
+
+ Get or create the mountable root and superblock, using the parameters in
+ the filesystem context to select/configure the superblock. This invokes
+ the ->get_tree() method.
+
+ (*) struct vfsmount *vfs_create_mount(struct fs_context *fc);
+
+ Create a mount given the parameters in the specified filesystem context.
+ Note that this does not attach the mount to anything.
+
+
+===========================
+SUPERBLOCK CREATION HELPERS
+===========================
+
+A number of VFS helpers are available for use by filesystems for the creation
+or looking up of superblocks.
+
+ (*) struct super_block *
+ sget_fc(struct fs_context *fc,
+ int (*test)(struct super_block *sb, struct fs_context *fc),
+ int (*set)(struct super_block *sb, struct fs_context *fc));
+
+ This is the core routine. If test is non-NULL, it searches for an
+ existing superblock matching the criteria held in the fs_context, using
+ the test function to match them. If no match is found, a new superblock
+ is created and the set function is called to set it up.
+
+ Prior to the set function being called, fc->s_fs_info will be transferred
+ to sb->s_fs_info - and fc->s_fs_info will be cleared if set returns
+ success (ie. 0).
+
+The following helpers all wrap sget_fc():
+
+ (*) int vfs_get_super(struct fs_context *fc,
+ enum vfs_get_super_keying keying,
+ int (*fill_super)(struct super_block *sb,
+ struct fs_context *fc))
+
+ This creates/looks up a deviceless superblock. The keying indicates how
+ many superblocks of this type may exist and in what manner they may be
+ shared:
+
+ (1) vfs_get_single_super
+
+ Only one such superblock may exist in the system. Any further
+ attempt to get a new superblock gets this one (and any parameter
+ differences are ignored).
+
+ (2) vfs_get_keyed_super
+
+ Multiple superblocks of this type may exist and they're keyed on
+ their s_fs_info pointer (for example this may refer to a
+ namespace).
+
+ (3) vfs_get_independent_super
+
+ Multiple independent superblocks of this type may exist. This
+ function never matches an existing one and always creates a new
+ one.
+
+
+=====================
+PARAMETER DESCRIPTION
+=====================
+
+Parameters are described using structures defined in linux/fs_parser.h.
+There's a core description struct that links everything together:
+
+ struct fs_parameter_description {
+ const char name[16];
+ const struct fs_parameter_spec *specs;
+ const struct fs_parameter_enum *enums;
+ };
+
+For example:
+
+ enum {
+ Opt_autocell,
+ Opt_bar,
+ Opt_dyn,
+ Opt_foo,
+ Opt_source,
+ };
+
+ static const struct fs_parameter_description afs_fs_parameters = {
+ .name = "kAFS",
+ .specs = afs_param_specs,
+ .enums = afs_param_enums,
+ };
+
+The members are as follows:
+
+ (1) const char name[16];
+
+ The name to be used in error messages generated by the parse helper
+ functions.
+
+ (2) const struct fs_parameter_specification *specs;
+
+ Table of parameter specifications, terminated with a null entry, where the
+ entries are of type:
+
+ struct fs_parameter_spec {
+ const char *name;
+ u8 opt;
+ enum fs_parameter_type type:8;
+ unsigned short flags;
+ };
+
+ The 'name' field is a string to match exactly to the parameter key (no
+ wildcards, patterns and no case-independence) and 'opt' is the value that
+ will be returned by the fs_parser() function in the case of a successful
+ match.
+
+ The 'type' field indicates the desired value type and must be one of:
+
+ TYPE NAME EXPECTED VALUE RESULT IN
+ ======================= ======================= =====================
+ fs_param_is_flag No value n/a
+ fs_param_is_bool Boolean value result->boolean
+ fs_param_is_u32 32-bit unsigned int result->uint_32
+ fs_param_is_u32_octal 32-bit octal int result->uint_32
+ fs_param_is_u32_hex 32-bit hex int result->uint_32
+ fs_param_is_s32 32-bit signed int result->int_32
+ fs_param_is_u64 64-bit unsigned int result->uint_64
+ fs_param_is_enum Enum value name result->uint_32
+ fs_param_is_string Arbitrary string param->string
+ fs_param_is_blob Binary blob param->blob
+ fs_param_is_blockdev Blockdev path * Needs lookup
+ fs_param_is_path Path * Needs lookup
+ fs_param_is_fd File descriptor result->int_32
+
+ Note that if the value is of fs_param_is_bool type, fs_parse() will try
+ to match any string value against "0", "1", "no", "yes", "false", "true".
+
+ Each parameter can also be qualified with 'flags':
+
+ fs_param_v_optional The value is optional
+ fs_param_neg_with_no result->negated set if key is prefixed with "no"
+ fs_param_neg_with_empty result->negated set if value is ""
+ fs_param_deprecated The parameter is deprecated.
+
+ These are wrapped with a number of convenience wrappers:
+
+ MACRO SPECIFIES
+ ======================= ===============================================
+ fsparam_flag() fs_param_is_flag
+ fsparam_flag_no() fs_param_is_flag, fs_param_neg_with_no
+ fsparam_bool() fs_param_is_bool
+ fsparam_u32() fs_param_is_u32
+ fsparam_u32oct() fs_param_is_u32_octal
+ fsparam_u32hex() fs_param_is_u32_hex
+ fsparam_s32() fs_param_is_s32
+ fsparam_u64() fs_param_is_u64
+ fsparam_enum() fs_param_is_enum
+ fsparam_string() fs_param_is_string
+ fsparam_blob() fs_param_is_blob
+ fsparam_bdev() fs_param_is_blockdev
+ fsparam_path() fs_param_is_path
+ fsparam_fd() fs_param_is_fd
+
+ all of which take two arguments, name string and option number - for
+ example:
+
+ static const struct fs_parameter_spec afs_param_specs[] = {
+ fsparam_flag ("autocell", Opt_autocell),
+ fsparam_flag ("dyn", Opt_dyn),
+ fsparam_string ("source", Opt_source),
+ fsparam_flag_no ("foo", Opt_foo),
+ {}
+ };
+
+ An addition macro, __fsparam() is provided that takes an additional pair
+ of arguments to specify the type and the flags for anything that doesn't
+ match one of the above macros.
+
+ (6) const struct fs_parameter_enum *enums;
+
+ Table of enum value names to integer mappings, terminated with a null
+ entry. This is of type:
+
+ struct fs_parameter_enum {
+ u8 opt;
+ char name[14];
+ u8 value;
+ };
+
+ Where the array is an unsorted list of { parameter ID, name }-keyed
+ elements that indicate the value to map to, e.g.:
+
+ static const struct fs_parameter_enum afs_param_enums[] = {
+ { Opt_bar, "x", 1},
+ { Opt_bar, "y", 23},
+ { Opt_bar, "z", 42},
+ };
+
+ If a parameter of type fs_param_is_enum is encountered, fs_parse() will
+ try to look the value up in the enum table and the result will be stored
+ in the parse result.
+
+The parser should be pointed to by the parser pointer in the file_system_type
+struct as this will provide validation on registration (if
+CONFIG_VALIDATE_FS_PARSER=y) and will allow the description to be queried from
+userspace using the fsinfo() syscall.
+
+
+==========================
+PARAMETER HELPER FUNCTIONS
+==========================
+
+A number of helper functions are provided to help a filesystem or an LSM
+process the parameters it is given.
+
+ (*) int lookup_constant(const struct constant_table tbl[],
+ const char *name, int not_found);
+
+ Look up a constant by name in a table of name -> integer mappings. The
+ table is an array of elements of the following type:
+
+ struct constant_table {
+ const char *name;
+ int value;
+ };
+
+ If a match is found, the corresponding value is returned. If a match
+ isn't found, the not_found value is returned instead.
+
+ (*) bool validate_constant_table(const struct constant_table *tbl,
+ size_t tbl_size,
+ int low, int high, int special);
+
+ Validate a constant table. Checks that all the elements are appropriately
+ ordered, that there are no duplicates and that the values are between low
+ and high inclusive, though provision is made for one allowable special
+ value outside of that range. If no special value is required, special
+ should just be set to lie inside the low-to-high range.
+
+ If all is good, true is returned. If the table is invalid, errors are
+ logged to dmesg and false is returned.
+
+ (*) bool fs_validate_description(const struct fs_parameter_description *desc);
+
+ This performs some validation checks on a parameter description. It
+ returns true if the description is good and false if it is not. It will
+ log errors to dmesg if validation fails.
+
+ (*) int fs_parse(struct fs_context *fc,
+ const struct fs_parameter_description *desc,
+ struct fs_parameter *param,
+ struct fs_parse_result *result);
+
+ This is the main interpreter of parameters. It uses the parameter
+ description to look up a parameter by key name and to convert that to an
+ option number (which it returns).
+
+ If successful, and if the parameter type indicates the result is a
+ boolean, integer or enum type, the value is converted by this function and
+ the result stored in result->{boolean,int_32,uint_32,uint_64}.
+
+ If a match isn't initially made, the key is prefixed with "no" and no
+ value is present then an attempt will be made to look up the key with the
+ prefix removed. If this matches a parameter for which the type has flag
+ fs_param_neg_with_no set, then a match will be made and result->negated
+ will be set to true.
+
+ If the parameter isn't matched, -ENOPARAM will be returned; if the
+ parameter is matched, but the value is erroneous, -EINVAL will be
+ returned; otherwise the parameter's option number will be returned.
+
+ (*) int fs_lookup_param(struct fs_context *fc,
+ struct fs_parameter *value,
+ bool want_bdev,
+ struct path *_path);
+
+ This takes a parameter that carries a string or filename type and attempts
+ to do a path lookup on it. If the parameter expects a blockdev, a check
+ is made that the inode actually represents one.
+
+ Returns 0 if successful and *_path will be set; returns a negative error
+ code if not.
diff --git a/Documentation/filesystems/nfs/nfsroot.txt b/Documentation/filesystems/nfs/nfsroot.txt
index d2963123eb1c..ae4332464560 100644
--- a/Documentation/filesystems/nfs/nfsroot.txt
+++ b/Documentation/filesystems/nfs/nfsroot.txt
@@ -239,7 +239,7 @@ rdinit=<executable file>
A description of the process of mounting the root file system can be
found in:
- Documentation/early-userspace/README
+ Documentation/driver-api/early-userspace/early_userspace_support.rst
diff --git a/Documentation/filesystems/overlayfs.txt b/Documentation/filesystems/overlayfs.txt
index eef7d9d259e8..1da2f1668f08 100644
--- a/Documentation/filesystems/overlayfs.txt
+++ b/Documentation/filesystems/overlayfs.txt
@@ -336,8 +336,20 @@ the copied layers will fail the verification of the lower root file handle.
Non-standard behavior
---------------------
-Overlayfs can now act as a POSIX compliant filesystem with the following
-features turned on:
+Current version of overlayfs can act as a mostly POSIX compliant
+filesystem.
+
+This is the list of cases that overlayfs doesn't currently handle:
+
+a) POSIX mandates updating st_atime for reads. This is currently not
+done in the case when the file resides on a lower layer.
+
+b) If a file residing on a lower layer is opened for read-only and then
+memory mapped with MAP_SHARED, then subsequent changes to the file are not
+reflected in the memory mapping.
+
+The following options allow overlayfs to act more like a standards
+compliant filesystem:
1) "redirect_dir"
diff --git a/Documentation/filesystems/path-lookup.rst b/Documentation/filesystems/path-lookup.rst
index 9d6b68853f5b..434a07b0002b 100644
--- a/Documentation/filesystems/path-lookup.rst
+++ b/Documentation/filesystems/path-lookup.rst
@@ -1,3 +1,18 @@
+===============
+Pathname lookup
+===============
+
+This write-up is based on three articles published at lwn.net:
+
+- <https://lwn.net/Articles/649115/> Pathname lookup in Linux
+- <https://lwn.net/Articles/649729/> RCU-walk: faster pathname lookup in Linux
+- <https://lwn.net/Articles/650786/> A walk among the symlinks
+
+Written by Neil Brown with help from Al Viro and Jon Corbet.
+It has subsequently been updated to reflect changes in the kernel
+including:
+
+- per-directory parallel name lookup.
Introduction to pathname lookup
===============================
@@ -344,7 +359,7 @@ In particular it is held while scanning chains in the dcache hash
table, and the mount point hash table.
Bringing it together with ``struct nameidata``
---------------------------------------------
+----------------------------------------------
.. _First edition Unix: http://minnie.tuhs.org/cgi-bin/utree.pl?file=V1/u2.s
@@ -355,7 +370,7 @@ converts a "name" to an "inode". ``struct nameidata`` contains (among
other fields):
``struct path path``
-~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~
A ``path`` contains a ``struct vfsmount`` (which is
embedded in a ``struct mount``) and a ``struct dentry``. Together these
@@ -366,13 +381,13 @@ step. A reference through ``d_lockref`` and ``mnt_count`` is always
held.
``struct qstr last``
-~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~
This is a string together with a length (i.e. _not_ ``nul`` terminated)
that is the "next" component in the pathname.
``int last_type``
-~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~
This is one of ``LAST_NORM``, ``LAST_ROOT``, ``LAST_DOT``, ``LAST_DOTDOT``, or
``LAST_BIND``. The ``last`` field is only valid if the type is
@@ -381,7 +396,7 @@ components of the symlink have been processed yet. Others should be
fairly self-explanatory.
``struct path root``
-~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~
This is used to hold a reference to the effective root of the
filesystem. Often that reference won't be needed, so this field is
@@ -510,7 +525,7 @@ potentially interesting things about these dentries corresponding
to three different flags that might be set in ``dentry->d_flags``:
``DCACHE_MANAGE_TRANSIT``
-~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~
If this flag has been set, then the filesystem has requested that the
``d_manage()`` dentry operation be called before handling any possible
@@ -529,7 +544,7 @@ filesystem, which will then give it a special pass through
``d_manage()`` by returning ``-EISDIR``.
``DCACHE_MOUNTED``
-~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~
This flag is set on every dentry that is mounted on. As Linux
supports multiple filesystem namespaces, it is possible that the
@@ -542,7 +557,7 @@ If this flag is set, and ``d_manage()`` didn't return ``-EISDIR``,
and a new ``dentry`` (both with counted references).
``DCACHE_NEED_AUTOMOUNT``
-~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~
If ``d_manage()`` allowed us to get this far, and ``lookup_mnt()`` didn't
find a mount point, then this flag causes the ``d_automount()`` dentry
@@ -698,7 +713,7 @@ With that little refresher on seqlocks out of the way we can look at
the bigger picture of how RCU-walk uses seqlocks.
``mount_lock`` and ``nd->m_seq``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
We already met the ``mount_lock`` seqlock when REF-walk used it to
ensure that crossing a mount point is performed safely. RCU-walk uses
@@ -727,7 +742,7 @@ results would have been the same. This ensures the invariant holds,
at least for vfsmount structures.
``dentry->d_seq`` and ``nd->seq``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In place of taking a count or lock on ``d_reflock``, RCU-walk samples
the per-dentry ``d_seq`` seqlock, and stores the sequence number in the
@@ -774,7 +789,7 @@ getting a counted reference to the new dentry before dropping that for
the old dentry which we saw in REF-walk.
No ``inode->i_rwsem`` or even ``rename_lock``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
A semaphore is a fairly heavyweight lock that can only be taken when it is
permissible to sleep. As ``rcu_read_lock()`` forbids sleeping,
@@ -796,7 +811,7 @@ locking. This neatly handles all cases, so adding extra checks on
rename_lock would bring no significant value.
``unlazy walk()`` and ``complete_walk()``
--------------------------------------
+-----------------------------------------
That "dropping down to REF-walk" typically involves a call to
``unlazy_walk()``, so named because "RCU-walk" is also sometimes
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index cf43bc4dbf31..2813a19389fe 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -330,14 +330,14 @@ unreferenced dentries, and is now only called when the dentry refcount goes to
[mandatory]
.d_compare() calling convention and locking rules are significantly
-changed. Read updated documentation in Documentation/filesystems/vfs.txt (and
+changed. Read updated documentation in Documentation/filesystems/vfs.rst (and
look at examples of other filesystems) for guidance.
---
[mandatory]
.d_hash() calling convention and locking rules are significantly
-changed. Read updated documentation in Documentation/filesystems/vfs.txt (and
+changed. Read updated documentation in Documentation/filesystems/vfs.rst (and
look at examples of other filesystems) for guidance.
---
@@ -377,12 +377,12 @@ where possible.
the filesystem provides it), which requires dropping out of rcu-walk mode. This
may now be called in rcu-walk mode (nd->flags & LOOKUP_RCU). -ECHILD should be
returned if the filesystem cannot handle rcu-walk. See
-Documentation/filesystems/vfs.txt for more details.
+Documentation/filesystems/vfs.rst for more details.
permission is an inode permission check that is called on many or all
directory inodes on the way down a path walk (to check for exec permission). It
must now be rcu-walk aware (mask & MAY_NOT_BLOCK). See
-Documentation/filesystems/vfs.txt for more details.
+Documentation/filesystems/vfs.rst for more details.
--
[mandatory]
@@ -625,7 +625,7 @@ in your dentry operations instead.
--
[mandatory]
->clone_file_range() and ->dedupe_file_range have been replaced with
- ->remap_file_range(). See Documentation/filesystems/vfs.txt for more
+ ->remap_file_range(). See Documentation/filesystems/vfs.rst for more
information.
--
[recommended]
@@ -638,3 +638,38 @@ in your dentry operations instead.
inode to d_splice_alias() will also do the right thing (equivalent of
d_add(dentry, NULL); return NULL;), so that kind of special cases
also doesn't need a separate treatment.
+--
+[strongly recommended]
+ take the RCU-delayed parts of ->destroy_inode() into a new method -
+ ->free_inode(). If ->destroy_inode() becomes empty - all the better,
+ just get rid of it. Synchronous work (e.g. the stuff that can't
+ be done from an RCU callback, or any WARN_ON() where we want the
+ stack trace) *might* be movable to ->evict_inode(); however,
+ that goes only for the things that are not needed to balance something
+ done by ->alloc_inode(). IOW, if it's cleaning up the stuff that
+ might have accumulated over the life of in-core inode, ->evict_inode()
+ might be a fit.
+
+ Rules for inode destruction:
+ * if ->destroy_inode() is non-NULL, it gets called
+ * if ->free_inode() is non-NULL, it gets scheduled by call_rcu()
+ * combination of NULL ->destroy_inode and NULL ->free_inode is
+ treated as NULL/free_inode_nonrcu, to preserve the compatibility.
+
+ Note that the callback (be it via ->free_inode() or explicit call_rcu()
+ in ->destroy_inode()) is *NOT* ordered wrt superblock destruction;
+ as the matter of fact, the superblock and all associated structures
+ might be already gone. The filesystem driver is guaranteed to be still
+ there, but that's it. Freeing memory in the callback is fine; doing
+ more than that is possible, but requires a lot of care and is best
+ avoided.
+--
+[mandatory]
+ DCACHE_RCUACCESS is gone; having an RCU delay on dentry freeing is the
+ default. DCACHE_NORCU opts out, and only d_alloc_pseudo() has any
+ business doing so.
+--
+[mandatory]
+ d_alloc_pseudo() is internal-only; uses outside of alloc_file_pseudo() are
+ very suspect (and won't work in modules). Such uses are very likely to
+ be misspelled d_alloc_anon().
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 66cad5c86171..fb4735fd73b0 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -45,6 +45,7 @@ Table of Contents
3.9 /proc/<pid>/map_files - Information about memory mapped files
3.10 /proc/<pid>/timerslack_ns - Task timerslack value
3.11 /proc/<pid>/patch_state - Livepatch patch operation state
+ 3.12 /proc/<pid>/arch_status - Task architecture specific information
4 Configuring procfs
4.1 Mount options
@@ -153,9 +154,11 @@ Table 1-1: Process specific entries in /proc
symbol the task is blocked in - or "0" if not blocked.
pagemap Page table
stack Report full stack trace, enable via CONFIG_STACKTRACE
- smaps an extension based on maps, showing the memory consumption of
+ smaps An extension based on maps, showing the memory consumption of
each mapping and flags associated with it
- numa_maps an extension based on maps, showing the memory locality and
+ smaps_rollup Accumulated smaps stats for all mappings of the process. This
+ can be derived from smaps, but is faster and more convenient
+ numa_maps An extension based on maps, showing the memory locality and
binding policy as well as mem usage (in pages) of each mapping.
..............................................................................
@@ -365,7 +368,7 @@ Table 1-4: Contents of the stat files (as of 2.6.30-rc7)
exit_code the thread's exit_code in the form reported by the waitpid system call
..............................................................................
-The /proc/PID/maps file containing the currently mapped memory regions and
+The /proc/PID/maps file contains the currently mapped memory regions and
their access permissions.
The format is:
@@ -416,11 +419,14 @@ is not associated with a file:
or if empty, the mapping is anonymous.
The /proc/PID/smaps is an extension based on maps, showing the memory
-consumption for each of the process's mappings. For each of mappings there
-is a series of lines such as the following:
+consumption for each of the process's mappings. For each mapping (aka Virtual
+Memory Area, or VMA) there is a series of lines such as the following:
08048000-080bc000 r-xp 00000000 03:02 13130 /bin/bash
+
Size: 1084 kB
+KernelPageSize: 4 kB
+MMUPageSize: 4 kB
Rss: 892 kB
Pss: 374 kB
Shared_Clean: 892 kB
@@ -442,11 +448,14 @@ Locked: 0 kB
THPeligible: 0
VmFlags: rd ex mr mw me dw
-the first of these lines shows the same information as is displayed for the
-mapping in /proc/PID/maps. The remaining lines show the size of the mapping
-(size), the amount of the mapping that is currently resident in RAM (RSS), the
-process' proportional share of this mapping (PSS), the number of clean and
-dirty private pages in the mapping.
+The first of these lines shows the same information as is displayed for the
+mapping in /proc/PID/maps. Following lines show the size of the mapping
+(size); the size of each page allocated when backing a VMA (KernelPageSize),
+which is usually the same as the size in the page table entries; the page size
+used by the MMU when backing a VMA (in most cases, the same as KernelPageSize);
+the amount of the mapping that is currently resident in RAM (RSS); the
+process' proportional share of this mapping (PSS); and the number of clean and
+dirty shared and private pages in the mapping.
The "proportional set size" (PSS) of a process is the count of pages it has
in memory, where each page is divided by the number of processes sharing it.
@@ -531,6 +540,19 @@ guarantees:
2) If there is something at a given vaddr during the entirety of the
life of the smaps/maps walk, there will be some output for it.
+The /proc/PID/smaps_rollup file includes the same fields as /proc/PID/smaps,
+but their values are the sums of the corresponding values for all mappings of
+the process. Additionally, it contains these fields:
+
+Pss_Anon
+Pss_File
+Pss_Shmem
+
+They represent the proportional shares of anonymous, file, and shmem pages, as
+described for smaps above. These fields are omitted in smaps since each
+mapping identifies the type (anon, file, or shmem) of all pages it contains.
+Thus all information in smaps_rollup can be derived from smaps, but at a
+significantly higher cost.
The /proc/PID/clear_refs is used to reset the PG_Referenced and ACCESSED/YOUNG
bits on both physical and virtual pages associated with a process, and the
@@ -1478,7 +1500,7 @@ review the kernel documentation in the directory /usr/src/linux/Documentation.
This chapter is heavily based on the documentation included in the pre 2.2
kernels, and became part of it in version 2.2.1 of the Linux kernel.
-Please see: Documentation/sysctl/ directory for descriptions of these
+Please see: Documentation/admin-guide/sysctl/ directory for descriptions of these
entries.
------------------------------------------------------------------------------
@@ -1948,6 +1970,45 @@ patched. If the patch is being enabled, then the task has already been
patched. If the patch is being disabled, then the task hasn't been
unpatched yet.
+3.12 /proc/<pid>/arch_status - task architecture specific status
+-------------------------------------------------------------------
+When CONFIG_PROC_PID_ARCH_STATUS is enabled, this file displays the
+architecture specific status of the task.
+
+Example
+-------
+ $ cat /proc/6753/arch_status
+ AVX512_elapsed_ms: 8
+
+Description
+-----------
+
+x86 specific entries:
+---------------------
+ AVX512_elapsed_ms:
+ ------------------
+ If AVX512 is supported on the machine, this entry shows the milliseconds
+ elapsed since the last time AVX512 usage was recorded. The recording
+ happens on a best effort basis when a task is scheduled out. This means
+ that the value depends on two factors:
+
+ 1) The time which the task spent on the CPU without being scheduled
+ out. With CPU isolation and a single runnable task this can take
+ several seconds.
+
+ 2) The time since the task was scheduled out last. Depending on the
+ reason for being scheduled out (time slice exhausted, syscall ...)
+ this can be arbitrary long time.
+
+ As a consequence the value cannot be considered precise and authoritative
+ information. The application which uses this information has to be aware
+ of the overall scenario on the system in order to determine whether a
+ task is a real AVX512 user or not. Precise information can be obtained
+ with performance counters.
+
+ A special value of '-1' indicates that no AVX512 usage was recorded, thus
+ the task is unlikely an AVX512 user, but depends on the workload and the
+ scheduling scenario, it also could be a false negative mentioned above.
------------------------------------------------------------------------------
Configuring procfs
diff --git a/Documentation/filesystems/ramfs-rootfs-initramfs.txt b/Documentation/filesystems/ramfs-rootfs-initramfs.txt
index 79637d227e85..97d42ccaa92d 100644
--- a/Documentation/filesystems/ramfs-rootfs-initramfs.txt
+++ b/Documentation/filesystems/ramfs-rootfs-initramfs.txt
@@ -105,7 +105,7 @@ All this differs from the old initrd in several ways:
- The old initrd file was a gzipped filesystem image (in some file format,
such as ext2, that needed a driver built into the kernel), while the new
initramfs archive is a gzipped cpio archive (like tar only simpler,
- see cpio(1) and Documentation/early-userspace/buffer-format.txt). The
+ see cpio(1) and Documentation/driver-api/early-userspace/buffer-format.rst). The
kernel's cpio extraction code is not only extremely small, it's also
__init text and data that can be discarded during the boot process.
@@ -159,7 +159,7 @@ One advantage of the configuration file is that root access is not required to
set permissions or create device nodes in the new archive. (Note that those
two example "file" entries expect to find files named "init.sh" and "busybox" in
a directory called "initramfs", under the linux-2.6.* directory. See
-Documentation/early-userspace/README for more details.)
+Documentation/driver-api/early-userspace/early_userspace_support.rst for more details.)
The kernel does not depend on external cpio tools. If you specify a
directory instead of a configuration file, the kernel's build infrastructure
diff --git a/Documentation/filesystems/splice.rst b/Documentation/filesystems/splice.rst
new file mode 100644
index 000000000000..edd874808472
--- /dev/null
+++ b/Documentation/filesystems/splice.rst
@@ -0,0 +1,22 @@
+================
+splice and pipes
+================
+
+splice API
+==========
+
+splice is a method for moving blocks of data around inside the kernel,
+without continually transferring them between the kernel and user space.
+
+.. kernel-doc:: fs/splice.c
+
+pipes API
+=========
+
+Pipe interfaces are all for in-kernel (builtin image) use. They are not
+exported for use by modules.
+
+.. kernel-doc:: include/linux/pipe_fs_i.h
+ :internal:
+
+.. kernel-doc:: fs/pipe.c
diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt
index 41411b0c60a3..ddf15b1b0d5a 100644
--- a/Documentation/filesystems/sysfs.txt
+++ b/Documentation/filesystems/sysfs.txt
@@ -116,6 +116,27 @@ static struct device_attribute dev_attr_foo = {
.store = store_foo,
};
+Note as stated in include/linux/kernel.h "OTHER_WRITABLE? Generally
+considered a bad idea." so trying to set a sysfs file writable for
+everyone will fail reverting to RO mode for "Others".
+
+For the common cases sysfs.h provides convenience macros to make
+defining attributes easier as well as making code more concise and
+readable. The above case could be shortened to:
+
+static struct device_attribute dev_attr_foo = __ATTR_RW(foo);
+
+the list of helpers available to define your wrapper function is:
+__ATTR_RO(name): assumes default name_show and mode 0444
+__ATTR_WO(name): assumes a name_store only and is restricted to mode
+ 0200 that is root write access only.
+__ATTR_RO_MODE(name, mode): fore more restrictive RO access currently
+ only use case is the EFI System Resource Table
+ (see drivers/firmware/efi/esrt.c)
+__ATTR_RW(name): assumes default name_show, name_store and setting
+ mode to 0644.
+__ATTR_NULL: which sets the name to NULL and is used as end of list
+ indicator (see: kernel/workqueue.c)
Subsystem-Specific Callbacks
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -298,7 +319,7 @@ quick way to lookup the sysfs interface for a device from the result of
a stat(2) operation.
More information can driver-model specific features can be found in
-Documentation/driver-model/.
+Documentation/driver-api/driver-model/.
TODO: Finish this section.
diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt
index d06e9a59a9f4..5ecbc03e6b2f 100644
--- a/Documentation/filesystems/tmpfs.txt
+++ b/Documentation/filesystems/tmpfs.txt
@@ -98,7 +98,7 @@ A memory policy with a valid NodeList will be saved, as specified, for
use at file creation time. When a task allocates a file in the file
system, the mount option memory policy will be applied with a NodeList,
if any, modified by the calling task's cpuset constraints
-[See Documentation/cgroup-v1/cpusets.txt] and any optional flags, listed
+[See Documentation/admin-guide/cgroup-v1/cpusets.rst] and any optional flags, listed
below. If the resulting NodeLists is the empty set, the effective memory
policy for the file will revert to "default" policy.
diff --git a/Documentation/filesystems/ubifs-authentication.md b/Documentation/filesystems/ubifs-authentication.md
index 028b3e2e25f9..23e698167141 100644
--- a/Documentation/filesystems/ubifs-authentication.md
+++ b/Documentation/filesystems/ubifs-authentication.md
@@ -417,9 +417,9 @@ will then have to be provided beforehand in the normal way.
[DMC-CBC-ATTACK] http://www.jakoblell.com/blog/2013/12/22/practical-malleability-attack-against-cbc-encrypted-luks-partitions/
-[DM-INTEGRITY] https://www.kernel.org/doc/Documentation/device-mapper/dm-integrity.txt
+[DM-INTEGRITY] https://www.kernel.org/doc/Documentation/device-mapper/dm-integrity.rst
-[DM-VERITY] https://www.kernel.org/doc/Documentation/device-mapper/verity.txt
+[DM-VERITY] https://www.kernel.org/doc/Documentation/device-mapper/verity.rst
[FSCRYPT-POLICY2] https://www.spinics.net/lists/linux-ext4/msg58710.html
diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
new file mode 100644
index 000000000000..0f85ab21c2ca
--- /dev/null
+++ b/Documentation/filesystems/vfs.rst
@@ -0,0 +1,1428 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=========================================
+Overview of the Linux Virtual File System
+=========================================
+
+Original author: Richard Gooch <rgooch@atnf.csiro.au>
+
+- Copyright (C) 1999 Richard Gooch
+- Copyright (C) 2005 Pekka Enberg
+
+
+Introduction
+============
+
+The Virtual File System (also known as the Virtual Filesystem Switch) is
+the software layer in the kernel that provides the filesystem interface
+to userspace programs. It also provides an abstraction within the
+kernel which allows different filesystem implementations to coexist.
+
+VFS system calls open(2), stat(2), read(2), write(2), chmod(2) and so on
+are called from a process context. Filesystem locking is described in
+the document Documentation/filesystems/Locking.
+
+
+Directory Entry Cache (dcache)
+------------------------------
+
+The VFS implements the open(2), stat(2), chmod(2), and similar system
+calls. The pathname argument that is passed to them is used by the VFS
+to search through the directory entry cache (also known as the dentry
+cache or dcache). This provides a very fast look-up mechanism to
+translate a pathname (filename) into a specific dentry. Dentries live
+in RAM and are never saved to disc: they exist only for performance.
+
+The dentry cache is meant to be a view into your entire filespace. As
+most computers cannot fit all dentries in the RAM at the same time, some
+bits of the cache are missing. In order to resolve your pathname into a
+dentry, the VFS may have to resort to creating dentries along the way,
+and then loading the inode. This is done by looking up the inode.
+
+
+The Inode Object
+----------------
+
+An individual dentry usually has a pointer to an inode. Inodes are
+filesystem objects such as regular files, directories, FIFOs and other
+beasts. They live either on the disc (for block device filesystems) or
+in the memory (for pseudo filesystems). Inodes that live on the disc
+are copied into the memory when required and changes to the inode are
+written back to disc. A single inode can be pointed to by multiple
+dentries (hard links, for example, do this).
+
+To look up an inode requires that the VFS calls the lookup() method of
+the parent directory inode. This method is installed by the specific
+filesystem implementation that the inode lives in. Once the VFS has the
+required dentry (and hence the inode), we can do all those boring things
+like open(2) the file, or stat(2) it to peek at the inode data. The
+stat(2) operation is fairly simple: once the VFS has the dentry, it
+peeks at the inode data and passes some of it back to userspace.
+
+
+The File Object
+---------------
+
+Opening a file requires another operation: allocation of a file
+structure (this is the kernel-side implementation of file descriptors).
+The freshly allocated file structure is initialized with a pointer to
+the dentry and a set of file operation member functions. These are
+taken from the inode data. The open() file method is then called so the
+specific filesystem implementation can do its work. You can see that
+this is another switch performed by the VFS. The file structure is
+placed into the file descriptor table for the process.
+
+Reading, writing and closing files (and other assorted VFS operations)
+is done by using the userspace file descriptor to grab the appropriate
+file structure, and then calling the required file structure method to
+do whatever is required. For as long as the file is open, it keeps the
+dentry in use, which in turn means that the VFS inode is still in use.
+
+
+Registering and Mounting a Filesystem
+=====================================
+
+To register and unregister a filesystem, use the following API
+functions:
+
+.. code-block:: c
+
+ #include <linux/fs.h>
+
+ extern int register_filesystem(struct file_system_type *);
+ extern int unregister_filesystem(struct file_system_type *);
+
+The passed struct file_system_type describes your filesystem. When a
+request is made to mount a filesystem onto a directory in your
+namespace, the VFS will call the appropriate mount() method for the
+specific filesystem. New vfsmount referring to the tree returned by
+->mount() will be attached to the mountpoint, so that when pathname
+resolution reaches the mountpoint it will jump into the root of that
+vfsmount.
+
+You can see all filesystems that are registered to the kernel in the
+file /proc/filesystems.
+
+
+struct file_system_type
+-----------------------
+
+This describes the filesystem. As of kernel 2.6.39, the following
+members are defined:
+
+.. code-block:: c
+
+ struct file_system_operations {
+ const char *name;
+ int fs_flags;
+ struct dentry *(*mount) (struct file_system_type *, int,
+ const char *, void *);
+ void (*kill_sb) (struct super_block *);
+ struct module *owner;
+ struct file_system_type * next;
+ struct list_head fs_supers;
+ struct lock_class_key s_lock_key;
+ struct lock_class_key s_umount_key;
+ };
+
+``name``
+ the name of the filesystem type, such as "ext2", "iso9660",
+ "msdos" and so on
+
+``fs_flags``
+ various flags (i.e. FS_REQUIRES_DEV, FS_NO_DCACHE, etc.)
+
+``mount``
+ the method to call when a new instance of this filesystem should
+ be mounted
+
+``kill_sb``
+ the method to call when an instance of this filesystem should be
+ shut down
+
+
+``owner``
+ for internal VFS use: you should initialize this to THIS_MODULE
+ in most cases.
+
+``next``
+ for internal VFS use: you should initialize this to NULL
+
+ s_lock_key, s_umount_key: lockdep-specific
+
+The mount() method has the following arguments:
+
+``struct file_system_type *fs_type``
+ describes the filesystem, partly initialized by the specific
+ filesystem code
+
+``int flags``
+ mount flags
+
+``const char *dev_name``
+ the device name we are mounting.
+
+``void *data``
+ arbitrary mount options, usually comes as an ASCII string (see
+ "Mount Options" section)
+
+The mount() method must return the root dentry of the tree requested by
+caller. An active reference to its superblock must be grabbed and the
+superblock must be locked. On failure it should return ERR_PTR(error).
+
+The arguments match those of mount(2) and their interpretation depends
+on filesystem type. E.g. for block filesystems, dev_name is interpreted
+as block device name, that device is opened and if it contains a
+suitable filesystem image the method creates and initializes struct
+super_block accordingly, returning its root dentry to caller.
+
+->mount() may choose to return a subtree of existing filesystem - it
+doesn't have to create a new one. The main result from the caller's
+point of view is a reference to dentry at the root of (sub)tree to be
+attached; creation of new superblock is a common side effect.
+
+The most interesting member of the superblock structure that the mount()
+method fills in is the "s_op" field. This is a pointer to a "struct
+super_operations" which describes the next level of the filesystem
+implementation.
+
+Usually, a filesystem uses one of the generic mount() implementations
+and provides a fill_super() callback instead. The generic variants are:
+
+``mount_bdev``
+ mount a filesystem residing on a block device
+
+``mount_nodev``
+ mount a filesystem that is not backed by a device
+
+``mount_single``
+ mount a filesystem which shares the instance between all mounts
+
+A fill_super() callback implementation has the following arguments:
+
+``struct super_block *sb``
+ the superblock structure. The callback must initialize this
+ properly.
+
+``void *data``
+ arbitrary mount options, usually comes as an ASCII string (see
+ "Mount Options" section)
+
+``int silent``
+ whether or not to be silent on error
+
+
+The Superblock Object
+=====================
+
+A superblock object represents a mounted filesystem.
+
+
+struct super_operations
+-----------------------
+
+This describes how the VFS can manipulate the superblock of your
+filesystem. As of kernel 2.6.22, the following members are defined:
+
+.. code-block:: c
+
+ struct super_operations {
+ struct inode *(*alloc_inode)(struct super_block *sb);
+ void (*destroy_inode)(struct inode *);
+
+ void (*dirty_inode) (struct inode *, int flags);
+ int (*write_inode) (struct inode *, int);
+ void (*drop_inode) (struct inode *);
+ void (*delete_inode) (struct inode *);
+ void (*put_super) (struct super_block *);
+ int (*sync_fs)(struct super_block *sb, int wait);
+ int (*freeze_fs) (struct super_block *);
+ int (*unfreeze_fs) (struct super_block *);
+ int (*statfs) (struct dentry *, struct kstatfs *);
+ int (*remount_fs) (struct super_block *, int *, char *);
+ void (*clear_inode) (struct inode *);
+ void (*umount_begin) (struct super_block *);
+
+ int (*show_options)(struct seq_file *, struct dentry *);
+
+ ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
+ ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
+ int (*nr_cached_objects)(struct super_block *);
+ void (*free_cached_objects)(struct super_block *, int);
+ };
+
+All methods are called without any locks being held, unless otherwise
+noted. This means that most methods can block safely. All methods are
+only called from a process context (i.e. not from an interrupt handler
+or bottom half).
+
+``alloc_inode``
+ this method is called by alloc_inode() to allocate memory for
+ struct inode and initialize it. If this function is not
+ defined, a simple 'struct inode' is allocated. Normally
+ alloc_inode will be used to allocate a larger structure which
+ contains a 'struct inode' embedded within it.
+
+``destroy_inode``
+ this method is called by destroy_inode() to release resources
+ allocated for struct inode. It is only required if
+ ->alloc_inode was defined and simply undoes anything done by
+ ->alloc_inode.
+
+``dirty_inode``
+ this method is called by the VFS to mark an inode dirty.
+
+``write_inode``
+ this method is called when the VFS needs to write an inode to
+ disc. The second parameter indicates whether the write should
+ be synchronous or not, not all filesystems check this flag.
+
+``drop_inode``
+ called when the last access to the inode is dropped, with the
+ inode->i_lock spinlock held.
+
+ This method should be either NULL (normal UNIX filesystem
+ semantics) or "generic_delete_inode" (for filesystems that do
+ not want to cache inodes - causing "delete_inode" to always be
+ called regardless of the value of i_nlink)
+
+ The "generic_delete_inode()" behavior is equivalent to the old
+ practice of using "force_delete" in the put_inode() case, but
+ does not have the races that the "force_delete()" approach had.
+
+``delete_inode``
+ called when the VFS wants to delete an inode
+
+``put_super``
+ called when the VFS wishes to free the superblock
+ (i.e. unmount). This is called with the superblock lock held
+
+``sync_fs``
+ called when VFS is writing out all dirty data associated with a
+ superblock. The second parameter indicates whether the method
+ should wait until the write out has been completed. Optional.
+
+``freeze_fs``
+ called when VFS is locking a filesystem and forcing it into a
+ consistent state. This method is currently used by the Logical
+ Volume Manager (LVM).
+
+``unfreeze_fs``
+ called when VFS is unlocking a filesystem and making it writable
+ again.
+
+``statfs``
+ called when the VFS needs to get filesystem statistics.
+
+``remount_fs``
+ called when the filesystem is remounted. This is called with
+ the kernel lock held
+
+``clear_inode``
+ called then the VFS clears the inode. Optional
+
+``umount_begin``
+ called when the VFS is unmounting a filesystem.
+
+``show_options``
+ called by the VFS to show mount options for /proc/<pid>/mounts.
+ (see "Mount Options" section)
+
+``quota_read``
+ called by the VFS to read from filesystem quota file.
+
+``quota_write``
+ called by the VFS to write to filesystem quota file.
+
+``nr_cached_objects``
+ called by the sb cache shrinking function for the filesystem to
+ return the number of freeable cached objects it contains.
+ Optional.
+
+``free_cache_objects``
+ called by the sb cache shrinking function for the filesystem to
+ scan the number of objects indicated to try to free them.
+ Optional, but any filesystem implementing this method needs to
+ also implement ->nr_cached_objects for it to be called
+ correctly.
+
+ We can't do anything with any errors that the filesystem might
+ encountered, hence the void return type. This will never be
+ called if the VM is trying to reclaim under GFP_NOFS conditions,
+ hence this method does not need to handle that situation itself.
+
+ Implementations must include conditional reschedule calls inside
+ any scanning loop that is done. This allows the VFS to
+ determine appropriate scan batch sizes without having to worry
+ about whether implementations will cause holdoff problems due to
+ large scan batch sizes.
+
+Whoever sets up the inode is responsible for filling in the "i_op"
+field. This is a pointer to a "struct inode_operations" which describes
+the methods that can be performed on individual inodes.
+
+
+struct xattr_handlers
+---------------------
+
+On filesystems that support extended attributes (xattrs), the s_xattr
+superblock field points to a NULL-terminated array of xattr handlers.
+Extended attributes are name:value pairs.
+
+``name``
+ Indicates that the handler matches attributes with the specified
+ name (such as "system.posix_acl_access"); the prefix field must
+ be NULL.
+
+``prefix``
+ Indicates that the handler matches all attributes with the
+ specified name prefix (such as "user."); the name field must be
+ NULL.
+
+``list``
+ Determine if attributes matching this xattr handler should be
+ listed for a particular dentry. Used by some listxattr
+ implementations like generic_listxattr.
+
+``get``
+ Called by the VFS to get the value of a particular extended
+ attribute. This method is called by the getxattr(2) system
+ call.
+
+``set``
+ Called by the VFS to set the value of a particular extended
+ attribute. When the new value is NULL, called to remove a
+ particular extended attribute. This method is called by the the
+ setxattr(2) and removexattr(2) system calls.
+
+When none of the xattr handlers of a filesystem match the specified
+attribute name or when a filesystem doesn't support extended attributes,
+the various ``*xattr(2)`` system calls return -EOPNOTSUPP.
+
+
+The Inode Object
+================
+
+An inode object represents an object within the filesystem.
+
+
+struct inode_operations
+-----------------------
+
+This describes how the VFS can manipulate an inode in your filesystem.
+As of kernel 2.6.22, the following members are defined:
+
+.. code-block:: c
+
+ struct inode_operations {
+ int (*create) (struct inode *,struct dentry *, umode_t, bool);
+ struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
+ int (*link) (struct dentry *,struct inode *,struct dentry *);
+ int (*unlink) (struct inode *,struct dentry *);
+ int (*symlink) (struct inode *,struct dentry *,const char *);
+ int (*mkdir) (struct inode *,struct dentry *,umode_t);
+ int (*rmdir) (struct inode *,struct dentry *);
+ int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t);
+ int (*rename) (struct inode *, struct dentry *,
+ struct inode *, struct dentry *, unsigned int);
+ int (*readlink) (struct dentry *, char __user *,int);
+ const char *(*get_link) (struct dentry *, struct inode *,
+ struct delayed_call *);
+ int (*permission) (struct inode *, int);
+ int (*get_acl)(struct inode *, int);
+ int (*setattr) (struct dentry *, struct iattr *);
+ int (*getattr) (const struct path *, struct kstat *, u32, unsigned int);
+ ssize_t (*listxattr) (struct dentry *, char *, size_t);
+ void (*update_time)(struct inode *, struct timespec *, int);
+ int (*atomic_open)(struct inode *, struct dentry *, struct file *,
+ unsigned open_flag, umode_t create_mode);
+ int (*tmpfile) (struct inode *, struct dentry *, umode_t);
+ };
+
+Again, all methods are called without any locks being held, unless
+otherwise noted.
+
+``create``
+ called by the open(2) and creat(2) system calls. Only required
+ if you want to support regular files. The dentry you get should
+ not have an inode (i.e. it should be a negative dentry). Here
+ you will probably call d_instantiate() with the dentry and the
+ newly created inode
+
+``lookup``
+ called when the VFS needs to look up an inode in a parent
+ directory. The name to look for is found in the dentry. This
+ method must call d_add() to insert the found inode into the
+ dentry. The "i_count" field in the inode structure should be
+ incremented. If the named inode does not exist a NULL inode
+ should be inserted into the dentry (this is called a negative
+ dentry). Returning an error code from this routine must only be
+ done on a real error, otherwise creating inodes with system
+ calls like create(2), mknod(2), mkdir(2) and so on will fail.
+ If you wish to overload the dentry methods then you should
+ initialise the "d_dop" field in the dentry; this is a pointer to
+ a struct "dentry_operations". This method is called with the
+ directory inode semaphore held
+
+``link``
+ called by the link(2) system call. Only required if you want to
+ support hard links. You will probably need to call
+ d_instantiate() just as you would in the create() method
+
+``unlink``
+ called by the unlink(2) system call. Only required if you want
+ to support deleting inodes
+
+``symlink``
+ called by the symlink(2) system call. Only required if you want
+ to support symlinks. You will probably need to call
+ d_instantiate() just as you would in the create() method
+
+``mkdir``
+ called by the mkdir(2) system call. Only required if you want
+ to support creating subdirectories. You will probably need to
+ call d_instantiate() just as you would in the create() method
+
+``rmdir``
+ called by the rmdir(2) system call. Only required if you want
+ to support deleting subdirectories
+
+``mknod``
+ called by the mknod(2) system call to create a device (char,
+ block) inode or a named pipe (FIFO) or socket. Only required if
+ you want to support creating these types of inodes. You will
+ probably need to call d_instantiate() just as you would in the
+ create() method
+
+``rename``
+ called by the rename(2) system call to rename the object to have
+ the parent and name given by the second inode and dentry.
+
+ The filesystem must return -EINVAL for any unsupported or
+ unknown flags. Currently the following flags are implemented:
+ (1) RENAME_NOREPLACE: this flag indicates that if the target of
+ the rename exists the rename should fail with -EEXIST instead of
+ replacing the target. The VFS already checks for existence, so
+ for local filesystems the RENAME_NOREPLACE implementation is
+ equivalent to plain rename.
+ (2) RENAME_EXCHANGE: exchange source and target. Both must
+ exist; this is checked by the VFS. Unlike plain rename, source
+ and target may be of different type.
+
+``get_link``
+ called by the VFS to follow a symbolic link to the inode it
+ points to. Only required if you want to support symbolic links.
+ This method returns the symlink body to traverse (and possibly
+ resets the current position with nd_jump_link()). If the body
+ won't go away until the inode is gone, nothing else is needed;
+ if it needs to be otherwise pinned, arrange for its release by
+ having get_link(..., ..., done) do set_delayed_call(done,
+ destructor, argument). In that case destructor(argument) will
+ be called once VFS is done with the body you've returned. May
+ be called in RCU mode; that is indicated by NULL dentry
+ argument. If request can't be handled without leaving RCU mode,
+ have it return ERR_PTR(-ECHILD).
+
+ If the filesystem stores the symlink target in ->i_link, the
+ VFS may use it directly without calling ->get_link(); however,
+ ->get_link() must still be provided. ->i_link must not be
+ freed until after an RCU grace period. Writing to ->i_link
+ post-iget() time requires a 'release' memory barrier.
+
+``readlink``
+ this is now just an override for use by readlink(2) for the
+ cases when ->get_link uses nd_jump_link() or object is not in
+ fact a symlink. Normally filesystems should only implement
+ ->get_link for symlinks and readlink(2) will automatically use
+ that.
+
+``permission``
+ called by the VFS to check for access rights on a POSIX-like
+ filesystem.
+
+ May be called in rcu-walk mode (mask & MAY_NOT_BLOCK). If in
+ rcu-walk mode, the filesystem must check the permission without
+ blocking or storing to the inode.
+
+ If a situation is encountered that rcu-walk cannot handle,
+ return
+ -ECHILD and it will be called again in ref-walk mode.
+
+``setattr``
+ called by the VFS to set attributes for a file. This method is
+ called by chmod(2) and related system calls.
+
+``getattr``
+ called by the VFS to get attributes of a file. This method is
+ called by stat(2) and related system calls.
+
+``listxattr``
+ called by the VFS to list all extended attributes for a given
+ file. This method is called by the listxattr(2) system call.
+
+``update_time``
+ called by the VFS to update a specific time or the i_version of
+ an inode. If this is not defined the VFS will update the inode
+ itself and call mark_inode_dirty_sync.
+
+``atomic_open``
+ called on the last component of an open. Using this optional
+ method the filesystem can look up, possibly create and open the
+ file in one atomic operation. If it wants to leave actual
+ opening to the caller (e.g. if the file turned out to be a
+ symlink, device, or just something filesystem won't do atomic
+ open for), it may signal this by returning finish_no_open(file,
+ dentry). This method is only called if the last component is
+ negative or needs lookup. Cached positive dentries are still
+ handled by f_op->open(). If the file was created, FMODE_CREATED
+ flag should be set in file->f_mode. In case of O_EXCL the
+ method must only succeed if the file didn't exist and hence
+ FMODE_CREATED shall always be set on success.
+
+``tmpfile``
+ called in the end of O_TMPFILE open(). Optional, equivalent to
+ atomically creating, opening and unlinking a file in given
+ directory.
+
+
+The Address Space Object
+========================
+
+The address space object is used to group and manage pages in the page
+cache. It can be used to keep track of the pages in a file (or anything
+else) and also track the mapping of sections of the file into process
+address spaces.
+
+There are a number of distinct yet related services that an
+address-space can provide. These include communicating memory pressure,
+page lookup by address, and keeping track of pages tagged as Dirty or
+Writeback.
+
+The first can be used independently to the others. The VM can try to
+either write dirty pages in order to clean them, or release clean pages
+in order to reuse them. To do this it can call the ->writepage method
+on dirty pages, and ->releasepage on clean pages with PagePrivate set.
+Clean pages without PagePrivate and with no external references will be
+released without notice being given to the address_space.
+
+To achieve this functionality, pages need to be placed on an LRU with
+lru_cache_add and mark_page_active needs to be called whenever the page
+is used.
+
+Pages are normally kept in a radix tree index by ->index. This tree
+maintains information about the PG_Dirty and PG_Writeback status of each
+page, so that pages with either of these flags can be found quickly.
+
+The Dirty tag is primarily used by mpage_writepages - the default
+->writepages method. It uses the tag to find dirty pages to call
+->writepage on. If mpage_writepages is not used (i.e. the address
+provides its own ->writepages) , the PAGECACHE_TAG_DIRTY tag is almost
+unused. write_inode_now and sync_inode do use it (through
+__sync_single_inode) to check if ->writepages has been successful in
+writing out the whole address_space.
+
+The Writeback tag is used by filemap*wait* and sync_page* functions, via
+filemap_fdatawait_range, to wait for all writeback to complete.
+
+An address_space handler may attach extra information to a page,
+typically using the 'private' field in the 'struct page'. If such
+information is attached, the PG_Private flag should be set. This will
+cause various VM routines to make extra calls into the address_space
+handler to deal with that data.
+
+An address space acts as an intermediate between storage and
+application. Data is read into the address space a whole page at a
+time, and provided to the application either by copying of the page, or
+by memory-mapping the page. Data is written into the address space by
+the application, and then written-back to storage typically in whole
+pages, however the address_space has finer control of write sizes.
+
+The read process essentially only requires 'readpage'. The write
+process is more complicated and uses write_begin/write_end or
+set_page_dirty to write data into the address_space, and writepage and
+writepages to writeback data to storage.
+
+Adding and removing pages to/from an address_space is protected by the
+inode's i_mutex.
+
+When data is written to a page, the PG_Dirty flag should be set. It
+typically remains set until writepage asks for it to be written. This
+should clear PG_Dirty and set PG_Writeback. It can be actually written
+at any point after PG_Dirty is clear. Once it is known to be safe,
+PG_Writeback is cleared.
+
+Writeback makes use of a writeback_control structure to direct the
+operations. This gives the the writepage and writepages operations some
+information about the nature of and reason for the writeback request,
+and the constraints under which it is being done. It is also used to
+return information back to the caller about the result of a writepage or
+writepages request.
+
+
+Handling errors during writeback
+--------------------------------
+
+Most applications that do buffered I/O will periodically call a file
+synchronization call (fsync, fdatasync, msync or sync_file_range) to
+ensure that data written has made it to the backing store. When there
+is an error during writeback, they expect that error to be reported when
+a file sync request is made. After an error has been reported on one
+request, subsequent requests on the same file descriptor should return
+0, unless further writeback errors have occurred since the previous file
+syncronization.
+
+Ideally, the kernel would report errors only on file descriptions on
+which writes were done that subsequently failed to be written back. The
+generic pagecache infrastructure does not track the file descriptions
+that have dirtied each individual page however, so determining which
+file descriptors should get back an error is not possible.
+
+Instead, the generic writeback error tracking infrastructure in the
+kernel settles for reporting errors to fsync on all file descriptions
+that were open at the time that the error occurred. In a situation with
+multiple writers, all of them will get back an error on a subsequent
+fsync, even if all of the writes done through that particular file
+descriptor succeeded (or even if there were no writes on that file
+descriptor at all).
+
+Filesystems that wish to use this infrastructure should call
+mapping_set_error to record the error in the address_space when it
+occurs. Then, after writing back data from the pagecache in their
+file->fsync operation, they should call file_check_and_advance_wb_err to
+ensure that the struct file's error cursor has advanced to the correct
+point in the stream of errors emitted by the backing device(s).
+
+
+struct address_space_operations
+-------------------------------
+
+This describes how the VFS can manipulate mapping of a file to page
+cache in your filesystem. The following members are defined:
+
+.. code-block:: c
+
+ struct address_space_operations {
+ int (*writepage)(struct page *page, struct writeback_control *wbc);
+ int (*readpage)(struct file *, struct page *);
+ int (*writepages)(struct address_space *, struct writeback_control *);
+ int (*set_page_dirty)(struct page *page);
+ int (*readpages)(struct file *filp, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages);
+ int (*write_begin)(struct file *, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata);
+ int (*write_end)(struct file *, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata);
+ sector_t (*bmap)(struct address_space *, sector_t);
+ void (*invalidatepage) (struct page *, unsigned int, unsigned int);
+ int (*releasepage) (struct page *, int);
+ void (*freepage)(struct page *);
+ ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter);
+ /* isolate a page for migration */
+ bool (*isolate_page) (struct page *, isolate_mode_t);
+ /* migrate the contents of a page to the specified target */
+ int (*migratepage) (struct page *, struct page *);
+ /* put migration-failed page back to right list */
+ void (*putback_page) (struct page *);
+ int (*launder_page) (struct page *);
+
+ int (*is_partially_uptodate) (struct page *, unsigned long,
+ unsigned long);
+ void (*is_dirty_writeback) (struct page *, bool *, bool *);
+ int (*error_remove_page) (struct mapping *mapping, struct page *page);
+ int (*swap_activate)(struct file *);
+ int (*swap_deactivate)(struct file *);
+ };
+
+``writepage``
+ called by the VM to write a dirty page to backing store. This
+ may happen for data integrity reasons (i.e. 'sync'), or to free
+ up memory (flush). The difference can be seen in
+ wbc->sync_mode. The PG_Dirty flag has been cleared and
+ PageLocked is true. writepage should start writeout, should set
+ PG_Writeback, and should make sure the page is unlocked, either
+ synchronously or asynchronously when the write operation
+ completes.
+
+ If wbc->sync_mode is WB_SYNC_NONE, ->writepage doesn't have to
+ try too hard if there are problems, and may choose to write out
+ other pages from the mapping if that is easier (e.g. due to
+ internal dependencies). If it chooses not to start writeout, it
+ should return AOP_WRITEPAGE_ACTIVATE so that the VM will not
+ keep calling ->writepage on that page.
+
+ See the file "Locking" for more details.
+
+``readpage``
+ called by the VM to read a page from backing store. The page
+ will be Locked when readpage is called, and should be unlocked
+ and marked uptodate once the read completes. If ->readpage
+ discovers that it needs to unlock the page for some reason, it
+ can do so, and then return AOP_TRUNCATED_PAGE. In this case,
+ the page will be relocated, relocked and if that all succeeds,
+ ->readpage will be called again.
+
+``writepages``
+ called by the VM to write out pages associated with the
+ address_space object. If wbc->sync_mode is WBC_SYNC_ALL, then
+ the writeback_control will specify a range of pages that must be
+ written out. If it is WBC_SYNC_NONE, then a nr_to_write is
+ given and that many pages should be written if possible. If no
+ ->writepages is given, then mpage_writepages is used instead.
+ This will choose pages from the address space that are tagged as
+ DIRTY and will pass them to ->writepage.
+
+``set_page_dirty``
+ called by the VM to set a page dirty. This is particularly
+ needed if an address space attaches private data to a page, and
+ that data needs to be updated when a page is dirtied. This is
+ called, for example, when a memory mapped page gets modified.
+ If defined, it should set the PageDirty flag, and the
+ PAGECACHE_TAG_DIRTY tag in the radix tree.
+
+``readpages``
+ called by the VM to read pages associated with the address_space
+ object. This is essentially just a vector version of readpage.
+ Instead of just one page, several pages are requested.
+ readpages is only used for read-ahead, so read errors are
+ ignored. If anything goes wrong, feel free to give up.
+
+``write_begin``
+ Called by the generic buffered write code to ask the filesystem
+ to prepare to write len bytes at the given offset in the file.
+ The address_space should check that the write will be able to
+ complete, by allocating space if necessary and doing any other
+ internal housekeeping. If the write will update parts of any
+ basic-blocks on storage, then those blocks should be pre-read
+ (if they haven't been read already) so that the updated blocks
+ can be written out properly.
+
+ The filesystem must return the locked pagecache page for the
+ specified offset, in ``*pagep``, for the caller to write into.
+
+ It must be able to cope with short writes (where the length
+ passed to write_begin is greater than the number of bytes copied
+ into the page).
+
+ flags is a field for AOP_FLAG_xxx flags, described in
+ include/linux/fs.h.
+
+ A void * may be returned in fsdata, which then gets passed into
+ write_end.
+
+ Returns 0 on success; < 0 on failure (which is the error code),
+ in which case write_end is not called.
+
+``write_end``
+ After a successful write_begin, and data copy, write_end must be
+ called. len is the original len passed to write_begin, and
+ copied is the amount that was able to be copied.
+
+ The filesystem must take care of unlocking the page and
+ releasing it refcount, and updating i_size.
+
+ Returns < 0 on failure, otherwise the number of bytes (<=
+ 'copied') that were able to be copied into pagecache.
+
+``bmap``
+ called by the VFS to map a logical block offset within object to
+ physical block number. This method is used by the FIBMAP ioctl
+ and for working with swap-files. To be able to swap to a file,
+ the file must have a stable mapping to a block device. The swap
+ system does not go through the filesystem but instead uses bmap
+ to find out where the blocks in the file are and uses those
+ addresses directly.
+
+``invalidatepage``
+ If a page has PagePrivate set, then invalidatepage will be
+ called when part or all of the page is to be removed from the
+ address space. This generally corresponds to either a
+ truncation, punch hole or a complete invalidation of the address
+ space (in the latter case 'offset' will always be 0 and 'length'
+ will be PAGE_SIZE). Any private data associated with the page
+ should be updated to reflect this truncation. If offset is 0
+ and length is PAGE_SIZE, then the private data should be
+ released, because the page must be able to be completely
+ discarded. This may be done by calling the ->releasepage
+ function, but in this case the release MUST succeed.
+
+``releasepage``
+ releasepage is called on PagePrivate pages to indicate that the
+ page should be freed if possible. ->releasepage should remove
+ any private data from the page and clear the PagePrivate flag.
+ If releasepage() fails for some reason, it must indicate failure
+ with a 0 return value. releasepage() is used in two distinct
+ though related cases. The first is when the VM finds a clean
+ page with no active users and wants to make it a free page. If
+ ->releasepage succeeds, the page will be removed from the
+ address_space and become free.
+
+ The second case is when a request has been made to invalidate
+ some or all pages in an address_space. This can happen through
+ the fadvise(POSIX_FADV_DONTNEED) system call or by the
+ filesystem explicitly requesting it as nfs and 9fs do (when they
+ believe the cache may be out of date with storage) by calling
+ invalidate_inode_pages2(). If the filesystem makes such a call,
+ and needs to be certain that all pages are invalidated, then its
+ releasepage will need to ensure this. Possibly it can clear the
+ PageUptodate bit if it cannot free private data yet.
+
+``freepage``
+ freepage is called once the page is no longer visible in the
+ page cache in order to allow the cleanup of any private data.
+ Since it may be called by the memory reclaimer, it should not
+ assume that the original address_space mapping still exists, and
+ it should not block.
+
+``direct_IO``
+ called by the generic read/write routines to perform direct_IO -
+ that is IO requests which bypass the page cache and transfer
+ data directly between the storage and the application's address
+ space.
+
+``isolate_page``
+ Called by the VM when isolating a movable non-lru page. If page
+ is successfully isolated, VM marks the page as PG_isolated via
+ __SetPageIsolated.
+
+``migrate_page``
+ This is used to compact the physical memory usage. If the VM
+ wants to relocate a page (maybe off a memory card that is
+ signalling imminent failure) it will pass a new page and an old
+ page to this function. migrate_page should transfer any private
+ data across and update any references that it has to the page.
+
+``putback_page``
+ Called by the VM when isolated page's migration fails.
+
+``launder_page``
+ Called before freeing a page - it writes back the dirty page.
+ To prevent redirtying the page, it is kept locked during the
+ whole operation.
+
+``is_partially_uptodate``
+ Called by the VM when reading a file through the pagecache when
+ the underlying blocksize != pagesize. If the required block is
+ up to date then the read can complete without needing the IO to
+ bring the whole page up to date.
+
+``is_dirty_writeback``
+ Called by the VM when attempting to reclaim a page. The VM uses
+ dirty and writeback information to determine if it needs to
+ stall to allow flushers a chance to complete some IO.
+ Ordinarily it can use PageDirty and PageWriteback but some
+ filesystems have more complex state (unstable pages in NFS
+ prevent reclaim) or do not set those flags due to locking
+ problems. This callback allows a filesystem to indicate to the
+ VM if a page should be treated as dirty or writeback for the
+ purposes of stalling.
+
+``error_remove_page``
+ normally set to generic_error_remove_page if truncation is ok
+ for this address space. Used for memory failure handling.
+ Setting this implies you deal with pages going away under you,
+ unless you have them locked or reference counts increased.
+
+``swap_activate``
+ Called when swapon is used on a file to allocate space if
+ necessary and pin the block lookup information in memory. A
+ return value of zero indicates success, in which case this file
+ can be used to back swapspace.
+
+``swap_deactivate``
+ Called during swapoff on files where swap_activate was
+ successful.
+
+
+The File Object
+===============
+
+A file object represents a file opened by a process. This is also known
+as an "open file description" in POSIX parlance.
+
+
+struct file_operations
+----------------------
+
+This describes how the VFS can manipulate an open file. As of kernel
+4.18, the following members are defined:
+
+.. code-block:: c
+
+ struct file_operations {
+ struct module *owner;
+ loff_t (*llseek) (struct file *, loff_t, int);
+ ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
+ ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
+ ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
+ ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
+ int (*iopoll)(struct kiocb *kiocb, bool spin);
+ int (*iterate) (struct file *, struct dir_context *);
+ int (*iterate_shared) (struct file *, struct dir_context *);
+ __poll_t (*poll) (struct file *, struct poll_table_struct *);
+ long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
+ long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
+ int (*mmap) (struct file *, struct vm_area_struct *);
+ int (*open) (struct inode *, struct file *);
+ int (*flush) (struct file *, fl_owner_t id);
+ int (*release) (struct inode *, struct file *);
+ int (*fsync) (struct file *, loff_t, loff_t, int datasync);
+ int (*fasync) (int, struct file *, int);
+ int (*lock) (struct file *, int, struct file_lock *);
+ ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
+ unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
+ int (*check_flags)(int);
+ int (*flock) (struct file *, int, struct file_lock *);
+ ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
+ ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
+ int (*setlease)(struct file *, long, struct file_lock **, void **);
+ long (*fallocate)(struct file *file, int mode, loff_t offset,
+ loff_t len);
+ void (*show_fdinfo)(struct seq_file *m, struct file *f);
+ #ifndef CONFIG_MMU
+ unsigned (*mmap_capabilities)(struct file *);
+ #endif
+ ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int);
+ loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t len, unsigned int remap_flags);
+ int (*fadvise)(struct file *, loff_t, loff_t, int);
+ };
+
+Again, all methods are called without any locks being held, unless
+otherwise noted.
+
+``llseek``
+ called when the VFS needs to move the file position index
+
+``read``
+ called by read(2) and related system calls
+
+``read_iter``
+ possibly asynchronous read with iov_iter as destination
+
+``write``
+ called by write(2) and related system calls
+
+``write_iter``
+ possibly asynchronous write with iov_iter as source
+
+``iopoll``
+ called when aio wants to poll for completions on HIPRI iocbs
+
+``iterate``
+ called when the VFS needs to read the directory contents
+
+``iterate_shared``
+ called when the VFS needs to read the directory contents when
+ filesystem supports concurrent dir iterators
+
+``poll``
+ called by the VFS when a process wants to check if there is
+ activity on this file and (optionally) go to sleep until there
+ is activity. Called by the select(2) and poll(2) system calls
+
+``unlocked_ioctl``
+ called by the ioctl(2) system call.
+
+``compat_ioctl``
+ called by the ioctl(2) system call when 32 bit system calls are
+ used on 64 bit kernels.
+
+``mmap``
+ called by the mmap(2) system call
+
+``open``
+ called by the VFS when an inode should be opened. When the VFS
+ opens a file, it creates a new "struct file". It then calls the
+ open method for the newly allocated file structure. You might
+ think that the open method really belongs in "struct
+ inode_operations", and you may be right. I think it's done the
+ way it is because it makes filesystems simpler to implement.
+ The open() method is a good place to initialize the
+ "private_data" member in the file structure if you want to point
+ to a device structure
+
+``flush``
+ called by the close(2) system call to flush a file
+
+``release``
+ called when the last reference to an open file is closed
+
+``fsync``
+ called by the fsync(2) system call. Also see the section above
+ entitled "Handling errors during writeback".
+
+``fasync``
+ called by the fcntl(2) system call when asynchronous
+ (non-blocking) mode is enabled for a file
+
+``lock``
+ called by the fcntl(2) system call for F_GETLK, F_SETLK, and
+ F_SETLKW commands
+
+``get_unmapped_area``
+ called by the mmap(2) system call
+
+``check_flags``
+ called by the fcntl(2) system call for F_SETFL command
+
+``flock``
+ called by the flock(2) system call
+
+``splice_write``
+ called by the VFS to splice data from a pipe to a file. This
+ method is used by the splice(2) system call
+
+``splice_read``
+ called by the VFS to splice data from file to a pipe. This
+ method is used by the splice(2) system call
+
+``setlease``
+ called by the VFS to set or release a file lock lease. setlease
+ implementations should call generic_setlease to record or remove
+ the lease in the inode after setting it.
+
+``fallocate``
+ called by the VFS to preallocate blocks or punch a hole.
+
+``copy_file_range``
+ called by the copy_file_range(2) system call.
+
+``remap_file_range``
+ called by the ioctl(2) system call for FICLONERANGE and FICLONE
+ and FIDEDUPERANGE commands to remap file ranges. An
+ implementation should remap len bytes at pos_in of the source
+ file into the dest file at pos_out. Implementations must handle
+ callers passing in len == 0; this means "remap to the end of the
+ source file". The return value should the number of bytes
+ remapped, or the usual negative error code if errors occurred
+ before any bytes were remapped. The remap_flags parameter
+ accepts REMAP_FILE_* flags. If REMAP_FILE_DEDUP is set then the
+ implementation must only remap if the requested file ranges have
+ identical contents. If REMAP_CAN_SHORTEN is set, the caller is
+ ok with the implementation shortening the request length to
+ satisfy alignment or EOF requirements (or any other reason).
+
+``fadvise``
+ possibly called by the fadvise64() system call.
+
+Note that the file operations are implemented by the specific
+filesystem in which the inode resides. When opening a device node
+(character or block special) most filesystems will call special
+support routines in the VFS which will locate the required device
+driver information. These support routines replace the filesystem file
+operations with those for the device driver, and then proceed to call
+the new open() method for the file. This is how opening a device file
+in the filesystem eventually ends up calling the device driver open()
+method.
+
+
+Directory Entry Cache (dcache)
+==============================
+
+
+struct dentry_operations
+------------------------
+
+This describes how a filesystem can overload the standard dentry
+operations. Dentries and the dcache are the domain of the VFS and the
+individual filesystem implementations. Device drivers have no business
+here. These methods may be set to NULL, as they are either optional or
+the VFS uses a default. As of kernel 2.6.22, the following members are
+defined:
+
+.. code-block:: c
+
+ struct dentry_operations {
+ int (*d_revalidate)(struct dentry *, unsigned int);
+ int (*d_weak_revalidate)(struct dentry *, unsigned int);
+ int (*d_hash)(const struct dentry *, struct qstr *);
+ int (*d_compare)(const struct dentry *,
+ unsigned int, const char *, const struct qstr *);
+ int (*d_delete)(const struct dentry *);
+ int (*d_init)(struct dentry *);
+ void (*d_release)(struct dentry *);
+ void (*d_iput)(struct dentry *, struct inode *);
+ char *(*d_dname)(struct dentry *, char *, int);
+ struct vfsmount *(*d_automount)(struct path *);
+ int (*d_manage)(const struct path *, bool);
+ struct dentry *(*d_real)(struct dentry *, const struct inode *);
+ };
+
+``d_revalidate``
+ called when the VFS needs to revalidate a dentry. This is
+ called whenever a name look-up finds a dentry in the dcache.
+ Most local filesystems leave this as NULL, because all their
+ dentries in the dcache are valid. Network filesystems are
+ different since things can change on the server without the
+ client necessarily being aware of it.
+
+ This function should return a positive value if the dentry is
+ still valid, and zero or a negative error code if it isn't.
+
+ d_revalidate may be called in rcu-walk mode (flags &
+ LOOKUP_RCU). If in rcu-walk mode, the filesystem must
+ revalidate the dentry without blocking or storing to the dentry,
+ d_parent and d_inode should not be used without care (because
+ they can change and, in d_inode case, even become NULL under
+ us).
+
+ If a situation is encountered that rcu-walk cannot handle,
+ return
+ -ECHILD and it will be called again in ref-walk mode.
+
+``_weak_revalidate``
+ called when the VFS needs to revalidate a "jumped" dentry. This
+ is called when a path-walk ends at dentry that was not acquired
+ by doing a lookup in the parent directory. This includes "/",
+ "." and "..", as well as procfs-style symlinks and mountpoint
+ traversal.
+
+ In this case, we are less concerned with whether the dentry is
+ still fully correct, but rather that the inode is still valid.
+ As with d_revalidate, most local filesystems will set this to
+ NULL since their dcache entries are always valid.
+
+ This function has the same return code semantics as
+ d_revalidate.
+
+ d_weak_revalidate is only called after leaving rcu-walk mode.
+
+``d_hash``
+ called when the VFS adds a dentry to the hash table. The first
+ dentry passed to d_hash is the parent directory that the name is
+ to be hashed into.
+
+ Same locking and synchronisation rules as d_compare regarding
+ what is safe to dereference etc.
+
+``d_compare``
+ called to compare a dentry name with a given name. The first
+ dentry is the parent of the dentry to be compared, the second is
+ the child dentry. len and name string are properties of the
+ dentry to be compared. qstr is the name to compare it with.
+
+ Must be constant and idempotent, and should not take locks if
+ possible, and should not or store into the dentry. Should not
+ dereference pointers outside the dentry without lots of care
+ (eg. d_parent, d_inode, d_name should not be used).
+
+ However, our vfsmount is pinned, and RCU held, so the dentries
+ and inodes won't disappear, neither will our sb or filesystem
+ module. ->d_sb may be used.
+
+ It is a tricky calling convention because it needs to be called
+ under "rcu-walk", ie. without any locks or references on things.
+
+``d_delete``
+ called when the last reference to a dentry is dropped and the
+ dcache is deciding whether or not to cache it. Return 1 to
+ delete immediately, or 0 to cache the dentry. Default is NULL
+ which means to always cache a reachable dentry. d_delete must
+ be constant and idempotent.
+
+``d_init``
+ called when a dentry is allocated
+
+``d_release``
+ called when a dentry is really deallocated
+
+``d_iput``
+ called when a dentry loses its inode (just prior to its being
+ deallocated). The default when this is NULL is that the VFS
+ calls iput(). If you define this method, you must call iput()
+ yourself
+
+``d_dname``
+ called when the pathname of a dentry should be generated.
+ Useful for some pseudo filesystems (sockfs, pipefs, ...) to
+ delay pathname generation. (Instead of doing it when dentry is
+ created, it's done only when the path is needed.). Real
+ filesystems probably dont want to use it, because their dentries
+ are present in global dcache hash, so their hash should be an
+ invariant. As no lock is held, d_dname() should not try to
+ modify the dentry itself, unless appropriate SMP safety is used.
+ CAUTION : d_path() logic is quite tricky. The correct way to
+ return for example "Hello" is to put it at the end of the
+ buffer, and returns a pointer to the first char.
+ dynamic_dname() helper function is provided to take care of
+ this.
+
+ Example :
+
+.. code-block:: c
+
+ static char *pipefs_dname(struct dentry *dent, char *buffer, int buflen)
+ {
+ return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]",
+ dentry->d_inode->i_ino);
+ }
+
+``d_automount``
+ called when an automount dentry is to be traversed (optional).
+ This should create a new VFS mount record and return the record
+ to the caller. The caller is supplied with a path parameter
+ giving the automount directory to describe the automount target
+ and the parent VFS mount record to provide inheritable mount
+ parameters. NULL should be returned if someone else managed to
+ make the automount first. If the vfsmount creation failed, then
+ an error code should be returned. If -EISDIR is returned, then
+ the directory will be treated as an ordinary directory and
+ returned to pathwalk to continue walking.
+
+ If a vfsmount is returned, the caller will attempt to mount it
+ on the mountpoint and will remove the vfsmount from its
+ expiration list in the case of failure. The vfsmount should be
+ returned with 2 refs on it to prevent automatic expiration - the
+ caller will clean up the additional ref.
+
+ This function is only used if DCACHE_NEED_AUTOMOUNT is set on
+ the dentry. This is set by __d_instantiate() if S_AUTOMOUNT is
+ set on the inode being added.
+
+``d_manage``
+ called to allow the filesystem to manage the transition from a
+ dentry (optional). This allows autofs, for example, to hold up
+ clients waiting to explore behind a 'mountpoint' while letting
+ the daemon go past and construct the subtree there. 0 should be
+ returned to let the calling process continue. -EISDIR can be
+ returned to tell pathwalk to use this directory as an ordinary
+ directory and to ignore anything mounted on it and not to check
+ the automount flag. Any other error code will abort pathwalk
+ completely.
+
+ If the 'rcu_walk' parameter is true, then the caller is doing a
+ pathwalk in RCU-walk mode. Sleeping is not permitted in this
+ mode, and the caller can be asked to leave it and call again by
+ returning -ECHILD. -EISDIR may also be returned to tell
+ pathwalk to ignore d_automount or any mounts.
+
+ This function is only used if DCACHE_MANAGE_TRANSIT is set on
+ the dentry being transited from.
+
+``d_real``
+ overlay/union type filesystems implement this method to return
+ one of the underlying dentries hidden by the overlay. It is
+ used in two different modes:
+
+ Called from file_dentry() it returns the real dentry matching
+ the inode argument. The real dentry may be from a lower layer
+ already copied up, but still referenced from the file. This
+ mode is selected with a non-NULL inode argument.
+
+ With NULL inode the topmost real underlying dentry is returned.
+
+Each dentry has a pointer to its parent dentry, as well as a hash list
+of child dentries. Child dentries are basically like files in a
+directory.
+
+
+Directory Entry Cache API
+--------------------------
+
+There are a number of functions defined which permit a filesystem to
+manipulate dentries:
+
+``dget``
+ open a new handle for an existing dentry (this just increments
+ the usage count)
+
+``dput``
+ close a handle for a dentry (decrements the usage count). If
+ the usage count drops to 0, and the dentry is still in its
+ parent's hash, the "d_delete" method is called to check whether
+ it should be cached. If it should not be cached, or if the
+ dentry is not hashed, it is deleted. Otherwise cached dentries
+ are put into an LRU list to be reclaimed on memory shortage.
+
+``d_drop``
+ this unhashes a dentry from its parents hash list. A subsequent
+ call to dput() will deallocate the dentry if its usage count
+ drops to 0
+
+``d_delete``
+ delete a dentry. If there are no other open references to the
+ dentry then the dentry is turned into a negative dentry (the
+ d_iput() method is called). If there are other references, then
+ d_drop() is called instead
+
+``d_add``
+ add a dentry to its parents hash list and then calls
+ d_instantiate()
+
+``d_instantiate``
+ add a dentry to the alias hash list for the inode and updates
+ the "d_inode" member. The "i_count" member in the inode
+ structure should be set/incremented. If the inode pointer is
+ NULL, the dentry is called a "negative dentry". This function
+ is commonly called when an inode is created for an existing
+ negative dentry
+
+``d_lookup``
+ look up a dentry given its parent and path name component It
+ looks up the child of that given name from the dcache hash
+ table. If it is found, the reference count is incremented and
+ the dentry is returned. The caller must use dput() to free the
+ dentry when it finishes using it.
+
+
+Mount Options
+=============
+
+
+Parsing options
+---------------
+
+On mount and remount the filesystem is passed a string containing a
+comma separated list of mount options. The options can have either of
+these forms:
+
+ option
+ option=value
+
+The <linux/parser.h> header defines an API that helps parse these
+options. There are plenty of examples on how to use it in existing
+filesystems.
+
+
+Showing options
+---------------
+
+If a filesystem accepts mount options, it must define show_options() to
+show all the currently active options. The rules are:
+
+ - options MUST be shown which are not default or their values differ
+ from the default
+
+ - options MAY be shown which are enabled by default or have their
+ default value
+
+Options used only internally between a mount helper and the kernel (such
+as file descriptors), or which only have an effect during the mounting
+(such as ones controlling the creation of a journal) are exempt from the
+above rules.
+
+The underlying reason for the above rules is to make sure, that a mount
+can be accurately replicated (e.g. umounting and mounting again) based
+on the information found in /proc/mounts.
+
+
+Resources
+=========
+
+(Note some of these resources are not up-to-date with the latest kernel
+ version.)
+
+Creating Linux virtual filesystems. 2002
+ <http://lwn.net/Articles/13325/>
+
+The Linux Virtual File-system Layer by Neil Brown. 1999
+ <http://www.cse.unsw.edu.au/~neilb/oss/linux-commentary/vfs.html>
+
+A tour of the Linux VFS by Michael K. Johnson. 1996
+ <http://www.tldp.org/LDP/khg/HyperNews/get/fs/vfstour.html>
+
+A small trail through the Linux kernel by Andries Brouwer. 2001
+ <http://www.win.tue.nl/~aeb/linux/vfs/trail.html>
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
deleted file mode 100644
index 8dc8e9c2913f..000000000000
--- a/Documentation/filesystems/vfs.txt
+++ /dev/null
@@ -1,1261 +0,0 @@
-
- Overview of the Linux Virtual File System
-
- Original author: Richard Gooch <rgooch@atnf.csiro.au>
-
- Last updated on June 24, 2007.
-
- Copyright (C) 1999 Richard Gooch
- Copyright (C) 2005 Pekka Enberg
-
- This file is released under the GPLv2.
-
-
-Introduction
-============
-
-The Virtual File System (also known as the Virtual Filesystem Switch)
-is the software layer in the kernel that provides the filesystem
-interface to userspace programs. It also provides an abstraction
-within the kernel which allows different filesystem implementations to
-coexist.
-
-VFS system calls open(2), stat(2), read(2), write(2), chmod(2) and so
-on are called from a process context. Filesystem locking is described
-in the document Documentation/filesystems/Locking.
-
-
-Directory Entry Cache (dcache)
-------------------------------
-
-The VFS implements the open(2), stat(2), chmod(2), and similar system
-calls. The pathname argument that is passed to them is used by the VFS
-to search through the directory entry cache (also known as the dentry
-cache or dcache). This provides a very fast look-up mechanism to
-translate a pathname (filename) into a specific dentry. Dentries live
-in RAM and are never saved to disc: they exist only for performance.
-
-The dentry cache is meant to be a view into your entire filespace. As
-most computers cannot fit all dentries in the RAM at the same time,
-some bits of the cache are missing. In order to resolve your pathname
-into a dentry, the VFS may have to resort to creating dentries along
-the way, and then loading the inode. This is done by looking up the
-inode.
-
-
-The Inode Object
-----------------
-
-An individual dentry usually has a pointer to an inode. Inodes are
-filesystem objects such as regular files, directories, FIFOs and other
-beasts. They live either on the disc (for block device filesystems)
-or in the memory (for pseudo filesystems). Inodes that live on the
-disc are copied into the memory when required and changes to the inode
-are written back to disc. A single inode can be pointed to by multiple
-dentries (hard links, for example, do this).
-
-To look up an inode requires that the VFS calls the lookup() method of
-the parent directory inode. This method is installed by the specific
-filesystem implementation that the inode lives in. Once the VFS has
-the required dentry (and hence the inode), we can do all those boring
-things like open(2) the file, or stat(2) it to peek at the inode
-data. The stat(2) operation is fairly simple: once the VFS has the
-dentry, it peeks at the inode data and passes some of it back to
-userspace.
-
-
-The File Object
----------------
-
-Opening a file requires another operation: allocation of a file
-structure (this is the kernel-side implementation of file
-descriptors). The freshly allocated file structure is initialized with
-a pointer to the dentry and a set of file operation member functions.
-These are taken from the inode data. The open() file method is then
-called so the specific filesystem implementation can do its work. You
-can see that this is another switch performed by the VFS. The file
-structure is placed into the file descriptor table for the process.
-
-Reading, writing and closing files (and other assorted VFS operations)
-is done by using the userspace file descriptor to grab the appropriate
-file structure, and then calling the required file structure method to
-do whatever is required. For as long as the file is open, it keeps the
-dentry in use, which in turn means that the VFS inode is still in use.
-
-
-Registering and Mounting a Filesystem
-=====================================
-
-To register and unregister a filesystem, use the following API
-functions:
-
- #include <linux/fs.h>
-
- extern int register_filesystem(struct file_system_type *);
- extern int unregister_filesystem(struct file_system_type *);
-
-The passed struct file_system_type describes your filesystem. When a
-request is made to mount a filesystem onto a directory in your namespace,
-the VFS will call the appropriate mount() method for the specific
-filesystem. New vfsmount referring to the tree returned by ->mount()
-will be attached to the mountpoint, so that when pathname resolution
-reaches the mountpoint it will jump into the root of that vfsmount.
-
-You can see all filesystems that are registered to the kernel in the
-file /proc/filesystems.
-
-
-struct file_system_type
------------------------
-
-This describes the filesystem. As of kernel 2.6.39, the following
-members are defined:
-
-struct file_system_type {
- const char *name;
- int fs_flags;
- struct dentry *(*mount) (struct file_system_type *, int,
- const char *, void *);
- void (*kill_sb) (struct super_block *);
- struct module *owner;
- struct file_system_type * next;
- struct list_head fs_supers;
- struct lock_class_key s_lock_key;
- struct lock_class_key s_umount_key;
-};
-
- name: the name of the filesystem type, such as "ext2", "iso9660",
- "msdos" and so on
-
- fs_flags: various flags (i.e. FS_REQUIRES_DEV, FS_NO_DCACHE, etc.)
-
- mount: the method to call when a new instance of this
- filesystem should be mounted
-
- kill_sb: the method to call when an instance of this filesystem
- should be shut down
-
- owner: for internal VFS use: you should initialize this to THIS_MODULE in
- most cases.
-
- next: for internal VFS use: you should initialize this to NULL
-
- s_lock_key, s_umount_key: lockdep-specific
-
-The mount() method has the following arguments:
-
- struct file_system_type *fs_type: describes the filesystem, partly initialized
- by the specific filesystem code
-
- int flags: mount flags
-
- const char *dev_name: the device name we are mounting.
-
- void *data: arbitrary mount options, usually comes as an ASCII
- string (see "Mount Options" section)
-
-The mount() method must return the root dentry of the tree requested by
-caller. An active reference to its superblock must be grabbed and the
-superblock must be locked. On failure it should return ERR_PTR(error).
-
-The arguments match those of mount(2) and their interpretation
-depends on filesystem type. E.g. for block filesystems, dev_name is
-interpreted as block device name, that device is opened and if it
-contains a suitable filesystem image the method creates and initializes
-struct super_block accordingly, returning its root dentry to caller.
-
-->mount() may choose to return a subtree of existing filesystem - it
-doesn't have to create a new one. The main result from the caller's
-point of view is a reference to dentry at the root of (sub)tree to
-be attached; creation of new superblock is a common side effect.
-
-The most interesting member of the superblock structure that the
-mount() method fills in is the "s_op" field. This is a pointer to
-a "struct super_operations" which describes the next level of the
-filesystem implementation.
-
-Usually, a filesystem uses one of the generic mount() implementations
-and provides a fill_super() callback instead. The generic variants are:
-
- mount_bdev: mount a filesystem residing on a block device
-
- mount_nodev: mount a filesystem that is not backed by a device
-
- mount_single: mount a filesystem which shares the instance between
- all mounts
-
-A fill_super() callback implementation has the following arguments:
-
- struct super_block *sb: the superblock structure. The callback
- must initialize this properly.
-
- void *data: arbitrary mount options, usually comes as an ASCII
- string (see "Mount Options" section)
-
- int silent: whether or not to be silent on error
-
-
-The Superblock Object
-=====================
-
-A superblock object represents a mounted filesystem.
-
-
-struct super_operations
------------------------
-
-This describes how the VFS can manipulate the superblock of your
-filesystem. As of kernel 2.6.22, the following members are defined:
-
-struct super_operations {
- struct inode *(*alloc_inode)(struct super_block *sb);
- void (*destroy_inode)(struct inode *);
-
- void (*dirty_inode) (struct inode *, int flags);
- int (*write_inode) (struct inode *, int);
- void (*drop_inode) (struct inode *);
- void (*delete_inode) (struct inode *);
- void (*put_super) (struct super_block *);
- int (*sync_fs)(struct super_block *sb, int wait);
- int (*freeze_fs) (struct super_block *);
- int (*unfreeze_fs) (struct super_block *);
- int (*statfs) (struct dentry *, struct kstatfs *);
- int (*remount_fs) (struct super_block *, int *, char *);
- void (*clear_inode) (struct inode *);
- void (*umount_begin) (struct super_block *);
-
- int (*show_options)(struct seq_file *, struct dentry *);
-
- ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
- ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
- int (*nr_cached_objects)(struct super_block *);
- void (*free_cached_objects)(struct super_block *, int);
-};
-
-All methods are called without any locks being held, unless otherwise
-noted. This means that most methods can block safely. All methods are
-only called from a process context (i.e. not from an interrupt handler
-or bottom half).
-
- alloc_inode: this method is called by alloc_inode() to allocate memory
- for struct inode and initialize it. If this function is not
- defined, a simple 'struct inode' is allocated. Normally
- alloc_inode will be used to allocate a larger structure which
- contains a 'struct inode' embedded within it.
-
- destroy_inode: this method is called by destroy_inode() to release
- resources allocated for struct inode. It is only required if
- ->alloc_inode was defined and simply undoes anything done by
- ->alloc_inode.
-
- dirty_inode: this method is called by the VFS to mark an inode dirty.
-
- write_inode: this method is called when the VFS needs to write an
- inode to disc. The second parameter indicates whether the write
- should be synchronous or not, not all filesystems check this flag.
-
- drop_inode: called when the last access to the inode is dropped,
- with the inode->i_lock spinlock held.
-
- This method should be either NULL (normal UNIX filesystem
- semantics) or "generic_delete_inode" (for filesystems that do not
- want to cache inodes - causing "delete_inode" to always be
- called regardless of the value of i_nlink)
-
- The "generic_delete_inode()" behavior is equivalent to the
- old practice of using "force_delete" in the put_inode() case,
- but does not have the races that the "force_delete()" approach
- had.
-
- delete_inode: called when the VFS wants to delete an inode
-
- put_super: called when the VFS wishes to free the superblock
- (i.e. unmount). This is called with the superblock lock held
-
- sync_fs: called when VFS is writing out all dirty data associated with
- a superblock. The second parameter indicates whether the method
- should wait until the write out has been completed. Optional.
-
- freeze_fs: called when VFS is locking a filesystem and
- forcing it into a consistent state. This method is currently
- used by the Logical Volume Manager (LVM).
-
- unfreeze_fs: called when VFS is unlocking a filesystem and making it writable
- again.
-
- statfs: called when the VFS needs to get filesystem statistics.
-
- remount_fs: called when the filesystem is remounted. This is called
- with the kernel lock held
-
- clear_inode: called then the VFS clears the inode. Optional
-
- umount_begin: called when the VFS is unmounting a filesystem.
-
- show_options: called by the VFS to show mount options for
- /proc/<pid>/mounts. (see "Mount Options" section)
-
- quota_read: called by the VFS to read from filesystem quota file.
-
- quota_write: called by the VFS to write to filesystem quota file.
-
- nr_cached_objects: called by the sb cache shrinking function for the
- filesystem to return the number of freeable cached objects it contains.
- Optional.
-
- free_cache_objects: called by the sb cache shrinking function for the
- filesystem to scan the number of objects indicated to try to free them.
- Optional, but any filesystem implementing this method needs to also
- implement ->nr_cached_objects for it to be called correctly.
-
- We can't do anything with any errors that the filesystem might
- encountered, hence the void return type. This will never be called if
- the VM is trying to reclaim under GFP_NOFS conditions, hence this
- method does not need to handle that situation itself.
-
- Implementations must include conditional reschedule calls inside any
- scanning loop that is done. This allows the VFS to determine
- appropriate scan batch sizes without having to worry about whether
- implementations will cause holdoff problems due to large scan batch
- sizes.
-
-Whoever sets up the inode is responsible for filling in the "i_op" field. This
-is a pointer to a "struct inode_operations" which describes the methods that
-can be performed on individual inodes.
-
-struct xattr_handlers
----------------------
-
-On filesystems that support extended attributes (xattrs), the s_xattr
-superblock field points to a NULL-terminated array of xattr handlers. Extended
-attributes are name:value pairs.
-
- name: Indicates that the handler matches attributes with the specified name
- (such as "system.posix_acl_access"); the prefix field must be NULL.
-
- prefix: Indicates that the handler matches all attributes with the specified
- name prefix (such as "user."); the name field must be NULL.
-
- list: Determine if attributes matching this xattr handler should be listed
- for a particular dentry. Used by some listxattr implementations like
- generic_listxattr.
-
- get: Called by the VFS to get the value of a particular extended attribute.
- This method is called by the getxattr(2) system call.
-
- set: Called by the VFS to set the value of a particular extended attribute.
- When the new value is NULL, called to remove a particular extended
- attribute. This method is called by the the setxattr(2) and
- removexattr(2) system calls.
-
-When none of the xattr handlers of a filesystem match the specified attribute
-name or when a filesystem doesn't support extended attributes, the various
-*xattr(2) system calls return -EOPNOTSUPP.
-
-
-The Inode Object
-================
-
-An inode object represents an object within the filesystem.
-
-
-struct inode_operations
------------------------
-
-This describes how the VFS can manipulate an inode in your
-filesystem. As of kernel 2.6.22, the following members are defined:
-
-struct inode_operations {
- int (*create) (struct inode *,struct dentry *, umode_t, bool);
- struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
- int (*link) (struct dentry *,struct inode *,struct dentry *);
- int (*unlink) (struct inode *,struct dentry *);
- int (*symlink) (struct inode *,struct dentry *,const char *);
- int (*mkdir) (struct inode *,struct dentry *,umode_t);
- int (*rmdir) (struct inode *,struct dentry *);
- int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t);
- int (*rename) (struct inode *, struct dentry *,
- struct inode *, struct dentry *, unsigned int);
- int (*readlink) (struct dentry *, char __user *,int);
- const char *(*get_link) (struct dentry *, struct inode *,
- struct delayed_call *);
- int (*permission) (struct inode *, int);
- int (*get_acl)(struct inode *, int);
- int (*setattr) (struct dentry *, struct iattr *);
- int (*getattr) (const struct path *, struct kstat *, u32, unsigned int);
- ssize_t (*listxattr) (struct dentry *, char *, size_t);
- void (*update_time)(struct inode *, struct timespec *, int);
- int (*atomic_open)(struct inode *, struct dentry *, struct file *,
- unsigned open_flag, umode_t create_mode);
- int (*tmpfile) (struct inode *, struct dentry *, umode_t);
-};
-
-Again, all methods are called without any locks being held, unless
-otherwise noted.
-
- create: called by the open(2) and creat(2) system calls. Only
- required if you want to support regular files. The dentry you
- get should not have an inode (i.e. it should be a negative
- dentry). Here you will probably call d_instantiate() with the
- dentry and the newly created inode
-
- lookup: called when the VFS needs to look up an inode in a parent
- directory. The name to look for is found in the dentry. This
- method must call d_add() to insert the found inode into the
- dentry. The "i_count" field in the inode structure should be
- incremented. If the named inode does not exist a NULL inode
- should be inserted into the dentry (this is called a negative
- dentry). Returning an error code from this routine must only
- be done on a real error, otherwise creating inodes with system
- calls like create(2), mknod(2), mkdir(2) and so on will fail.
- If you wish to overload the dentry methods then you should
- initialise the "d_dop" field in the dentry; this is a pointer
- to a struct "dentry_operations".
- This method is called with the directory inode semaphore held
-
- link: called by the link(2) system call. Only required if you want
- to support hard links. You will probably need to call
- d_instantiate() just as you would in the create() method
-
- unlink: called by the unlink(2) system call. Only required if you
- want to support deleting inodes
-
- symlink: called by the symlink(2) system call. Only required if you
- want to support symlinks. You will probably need to call
- d_instantiate() just as you would in the create() method
-
- mkdir: called by the mkdir(2) system call. Only required if you want
- to support creating subdirectories. You will probably need to
- call d_instantiate() just as you would in the create() method
-
- rmdir: called by the rmdir(2) system call. Only required if you want
- to support deleting subdirectories
-
- mknod: called by the mknod(2) system call to create a device (char,
- block) inode or a named pipe (FIFO) or socket. Only required
- if you want to support creating these types of inodes. You
- will probably need to call d_instantiate() just as you would
- in the create() method
-
- rename: called by the rename(2) system call to rename the object to
- have the parent and name given by the second inode and dentry.
-
- The filesystem must return -EINVAL for any unsupported or
- unknown flags. Currently the following flags are implemented:
- (1) RENAME_NOREPLACE: this flag indicates that if the target
- of the rename exists the rename should fail with -EEXIST
- instead of replacing the target. The VFS already checks for
- existence, so for local filesystems the RENAME_NOREPLACE
- implementation is equivalent to plain rename.
- (2) RENAME_EXCHANGE: exchange source and target. Both must
- exist; this is checked by the VFS. Unlike plain rename,
- source and target may be of different type.
-
- get_link: called by the VFS to follow a symbolic link to the
- inode it points to. Only required if you want to support
- symbolic links. This method returns the symlink body
- to traverse (and possibly resets the current position with
- nd_jump_link()). If the body won't go away until the inode
- is gone, nothing else is needed; if it needs to be otherwise
- pinned, arrange for its release by having get_link(..., ..., done)
- do set_delayed_call(done, destructor, argument).
- In that case destructor(argument) will be called once VFS is
- done with the body you've returned.
- May be called in RCU mode; that is indicated by NULL dentry
- argument. If request can't be handled without leaving RCU mode,
- have it return ERR_PTR(-ECHILD).
-
- readlink: this is now just an override for use by readlink(2) for the
- cases when ->get_link uses nd_jump_link() or object is not in
- fact a symlink. Normally filesystems should only implement
- ->get_link for symlinks and readlink(2) will automatically use
- that.
-
- permission: called by the VFS to check for access rights on a POSIX-like
- filesystem.
-
- May be called in rcu-walk mode (mask & MAY_NOT_BLOCK). If in rcu-walk
- mode, the filesystem must check the permission without blocking or
- storing to the inode.
-
- If a situation is encountered that rcu-walk cannot handle, return
- -ECHILD and it will be called again in ref-walk mode.
-
- setattr: called by the VFS to set attributes for a file. This method
- is called by chmod(2) and related system calls.
-
- getattr: called by the VFS to get attributes of a file. This method
- is called by stat(2) and related system calls.
-
- listxattr: called by the VFS to list all extended attributes for a
- given file. This method is called by the listxattr(2) system call.
-
- update_time: called by the VFS to update a specific time or the i_version of
- an inode. If this is not defined the VFS will update the inode itself
- and call mark_inode_dirty_sync.
-
- atomic_open: called on the last component of an open. Using this optional
- method the filesystem can look up, possibly create and open the file in
- one atomic operation. If it wants to leave actual opening to the
- caller (e.g. if the file turned out to be a symlink, device, or just
- something filesystem won't do atomic open for), it may signal this by
- returning finish_no_open(file, dentry). This method is only called if
- the last component is negative or needs lookup. Cached positive dentries
- are still handled by f_op->open(). If the file was created,
- FMODE_CREATED flag should be set in file->f_mode. In case of O_EXCL
- the method must only succeed if the file didn't exist and hence FMODE_CREATED
- shall always be set on success.
-
- tmpfile: called in the end of O_TMPFILE open(). Optional, equivalent to
- atomically creating, opening and unlinking a file in given directory.
-
-The Address Space Object
-========================
-
-The address space object is used to group and manage pages in the page
-cache. It can be used to keep track of the pages in a file (or
-anything else) and also track the mapping of sections of the file into
-process address spaces.
-
-There are a number of distinct yet related services that an
-address-space can provide. These include communicating memory
-pressure, page lookup by address, and keeping track of pages tagged as
-Dirty or Writeback.
-
-The first can be used independently to the others. The VM can try to
-either write dirty pages in order to clean them, or release clean
-pages in order to reuse them. To do this it can call the ->writepage
-method on dirty pages, and ->releasepage on clean pages with
-PagePrivate set. Clean pages without PagePrivate and with no external
-references will be released without notice being given to the
-address_space.
-
-To achieve this functionality, pages need to be placed on an LRU with
-lru_cache_add and mark_page_active needs to be called whenever the
-page is used.
-
-Pages are normally kept in a radix tree index by ->index. This tree
-maintains information about the PG_Dirty and PG_Writeback status of
-each page, so that pages with either of these flags can be found
-quickly.
-
-The Dirty tag is primarily used by mpage_writepages - the default
-->writepages method. It uses the tag to find dirty pages to call
-->writepage on. If mpage_writepages is not used (i.e. the address
-provides its own ->writepages) , the PAGECACHE_TAG_DIRTY tag is
-almost unused. write_inode_now and sync_inode do use it (through
-__sync_single_inode) to check if ->writepages has been successful in
-writing out the whole address_space.
-
-The Writeback tag is used by filemap*wait* and sync_page* functions,
-via filemap_fdatawait_range, to wait for all writeback to complete.
-
-An address_space handler may attach extra information to a page,
-typically using the 'private' field in the 'struct page'. If such
-information is attached, the PG_Private flag should be set. This will
-cause various VM routines to make extra calls into the address_space
-handler to deal with that data.
-
-An address space acts as an intermediate between storage and
-application. Data is read into the address space a whole page at a
-time, and provided to the application either by copying of the page,
-or by memory-mapping the page.
-Data is written into the address space by the application, and then
-written-back to storage typically in whole pages, however the
-address_space has finer control of write sizes.
-
-The read process essentially only requires 'readpage'. The write
-process is more complicated and uses write_begin/write_end or
-set_page_dirty to write data into the address_space, and writepage
-and writepages to writeback data to storage.
-
-Adding and removing pages to/from an address_space is protected by the
-inode's i_mutex.
-
-When data is written to a page, the PG_Dirty flag should be set. It
-typically remains set until writepage asks for it to be written. This
-should clear PG_Dirty and set PG_Writeback. It can be actually
-written at any point after PG_Dirty is clear. Once it is known to be
-safe, PG_Writeback is cleared.
-
-Writeback makes use of a writeback_control structure to direct the
-operations. This gives the the writepage and writepages operations some
-information about the nature of and reason for the writeback request,
-and the constraints under which it is being done. It is also used to
-return information back to the caller about the result of a writepage or
-writepages request.
-
-Handling errors during writeback
---------------------------------
-Most applications that do buffered I/O will periodically call a file
-synchronization call (fsync, fdatasync, msync or sync_file_range) to
-ensure that data written has made it to the backing store. When there
-is an error during writeback, they expect that error to be reported when
-a file sync request is made. After an error has been reported on one
-request, subsequent requests on the same file descriptor should return
-0, unless further writeback errors have occurred since the previous file
-syncronization.
-
-Ideally, the kernel would report errors only on file descriptions on
-which writes were done that subsequently failed to be written back. The
-generic pagecache infrastructure does not track the file descriptions
-that have dirtied each individual page however, so determining which
-file descriptors should get back an error is not possible.
-
-Instead, the generic writeback error tracking infrastructure in the
-kernel settles for reporting errors to fsync on all file descriptions
-that were open at the time that the error occurred. In a situation with
-multiple writers, all of them will get back an error on a subsequent fsync,
-even if all of the writes done through that particular file descriptor
-succeeded (or even if there were no writes on that file descriptor at all).
-
-Filesystems that wish to use this infrastructure should call
-mapping_set_error to record the error in the address_space when it
-occurs. Then, after writing back data from the pagecache in their
-file->fsync operation, they should call file_check_and_advance_wb_err to
-ensure that the struct file's error cursor has advanced to the correct
-point in the stream of errors emitted by the backing device(s).
-
-struct address_space_operations
--------------------------------
-
-This describes how the VFS can manipulate mapping of a file to page cache in
-your filesystem. The following members are defined:
-
-struct address_space_operations {
- int (*writepage)(struct page *page, struct writeback_control *wbc);
- int (*readpage)(struct file *, struct page *);
- int (*writepages)(struct address_space *, struct writeback_control *);
- int (*set_page_dirty)(struct page *page);
- int (*readpages)(struct file *filp, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages);
- int (*write_begin)(struct file *, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata);
- int (*write_end)(struct file *, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata);
- sector_t (*bmap)(struct address_space *, sector_t);
- void (*invalidatepage) (struct page *, unsigned int, unsigned int);
- int (*releasepage) (struct page *, int);
- void (*freepage)(struct page *);
- ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter);
- /* isolate a page for migration */
- bool (*isolate_page) (struct page *, isolate_mode_t);
- /* migrate the contents of a page to the specified target */
- int (*migratepage) (struct page *, struct page *);
- /* put migration-failed page back to right list */
- void (*putback_page) (struct page *);
- int (*launder_page) (struct page *);
-
- int (*is_partially_uptodate) (struct page *, unsigned long,
- unsigned long);
- void (*is_dirty_writeback) (struct page *, bool *, bool *);
- int (*error_remove_page) (struct mapping *mapping, struct page *page);
- int (*swap_activate)(struct file *);
- int (*swap_deactivate)(struct file *);
-};
-
- writepage: called by the VM to write a dirty page to backing store.
- This may happen for data integrity reasons (i.e. 'sync'), or
- to free up memory (flush). The difference can be seen in
- wbc->sync_mode.
- The PG_Dirty flag has been cleared and PageLocked is true.
- writepage should start writeout, should set PG_Writeback,
- and should make sure the page is unlocked, either synchronously
- or asynchronously when the write operation completes.
-
- If wbc->sync_mode is WB_SYNC_NONE, ->writepage doesn't have to
- try too hard if there are problems, and may choose to write out
- other pages from the mapping if that is easier (e.g. due to
- internal dependencies). If it chooses not to start writeout, it
- should return AOP_WRITEPAGE_ACTIVATE so that the VM will not keep
- calling ->writepage on that page.
-
- See the file "Locking" for more details.
-
- readpage: called by the VM to read a page from backing store.
- The page will be Locked when readpage is called, and should be
- unlocked and marked uptodate once the read completes.
- If ->readpage discovers that it needs to unlock the page for
- some reason, it can do so, and then return AOP_TRUNCATED_PAGE.
- In this case, the page will be relocated, relocked and if
- that all succeeds, ->readpage will be called again.
-
- writepages: called by the VM to write out pages associated with the
- address_space object. If wbc->sync_mode is WBC_SYNC_ALL, then
- the writeback_control will specify a range of pages that must be
- written out. If it is WBC_SYNC_NONE, then a nr_to_write is given
- and that many pages should be written if possible.
- If no ->writepages is given, then mpage_writepages is used
- instead. This will choose pages from the address space that are
- tagged as DIRTY and will pass them to ->writepage.
-
- set_page_dirty: called by the VM to set a page dirty.
- This is particularly needed if an address space attaches
- private data to a page, and that data needs to be updated when
- a page is dirtied. This is called, for example, when a memory
- mapped page gets modified.
- If defined, it should set the PageDirty flag, and the
- PAGECACHE_TAG_DIRTY tag in the radix tree.
-
- readpages: called by the VM to read pages associated with the address_space
- object. This is essentially just a vector version of
- readpage. Instead of just one page, several pages are
- requested.
- readpages is only used for read-ahead, so read errors are
- ignored. If anything goes wrong, feel free to give up.
-
- write_begin:
- Called by the generic buffered write code to ask the filesystem to
- prepare to write len bytes at the given offset in the file. The
- address_space should check that the write will be able to complete,
- by allocating space if necessary and doing any other internal
- housekeeping. If the write will update parts of any basic-blocks on
- storage, then those blocks should be pre-read (if they haven't been
- read already) so that the updated blocks can be written out properly.
-
- The filesystem must return the locked pagecache page for the specified
- offset, in *pagep, for the caller to write into.
-
- It must be able to cope with short writes (where the length passed to
- write_begin is greater than the number of bytes copied into the page).
-
- flags is a field for AOP_FLAG_xxx flags, described in
- include/linux/fs.h.
-
- A void * may be returned in fsdata, which then gets passed into
- write_end.
-
- Returns 0 on success; < 0 on failure (which is the error code), in
- which case write_end is not called.
-
- write_end: After a successful write_begin, and data copy, write_end must
- be called. len is the original len passed to write_begin, and copied
- is the amount that was able to be copied.
-
- The filesystem must take care of unlocking the page and releasing it
- refcount, and updating i_size.
-
- Returns < 0 on failure, otherwise the number of bytes (<= 'copied')
- that were able to be copied into pagecache.
-
- bmap: called by the VFS to map a logical block offset within object to
- physical block number. This method is used by the FIBMAP
- ioctl and for working with swap-files. To be able to swap to
- a file, the file must have a stable mapping to a block
- device. The swap system does not go through the filesystem
- but instead uses bmap to find out where the blocks in the file
- are and uses those addresses directly.
-
- invalidatepage: If a page has PagePrivate set, then invalidatepage
- will be called when part or all of the page is to be removed
- from the address space. This generally corresponds to either a
- truncation, punch hole or a complete invalidation of the address
- space (in the latter case 'offset' will always be 0 and 'length'
- will be PAGE_SIZE). Any private data associated with the page
- should be updated to reflect this truncation. If offset is 0 and
- length is PAGE_SIZE, then the private data should be released,
- because the page must be able to be completely discarded. This may
- be done by calling the ->releasepage function, but in this case the
- release MUST succeed.
-
- releasepage: releasepage is called on PagePrivate pages to indicate
- that the page should be freed if possible. ->releasepage
- should remove any private data from the page and clear the
- PagePrivate flag. If releasepage() fails for some reason, it must
- indicate failure with a 0 return value.
- releasepage() is used in two distinct though related cases. The
- first is when the VM finds a clean page with no active users and
- wants to make it a free page. If ->releasepage succeeds, the
- page will be removed from the address_space and become free.
-
- The second case is when a request has been made to invalidate
- some or all pages in an address_space. This can happen
- through the fadvise(POSIX_FADV_DONTNEED) system call or by the
- filesystem explicitly requesting it as nfs and 9fs do (when
- they believe the cache may be out of date with storage) by
- calling invalidate_inode_pages2().
- If the filesystem makes such a call, and needs to be certain
- that all pages are invalidated, then its releasepage will
- need to ensure this. Possibly it can clear the PageUptodate
- bit if it cannot free private data yet.
-
- freepage: freepage is called once the page is no longer visible in
- the page cache in order to allow the cleanup of any private
- data. Since it may be called by the memory reclaimer, it
- should not assume that the original address_space mapping still
- exists, and it should not block.
-
- direct_IO: called by the generic read/write routines to perform
- direct_IO - that is IO requests which bypass the page cache
- and transfer data directly between the storage and the
- application's address space.
-
- isolate_page: Called by the VM when isolating a movable non-lru page.
- If page is successfully isolated, VM marks the page as PG_isolated
- via __SetPageIsolated.
-
- migrate_page: This is used to compact the physical memory usage.
- If the VM wants to relocate a page (maybe off a memory card
- that is signalling imminent failure) it will pass a new page
- and an old page to this function. migrate_page should
- transfer any private data across and update any references
- that it has to the page.
-
- putback_page: Called by the VM when isolated page's migration fails.
-
- launder_page: Called before freeing a page - it writes back the dirty page. To
- prevent redirtying the page, it is kept locked during the whole
- operation.
-
- is_partially_uptodate: Called by the VM when reading a file through the
- pagecache when the underlying blocksize != pagesize. If the required
- block is up to date then the read can complete without needing the IO
- to bring the whole page up to date.
-
- is_dirty_writeback: Called by the VM when attempting to reclaim a page.
- The VM uses dirty and writeback information to determine if it needs
- to stall to allow flushers a chance to complete some IO. Ordinarily
- it can use PageDirty and PageWriteback but some filesystems have
- more complex state (unstable pages in NFS prevent reclaim) or
- do not set those flags due to locking problems. This callback
- allows a filesystem to indicate to the VM if a page should be
- treated as dirty or writeback for the purposes of stalling.
-
- error_remove_page: normally set to generic_error_remove_page if truncation
- is ok for this address space. Used for memory failure handling.
- Setting this implies you deal with pages going away under you,
- unless you have them locked or reference counts increased.
-
- swap_activate: Called when swapon is used on a file to allocate
- space if necessary and pin the block lookup information in
- memory. A return value of zero indicates success,
- in which case this file can be used to back swapspace.
-
- swap_deactivate: Called during swapoff on files where swap_activate
- was successful.
-
-
-The File Object
-===============
-
-A file object represents a file opened by a process. This is also known
-as an "open file description" in POSIX parlance.
-
-
-struct file_operations
-----------------------
-
-This describes how the VFS can manipulate an open file. As of kernel
-4.18, the following members are defined:
-
-struct file_operations {
- struct module *owner;
- loff_t (*llseek) (struct file *, loff_t, int);
- ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
- ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
- ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
- ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
- int (*iterate) (struct file *, struct dir_context *);
- int (*iterate_shared) (struct file *, struct dir_context *);
- __poll_t (*poll) (struct file *, struct poll_table_struct *);
- long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
- long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
- int (*mmap) (struct file *, struct vm_area_struct *);
- int (*open) (struct inode *, struct file *);
- int (*flush) (struct file *, fl_owner_t id);
- int (*release) (struct inode *, struct file *);
- int (*fsync) (struct file *, loff_t, loff_t, int datasync);
- int (*fasync) (int, struct file *, int);
- int (*lock) (struct file *, int, struct file_lock *);
- ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
- unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
- int (*check_flags)(int);
- int (*flock) (struct file *, int, struct file_lock *);
- ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
- ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
- int (*setlease)(struct file *, long, struct file_lock **, void **);
- long (*fallocate)(struct file *file, int mode, loff_t offset,
- loff_t len);
- void (*show_fdinfo)(struct seq_file *m, struct file *f);
-#ifndef CONFIG_MMU
- unsigned (*mmap_capabilities)(struct file *);
-#endif
- ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int);
- loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t len, unsigned int remap_flags);
- int (*fadvise)(struct file *, loff_t, loff_t, int);
-};
-
-Again, all methods are called without any locks being held, unless
-otherwise noted.
-
- llseek: called when the VFS needs to move the file position index
-
- read: called by read(2) and related system calls
-
- read_iter: possibly asynchronous read with iov_iter as destination
-
- write: called by write(2) and related system calls
-
- write_iter: possibly asynchronous write with iov_iter as source
-
- iterate: called when the VFS needs to read the directory contents
-
- iterate_shared: called when the VFS needs to read the directory contents
- when filesystem supports concurrent dir iterators
-
- poll: called by the VFS when a process wants to check if there is
- activity on this file and (optionally) go to sleep until there
- is activity. Called by the select(2) and poll(2) system calls
-
- unlocked_ioctl: called by the ioctl(2) system call.
-
- compat_ioctl: called by the ioctl(2) system call when 32 bit system calls
- are used on 64 bit kernels.
-
- mmap: called by the mmap(2) system call
-
- open: called by the VFS when an inode should be opened. When the VFS
- opens a file, it creates a new "struct file". It then calls the
- open method for the newly allocated file structure. You might
- think that the open method really belongs in
- "struct inode_operations", and you may be right. I think it's
- done the way it is because it makes filesystems simpler to
- implement. The open() method is a good place to initialize the
- "private_data" member in the file structure if you want to point
- to a device structure
-
- flush: called by the close(2) system call to flush a file
-
- release: called when the last reference to an open file is closed
-
- fsync: called by the fsync(2) system call. Also see the section above
- entitled "Handling errors during writeback".
-
- fasync: called by the fcntl(2) system call when asynchronous
- (non-blocking) mode is enabled for a file
-
- lock: called by the fcntl(2) system call for F_GETLK, F_SETLK, and F_SETLKW
- commands
-
- get_unmapped_area: called by the mmap(2) system call
-
- check_flags: called by the fcntl(2) system call for F_SETFL command
-
- flock: called by the flock(2) system call
-
- splice_write: called by the VFS to splice data from a pipe to a file. This
- method is used by the splice(2) system call
-
- splice_read: called by the VFS to splice data from file to a pipe. This
- method is used by the splice(2) system call
-
- setlease: called by the VFS to set or release a file lock lease. setlease
- implementations should call generic_setlease to record or remove
- the lease in the inode after setting it.
-
- fallocate: called by the VFS to preallocate blocks or punch a hole.
-
- copy_file_range: called by the copy_file_range(2) system call.
-
- remap_file_range: called by the ioctl(2) system call for FICLONERANGE and
- FICLONE and FIDEDUPERANGE commands to remap file ranges. An
- implementation should remap len bytes at pos_in of the source file into
- the dest file at pos_out. Implementations must handle callers passing
- in len == 0; this means "remap to the end of the source file". The
- return value should the number of bytes remapped, or the usual
- negative error code if errors occurred before any bytes were remapped.
- The remap_flags parameter accepts REMAP_FILE_* flags. If
- REMAP_FILE_DEDUP is set then the implementation must only remap if the
- requested file ranges have identical contents. If REMAP_CAN_SHORTEN is
- set, the caller is ok with the implementation shortening the request
- length to satisfy alignment or EOF requirements (or any other reason).
-
- fadvise: possibly called by the fadvise64() system call.
-
-Note that the file operations are implemented by the specific
-filesystem in which the inode resides. When opening a device node
-(character or block special) most filesystems will call special
-support routines in the VFS which will locate the required device
-driver information. These support routines replace the filesystem file
-operations with those for the device driver, and then proceed to call
-the new open() method for the file. This is how opening a device file
-in the filesystem eventually ends up calling the device driver open()
-method.
-
-
-Directory Entry Cache (dcache)
-==============================
-
-
-struct dentry_operations
-------------------------
-
-This describes how a filesystem can overload the standard dentry
-operations. Dentries and the dcache are the domain of the VFS and the
-individual filesystem implementations. Device drivers have no business
-here. These methods may be set to NULL, as they are either optional or
-the VFS uses a default. As of kernel 2.6.22, the following members are
-defined:
-
-struct dentry_operations {
- int (*d_revalidate)(struct dentry *, unsigned int);
- int (*d_weak_revalidate)(struct dentry *, unsigned int);
- int (*d_hash)(const struct dentry *, struct qstr *);
- int (*d_compare)(const struct dentry *,
- unsigned int, const char *, const struct qstr *);
- int (*d_delete)(const struct dentry *);
- int (*d_init)(struct dentry *);
- void (*d_release)(struct dentry *);
- void (*d_iput)(struct dentry *, struct inode *);
- char *(*d_dname)(struct dentry *, char *, int);
- struct vfsmount *(*d_automount)(struct path *);
- int (*d_manage)(const struct path *, bool);
- struct dentry *(*d_real)(struct dentry *, const struct inode *);
-};
-
- d_revalidate: called when the VFS needs to revalidate a dentry. This
- is called whenever a name look-up finds a dentry in the
- dcache. Most local filesystems leave this as NULL, because all their
- dentries in the dcache are valid. Network filesystems are different
- since things can change on the server without the client necessarily
- being aware of it.
-
- This function should return a positive value if the dentry is still
- valid, and zero or a negative error code if it isn't.
-
- d_revalidate may be called in rcu-walk mode (flags & LOOKUP_RCU).
- If in rcu-walk mode, the filesystem must revalidate the dentry without
- blocking or storing to the dentry, d_parent and d_inode should not be
- used without care (because they can change and, in d_inode case, even
- become NULL under us).
-
- If a situation is encountered that rcu-walk cannot handle, return
- -ECHILD and it will be called again in ref-walk mode.
-
- d_weak_revalidate: called when the VFS needs to revalidate a "jumped" dentry.
- This is called when a path-walk ends at dentry that was not acquired by
- doing a lookup in the parent directory. This includes "/", "." and "..",
- as well as procfs-style symlinks and mountpoint traversal.
-
- In this case, we are less concerned with whether the dentry is still
- fully correct, but rather that the inode is still valid. As with
- d_revalidate, most local filesystems will set this to NULL since their
- dcache entries are always valid.
-
- This function has the same return code semantics as d_revalidate.
-
- d_weak_revalidate is only called after leaving rcu-walk mode.
-
- d_hash: called when the VFS adds a dentry to the hash table. The first
- dentry passed to d_hash is the parent directory that the name is
- to be hashed into.
-
- Same locking and synchronisation rules as d_compare regarding
- what is safe to dereference etc.
-
- d_compare: called to compare a dentry name with a given name. The first
- dentry is the parent of the dentry to be compared, the second is
- the child dentry. len and name string are properties of the dentry
- to be compared. qstr is the name to compare it with.
-
- Must be constant and idempotent, and should not take locks if
- possible, and should not or store into the dentry.
- Should not dereference pointers outside the dentry without
- lots of care (eg. d_parent, d_inode, d_name should not be used).
-
- However, our vfsmount is pinned, and RCU held, so the dentries and
- inodes won't disappear, neither will our sb or filesystem module.
- ->d_sb may be used.
-
- It is a tricky calling convention because it needs to be called under
- "rcu-walk", ie. without any locks or references on things.
-
- d_delete: called when the last reference to a dentry is dropped and the
- dcache is deciding whether or not to cache it. Return 1 to delete
- immediately, or 0 to cache the dentry. Default is NULL which means to
- always cache a reachable dentry. d_delete must be constant and
- idempotent.
-
- d_init: called when a dentry is allocated
-
- d_release: called when a dentry is really deallocated
-
- d_iput: called when a dentry loses its inode (just prior to its
- being deallocated). The default when this is NULL is that the
- VFS calls iput(). If you define this method, you must call
- iput() yourself
-
- d_dname: called when the pathname of a dentry should be generated.
- Useful for some pseudo filesystems (sockfs, pipefs, ...) to delay
- pathname generation. (Instead of doing it when dentry is created,
- it's done only when the path is needed.). Real filesystems probably
- dont want to use it, because their dentries are present in global
- dcache hash, so their hash should be an invariant. As no lock is
- held, d_dname() should not try to modify the dentry itself, unless
- appropriate SMP safety is used. CAUTION : d_path() logic is quite
- tricky. The correct way to return for example "Hello" is to put it
- at the end of the buffer, and returns a pointer to the first char.
- dynamic_dname() helper function is provided to take care of this.
-
- Example :
-
- static char *pipefs_dname(struct dentry *dent, char *buffer, int buflen)
- {
- return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]",
- dentry->d_inode->i_ino);
- }
-
- d_automount: called when an automount dentry is to be traversed (optional).
- This should create a new VFS mount record and return the record to the
- caller. The caller is supplied with a path parameter giving the
- automount directory to describe the automount target and the parent
- VFS mount record to provide inheritable mount parameters. NULL should
- be returned if someone else managed to make the automount first. If
- the vfsmount creation failed, then an error code should be returned.
- If -EISDIR is returned, then the directory will be treated as an
- ordinary directory and returned to pathwalk to continue walking.
-
- If a vfsmount is returned, the caller will attempt to mount it on the
- mountpoint and will remove the vfsmount from its expiration list in
- the case of failure. The vfsmount should be returned with 2 refs on
- it to prevent automatic expiration - the caller will clean up the
- additional ref.
-
- This function is only used if DCACHE_NEED_AUTOMOUNT is set on the
- dentry. This is set by __d_instantiate() if S_AUTOMOUNT is set on the
- inode being added.
-
- d_manage: called to allow the filesystem to manage the transition from a
- dentry (optional). This allows autofs, for example, to hold up clients
- waiting to explore behind a 'mountpoint' while letting the daemon go
- past and construct the subtree there. 0 should be returned to let the
- calling process continue. -EISDIR can be returned to tell pathwalk to
- use this directory as an ordinary directory and to ignore anything
- mounted on it and not to check the automount flag. Any other error
- code will abort pathwalk completely.
-
- If the 'rcu_walk' parameter is true, then the caller is doing a
- pathwalk in RCU-walk mode. Sleeping is not permitted in this mode,
- and the caller can be asked to leave it and call again by returning
- -ECHILD. -EISDIR may also be returned to tell pathwalk to
- ignore d_automount or any mounts.
-
- This function is only used if DCACHE_MANAGE_TRANSIT is set on the
- dentry being transited from.
-
- d_real: overlay/union type filesystems implement this method to return one of
- the underlying dentries hidden by the overlay. It is used in two
- different modes:
-
- Called from file_dentry() it returns the real dentry matching the inode
- argument. The real dentry may be from a lower layer already copied up,
- but still referenced from the file. This mode is selected with a
- non-NULL inode argument.
-
- With NULL inode the topmost real underlying dentry is returned.
-
-Each dentry has a pointer to its parent dentry, as well as a hash list
-of child dentries. Child dentries are basically like files in a
-directory.
-
-
-Directory Entry Cache API
---------------------------
-
-There are a number of functions defined which permit a filesystem to
-manipulate dentries:
-
- dget: open a new handle for an existing dentry (this just increments
- the usage count)
-
- dput: close a handle for a dentry (decrements the usage count). If
- the usage count drops to 0, and the dentry is still in its
- parent's hash, the "d_delete" method is called to check whether
- it should be cached. If it should not be cached, or if the dentry
- is not hashed, it is deleted. Otherwise cached dentries are put
- into an LRU list to be reclaimed on memory shortage.
-
- d_drop: this unhashes a dentry from its parents hash list. A
- subsequent call to dput() will deallocate the dentry if its
- usage count drops to 0
-
- d_delete: delete a dentry. If there are no other open references to
- the dentry then the dentry is turned into a negative dentry
- (the d_iput() method is called). If there are other
- references, then d_drop() is called instead
-
- d_add: add a dentry to its parents hash list and then calls
- d_instantiate()
-
- d_instantiate: add a dentry to the alias hash list for the inode and
- updates the "d_inode" member. The "i_count" member in the
- inode structure should be set/incremented. If the inode
- pointer is NULL, the dentry is called a "negative
- dentry". This function is commonly called when an inode is
- created for an existing negative dentry
-
- d_lookup: look up a dentry given its parent and path name component
- It looks up the child of that given name from the dcache
- hash table. If it is found, the reference count is incremented
- and the dentry is returned. The caller must use dput()
- to free the dentry when it finishes using it.
-
-Mount Options
-=============
-
-Parsing options
----------------
-
-On mount and remount the filesystem is passed a string containing a
-comma separated list of mount options. The options can have either of
-these forms:
-
- option
- option=value
-
-The <linux/parser.h> header defines an API that helps parse these
-options. There are plenty of examples on how to use it in existing
-filesystems.
-
-Showing options
----------------
-
-If a filesystem accepts mount options, it must define show_options()
-to show all the currently active options. The rules are:
-
- - options MUST be shown which are not default or their values differ
- from the default
-
- - options MAY be shown which are enabled by default or have their
- default value
-
-Options used only internally between a mount helper and the kernel
-(such as file descriptors), or which only have an effect during the
-mounting (such as ones controlling the creation of a journal) are exempt
-from the above rules.
-
-The underlying reason for the above rules is to make sure, that a
-mount can be accurately replicated (e.g. umounting and mounting again)
-based on the information found in /proc/mounts.
-
-Resources
-=========
-
-(Note some of these resources are not up-to-date with the latest kernel
- version.)
-
-Creating Linux virtual filesystems. 2002
- <http://lwn.net/Articles/13325/>
-
-The Linux Virtual File-system Layer by Neil Brown. 1999
- <http://www.cse.unsw.edu.au/~neilb/oss/linux-commentary/vfs.html>
-
-A tour of the Linux VFS by Michael K. Johnson. 1996
- <http://www.tldp.org/LDP/khg/HyperNews/get/fs/vfstour.html>
-
-A small trail through the Linux kernel by Andries Brouwer. 2001
- <http://www.win.tue.nl/~aeb/linux/vfs/trail.html>
diff --git a/Documentation/filesystems/xfs-delayed-logging-design.txt b/Documentation/filesystems/xfs-delayed-logging-design.txt
index 2ce36439c09f..9a6dd289b17b 100644
--- a/Documentation/filesystems/xfs-delayed-logging-design.txt
+++ b/Documentation/filesystems/xfs-delayed-logging-design.txt
@@ -34,7 +34,7 @@ transaction:
D A+B+C+D X+n+m+o
<object written to disk>
E E Y (> X+n+m+o)
- F E+F Yٍ+p
+ F E+F Y+p
In other words, each time an object is relogged, the new transaction contains
the aggregation of all the previous changes currently held only in the log.
diff --git a/Documentation/filesystems/xfs-self-describing-metadata.txt b/Documentation/filesystems/xfs-self-describing-metadata.txt
index 68604e67a495..8db0121d0980 100644
--- a/Documentation/filesystems/xfs-self-describing-metadata.txt
+++ b/Documentation/filesystems/xfs-self-describing-metadata.txt
@@ -222,7 +222,7 @@ static void
xfs_foo_read_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
if ((xfs_sb_version_hascrc(&mp->m_sb) &&
!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
@@ -245,7 +245,7 @@ static bool
xfs_foo_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_ondisk_hdr *hdr = bp->b_addr;
if (hdr->magic != cpu_to_be32(XFS_FOO_MAGIC))
@@ -272,7 +272,7 @@ static bool
xfs_foo_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_ondisk_hdr *hdr = bp->b_addr;
if (hdr->magic == cpu_to_be32(XFS_FOO_CRC_MAGIC)) {
@@ -297,7 +297,7 @@ static void
xfs_foo_write_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_buf_log_item *bip = bp->b_fspriv;
if (!xfs_foo_verify(bp)) {
diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt
deleted file mode 100644
index 9ccfd1bc6201..000000000000
--- a/Documentation/filesystems/xfs.txt
+++ /dev/null
@@ -1,469 +0,0 @@
-
-The SGI XFS Filesystem
-======================
-
-XFS is a high performance journaling filesystem which originated
-on the SGI IRIX platform. It is completely multi-threaded, can
-support large files and large filesystems, extended attributes,
-variable block sizes, is extent based, and makes extensive use of
-Btrees (directories, extents, free space) to aid both performance
-and scalability.
-
-Refer to the documentation at https://xfs.wiki.kernel.org/
-for further details. This implementation is on-disk compatible
-with the IRIX version of XFS.
-
-
-Mount Options
-=============
-
-When mounting an XFS filesystem, the following options are accepted.
-For boolean mount options, the names with the (*) suffix is the
-default behaviour.
-
- allocsize=size
- Sets the buffered I/O end-of-file preallocation size when
- doing delayed allocation writeout (default size is 64KiB).
- Valid values for this option are page size (typically 4KiB)
- through to 1GiB, inclusive, in power-of-2 increments.
-
- The default behaviour is for dynamic end-of-file
- preallocation size, which uses a set of heuristics to
- optimise the preallocation size based on the current
- allocation patterns within the file and the access patterns
- to the file. Specifying a fixed allocsize value turns off
- the dynamic behaviour.
-
- attr2
- noattr2
- The options enable/disable an "opportunistic" improvement to
- be made in the way inline extended attributes are stored
- on-disk. When the new form is used for the first time when
- attr2 is selected (either when setting or removing extended
- attributes) the on-disk superblock feature bit field will be
- updated to reflect this format being in use.
-
- The default behaviour is determined by the on-disk feature
- bit indicating that attr2 behaviour is active. If either
- mount option it set, then that becomes the new default used
- by the filesystem.
-
- CRC enabled filesystems always use the attr2 format, and so
- will reject the noattr2 mount option if it is set.
-
- discard
- nodiscard (*)
- Enable/disable the issuing of commands to let the block
- device reclaim space freed by the filesystem. This is
- useful for SSD devices, thinly provisioned LUNs and virtual
- machine images, but may have a performance impact.
-
- Note: It is currently recommended that you use the fstrim
- application to discard unused blocks rather than the discard
- mount option because the performance impact of this option
- is quite severe.
-
- grpid/bsdgroups
- nogrpid/sysvgroups (*)
- These options define what group ID a newly created file
- gets. When grpid is set, it takes the group ID of the
- directory in which it is created; otherwise it takes the
- fsgid of the current process, unless the directory has the
- setgid bit set, in which case it takes the gid from the
- parent directory, and also gets the setgid bit set if it is
- a directory itself.
-
- filestreams
- Make the data allocator use the filestreams allocation mode
- across the entire filesystem rather than just on directories
- configured to use it.
-
- ikeep
- noikeep (*)
- When ikeep is specified, XFS does not delete empty inode
- clusters and keeps them around on disk. When noikeep is
- specified, empty inode clusters are returned to the free
- space pool.
-
- inode32
- inode64 (*)
- When inode32 is specified, it indicates that XFS limits
- inode creation to locations which will not result in inode
- numbers with more than 32 bits of significance.
-
- When inode64 is specified, it indicates that XFS is allowed
- to create inodes at any location in the filesystem,
- including those which will result in inode numbers occupying
- more than 32 bits of significance.
-
- inode32 is provided for backwards compatibility with older
- systems and applications, since 64 bits inode numbers might
- cause problems for some applications that cannot handle
- large inode numbers. If applications are in use which do
- not handle inode numbers bigger than 32 bits, the inode32
- option should be specified.
-
-
- largeio
- nolargeio (*)
- If "nolargeio" is specified, the optimal I/O reported in
- st_blksize by stat(2) will be as small as possible to allow
- user applications to avoid inefficient read/modify/write
- I/O. This is typically the page size of the machine, as
- this is the granularity of the page cache.
-
- If "largeio" specified, a filesystem that was created with a
- "swidth" specified will return the "swidth" value (in bytes)
- in st_blksize. If the filesystem does not have a "swidth"
- specified but does specify an "allocsize" then "allocsize"
- (in bytes) will be returned instead. Otherwise the behaviour
- is the same as if "nolargeio" was specified.
-
- logbufs=value
- Set the number of in-memory log buffers. Valid numbers
- range from 2-8 inclusive.
-
- The default value is 8 buffers.
-
- If the memory cost of 8 log buffers is too high on small
- systems, then it may be reduced at some cost to performance
- on metadata intensive workloads. The logbsize option below
- controls the size of each buffer and so is also relevant to
- this case.
-
- logbsize=value
- Set the size of each in-memory log buffer. The size may be
- specified in bytes, or in kilobytes with a "k" suffix.
- Valid sizes for version 1 and version 2 logs are 16384 (16k)
- and 32768 (32k). Valid sizes for version 2 logs also
- include 65536 (64k), 131072 (128k) and 262144 (256k). The
- logbsize must be an integer multiple of the log
- stripe unit configured at mkfs time.
-
- The default value for for version 1 logs is 32768, while the
- default value for version 2 logs is MAX(32768, log_sunit).
-
- logdev=device and rtdev=device
- Use an external log (metadata journal) and/or real-time device.
- An XFS filesystem has up to three parts: a data section, a log
- section, and a real-time section. The real-time section is
- optional, and the log section can be separate from the data
- section or contained within it.
-
- noalign
- Data allocations will not be aligned at stripe unit
- boundaries. This is only relevant to filesystems created
- with non-zero data alignment parameters (sunit, swidth) by
- mkfs.
-
- norecovery
- The filesystem will be mounted without running log recovery.
- If the filesystem was not cleanly unmounted, it is likely to
- be inconsistent when mounted in "norecovery" mode.
- Some files or directories may not be accessible because of this.
- Filesystems mounted "norecovery" must be mounted read-only or
- the mount will fail.
-
- nouuid
- Don't check for double mounted file systems using the file
- system uuid. This is useful to mount LVM snapshot volumes,
- and often used in combination with "norecovery" for mounting
- read-only snapshots.
-
- noquota
- Forcibly turns off all quota accounting and enforcement
- within the filesystem.
-
- uquota/usrquota/uqnoenforce/quota
- User disk quota accounting enabled, and limits (optionally)
- enforced. Refer to xfs_quota(8) for further details.
-
- gquota/grpquota/gqnoenforce
- Group disk quota accounting enabled and limits (optionally)
- enforced. Refer to xfs_quota(8) for further details.
-
- pquota/prjquota/pqnoenforce
- Project disk quota accounting enabled and limits (optionally)
- enforced. Refer to xfs_quota(8) for further details.
-
- sunit=value and swidth=value
- Used to specify the stripe unit and width for a RAID device
- or a stripe volume. "value" must be specified in 512-byte
- block units. These options are only relevant to filesystems
- that were created with non-zero data alignment parameters.
-
- The sunit and swidth parameters specified must be compatible
- with the existing filesystem alignment characteristics. In
- general, that means the only valid changes to sunit are
- increasing it by a power-of-2 multiple. Valid swidth values
- are any integer multiple of a valid sunit value.
-
- Typically the only time these mount options are necessary if
- after an underlying RAID device has had it's geometry
- modified, such as adding a new disk to a RAID5 lun and
- reshaping it.
-
- swalloc
- Data allocations will be rounded up to stripe width boundaries
- when the current end of file is being extended and the file
- size is larger than the stripe width size.
-
- wsync
- When specified, all filesystem namespace operations are
- executed synchronously. This ensures that when the namespace
- operation (create, unlink, etc) completes, the change to the
- namespace is on stable storage. This is useful in HA setups
- where failover must not result in clients seeing
- inconsistent namespace presentation during or after a
- failover event.
-
-
-Deprecated Mount Options
-========================
-
- Name Removal Schedule
- ---- ----------------
-
-
-Removed Mount Options
-=====================
-
- Name Removed
- ---- -------
- delaylog/nodelaylog v4.0
- ihashsize v4.0
- irixsgid v4.0
- osyncisdsync/osyncisosync v4.0
- barrier v4.19
- nobarrier v4.19
-
-
-sysctls
-=======
-
-The following sysctls are available for the XFS filesystem:
-
- fs.xfs.stats_clear (Min: 0 Default: 0 Max: 1)
- Setting this to "1" clears accumulated XFS statistics
- in /proc/fs/xfs/stat. It then immediately resets to "0".
-
- fs.xfs.xfssyncd_centisecs (Min: 100 Default: 3000 Max: 720000)
- The interval at which the filesystem flushes metadata
- out to disk and runs internal cache cleanup routines.
-
- fs.xfs.filestream_centisecs (Min: 1 Default: 3000 Max: 360000)
- The interval at which the filesystem ages filestreams cache
- references and returns timed-out AGs back to the free stream
- pool.
-
- fs.xfs.speculative_prealloc_lifetime
- (Units: seconds Min: 1 Default: 300 Max: 86400)
- The interval at which the background scanning for inodes
- with unused speculative preallocation runs. The scan
- removes unused preallocation from clean inodes and releases
- the unused space back to the free pool.
-
- fs.xfs.error_level (Min: 0 Default: 3 Max: 11)
- A volume knob for error reporting when internal errors occur.
- This will generate detailed messages & backtraces for filesystem
- shutdowns, for example. Current threshold values are:
-
- XFS_ERRLEVEL_OFF: 0
- XFS_ERRLEVEL_LOW: 1
- XFS_ERRLEVEL_HIGH: 5
-
- fs.xfs.panic_mask (Min: 0 Default: 0 Max: 255)
- Causes certain error conditions to call BUG(). Value is a bitmask;
- OR together the tags which represent errors which should cause panics:
-
- XFS_NO_PTAG 0
- XFS_PTAG_IFLUSH 0x00000001
- XFS_PTAG_LOGRES 0x00000002
- XFS_PTAG_AILDELETE 0x00000004
- XFS_PTAG_ERROR_REPORT 0x00000008
- XFS_PTAG_SHUTDOWN_CORRUPT 0x00000010
- XFS_PTAG_SHUTDOWN_IOERROR 0x00000020
- XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040
- XFS_PTAG_FSBLOCK_ZERO 0x00000080
-
- This option is intended for debugging only.
-
- fs.xfs.irix_symlink_mode (Min: 0 Default: 0 Max: 1)
- Controls whether symlinks are created with mode 0777 (default)
- or whether their mode is affected by the umask (irix mode).
-
- fs.xfs.irix_sgid_inherit (Min: 0 Default: 0 Max: 1)
- Controls files created in SGID directories.
- If the group ID of the new file does not match the effective group
- ID or one of the supplementary group IDs of the parent dir, the
- ISGID bit is cleared if the irix_sgid_inherit compatibility sysctl
- is set.
-
- fs.xfs.inherit_sync (Min: 0 Default: 1 Max: 1)
- Setting this to "1" will cause the "sync" flag set
- by the xfs_io(8) chattr command on a directory to be
- inherited by files in that directory.
-
- fs.xfs.inherit_nodump (Min: 0 Default: 1 Max: 1)
- Setting this to "1" will cause the "nodump" flag set
- by the xfs_io(8) chattr command on a directory to be
- inherited by files in that directory.
-
- fs.xfs.inherit_noatime (Min: 0 Default: 1 Max: 1)
- Setting this to "1" will cause the "noatime" flag set
- by the xfs_io(8) chattr command on a directory to be
- inherited by files in that directory.
-
- fs.xfs.inherit_nosymlinks (Min: 0 Default: 1 Max: 1)
- Setting this to "1" will cause the "nosymlinks" flag set
- by the xfs_io(8) chattr command on a directory to be
- inherited by files in that directory.
-
- fs.xfs.inherit_nodefrag (Min: 0 Default: 1 Max: 1)
- Setting this to "1" will cause the "nodefrag" flag set
- by the xfs_io(8) chattr command on a directory to be
- inherited by files in that directory.
-
- fs.xfs.rotorstep (Min: 1 Default: 1 Max: 256)
- In "inode32" allocation mode, this option determines how many
- files the allocator attempts to allocate in the same allocation
- group before moving to the next allocation group. The intent
- is to control the rate at which the allocator moves between
- allocation groups when allocating extents for new files.
-
-Deprecated Sysctls
-==================
-
-None at present.
-
-
-Removed Sysctls
-===============
-
- Name Removed
- ---- -------
- fs.xfs.xfsbufd_centisec v4.0
- fs.xfs.age_buffer_centisecs v4.0
-
-
-Error handling
-==============
-
-XFS can act differently according to the type of error found during its
-operation. The implementation introduces the following concepts to the error
-handler:
-
- -failure speed:
- Defines how fast XFS should propagate an error upwards when a specific
- error is found during the filesystem operation. It can propagate
- immediately, after a defined number of retries, after a set time period,
- or simply retry forever.
-
- -error classes:
- Specifies the subsystem the error configuration will apply to, such as
- metadata IO or memory allocation. Different subsystems will have
- different error handlers for which behaviour can be configured.
-
- -error handlers:
- Defines the behavior for a specific error.
-
-The filesystem behavior during an error can be set via sysfs files. Each
-error handler works independently - the first condition met by an error handler
-for a specific class will cause the error to be propagated rather than reset and
-retried.
-
-The action taken by the filesystem when the error is propagated is context
-dependent - it may cause a shut down in the case of an unrecoverable error,
-it may be reported back to userspace, or it may even be ignored because
-there's nothing useful we can with the error or anyone we can report it to (e.g.
-during unmount).
-
-The configuration files are organized into the following hierarchy for each
-mounted filesystem:
-
- /sys/fs/xfs/<dev>/error/<class>/<error>/
-
-Where:
- <dev>
- The short device name of the mounted filesystem. This is the same device
- name that shows up in XFS kernel error messages as "XFS(<dev>): ..."
-
- <class>
- The subsystem the error configuration belongs to. As of 4.9, the defined
- classes are:
-
- - "metadata": applies metadata buffer write IO
-
- <error>
- The individual error handler configurations.
-
-
-Each filesystem has "global" error configuration options defined in their top
-level directory:
-
- /sys/fs/xfs/<dev>/error/
-
- fail_at_unmount (Min: 0 Default: 1 Max: 1)
- Defines the filesystem error behavior at unmount time.
-
- If set to a value of 1, XFS will override all other error configurations
- during unmount and replace them with "immediate fail" characteristics.
- i.e. no retries, no retry timeout. This will always allow unmount to
- succeed when there are persistent errors present.
-
- If set to 0, the configured retry behaviour will continue until all
- retries and/or timeouts have been exhausted. This will delay unmount
- completion when there are persistent errors, and it may prevent the
- filesystem from ever unmounting fully in the case of "retry forever"
- handler configurations.
-
- Note: there is no guarantee that fail_at_unmount can be set while an
- unmount is in progress. It is possible that the sysfs entries are
- removed by the unmounting filesystem before a "retry forever" error
- handler configuration causes unmount to hang, and hence the filesystem
- must be configured appropriately before unmount begins to prevent
- unmount hangs.
-
-Each filesystem has specific error class handlers that define the error
-propagation behaviour for specific errors. There is also a "default" error
-handler defined, which defines the behaviour for all errors that don't have
-specific handlers defined. Where multiple retry constraints are configuredi for
-a single error, the first retry configuration that expires will cause the error
-to be propagated. The handler configurations are found in the directory:
-
- /sys/fs/xfs/<dev>/error/<class>/<error>/
-
- max_retries (Min: -1 Default: Varies Max: INTMAX)
- Defines the allowed number of retries of a specific error before
- the filesystem will propagate the error. The retry count for a given
- error context (e.g. a specific metadata buffer) is reset every time
- there is a successful completion of the operation.
-
- Setting the value to "-1" will cause XFS to retry forever for this
- specific error.
-
- Setting the value to "0" will cause XFS to fail immediately when the
- specific error is reported.
-
- Setting the value to "N" (where 0 < N < Max) will make XFS retry the
- operation "N" times before propagating the error.
-
- retry_timeout_seconds (Min: -1 Default: Varies Max: 1 day)
- Define the amount of time (in seconds) that the filesystem is
- allowed to retry its operations when the specific error is
- found.
-
- Setting the value to "-1" will allow XFS to retry forever for this
- specific error.
-
- Setting the value to "0" will cause XFS to fail immediately when the
- specific error is reported.
-
- Setting the value to "N" (where 0 < N < Max) will allow XFS to retry the
- operation for up to "N" seconds before propagating the error.
-
-Note: The default behaviour for a specific error handler is dependent on both
-the class and error context. For example, the default values for
-"metadata/ENODEV" are "0" rather than "-1" so that this error handler defaults
-to "fail immediately" behaviour. This is done because ENODEV is a fatal,
-unrecoverable error no matter how many times the metadata IO is retried.