diff options
Diffstat (limited to 'Documentation/filesystems')
-rw-r--r-- | Documentation/filesystems/Locking | 36 | ||||
-rw-r--r-- | Documentation/filesystems/cifs/README | 2 | ||||
-rw-r--r-- | Documentation/filesystems/dax.txt | 53 | ||||
-rw-r--r-- | Documentation/filesystems/devpts.txt | 145 | ||||
-rw-r--r-- | Documentation/filesystems/directory-locking | 32 | ||||
-rw-r--r-- | Documentation/filesystems/f2fs.txt | 8 | ||||
-rw-r--r-- | Documentation/filesystems/nilfs2.txt | 8 | ||||
-rw-r--r-- | Documentation/filesystems/ocfs2-online-filecheck.txt | 10 | ||||
-rw-r--r-- | Documentation/filesystems/orangefs.txt | 50 | ||||
-rw-r--r-- | Documentation/filesystems/overlayfs.txt | 17 | ||||
-rw-r--r-- | Documentation/filesystems/pohmelfs/design_notes.txt | 2 | ||||
-rw-r--r-- | Documentation/filesystems/porting | 14 | ||||
-rw-r--r-- | Documentation/filesystems/proc.txt | 14 | ||||
-rw-r--r-- | Documentation/filesystems/qnx6.txt | 2 | ||||
-rw-r--r-- | Documentation/filesystems/tmpfs.txt | 2 | ||||
-rw-r--r-- | Documentation/filesystems/vfs.txt | 64 | ||||
-rw-r--r-- | Documentation/filesystems/xfs.txt | 123 |
17 files changed, 363 insertions, 219 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 75eea7ce3d7c..d30fb2cb5066 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -12,14 +12,17 @@ prototypes: int (*d_revalidate)(struct dentry *, unsigned int); int (*d_weak_revalidate)(struct dentry *, unsigned int); int (*d_hash)(const struct dentry *, struct qstr *); - int (*d_compare)(const struct dentry *, const struct dentry *, + int (*d_compare)(const struct dentry *, unsigned int, const char *, const struct qstr *); int (*d_delete)(struct dentry *); + int (*d_init)(struct dentry *); void (*d_release)(struct dentry *); void (*d_iput)(struct dentry *, struct inode *); char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen); struct vfsmount *(*d_automount)(struct path *path); int (*d_manage)(struct dentry *, bool); + struct dentry *(*d_real)(struct dentry *, const struct inode *, + unsigned int); locking rules: rename_lock ->d_lock may block rcu-walk @@ -28,12 +31,14 @@ d_weak_revalidate:no no yes no d_hash no no no maybe d_compare: yes no no maybe d_delete: no yes no no +d_init: no no yes no d_release: no no yes no d_prune: no yes no no d_iput: no no yes no d_dname: no no no no d_automount: no no yes no d_manage: no no yes (ref-walk) maybe +d_real no no yes no --------------------------- inode_operations --------------------------- prototypes: @@ -66,7 +71,6 @@ prototypes: struct file *, unsigned open_flag, umode_t create_mode, int *opened); int (*tmpfile) (struct inode *, struct dentry *, umode_t); - int (*dentry_open)(struct dentry *, struct file *, const struct cred *); locking rules: all may block @@ -95,7 +99,6 @@ fiemap: no update_time: no atomic_open: yes tmpfile: no -dentry_open: no Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on victim. @@ -179,7 +182,6 @@ unlocks and drops the reference. prototypes: int (*writepage)(struct page *page, struct writeback_control *wbc); int (*readpage)(struct file *, struct page *); - int (*sync_page)(struct page *); int (*writepages)(struct address_space *, struct writeback_control *); int (*set_page_dirty)(struct page *page); int (*readpages)(struct file *filp, struct address_space *mapping, @@ -195,7 +197,9 @@ prototypes: int (*releasepage) (struct page *, int); void (*freepage)(struct page *); int (*direct_IO)(struct kiocb *, struct iov_iter *iter); + bool (*isolate_page) (struct page *, isolate_mode_t); int (*migratepage)(struct address_space *, struct page *, struct page *); + void (*putback_page) (struct page *); int (*launder_page)(struct page *); int (*is_partially_uptodate)(struct page *, unsigned long, unsigned long); int (*error_remove_page)(struct address_space *, struct page *); @@ -208,7 +212,6 @@ locking rules: PageLocked(page) i_mutex writepage: yes, unlocks (see below) readpage: yes, unlocks -sync_page: maybe writepages: set_page_dirty no readpages: @@ -219,15 +222,17 @@ invalidatepage: yes releasepage: yes freepage: yes direct_IO: +isolate_page: yes migratepage: yes (both) +putback_page: yes launder_page: yes is_partially_uptodate: yes error_remove_page: yes swap_activate: no swap_deactivate: no - ->write_begin(), ->write_end(), ->sync_page() and ->readpage() -may be called from the request handler (/dev/loop). + ->write_begin(), ->write_end() and ->readpage() may be called from +the request handler (/dev/loop). ->readpage() unlocks the page, either synchronously or via I/O completion. @@ -283,11 +288,6 @@ will leave the page itself marked clean but it will be tagged as dirty in the radix tree. This incoherency can lead to all sorts of hard-to-debug problems in the filesystem like having dirty inodes at umount and losing written data. - ->sync_page() locking rules are not well-defined - usually it is called -with lock on page, but that is not guaranteed. Considering the currently -existing instances of this method ->sync_page() itself doesn't look -well-defined... - ->writepages() is used for periodic writeback and for syscall-initiated sync operations. The address_space should start I/O against at least *nr_to_write pages. *nr_to_write must be decremented for each page which is @@ -395,7 +395,7 @@ prototypes: int (*release) (struct gendisk *, fmode_t); int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); - int (*direct_access) (struct block_device *, sector_t, void __pmem **, + int (*direct_access) (struct block_device *, sector_t, void **, unsigned long *); int (*media_changed) (struct gendisk *); void (*unlock_native_capacity) (struct gendisk *); @@ -544,13 +544,13 @@ subsequent truncate), and then return with VM_FAULT_LOCKED, and the page locked. The VM will unlock the page. ->map_pages() is called when VM asks to map easy accessible pages. -Filesystem should find and map pages associated with offsets from "pgoff" -till "max_pgoff". ->map_pages() is called with page table locked and must +Filesystem should find and map pages associated with offsets from "start_pgoff" +till "end_pgoff". ->map_pages() is called with page table locked and must not block. If it's not possible to reach a page without blocking, filesystem should skip it. Filesystem should use do_set_pte() to setup -page table entry. Pointer to entry associated with offset "pgoff" is -passed in "pte" field in vm_fault structure. Pointers to entries for other -offsets should be calculated relative to "pte". +page table entry. Pointer to entry associated with the page is passed in +"pte" field in fault_env structure. Pointers to entries for other offsets +should be calculated relative to "pte". ->page_mkwrite() is called when a previously read-only pte is about to become writeable. The filesystem again must ensure that there are diff --git a/Documentation/filesystems/cifs/README b/Documentation/filesystems/cifs/README index 2d5622f60e11..a54788405429 100644 --- a/Documentation/filesystems/cifs/README +++ b/Documentation/filesystems/cifs/README @@ -272,7 +272,7 @@ A partial list of the supported mount options follows: same domain (e.g. running winbind or nss_ldap) and the server supports the Unix Extensions then the uid and gid can be retrieved from the server (and uid - and gid would not have to be specifed on the mount. + and gid would not have to be specified on the mount. For servers which do not support the CIFS Unix extensions, the default uid (and gid) returned on lookup of existing files will be the uid (gid) of the person diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt index 7bde64014a89..23d18b8a49d5 100644 --- a/Documentation/filesystems/dax.txt +++ b/Documentation/filesystems/dax.txt @@ -49,6 +49,7 @@ These block devices may be used for inspiration: - axonram: Axon DDR2 device driver - brd: RAM backed block device driver - dcssblk: s390 dcss block device driver +- pmem: NVDIMM persistent memory driver Implementation Tips for Filesystem Writers @@ -75,8 +76,41 @@ calls to get_block() (for example by a page-fault racing with a read() or a write()) work correctly. These filesystems may be used for inspiration: -- ext2: the second extended filesystem, see Documentation/filesystems/ext2.txt -- ext4: the fourth extended filesystem, see Documentation/filesystems/ext4.txt +- ext2: see Documentation/filesystems/ext2.txt +- ext4: see Documentation/filesystems/ext4.txt +- xfs: see Documentation/filesystems/xfs.txt + + +Handling Media Errors +--------------------- + +The libnvdimm subsystem stores a record of known media error locations for +each pmem block device (in gendisk->badblocks). If we fault at such location, +or one with a latent error not yet discovered, the application can expect +to receive a SIGBUS. Libnvdimm also allows clearing of these errors by simply +writing the affected sectors (through the pmem driver, and if the underlying +NVDIMM supports the clear_poison DSM defined by ACPI). + +Since DAX IO normally doesn't go through the driver/bio path, applications or +sysadmins have an option to restore the lost data from a prior backup/inbuilt +redundancy in the following ways: + +1. Delete the affected file, and restore from a backup (sysadmin route): + This will free the file system blocks that were being used by the file, + and the next time they're allocated, they will be zeroed first, which + happens through the driver, and will clear bad sectors. + +2. Truncate or hole-punch the part of the file that has a bad-block (at least + an entire aligned sector has to be hole-punched, but not necessarily an + entire filesystem block). + +These are the two basic paths that allow DAX filesystems to continue operating +in the presence of media errors. More robust error recovery mechanisms can be +built on top of this in the future, for example, involving redundancy/mirroring +provided at the block layer through DM, or additionally, at the filesystem +level. These would have to rely on the above two tenets, that error clearing +can happen either by sending an IO through the driver, or zeroing (also through +the driver). Shortcomings @@ -89,9 +123,12 @@ The DAX code does not work correctly on architectures which have virtually mapped caches such as ARM, MIPS and SPARC. Calling get_user_pages() on a range of user memory that has been mmaped -from a DAX file will fail as there are no 'struct page' to describe -those pages. This problem is being worked on. That means that O_DIRECT -reads/writes to those memory ranges from a non-DAX file will fail (note -that O_DIRECT reads/writes _of a DAX file_ do work, it is the memory -that is being accessed that is key here). Other things that will not -work include RDMA, sendfile() and splice(). +from a DAX file will fail when there are no 'struct page' to describe +those pages. This problem has been addressed in some device drivers +by adding optional struct page support for pages under the control of +the driver (see CONFIG_NVDIMM_PFN in drivers/nvdimm for an example of +how to do this). In the non struct page cases O_DIRECT reads/writes to +those memory ranges from a non-DAX file will fail (note that O_DIRECT +reads/writes _of a DAX file_ do work, it is the memory that is being +accessed that is key here). Other things that will not work in the +non struct page case include RDMA, sendfile() and splice(). diff --git a/Documentation/filesystems/devpts.txt b/Documentation/filesystems/devpts.txt index 30d2fcb32f72..9f94fe276dea 100644 --- a/Documentation/filesystems/devpts.txt +++ b/Documentation/filesystems/devpts.txt @@ -1,141 +1,26 @@ +Each mount of the devpts filesystem is now distinct such that ptys +and their indicies allocated in one mount are independent from ptys +and their indicies in all other mounts. -To support containers, we now allow multiple instances of devpts filesystem, -such that indices of ptys allocated in one instance are independent of indices -allocated in other instances of devpts. +All mounts of the devpts filesystem now create a /dev/pts/ptmx node +with permissions 0000. -To preserve backward compatibility, this support for multiple instances is -enabled only if: +To retain backwards compatibility the a ptmx device node (aka any node +created with "mknod name c 5 2") when opened will look for an instance +of devpts under the name "pts" in the same directory as the ptmx device +node. - - CONFIG_DEVPTS_MULTIPLE_INSTANCES=y, and - - '-o newinstance' mount option is specified while mounting devpts - -IOW, devpts now supports both single-instance and multi-instance semantics. - -If CONFIG_DEVPTS_MULTIPLE_INSTANCES=n, there is no change in behavior and -this referred to as the "legacy" mode. In this mode, the new mount options -(-o newinstance and -o ptmxmode) will be ignored with a 'bogus option' message -on console. - -If CONFIG_DEVPTS_MULTIPLE_INSTANCES=y and devpts is mounted without the -'newinstance' option (as in current start-up scripts) the new mount binds -to the initial kernel mount of devpts. This mode is referred to as the -'single-instance' mode and the current, single-instance semantics are -preserved, i.e PTYs are common across the system. - -The only difference between this single-instance mode and the legacy mode -is the presence of new, '/dev/pts/ptmx' node with permissions 0000, which -can safely be ignored. - -If CONFIG_DEVPTS_MULTIPLE_INSTANCES=y and 'newinstance' option is specified, -the mount is considered to be in the multi-instance mode and a new instance -of the devpts fs is created. Any ptys created in this instance are independent -of ptys in other instances of devpts. Like in the single-instance mode, the -/dev/pts/ptmx node is present. To effectively use the multi-instance mode, -open of /dev/ptmx must be a redirected to '/dev/pts/ptmx' using a symlink or -bind-mount. - -Eg: A container startup script could do the following: - - $ chmod 0666 /dev/pts/ptmx - $ rm /dev/ptmx - $ ln -s pts/ptmx /dev/ptmx - $ ns_exec -cm /bin/bash - - # We are now in new container - - $ umount /dev/pts - $ mount -t devpts -o newinstance lxcpts /dev/pts - $ sshd -p 1234 - -where 'ns_exec -cm /bin/bash' calls clone() with CLONE_NEWNS flag and execs -/bin/bash in the child process. A pty created by the sshd is not visible in -the original mount of /dev/pts. +As an option instead of placing a /dev/ptmx device node at /dev/ptmx +it is possible to place a symlink to /dev/pts/ptmx at /dev/ptmx or +to bind mount /dev/ptx/ptmx to /dev/ptmx. If you opt for using +the devpts filesystem in this manner devpts should be mounted with +the ptmxmode=0666, or chmod 0666 /dev/pts/ptmx should be called. Total count of pty pairs in all instances is limited by sysctls: kernel.pty.max = 4096 - global limit -kernel.pty.reserve = 1024 - reserve for initial instance +kernel.pty.reserve = 1024 - reserved for filesystems mounted from the initial mount namespace kernel.pty.nr - current count of ptys Per-instance limit could be set by adding mount option "max=<count>". This feature was added in kernel 3.4 together with sysctl kernel.pty.reserve. In kernels older than 3.4 sysctl kernel.pty.max works as per-instance limit. - -User-space changes ------------------- - -In multi-instance mode (i.e '-o newinstance' mount option is specified at least -once), following user-space issues should be noted. - -1. If -o newinstance mount option is never used, /dev/pts/ptmx can be ignored - and no change is needed to system-startup scripts. - -2. To effectively use multi-instance mode (i.e -o newinstance is specified) - administrators or startup scripts should "redirect" open of /dev/ptmx to - /dev/pts/ptmx using either a bind mount or symlink. - - $ mount -t devpts -o newinstance devpts /dev/pts - - followed by either - - $ rm /dev/ptmx - $ ln -s pts/ptmx /dev/ptmx - $ chmod 666 /dev/pts/ptmx - or - $ mount -o bind /dev/pts/ptmx /dev/ptmx - -3. The '/dev/ptmx -> pts/ptmx' symlink is the preferred method since it - enables better error-reporting and treats both single-instance and - multi-instance mounts similarly. - - But this method requires that system-startup scripts set the mode of - /dev/pts/ptmx correctly (default mode is 0000). The scripts can set the - mode by, either - - - adding ptmxmode mount option to devpts entry in /etc/fstab, or - - using 'chmod 0666 /dev/pts/ptmx' - -4. If multi-instance mode mount is needed for containers, but the system - startup scripts have not yet been updated, container-startup scripts - should bind mount /dev/ptmx to /dev/pts/ptmx to avoid breaking single- - instance mounts. - - Or, in general, container-startup scripts should use: - - mount -t devpts -o newinstance -o ptmxmode=0666 devpts /dev/pts - if [ ! -L /dev/ptmx ]; then - mount -o bind /dev/pts/ptmx /dev/ptmx - fi - - When all devpts mounts are multi-instance, /dev/ptmx can permanently be - a symlink to pts/ptmx and the bind mount can be ignored. - -5. A multi-instance mount that is not accompanied by the /dev/ptmx to - /dev/pts/ptmx redirection would result in an unusable/unreachable pty. - - mount -t devpts -o newinstance lxcpts /dev/pts - - immediately followed by: - - open("/dev/ptmx") - - would create a pty, say /dev/pts/7, in the initial kernel mount. - But /dev/pts/7 would be invisible in the new mount. - -6. The permissions for /dev/pts/ptmx node should be specified when mounting - /dev/pts, using the '-o ptmxmode=%o' mount option (default is 0000). - - mount -t devpts -o newinstance -o ptmxmode=0644 devpts /dev/pts - - The permissions can be later be changed as usual with 'chmod'. - - chmod 666 /dev/pts/ptmx - -7. A mount of devpts without the 'newinstance' option results in binding to - initial kernel mount. This behavior while preserving legacy semantics, - does not provide strict isolation in a container environment. i.e by - mounting devpts without the 'newinstance' option, a container could - get visibility into the 'host' or root container's devpts. - - To workaround this and have strict isolation, all mounts of devpts, - including the mount in the root container, should use the newinstance - option. diff --git a/Documentation/filesystems/directory-locking b/Documentation/filesystems/directory-locking index 09bbf9a54f80..c314badbcfc6 100644 --- a/Documentation/filesystems/directory-locking +++ b/Documentation/filesystems/directory-locking @@ -1,30 +1,37 @@ Locking scheme used for directory operations is based on two -kinds of locks - per-inode (->i_mutex) and per-filesystem +kinds of locks - per-inode (->i_rwsem) and per-filesystem (->s_vfs_rename_mutex). - When taking the i_mutex on multiple non-directory objects, we + When taking the i_rwsem on multiple non-directory objects, we always acquire the locks in order by increasing address. We'll call that "inode pointer" order in the following. For our purposes all operations fall in 5 classes: 1) read access. Locking rules: caller locks directory we are accessing. +The lock is taken shared. -2) object creation. Locking rules: same as above. +2) object creation. Locking rules: same as above, but the lock is taken +exclusive. 3) object removal. Locking rules: caller locks parent, finds victim, -locks victim and calls the method. +locks victim and calls the method. Locks are exclusive. 4) rename() that is _not_ cross-directory. Locking rules: caller locks -the parent and finds source and target. If target already exists, lock -it. If source is a non-directory, lock it. If that means we need to -lock both, lock them in inode pointer order. +the parent and finds source and target. In case of exchange (with +RENAME_EXCHANGE in rename2() flags argument) lock both. In any case, +if the target already exists, lock it. If the source is a non-directory, +lock it. If we need to lock both, lock them in inode pointer order. +Then call the method. All locks are exclusive. +NB: we might get away with locking the the source (and target in exchange +case) shared. 5) link creation. Locking rules: * lock parent * check that source is not a directory * lock source * call the method. +All locks are exclusive. 6) cross-directory rename. The trickiest in the whole bunch. Locking rules: @@ -35,11 +42,12 @@ rules: fail with -ENOTEMPTY * if new parent is equal to or is a descendent of source fail with -ELOOP - * If target exists, lock it. If source is a non-directory, lock - it. In case that means we need to lock both source and target, - do so in inode pointer order. + * If it's an exchange, lock both the source and the target. + * If the target exists, lock it. If the source is a non-directory, + lock it. If we need to lock both, do so in inode pointer order. * call the method. - +All ->i_rwsem are taken exclusive. Again, we might get away with locking +the the source (and target in exchange case) shared. The rules above obviously guarantee that all directories that are going to be read, modified or removed by method will be locked by caller. @@ -73,7 +81,7 @@ objects - A < B iff A is an ancestor of B. attempt to acquire some lock and already holds at least one lock. Let's consider the set of contended locks. First of all, filesystem lock is not contended, since any process blocked on it is not holding any locks. -Thus all processes are blocked on ->i_mutex. +Thus all processes are blocked on ->i_rwsem. By (3), any process holding a non-directory lock can only be waiting on another non-directory lock with a larger address. Therefore diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index e1c9f0849da6..753dd4f96afe 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -109,7 +109,9 @@ background_gc=%s Turn on/off cleaning operations, namely garbage disable_roll_forward Disable the roll-forward recovery routine norecovery Disable the roll-forward recovery routine, mounted read- only (i.e., -o ro,disable_roll_forward) -discard Issue discard/TRIM commands when a segment is cleaned. +discard/nodiscard Enable/disable real-time discard in f2fs, if discard is + enabled, f2fs will issue discard/TRIM commands when a + segment is cleaned. no_heap Disable heap-style segment allocation which finds free segments for data from the beginning of main area, while for node from the end of main area. @@ -129,6 +131,7 @@ inline_dentry Enable the inline dir feature: data in new created directory entries can be written into inode block. The space of inode block which is used to store inline dentries is limited to ~3.4k. +noinline_dentry Diable the inline dentry feature. flush_merge Merge concurrent cache_flush commands as much as possible to eliminate redundant command issues. If the underlying device handles the cache_flush command relatively slowly, @@ -151,6 +154,9 @@ noinline_data Disable the inline data feature, inline data feature is enabled by default. data_flush Enable data flushing before checkpoint in order to persist data of regular and symlink. +mode=%s Control block allocation mode which supports "adaptive" + and "lfs". In "lfs" mode, there should be no random + writes towards main area. ================================================================================ DEBUGFS ENTRIES diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt index 41c3d332acc9..c0727dc36271 100644 --- a/Documentation/filesystems/nilfs2.txt +++ b/Documentation/filesystems/nilfs2.txt @@ -267,4 +267,10 @@ among NILFS2 files can be depicted as follows: `-- file (ino=yy) ( regular file, directory, or symlink ) -For detail on the format of each file, please see include/linux/nilfs2_fs.h. +For detail on the format of each file, please see nilfs2_ondisk.h +located at include/uapi/linux directory. + +There are no patents or other intellectual property that we protect +with regard to the design of NILFS2. It is allowed to replicate the +design in hopes that other operating systems could share (mount, read, +write, etc.) data stored in this format. diff --git a/Documentation/filesystems/ocfs2-online-filecheck.txt b/Documentation/filesystems/ocfs2-online-filecheck.txt index 1ab07860430d..139fab175c8a 100644 --- a/Documentation/filesystems/ocfs2-online-filecheck.txt +++ b/Documentation/filesystems/ocfs2-online-filecheck.txt @@ -5,12 +5,12 @@ This document will describe OCFS2 online file check feature. Introduction ============ -OCFS2 is often used in high-availaibility systems. However, OCFS2 usually +OCFS2 is often used in high-availability systems. However, OCFS2 usually converts the filesystem to read-only when encounters an error. This may not be necessary, since turning the filesystem read-only would affect other running processes as well, decreasing availability. Then, a mount option (errors=continue) is introduced, which would return the --EIO errno to the calling process and terminate furhter processing so that the +-EIO errno to the calling process and terminate further processing so that the filesystem is not corrupted further. The filesystem is not converted to read-only, and the problematic file's inode number is reported in the kernel log. The user can try to check/fix this file via online filecheck feature. @@ -44,7 +44,7 @@ There is a sysfs directory for each OCFS2 file system mounting: /sys/fs/ocfs2/<devname>/filecheck -Here, <devname> indicates the name of OCFS2 volumn device which has been already +Here, <devname> indicates the name of OCFS2 volume device which has been already mounted. The file above would accept inode numbers. This could be used to communicate with kernel space, tell which file(inode number) will be checked or fixed. Currently, three operations are supported, which includes checking @@ -76,14 +76,14 @@ The output is like this: This time, the <ERROR> column indicates whether this fix is successful or not. 3. The record cache is used to store the history of check/fix results. It's -defalut size is 10, and can be adjust between the range of 10 ~ 100. You can +default size is 10, and can be adjust between the range of 10 ~ 100. You can adjust the size like this: # echo "<size>" > /sys/fs/ocfs2/<devname>/filecheck/set Fixing stuff ============ -On receivng the inode, the filesystem would read the inode and the +On receiving the inode, the filesystem would read the inode and the file metadata. In case of errors, the filesystem would fix the errors and report the problems it fixed in the kernel log. As a precautionary measure, the inode must first be checked for errors before performing a final fix. diff --git a/Documentation/filesystems/orangefs.txt b/Documentation/filesystems/orangefs.txt index e1a0056a365f..1dfdec790946 100644 --- a/Documentation/filesystems/orangefs.txt +++ b/Documentation/filesystems/orangefs.txt @@ -281,7 +281,7 @@ on the wait queue and one attempt is made to recycle them. Obviously, if the client-core stays dead too long, the arbitrary userspace processes trying to use Orangefs will be negatively affected. Waiting ops that can't be serviced will be removed from the request list and -have their states set to "given up". In-progress ops that can't +have their states set to "given up". In-progress ops that can't be serviced will be removed from the in_progress hash table and have their states set to "given up". @@ -338,7 +338,7 @@ particular response. PVFS2_VFS_OP_STATFS fill a pvfs2_statfs_response_t with useless info <g>. It is hard for us to know, in a timely fashion, these statistics about our - distributed network filesystem. + distributed network filesystem. PVFS2_VFS_OP_FS_MOUNT fill a pvfs2_fs_mount_response_t which is just like a PVFS_object_kref @@ -386,7 +386,7 @@ responses: io_array[1].iov_base = address of global variable "pdev_magic" (int32_t) io_array[1].iov_len = sizeof(int32_t) - + io_array[2].iov_base = address of parameter "tag" (PVFS_id_gen_t) io_array[2].iov_len = sizeof(int64_t) @@ -402,5 +402,47 @@ Readdir responses initialize the fifth element io_array like this: io_array[4].iov_len = contents of member trailer_size (PVFS_size) from out_downcall member of global variable vfs_request - + +Orangefs exploits the dcache in order to avoid sending redundant +requests to userspace. We keep object inode attributes up-to-date with +orangefs_inode_getattr. Orangefs_inode_getattr uses two arguments to +help it decide whether or not to update an inode: "new" and "bypass". +Orangefs keeps private data in an object's inode that includes a short +timeout value, getattr_time, which allows any iteration of +orangefs_inode_getattr to know how long it has been since the inode was +updated. When the object is not new (new == 0) and the bypass flag is not +set (bypass == 0) orangefs_inode_getattr returns without updating the inode +if getattr_time has not timed out. Getattr_time is updated each time the +inode is updated. + +Creation of a new object (file, dir, sym-link) includes the evaluation of +its pathname, resulting in a negative directory entry for the object. +A new inode is allocated and associated with the dentry, turning it from +a negative dentry into a "productive full member of society". Orangefs +obtains the new inode from Linux with new_inode() and associates +the inode with the dentry by sending the pair back to Linux with +d_instantiate(). + +The evaluation of a pathname for an object resolves to its corresponding +dentry. If there is no corresponding dentry, one is created for it in +the dcache. Whenever a dentry is modified or verified Orangefs stores a +short timeout value in the dentry's d_time, and the dentry will be trusted +for that amount of time. Orangefs is a network filesystem, and objects +can potentially change out-of-band with any particular Orangefs kernel module +instance, so trusting a dentry is risky. The alternative to trusting +dentries is to always obtain the needed information from userspace - at +least a trip to the client-core, maybe to the servers. Obtaining information +from a dentry is cheap, obtaining it from userspace is relatively expensive, +hence the motivation to use the dentry when possible. + +The timeout values d_time and getattr_time are jiffy based, and the +code is designed to avoid the jiffy-wrap problem: + +"In general, if the clock may have wrapped around more than once, there +is no way to tell how much time has elapsed. However, if the times t1 +and t2 are known to be fairly close, we can reliably compute the +difference in a way that takes into account the possibility that the +clock may have wrapped between times." + + from course notes by instructor Andy Wang diff --git a/Documentation/filesystems/overlayfs.txt b/Documentation/filesystems/overlayfs.txt index 28091457b71a..bcbf9710e4af 100644 --- a/Documentation/filesystems/overlayfs.txt +++ b/Documentation/filesystems/overlayfs.txt @@ -183,26 +183,15 @@ The copy_up operation essentially creates a new, identical file and moves it over to the old name. The new file may be on a different filesystem, so both st_dev and st_ino of the file may change. -Any open files referring to this inode will access the old data and -metadata. Similarly any file locks obtained before copy_up will not -apply to the copied up file. +Any open files referring to this inode will access the old data. -On a file opened with O_RDONLY fchmod(2), fchown(2), futimesat(2) and -fsetxattr(2) will fail with EROFS. +Any file locks (and leases) obtained before copy_up will not apply +to the copied up file. If a file with multiple hard links is copied up, then this will "break" the link. Changes will not be propagated to other names referring to the same inode. -Symlinks in /proc/PID/ and /proc/PID/fd which point to a non-directory -object in overlayfs will not contain valid absolute paths, only -relative paths leading up to the filesystem's root. This will be -fixed in the future. - -Some operations are not atomic, for example a crash during copy_up or -rename will leave the filesystem in an inconsistent state. This will -be addressed in the future. - Changes to underlying filesystems --------------------------------- diff --git a/Documentation/filesystems/pohmelfs/design_notes.txt b/Documentation/filesystems/pohmelfs/design_notes.txt index 8aef91335701..106d17fbb05f 100644 --- a/Documentation/filesystems/pohmelfs/design_notes.txt +++ b/Documentation/filesystems/pohmelfs/design_notes.txt @@ -29,7 +29,7 @@ Main features of this FS include: * Read request (data read, directory listing, lookup requests) balancing between multiple servers. * Write requests are replicated to multiple servers and completed only when all of them are acked. * Ability to add and/or remove servers from the working set at run-time. - * Strong authentification and possible data encryption in network channel. + * Strong authentication and possible data encryption in network channel. * Extended attributes support. POHMELFS is based on transactions, which are potentially long-standing objects that live diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index 46f3bb7a02f5..b1bd05ea66b2 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting @@ -578,3 +578,17 @@ in your dentry operations instead. -- [mandatory] ->atomic_open() calls without O_CREAT may happen in parallel. +-- +[mandatory] + ->setxattr() and xattr_handler.set() get dentry and inode passed separately. + dentry might be yet to be attached to inode, so do _not_ use its ->d_inode + in the instances. Rationale: !@#!@# security_d_instantiate() needs to be + called before we attach dentry to inode and !@#!@##!@$!$#!@#$!@$!@$ smack + ->d_instantiate() uses not just ->getxattr() but ->setxattr() as well. +-- +[mandatory] + ->d_compare() doesn't get parent as a separate argument anymore. If you + used it for finding the struct super_block involved, dentry->d_sb will + work just as well; if it's something more complicated, use dentry->d_parent. + Just be careful not to assume that fetching it more than once will yield + the same value - in RCU mode it could change under you. diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 7f5607a089b4..fcc1ac094282 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -145,7 +145,7 @@ Table 1-1: Process specific entries in /proc symbol the task is blocked in - or "0" if not blocked. pagemap Page table stack Report full stack trace, enable via CONFIG_STACKTRACE - smaps a extension based on maps, showing the memory consumption of + smaps an extension based on maps, showing the memory consumption of each mapping and flags associated with it numa_maps an extension based on maps, showing the memory locality and binding policy as well as mem usage (in pages) of each mapping. @@ -225,6 +225,7 @@ Table 1-2: Contents of the status files (as of 4.1) TracerPid PID of process tracing this process (0 if not) Uid Real, effective, saved set, and file system UIDs Gid Real, effective, saved set, and file system GIDs + Umask file mode creation mask FDSize number of file descriptor slots currently allocated Groups supplementary group list NStgid descendant namespace thread group ID hierarchy @@ -435,6 +436,7 @@ Private_Dirty: 0 kB Referenced: 892 kB Anonymous: 0 kB AnonHugePages: 0 kB +ShmemPmdMapped: 0 kB Shared_Hugetlb: 0 kB Private_Hugetlb: 0 kB Swap: 0 kB @@ -463,6 +465,8 @@ accessed. a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE and a page is modified, the file page is replaced by a private anonymous copy. "AnonHugePages" shows the ammount of memory backed by transparent hugepage. +"ShmemPmdMapped" shows the ammount of shared (shmem/tmpfs) memory backed by +huge pages. "Shared_Hugetlb" and "Private_Hugetlb" show the ammounts of memory backed by hugetlbfs page which is *not* counted in "RSS" or "PSS" field for historical reasons. And these are not included in {Shared,Private}_{Clean,Dirty} field. @@ -724,7 +728,7 @@ IRQ, you can set it by doing: > echo 1 > /proc/irq/10/smp_affinity This means that only the first CPU will handle the IRQ, but you can also echo -5 which means that only the first and fourth CPU can handle the IRQ. +5 which means that only the first and third CPU can handle the IRQ. The contents of each smp_affinity file is the same by default: @@ -867,6 +871,9 @@ VmallocTotal: 112216 kB VmallocUsed: 428 kB VmallocChunk: 111088 kB AnonHugePages: 49152 kB +ShmemHugePages: 0 kB +ShmemPmdMapped: 0 kB + MemTotal: Total usable ram (i.e. physical ram minus a few reserved bits and the kernel binary code) @@ -911,6 +918,9 @@ MemAvailable: An estimate of how much memory is available for starting new AnonHugePages: Non-file backed huge pages mapped into userspace page tables Mapped: files which have been mmaped, such as libraries Shmem: Total memory used by shared memory (shmem) and tmpfs +ShmemHugePages: Memory used by shared memory (shmem) and tmpfs allocated + with huge pages +ShmemPmdMapped: Shared memory mapped into userspace with huge pages Slab: in-kernel data structures cache SReclaimable: Part of Slab, that might be reclaimed, such as caches SUnreclaim: Part of Slab, that cannot be reclaimed on memory pressure diff --git a/Documentation/filesystems/qnx6.txt b/Documentation/filesystems/qnx6.txt index 408679789136..4f3d6a882bdc 100644 --- a/Documentation/filesystems/qnx6.txt +++ b/Documentation/filesystems/qnx6.txt @@ -16,7 +16,7 @@ qnx6fs shares many properties with traditional Unix filesystems. It has the concepts of blocks, inodes and directories. On QNX it is possible to create little endian and big endian qnx6 filesystems. This feature makes it possible to create and use a different endianness fs -for the target (QNX is used on quite a range of embedded systems) plattform +for the target (QNX is used on quite a range of embedded systems) platform running on a different endianness. The Linux driver handles endianness transparently. (LE and BE) diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt index d9c11d25bf02..a85355cf85f4 100644 --- a/Documentation/filesystems/tmpfs.txt +++ b/Documentation/filesystems/tmpfs.txt @@ -98,7 +98,7 @@ A memory policy with a valid NodeList will be saved, as specified, for use at file creation time. When a task allocates a file in the file system, the mount option memory policy will be applied with a NodeList, if any, modified by the calling task's cpuset constraints -[See Documentation/cgroups/cpusets.txt] and any optional flags, listed +[See Documentation/cgroup-v1/cpusets.txt] and any optional flags, listed below. If the resulting NodeLists is the empty set, the effective memory policy for the file will revert to "default" policy. diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index bb1c91726922..cbec006e10e4 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -364,7 +364,6 @@ struct inode_operations { int (*atomic_open)(struct inode *, struct dentry *, struct file *, unsigned open_flag, umode_t create_mode, int *opened); int (*tmpfile) (struct inode *, struct dentry *, umode_t); - int (*dentry_open)(struct dentry *, struct file *, const struct cred *); }; Again, all methods are called without any locks being held, unless @@ -534,9 +533,7 @@ __sync_single_inode) to check if ->writepages has been successful in writing out the whole address_space. The Writeback tag is used by filemap*wait* and sync_page* functions, -via filemap_fdatawait_range, to wait for all writeback to -complete. While waiting ->sync_page (if defined) will be called on -each page that is found to require writeback. +via filemap_fdatawait_range, to wait for all writeback to complete. An address_space handler may attach extra information to a page, typically using the 'private' field in the 'struct page'. If such @@ -554,8 +551,8 @@ address_space has finer control of write sizes. The read process essentially only requires 'readpage'. The write process is more complicated and uses write_begin/write_end or -set_page_dirty to write data into the address_space, and writepage, -sync_page, and writepages to writeback data to storage. +set_page_dirty to write data into the address_space, and writepage +and writepages to writeback data to storage. Adding and removing pages to/from an address_space is protected by the inode's i_mutex. @@ -592,9 +589,14 @@ struct address_space_operations { int (*releasepage) (struct page *, int); void (*freepage)(struct page *); ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); + /* isolate a page for migration */ + bool (*isolate_page) (struct page *, isolate_mode_t); /* migrate the contents of a page to the specified target */ int (*migratepage) (struct page *, struct page *); + /* put migration-failed page back to right list */ + void (*putback_page) (struct page *); int (*launder_page) (struct page *); + int (*is_partially_uptodate) (struct page *, unsigned long, unsigned long); void (*is_dirty_writeback) (struct page *, bool *, bool *); @@ -696,13 +698,6 @@ struct address_space_operations { but instead uses bmap to find out where the blocks in the file are and uses those addresses directly. - dentry_open: *WARNING: probably going away soon, do not use!* This is an - alternative to f_op->open(), the difference is that this method may open - a file not necessarily originating from the same filesystem as the one - i_op->open() was called on. It may be useful for stacking filesystems - which want to allow native I/O directly on underlying files. - - invalidatepage: If a page has PagePrivate set, then invalidatepage will be called when part or all of the page is to be removed from the address space. This generally corresponds to either a @@ -747,6 +742,10 @@ struct address_space_operations { and transfer data directly between the storage and the application's address space. + isolate_page: Called by the VM when isolating a movable non-lru page. + If page is successfully isolated, VM marks the page as PG_isolated + via __SetPageIsolated. + migrate_page: This is used to compact the physical memory usage. If the VM wants to relocate a page (maybe off a memory card that is signalling imminent failure) it will pass a new page @@ -754,6 +753,8 @@ struct address_space_operations { transfer any private data across and update any references that it has to the page. + putback_page: Called by the VM when isolated page's migration fails. + launder_page: Called before freeing a page - it writes back the dirty page. To prevent redirtying the page, it is kept locked during the whole operation. @@ -930,14 +931,17 @@ struct dentry_operations { int (*d_revalidate)(struct dentry *, unsigned int); int (*d_weak_revalidate)(struct dentry *, unsigned int); int (*d_hash)(const struct dentry *, struct qstr *); - int (*d_compare)(const struct dentry *, const struct dentry *, + int (*d_compare)(const struct dentry *, unsigned int, const char *, const struct qstr *); int (*d_delete)(const struct dentry *); + int (*d_init)(struct dentry *); void (*d_release)(struct dentry *); void (*d_iput)(struct dentry *, struct inode *); char *(*d_dname)(struct dentry *, char *, int); struct vfsmount *(*d_automount)(struct path *); int (*d_manage)(struct dentry *, bool); + struct dentry *(*d_real)(struct dentry *, const struct inode *, + unsigned int); }; d_revalidate: called when the VFS needs to revalidate a dentry. This @@ -1003,6 +1007,8 @@ struct dentry_operations { always cache a reachable dentry. d_delete must be constant and idempotent. + d_init: called when a dentry is allocated + d_release: called when a dentry is really deallocated d_iput: called when a dentry loses its inode (just prior to its @@ -1022,6 +1028,14 @@ struct dentry_operations { at the end of the buffer, and returns a pointer to the first char. dynamic_dname() helper function is provided to take care of this. + Example : + + static char *pipefs_dname(struct dentry *dent, char *buffer, int buflen) + { + return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", + dentry->d_inode->i_ino); + } + d_automount: called when an automount dentry is to be traversed (optional). This should create a new VFS mount record and return the record to the caller. The caller is supplied with a path parameter giving the @@ -1060,13 +1074,23 @@ struct dentry_operations { This function is only used if DCACHE_MANAGE_TRANSIT is set on the dentry being transited from. -Example : + d_real: overlay/union type filesystems implement this method to return one of + the underlying dentries hidden by the overlay. It is used in three + different modes: -static char *pipefs_dname(struct dentry *dent, char *buffer, int buflen) -{ - return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", - dentry->d_inode->i_ino); -} + Called from open it may need to copy-up the file depending on the + supplied open flags. This mode is selected with a non-zero flags + argument. In this mode the d_real method can return an error. + + Called from file_dentry() it returns the real dentry matching the inode + argument. The real dentry may be from a lower layer already copied up, + but still referenced from the file. This mode is selected with a + non-NULL inode argument. This will always succeed. + + With NULL inode and zero flags the topmost real underlying dentry is + returned. This will always succeed. + + This method is never called with both non-NULL inode and non-zero flags. Each dentry has a pointer to its parent dentry, as well as a hash list of child dentries. Child dentries are basically like files in a diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt index 8146e9fd5ffc..c2d44e6e117b 100644 --- a/Documentation/filesystems/xfs.txt +++ b/Documentation/filesystems/xfs.txt @@ -348,3 +348,126 @@ Removed Sysctls ---- ------- fs.xfs.xfsbufd_centisec v4.0 fs.xfs.age_buffer_centisecs v4.0 + + +Error handling +============== + +XFS can act differently according to the type of error found during its +operation. The implementation introduces the following concepts to the error +handler: + + -failure speed: + Defines how fast XFS should propagate an error upwards when a specific + error is found during the filesystem operation. It can propagate + immediately, after a defined number of retries, after a set time period, + or simply retry forever. + + -error classes: + Specifies the subsystem the error configuration will apply to, such as + metadata IO or memory allocation. Different subsystems will have + different error handlers for which behaviour can be configured. + + -error handlers: + Defines the behavior for a specific error. + +The filesystem behavior during an error can be set via sysfs files. Each +error handler works independently - the first condition met by an error handler +for a specific class will cause the error to be propagated rather than reset and +retried. + +The action taken by the filesystem when the error is propagated is context +dependent - it may cause a shut down in the case of an unrecoverable error, +it may be reported back to userspace, or it may even be ignored because +there's nothing useful we can with the error or anyone we can report it to (e.g. +during unmount). + +The configuration files are organized into the following hierarchy for each +mounted filesystem: + + /sys/fs/xfs/<dev>/error/<class>/<error>/ + +Where: + <dev> + The short device name of the mounted filesystem. This is the same device + name that shows up in XFS kernel error messages as "XFS(<dev>): ..." + + <class> + The subsystem the error configuration belongs to. As of 4.9, the defined + classes are: + + - "metadata": applies metadata buffer write IO + + <error> + The individual error handler configurations. + + +Each filesystem has "global" error configuration options defined in their top +level directory: + + /sys/fs/xfs/<dev>/error/ + + fail_at_unmount (Min: 0 Default: 1 Max: 1) + Defines the filesystem error behavior at unmount time. + + If set to a value of 1, XFS will override all other error configurations + during unmount and replace them with "immediate fail" characteristics. + i.e. no retries, no retry timeout. This will always allow unmount to + succeed when there are persistent errors present. + + If set to 0, the configured retry behaviour will continue until all + retries and/or timeouts have been exhausted. This will delay unmount + completion when there are persistent errors, and it may prevent the + filesystem from ever unmounting fully in the case of "retry forever" + handler configurations. + + Note: there is no guarantee that fail_at_unmount can be set whilst an + unmount is in progress. It is possible that the sysfs entries are + removed by the unmounting filesystem before a "retry forever" error + handler configuration causes unmount to hang, and hence the filesystem + must be configured appropriately before unmount begins to prevent + unmount hangs. + +Each filesystem has specific error class handlers that define the error +propagation behaviour for specific errors. There is also a "default" error +handler defined, which defines the behaviour for all errors that don't have +specific handlers defined. Where multiple retry constraints are configuredi for +a single error, the first retry configuration that expires will cause the error +to be propagated. The handler configurations are found in the directory: + + /sys/fs/xfs/<dev>/error/<class>/<error>/ + + max_retries (Min: -1 Default: Varies Max: INTMAX) + Defines the allowed number of retries of a specific error before + the filesystem will propagate the error. The retry count for a given + error context (e.g. a specific metadata buffer) is reset every time + there is a successful completion of the operation. + + Setting the value to "-1" will cause XFS to retry forever for this + specific error. + + Setting the value to "0" will cause XFS to fail immediately when the + specific error is reported. + + Setting the value to "N" (where 0 < N < Max) will make XFS retry the + operation "N" times before propagating the error. + + retry_timeout_seconds (Min: -1 Default: Varies Max: 1 day) + Define the amount of time (in seconds) that the filesystem is + allowed to retry its operations when the specific error is + found. + + Setting the value to "-1" will allow XFS to retry forever for this + specific error. + + Setting the value to "0" will cause XFS to fail immediately when the + specific error is reported. + + Setting the value to "N" (where 0 < N < Max) will allow XFS to retry the + operation for up to "N" seconds before propagating the error. + +Note: The default behaviour for a specific error handler is dependent on both +the class and error context. For example, the default values for +"metadata/ENODEV" are "0" rather than "-1" so that this error handler defaults +to "fail immediately" behaviour. This is done because ENODEV is a fatal, +unrecoverable error no matter how many times the metadata IO is retried. |