8 files changed, 201 insertions, 31 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index fe15682e8acd..14cdc101d165 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -59,10 +59,7 @@ prototypes:
 	int (*get_acl)(struct inode *, int);
 	int (*setattr) (struct dentry *, struct iattr *);
 	int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *);
-	int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
-	ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
 	ssize_t (*listxattr) (struct dentry *, char *, size_t);
-	int (*removexattr) (struct dentry *, const char *);
 	int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len);
 	void (*update_time)(struct inode *, struct timespec *, int);
 	int (*atomic_open)(struct inode *, struct dentry *,
@@ -88,15 +85,13 @@ setattr:	yes
 permission:	no (may not block if called in rcu-walk mode)
 get_acl:	no
 getattr:	no
-setxattr:	yes
-getxattr:	no
 listxattr:	no
-removexattr:	yes
 fiemap:		no
 update_time:	no
 atomic_open:	yes
 tmpfile:	no
 
+
 	Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on
 victim.
 	cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem.
@@ -104,6 +99,23 @@ victim.
 See Documentation/filesystems/directory-locking for more detailed discussion
 of the locking scheme for directory operations.
 
+----------------------- xattr_handler operations -----------------------
+prototypes:
+	bool (*list)(struct dentry *dentry);
+	int (*get)(const struct xattr_handler *handler, struct dentry *dentry,
+		   struct inode *inode, const char *name, void *buffer,
+		   size_t size);
+	int (*set)(const struct xattr_handler *handler, struct dentry *dentry,
+		   struct inode *inode, const char *name, const void *buffer,
+		   size_t size, int flags);
+
+locking rules:
+	all may block
+		i_mutex(inode)
+list:		no
+get:		no
+set:		yes
+
 --------------------------- super_operations ---------------------------
 prototypes:
 	struct inode *(*alloc_inode)(struct super_block *sb);
diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt
index d6030aa33376..f5306ee40ea9 100644
--- a/Documentation/filesystems/ceph.txt
+++ b/Documentation/filesystems/ceph.txt
@@ -98,6 +98,10 @@ Mount Options
 	size.
 
   rsize=X
+	Specify the maximum read size in bytes.  By default there is no
+	maximum.
+
+  rasize=X
 	Specify the maximum readahead.
 
   mount_timeout=X
diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt
index 0c16a22521a8..23d18b8a49d5 100644
--- a/Documentation/filesystems/dax.txt
+++ b/Documentation/filesystems/dax.txt
@@ -123,9 +123,12 @@ The DAX code does not work correctly on architectures which have virtually
 mapped caches such as ARM, MIPS and SPARC.
 
 Calling get_user_pages() on a range of user memory that has been mmaped
-from a DAX file will fail as there are no 'struct page' to describe
-those pages.  This problem is being worked on.  That means that O_DIRECT
-reads/writes to those memory ranges from a non-DAX file will fail (note
-that O_DIRECT reads/writes _of a DAX file_ do work, it is the memory
-that is being accessed that is key here).  Other things that will not
-work include RDMA, sendfile() and splice().
+from a DAX file will fail when there are no 'struct page' to describe
+those pages.  This problem has been addressed in some device drivers
+by adding optional struct page support for pages under the control of
+the driver (see CONFIG_NVDIMM_PFN in drivers/nvdimm for an example of
+how to do this). In the non struct page cases O_DIRECT reads/writes to
+those memory ranges from a non-DAX file will fail (note that O_DIRECT
+reads/writes _of a DAX file_ do work, it is the memory that is being
+accessed that is key here).  Other things that will not work in the
+non struct page case include RDMA, sendfile() and splice().
diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt
index ecd808088362..753dd4f96afe 100644
--- a/Documentation/filesystems/f2fs.txt
+++ b/Documentation/filesystems/f2fs.txt
@@ -131,6 +131,7 @@ inline_dentry          Enable the inline dir feature: data in new created
                        directory entries can be written into inode block. The
                        space of inode block which is used to store inline
                        dentries is limited to ~3.4k.
+noinline_dentry        Diable the inline dentry feature.
 flush_merge	       Merge concurrent cache_flush commands as much as possible
                        to eliminate redundant command issues. If the underlying
 		       device handles the cache_flush command relatively slowly,
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 1ab28d9b612a..bdd025ceb763 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -287,8 +287,8 @@ implementing on-disk size changes.  Start with a copy of the old inode_setattr
 and vmtruncate, and the reorder the vmtruncate + foofs_vmtruncate sequence to
 be in order of zeroing blocks using block_truncate_page or similar helpers,
 size update and on finally on-disk truncation which should not fail.
-inode_change_ok now includes the size checks for ATTR_SIZE and must be called
-in the beginning of ->setattr unconditionally.
+setattr_prepare (which used to be inode_change_ok) now includes the size checks
+for ATTR_SIZE and must be called in the beginning of ->setattr unconditionally.
 
 [mandatory]
 
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 68080ad6a75e..219ffd41a911 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -145,7 +145,7 @@ Table 1-1: Process specific entries in /proc
 		symbol the task is blocked in - or "0" if not blocked.
  pagemap	Page table
  stack		Report full stack trace, enable via CONFIG_STACKTRACE
- smaps		a extension based on maps, showing the memory consumption of
+ smaps		an extension based on maps, showing the memory consumption of
 		each mapping and flags associated with it
  numa_maps	an extension based on maps, showing the memory locality and
 		binding policy as well as mem usage (in pages) of each mapping.
@@ -515,6 +515,18 @@ be vanished or the reverse -- new added.
 This file is only present if the CONFIG_MMU kernel configuration option is
 enabled.
 
+Note: reading /proc/PID/maps or /proc/PID/smaps is inherently racy (consistent
+output can be achieved only in the single read call).
+This typically manifests when doing partial reads of these files while the
+memory map is being modified.  Despite the races, we do provide the following
+guarantees:
+
+1) The mapped addresses never go backwards, which implies no two
+   regions will ever overlap.
+2) If there is something at a given vaddr during the entirety of the
+   life of the smaps/maps walk, there will be some output for it.
+
+
 The /proc/PID/clear_refs is used to reset the PG_Referenced and ACCESSED/YOUNG
 bits on both physical and virtual pages associated with a process, and the
 soft-dirty bit on pte (see Documentation/vm/soft-dirty.txt for details).
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index b6bfa0bc02f8..d619c8d71966 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -323,6 +323,35 @@ Whoever sets up the inode is responsible for filling in the "i_op" field. This
 is a pointer to a "struct inode_operations" which describes the methods that
 can be performed on individual inodes.
 
+struct xattr_handlers
+---------------------
+
+On filesystems that support extended attributes (xattrs), the s_xattr
+superblock field points to a NULL-terminated array of xattr handlers.  Extended
+attributes are name:value pairs.
+
+  name: Indicates that the handler matches attributes with the specified name
+	(such as "system.posix_acl_access"); the prefix field must be NULL.
+
+  prefix: Indicates that the handler matches all attributes with the specified
+	name prefix (such as "user."); the name field must be NULL.
+
+  list: Determine if attributes matching this xattr handler should be listed
+	for a particular dentry.  Used by some listxattr implementations like
+	generic_listxattr.
+
+  get: Called by the VFS to get the value of a particular extended attribute.
+	This method is called by the getxattr(2) system call.
+
+  set: Called by the VFS to set the value of a particular extended attribute.
+	When the new value is NULL, called to remove a particular extended
+	attribute.  This method is called by the the setxattr(2) and
+	removexattr(2) system calls.
+
+When none of the xattr handlers of a filesystem match the specified attribute
+name or when a filesystem doesn't support extended attributes, the various
+*xattr(2) system calls return -EOPNOTSUPP.
+
 
 The Inode Object
 ================
@@ -354,10 +383,7 @@ struct inode_operations {
 	int (*get_acl)(struct inode *, int);
 	int (*setattr) (struct dentry *, struct iattr *);
 	int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
-	int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
-	ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
 	ssize_t (*listxattr) (struct dentry *, char *, size_t);
-	int (*removexattr) (struct dentry *, const char *);
 	void (*update_time)(struct inode *, struct timespec *, int);
 	int (*atomic_open)(struct inode *, struct dentry *, struct file *,
 			unsigned open_flag, umode_t create_mode, int *opened);
@@ -458,19 +484,8 @@ otherwise noted.
   getattr: called by the VFS to get attributes of a file. This method
   	is called by stat(2) and related system calls.
 
-  setxattr: called by the VFS to set an extended attribute for a file.
-  	Extended attribute is a name:value pair associated with an
-  	inode. This method is called by setxattr(2) system call.
-
-  getxattr: called by the VFS to retrieve the value of an extended
-  	attribute name. This method is called by getxattr(2) function
-  	call.
-
   listxattr: called by the VFS to list all extended attributes for a
-  	given file. This method is called by listxattr(2) system call.
-
-  removexattr: called by the VFS to remove an extended attribute from
-  	a file. This method is called by removexattr(2) system call.
+	given file. This method is called by the listxattr(2) system call.
 
   update_time: called by the VFS to update a specific time or the i_version of
   	an inode.  If this is not defined the VFS will update the inode itself
@@ -717,7 +732,7 @@ struct address_space_operations {
 
 	The second case is when a request has been made to invalidate
         some or all pages in an address_space.  This can happen
-        through the fadvice(POSIX_FADV_DONTNEED) system call or by the
+        through the fadvise(POSIX_FADV_DONTNEED) system call or by the
         filesystem explicitly requesting it as nfs and 9fs do (when
         they believe the cache may be out of date with storage) by
         calling invalidate_inode_pages2().
diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt
index 8146e9fd5ffc..c2d44e6e117b 100644
--- a/Documentation/filesystems/xfs.txt
+++ b/Documentation/filesystems/xfs.txt
@@ -348,3 +348,126 @@ Removed Sysctls
   ----				-------
   fs.xfs.xfsbufd_centisec	v4.0
   fs.xfs.age_buffer_centisecs	v4.0
+
+
+Error handling
+==============
+
+XFS can act differently according to the type of error found during its
+operation. The implementation introduces the following concepts to the error
+handler:
+
+ -failure speed:
+	Defines how fast XFS should propagate an error upwards when a specific
+	error is found during the filesystem operation. It can propagate
+	immediately, after a defined number of retries, after a set time period,
+	or simply retry forever.
+
+ -error classes:
+	Specifies the subsystem the error configuration will apply to, such as
+	metadata IO or memory allocation. Different subsystems will have
+	different error handlers for which behaviour can be configured.
+
+ -error handlers:
+	Defines the behavior for a specific error.
+
+The filesystem behavior during an error can be set via sysfs files. Each
+error handler works independently - the first condition met by an error handler
+for a specific class will cause the error to be propagated rather than reset and
+retried.
+
+The action taken by the filesystem when the error is propagated is context
+dependent - it may cause a shut down in the case of an unrecoverable error,
+it may be reported back to userspace, or it may even be ignored because
+there's nothing useful we can with the error or anyone we can report it to (e.g.
+during unmount).
+
+The configuration files are organized into the following hierarchy for each
+mounted filesystem:
+
+  /sys/fs/xfs/<dev>/error/<class>/<error>/
+
+Where:
+  <dev>
+	The short device name of the mounted filesystem. This is the same device
+	name that shows up in XFS kernel error messages as "XFS(<dev>): ..."
+
+  <class>
+	The subsystem the error configuration belongs to. As of 4.9, the defined
+	classes are:
+
+		- "metadata": applies metadata buffer write IO
+
+  <error>
+	The individual error handler configurations.
+
+
+Each filesystem has "global" error configuration options defined in their top
+level directory:
+
+  /sys/fs/xfs/<dev>/error/
+
+  fail_at_unmount		(Min:  0  Default:  1  Max: 1)
+	Defines the filesystem error behavior at unmount time.
+
+	If set to a value of 1, XFS will override all other error configurations
+	during unmount and replace them with "immediate fail" characteristics.
+	i.e. no retries, no retry timeout. This will always allow unmount to
+	succeed when there are persistent errors present.
+
+	If set to 0, the configured retry behaviour will continue until all
+	retries and/or timeouts have been exhausted. This will delay unmount
+	completion when there are persistent errors, and it may prevent the
+	filesystem from ever unmounting fully in the case of "retry forever"
+	handler configurations.
+
+	Note: there is no guarantee that fail_at_unmount can be set whilst an
+	unmount is in progress. It is possible that the sysfs entries are
+	removed by the unmounting filesystem before a "retry forever" error
+	handler configuration causes unmount to hang, and hence the filesystem
+	must be configured appropriately before unmount begins to prevent
+	unmount hangs.
+
+Each filesystem has specific error class handlers that define the error
+propagation behaviour for specific errors. There is also a "default" error
+handler defined, which defines the behaviour for all errors that don't have
+specific handlers defined. Where multiple retry constraints are configuredi for
+a single error, the first retry configuration that expires will cause the error
+to be propagated. The handler configurations are found in the directory:
+
+  /sys/fs/xfs/<dev>/error/<class>/<error>/
+
+  max_retries			(Min: -1  Default: Varies  Max: INTMAX)
+	Defines the allowed number of retries of a specific error before
+	the filesystem will propagate the error. The retry count for a given
+	error context (e.g. a specific metadata buffer) is reset every time
+	there is a successful completion of the operation.
+
+	Setting the value to "-1" will cause XFS to retry forever for this
+	specific error.
+
+	Setting the value to "0" will cause XFS to fail immediately when the
+	specific error is reported.
+
+	Setting the value to "N" (where 0 < N < Max) will make XFS retry the
+	operation "N" times before propagating the error.
+
+  retry_timeout_seconds		(Min:  -1  Default:  Varies  Max: 1 day)
+	Define the amount of time (in seconds) that the filesystem is
+	allowed to retry its operations when the specific error is
+	found.
+
+	Setting the value to "-1" will allow XFS to retry forever for this
+	specific error.
+
+	Setting the value to "0" will cause XFS to fail immediately when the
+	specific error is reported.
+
+	Setting the value to "N" (where 0 < N < Max) will allow XFS to retry the
+	operation for up to "N" seconds before propagating the error.
+
+Note: The default behaviour for a specific error handler is dependent on both
+the class and error context. For example, the default values for
+"metadata/ENODEV" are "0" rather than "-1" so that this error handler defaults
+to "fail immediately" behaviour. This is done because ENODEV is a fatal,
+unrecoverable error no matter how many times the metadata IO is retried.