diff options
42 files changed, 4036 insertions, 150 deletions
diff --git a/Documentation/conf.py b/Documentation/conf.py index 62ac5a9f3a9f..b691af4831fa 100644 --- a/Documentation/conf.py +++ b/Documentation/conf.py @@ -34,7 +34,7 @@ needs_sphinx = '1.3' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. -extensions = ['kerneldoc', 'rstFlatTable', 'kernel_include', 'cdomain', 'kfigure'] +extensions = ['kerneldoc', 'rstFlatTable', 'kernel_include', 'cdomain', 'kfigure', 'sphinx.ext.ifconfig'] # The name of the math extension changed on Sphinx 1.4 if major == 1 and minor > 3: diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4/ext4.rst index 7f628b9f7c4b..9d4368d591fa 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4/ext4.rst @@ -1,6 +1,8 @@ +.. SPDX-License-Identifier: GPL-2.0 -Ext4 Filesystem -=============== +======================== +General Information +======================== Ext4 is an advanced level of the ext3 filesystem which incorporates scalability and reliability enhancements for supporting large filesystems @@ -11,37 +13,30 @@ Mailing list: linux-ext4@vger.kernel.org Web site: http://ext4.wiki.kernel.org -1. Quick usage instructions: -=========================== +Quick usage instructions +======================== Note: More extensive information for getting started with ext4 can be - found at the ext4 wiki site at the URL: - http://ext4.wiki.kernel.org/index.php/Ext4_Howto +found at the ext4 wiki site at the URL: +http://ext4.wiki.kernel.org/index.php/Ext4_Howto - - Compile and install the latest version of e2fsprogs (as of this - writing version 1.41.3) from: + - The latest version of e2fsprogs can be found at: + + https://www.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/ - http://sourceforge.net/project/showfiles.php?group_id=2406 - or - https://www.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/ + http://sourceforge.net/project/showfiles.php?group_id=2406 or grab the latest git repository from: - git://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git - - - Note that it is highly important to install the mke2fs.conf file - that comes with the e2fsprogs 1.41.x sources in /etc/mke2fs.conf. If - you have edited the /etc/mke2fs.conf file installed on your system, - you will need to merge your changes with the version from e2fsprogs - 1.41.x. + https://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git - Create a new filesystem using the ext4 filesystem type: - # mke2fs -t ext4 /dev/hda1 + # mke2fs -t ext4 /dev/hda1 - Or to configure an existing ext3 filesystem to support extents: + Or to configure an existing ext3 filesystem to support extents: # tune2fs -O extents /dev/hda1 @@ -50,10 +45,6 @@ Note: More extensive information for getting started with ext4 can be # tune2fs -I 256 /dev/hda1 - (Note: we currently do not have tools to convert an ext4 - filesystem back to ext3; so please do not do try this on production - filesystems.) - - Mounting: # mount -t ext4 /dev/hda1 /wherever @@ -75,10 +66,11 @@ Note: More extensive information for getting started with ext4 can be the filesystem with a large journal can also be helpful for metadata-intensive workloads. -2. Features -=========== +Features +======== -2.1 Currently available +Currently Available +------------------- * ability to use filesystems > 16TB (e2fsprogs support not available yet) * extent format reduces metadata overhead (RAM, IO for access, transactions) @@ -103,31 +95,15 @@ Note: More extensive information for getting started with ext4 can be [1] Filesystems with a block size of 1k may see a limit imposed by the directory hash tree having a maximum depth of two. -2.2 Candidate features for future inclusion - -* online defrag (patches available but not well tested) -* reduced mke2fs time via lazy itable initialization in conjunction with - the uninit_bg feature (capability to do this is available in e2fsprogs - but a kernel thread to do lazy zeroing of unused inode table blocks - after filesystem is first mounted is required for safety) - -There are several others under discussion, whether they all make it in is -partly a function of how much time everyone has to work on them. Features like -metadata checksumming have been discussed and planned for a bit but no patches -exist yet so I'm not sure they're in the near-term roadmap. - -The big performance win will come with mballoc, delalloc and flex_bg -grouping of bitmaps and inode tables. Some test results available here: - - - http://www.bullopensource.org/ext4/20080818-ffsb/ffsb-write-2.6.27-rc1.html - - http://www.bullopensource.org/ext4/20080818-ffsb/ffsb-readwrite-2.6.27-rc1.html - -3. Options -========== +Options +======= When mounting an ext4 filesystem, the following option are accepted: (*) == default +======================= ======================================================= +Mount Option Description +======================= ======================================================= ro Mount filesystem read only. Note that ext4 will replay the journal (and thus write to the partition) even when mounted "read only". The @@ -387,33 +363,38 @@ i_version Enable 64-bit inode version support. This option is dax Use direct access (no page cache). See Documentation/filesystems/dax.txt. Note that this option is incompatible with data=journal. +======================= ======================================================= Data Mode ========= There are 3 different data modes: * writeback mode -In data=writeback mode, ext4 does not journal data at all. This mode provides -a similar level of journaling as that of XFS, JFS, and ReiserFS in its default -mode - metadata journaling. A crash+recovery can cause incorrect data to -appear in files which were written shortly before the crash. This mode will -typically provide the best ext4 performance. + + In data=writeback mode, ext4 does not journal data at all. This mode provides + a similar level of journaling as that of XFS, JFS, and ReiserFS in its default + mode - metadata journaling. A crash+recovery can cause incorrect data to + appear in files which were written shortly before the crash. This mode will + typically provide the best ext4 performance. * ordered mode -In data=ordered mode, ext4 only officially journals metadata, but it logically -groups metadata information related to data changes with the data blocks into a -single unit called a transaction. When it's time to write the new metadata -out to disk, the associated data blocks are written first. In general, -this mode performs slightly slower than writeback but significantly faster than journal mode. + + In data=ordered mode, ext4 only officially journals metadata, but it logically + groups metadata information related to data changes with the data blocks into + a single unit called a transaction. When it's time to write the new metadata + out to disk, the associated data blocks are written first. In general, this + mode performs slightly slower than writeback but significantly faster than + journal mode. * journal mode -data=journal mode provides full data and metadata journaling. All new data is -written to the journal first, and then to its final location. -In the event of a crash, the journal can be replayed, bringing both data and -metadata into a consistent state. This mode is the slowest except when data -needs to be read from and written to disk at the same time where it -outperforms all others modes. Enabling this mode will disable delayed -allocation and O_DIRECT support. + + data=journal mode provides full data and metadata journaling. All new data is + written to the journal first, and then to its final location. In the event of + a crash, the journal can be replayed, bringing both data and metadata into a + consistent state. This mode is the slowest except when data needs to be read + from and written to disk at the same time where it outperforms all others + modes. Enabling this mode will disable delayed allocation and O_DIRECT + support. /proc entries ============= @@ -425,10 +406,12 @@ Information about mounted ext4 file systems can be found in in table below. Files in /proc/fs/ext4/<devname> -.............................................................................. + +================ ======= File Content +================ ======= mb_groups details of multiblock allocator buddy cache of free blocks -.............................................................................. +================ ======= /sys entries ============ @@ -439,28 +422,30 @@ Information about mounted ext4 file systems can be found in /sys/fs/ext4/dm-0). The files in each per-device directory are shown in table below. -Files in /sys/fs/ext4/<devname> +Files in /sys/fs/ext4/<devname>: + (see also Documentation/ABI/testing/sysfs-fs-ext4) -.............................................................................. - File Content +============================= ================================================= +File Content +============================= ================================================= delayed_allocation_blocks This file is read-only and shows the number of blocks that are dirty in the page cache, but which do not have their location in the filesystem allocated yet. - inode_goal Tuning parameter which (if non-zero) controls +inode_goal Tuning parameter which (if non-zero) controls the goal inode used by the inode allocator in preference to all other allocation heuristics. This is intended for debugging use only, and should be 0 on production systems. - inode_readahead_blks Tuning parameter which controls the maximum +inode_readahead_blks Tuning parameter which controls the maximum number of inode table blocks that ext4's inode table readahead algorithm will pre-read into the buffer cache - lifetime_write_kbytes This file is read-only and shows the number of +lifetime_write_kbytes This file is read-only and shows the number of kilobytes of data that have been written to this filesystem since it was created. @@ -508,7 +493,7 @@ Files in /sys/fs/ext4/<devname> in the file system. If there is not enough space for the reserved space when mounting the file mount will _not_ fail. -.............................................................................. +============================= ================================================= Ioctls ====== @@ -518,8 +503,10 @@ through the system call interfaces. The list of all Ext4 specific ioctls are shown in the table below. Table of Ext4 specific ioctls -.............................................................................. - Ioctl Description + +============================= ================================================= +Ioctl Description +============================= ================================================= EXT4_IOC_GETFLAGS Get additional attributes associated with inode. The ioctl argument is an integer bitfield, with bit values described in ext4.h. This ioctl is an @@ -610,8 +597,7 @@ Table of Ext4 specific ioctls normal user by accident. The data blocks of the previous boot loader will be associated with the given inode. - -.............................................................................. +============================= ================================================= References ========== diff --git a/Documentation/filesystems/ext4/index.rst b/Documentation/filesystems/ext4/index.rst new file mode 100644 index 000000000000..71121605558c --- /dev/null +++ b/Documentation/filesystems/ext4/index.rst @@ -0,0 +1,17 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============== +ext4 Filesystem +=============== + +General usage and on-disk artifacts writen by ext4. More documentation may +be ported from the wiki as time permits. This should be considered the +canonical source of information as the details here have been reviewed by +the ext4 community. + +.. toctree:: + :maxdepth: 5 + :numbered: + + ext4 + ondisk/index diff --git a/Documentation/filesystems/ext4/ondisk/about.rst b/Documentation/filesystems/ext4/ondisk/about.rst new file mode 100644 index 000000000000..0aadba052264 --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/about.rst @@ -0,0 +1,44 @@ +.. SPDX-License-Identifier: GPL-2.0 + +About this Book +=============== + +This document attempts to describe the on-disk format for ext4 +filesystems. The same general ideas should apply to ext2/3 filesystems +as well, though they do not support all the features that ext4 supports, +and the fields will be shorter. + +**NOTE**: This is a work in progress, based on notes that the author +(djwong) made while picking apart a filesystem by hand. The data +structure definitions should be current as of Linux 4.18 and +e2fsprogs-1.44. All comments and corrections are welcome, since there is +undoubtedly plenty of lore that might not be reflected in freshly +created demonstration filesystems. + +License +------- +This book is licensed under the terms of the GNU Public License, v2. + +Terminology +----------- + +ext4 divides a storage device into an array of logical blocks both to +reduce bookkeeping overhead and to increase throughput by forcing larger +transfer sizes. Generally, the block size will be 4KiB (the same size as +pages on x86 and the block layer's default block size), though the +actual size is calculated as 2 ^ (10 + ``sb.s_log_block_size``) bytes. +Throughout this document, disk locations are given in terms of these +logical blocks, not raw LBAs, and not 1024-byte blocks. For the sake of +convenience, the logical block size will be referred to as +``$block_size`` throughout the rest of the document. + +When referenced in ``preformatted text`` blocks, ``sb`` refers to fields +in the super block, and ``inode`` refers to fields in an inode table +entry. + +Other References +---------------- + +Also see http://www.nongnu.org/ext2-doc/ for quite a collection of +information about ext2/3. Here's another old reference: +http://wiki.osdev.org/Ext2 diff --git a/Documentation/filesystems/ext4/ondisk/allocators.rst b/Documentation/filesystems/ext4/ondisk/allocators.rst new file mode 100644 index 000000000000..7aa85152ace3 --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/allocators.rst @@ -0,0 +1,56 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Block and Inode Allocation Policy +--------------------------------- + +ext4 recognizes (better than ext3, anyway) that data locality is +generally a desirably quality of a filesystem. On a spinning disk, +keeping related blocks near each other reduces the amount of movement +that the head actuator and disk must perform to access a data block, +thus speeding up disk IO. On an SSD there of course are no moving parts, +but locality can increase the size of each transfer request while +reducing the total number of requests. This locality may also have the +effect of concentrating writes on a single erase block, which can speed +up file rewrites significantly. Therefore, it is useful to reduce +fragmentation whenever possible. + +The first tool that ext4 uses to combat fragmentation is the multi-block +allocator. When a file is first created, the block allocator +speculatively allocates 8KiB of disk space to the file on the assumption +that the space will get written soon. When the file is closed, the +unused speculative allocations are of course freed, but if the +speculation is correct (typically the case for full writes of small +files) then the file data gets written out in a single multi-block +extent. A second related trick that ext4 uses is delayed allocation. +Under this scheme, when a file needs more blocks to absorb file writes, +the filesystem defers deciding the exact placement on the disk until all +the dirty buffers are being written out to disk. By not committing to a +particular placement until it's absolutely necessary (the commit timeout +is hit, or sync() is called, or the kernel runs out of memory), the hope +is that the filesystem can make better location decisions. + +The third trick that ext4 (and ext3) uses is that it tries to keep a +file's data blocks in the same block group as its inode. This cuts down +on the seek penalty when the filesystem first has to read a file's inode +to learn where the file's data blocks live and then seek over to the +file's data blocks to begin I/O operations. + +The fourth trick is that all the inodes in a directory are placed in the +same block group as the directory, when feasible. The working assumption +here is that all the files in a directory might be related, therefore it +is useful to try to keep them all together. + +The fifth trick is that the disk volume is cut up into 128MB block +groups; these mini-containers are used as outlined above to try to +maintain data locality. However, there is a deliberate quirk -- when a +directory is created in the root directory, the inode allocator scans +the block groups and puts that directory into the least heavily loaded +block group that it can find. This encourages directories to spread out +over a disk; as the top-level directory/file blobs fill up one block +group, the allocators simply move on to the next block group. Allegedly +this scheme evens out the loading on the block groups, though the author +suspects that the directories which are so unlucky as to land towards +the end of a spinning drive get a raw deal performance-wise. + +Of course if all of these mechanisms fail, one can always use e4defrag +to defragment files. diff --git a/Documentation/filesystems/ext4/ondisk/attributes.rst b/Documentation/filesystems/ext4/ondisk/attributes.rst new file mode 100644 index 000000000000..0b01b67b81fe --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/attributes.rst @@ -0,0 +1,191 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Extended Attributes +------------------- + +Extended attributes (xattrs) are typically stored in a separate data +block on the disk and referenced from inodes via ``inode.i_file_acl*``. +The first use of extended attributes seems to have been for storing file +ACLs and other security data (selinux). With the ``user_xattr`` mount +option it is possible for users to store extended attributes so long as +all attribute names begin with “user”; this restriction seems to have +disappeared as of Linux 3.0. + +There are two places where extended attributes can be found. The first +place is between the end of each inode entry and the beginning of the +next inode entry. For example, if inode.i\_extra\_isize = 28 and +sb.inode\_size = 256, then there are 256 - (128 + 28) = 100 bytes +available for in-inode extended attribute storage. The second place +where extended attributes can be found is in the block pointed to by +``inode.i_file_acl``. As of Linux 3.11, it is not possible for this +block to contain a pointer to a second extended attribute block (or even +the remaining blocks of a cluster). In theory it is possible for each +attribute's value to be stored in a separate data block, though as of +Linux 3.11 the code does not permit this. + +Keys are generally assumed to be ASCIIZ strings, whereas values can be +strings or binary data. + +Extended attributes, when stored after the inode, have a header +``ext4_xattr_ibody_header`` that is 4 bytes long: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Type + - Name + - Description + * - 0x0 + - \_\_le32 + - h\_magic + - Magic number for identification, 0xEA020000. This value is set by the + Linux driver, though e2fsprogs doesn't seem to check it(?) + +The beginning of an extended attribute block is in +``struct ext4_xattr_header``, which is 32 bytes long: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Type + - Name + - Description + * - 0x0 + - \_\_le32 + - h\_magic + - Magic number for identification, 0xEA020000. + * - 0x4 + - \_\_le32 + - h\_refcount + - Reference count. + * - 0x8 + - \_\_le32 + - h\_blocks + - Number of disk blocks used. + * - 0xC + - \_\_le32 + - h\_hash + - Hash value of all attributes. + * - 0x10 + - \_\_le32 + - h\_checksum + - Checksum of the extended attribute block. + * - 0x14 + - \_\_u32 + - h\_reserved[2] + - Zero. + +The checksum is calculated against the FS UUID, the 64-bit block number +of the extended attribute block, and the entire block (header + +entries). + +Following the ``struct ext4_xattr_header`` or +``struct ext4_xattr_ibody_header`` is an array of +``struct ext4_xattr_entry``; each of these entries is at least 16 bytes +long. When stored in an external block, the ``struct ext4_xattr_entry`` +entries must be stored in sorted order. The sort order is +``e_name_index``, then ``e_name_len``, and finally ``e_name``. +Attributes stored inside an inode do not need be stored in sorted order. + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Type + - Name + - Description + * - 0x0 + - \_\_u8 + - e\_name\_len + - Length of name. + * - 0x1 + - \_\_u8 + - e\_name\_index + - Attribute name index. There is a discussion of this below. + * - 0x2 + - \_\_le16 + - e\_value\_offs + - Location of this attribute's value on the disk block where it is stored. + Multiple attributes can share the same value. For an inode attribute + this value is relative to the start of the first entry; for a block this + value is relative to the start of the block (i.e. the header). + * - 0x4 + - \_\_le32 + - e\_value\_inum + - The inode where the value is stored. Zero indicates the value is in the + same block as this entry. This field is only used if the + INCOMPAT\_EA\_INODE feature is enabled. + * - 0x8 + - \_\_le32 + - e\_value\_size + - Length of attribute value. + * - 0xC + - \_\_le32 + - e\_hash + - Hash value of attribute name and attribute value. The kernel doesn't + update the hash for in-inode attributes, so for that case this value + must be zero, because e2fsck validates any non-zero hash regardless of + where the xattr lives. + * - 0x10 + - char + - e\_name[e\_name\_len] + - Attribute name. Does not include trailing NULL. + +Attribute values can follow the end of the entry table. There appears to +be a requirement that they be aligned to 4-byte boundaries. The values +are stored starting at the end of the block and grow towards the +xattr\_header/xattr\_entry table. When the two collide, the overflow is +put into a separate disk block. If the disk block fills up, the +filesystem returns -ENOSPC. + +The first four fields of the ``ext4_xattr_entry`` are set to zero to +mark the end of the key list. + +Attribute Name Indices +~~~~~~~~~~~~~~~~~~~~~~ + +Logically speaking, extended attributes are a series of key=value pairs. +The keys are assumed to be NULL-terminated strings. To reduce the amount +of on-disk space that the keys consume, the beginning of the key string +is matched against the attribute name index. If a match is found, the +attribute name index field is set, and matching string is removed from +the key name. Here is a map of name index values to key prefixes: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Name Index + - Key Prefix + * - 0 + - (no prefix) + * - 1 + - “user.” + * - 2 + - “system.posix\_acl\_access” + * - 3 + - “system.posix\_acl\_default” + * - 4 + - “trusted.” + * - 6 + - “security.” + * - 7 + - “system.” (inline\_data only?) + * - 8 + - “system.richacl” (SuSE kernels only?) + +For example, if the attribute key is “user.fubar”, the attribute name +index is set to 1 and the “fubar” name is recorded on disk. + +POSIX ACLs +~~~~~~~~~~ + +POSIX ACLs are stored in a reduced version of the Linux kernel (and +libacl's) internal ACL format. The key difference is that the version +number is different (1) and the ``e_id`` field is only stored for named +user and group ACLs. diff --git a/Documentation/filesystems/ext4/ondisk/bigalloc.rst b/Documentation/filesystems/ext4/ondisk/bigalloc.rst new file mode 100644 index 000000000000..c6d88557553c --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/bigalloc.rst @@ -0,0 +1,22 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Bigalloc +-------- + +At the moment, the default size of a block is 4KiB, which is a commonly +supported page size on most MMU-capable hardware. This is fortunate, as +ext4 code is not prepared to handle the case where the block size +exceeds the page size. However, for a filesystem of mostly huge files, +it is desirable to be able to allocate disk blocks in units of multiple +blocks to reduce both fragmentation and metadata overhead. The +`bigalloc <Bigalloc>`__ feature provides exactly this ability. The +administrator can set a block cluster size at mkfs time (which is stored +in the s\_log\_cluster\_size field in the superblock); from then on, the +block bitmaps track clusters, not individual blocks. This means that +block groups can be several gigabytes in size (instead of just 128MiB); +however, the minimum allocation unit becomes a cluster, not a block, +even for directories. TaoBao had a patchset to extend the “use units of +clusters instead of blocks” to the extent tree, though it is not clear +where those patches went-- they eventually morphed into “extent tree v2” +but that code has not landed as of May 2015. + diff --git a/Documentation/filesystems/ext4/ondisk/bitmaps.rst b/Documentation/filesystems/ext4/ondisk/bitmaps.rst new file mode 100644 index 000000000000..c7546dbc197a --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/bitmaps.rst @@ -0,0 +1,28 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Block and inode Bitmaps +----------------------- + +The data block bitmap tracks the usage of data blocks within the block +group. + +The inode bitmap records which entries in the inode table are in use. + +As with most bitmaps, one bit represents the usage status of one data +block or inode table entry. This implies a block group size of 8 \* +number\_of\_bytes\_in\_a\_logical\_block. + +NOTE: If ``BLOCK_UNINIT`` is set for a given block group, various parts +of the kernel and e2fsprogs code pretends that the block bitmap contains +zeros (i.e. all blocks in the group are free). However, it is not +necessarily the case that no blocks are in use -- if ``meta_bg`` is set, +the bitmaps and group descriptor live inside the group. Unfortunately, +ext2fs\_test\_block\_bitmap2() will return '0' for those locations, +which produces confusing debugfs output. + +Inode Table +----------- +Inode tables are statically allocated at mkfs time. Each block group +descriptor points to the start of the table, and the superblock records +the number of inodes per group. See the section on inodes for more +information. diff --git a/Documentation/filesystems/ext4/ondisk/blockgroup.rst b/Documentation/filesystems/ext4/ondisk/blockgroup.rst new file mode 100644 index 000000000000..baf888e4c06a --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/blockgroup.rst @@ -0,0 +1,135 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Layout +------ + +The layout of a standard block group is approximately as follows (each +of these fields is discussed in a separate section below): + +.. list-table:: + :widths: 1 1 1 1 1 1 1 1 + :header-rows: 1 + + * - Group 0 Padding + - ext4 Super Block + - Group Descriptors + - Reserved GDT Blocks + - Data Block Bitmap + - inode Bitmap + - inode Table + - Data Blocks + * - 1024 bytes + - 1 block + - many blocks + - many blocks + - 1 block + - 1 block + - many blocks + - many more blocks + +For the special case of block group 0, the first 1024 bytes are unused, +to allow for the installation of x86 boot sectors and other oddities. +The superblock will start at offset 1024 bytes, whichever block that +happens to be (usually 0). However, if for some reason the block size = +1024, then block 0 is marked in use and the superblock goes in block 1. +For all other block groups, there is no padding. + +The ext4 driver primarily works with the superblock and the group +descriptors that are found in block group 0. Redundant copies of the +superblock and group descriptors are written to some of the block groups +across the disk in case the beginning of the disk gets trashed, though +not all block groups necessarily host a redundant copy (see following +paragraph for more details). If the group does not have a redundant +copy, the block group begins with the data block bitmap. Note also that +when the filesystem is freshly formatted, mkfs will allocate “reserve +GDT block” space after the block group descriptors and before the start +of the block bitmaps to allow for future expansion of the filesystem. By +default, a filesystem is allowed to increase in size by a factor of +1024x over the original filesystem size. + +The location of the inode table is given by ``grp.bg_inode_table_*``. It +is continuous range of blocks large enough to contain +``sb.s_inodes_per_group * sb.s_inode_size`` bytes. + +As for the ordering of items in a block group, it is generally +established that the super block and the group descriptor table, if +present, will be at the beginning of the block group. The bitmaps and +the inode table can be anywhere, and it is quite possible for the +bitmaps to come after the inode table, or for both to be in different +groups (flex\_bg). Leftover space is used for file data blocks, indirect +block maps, extent tree blocks, and extended attributes. + +Flexible Block Groups +--------------------- + +Starting in ext4, there is a new feature called flexible block groups +(flex\_bg). In a flex\_bg, several block groups are tied together as one +logical block group; the bitmap spaces and the inode table space in the +first block group of the flex\_bg are expanded to include the bitmaps +and inode tables of all other block groups in the flex\_bg. For example, +if the flex\_bg size is 4, then group 0 will contain (in order) the +superblock, group descriptors, data block bitmaps for groups 0-3, inode +bitmaps for groups 0-3, inode tables for groups 0-3, and the remaining +space in group 0 is for file data. The effect of this is to group the +block metadata close together for faster loading, and to enable large +files to be continuous on disk. Backup copies of the superblock and +group descriptors are always at the beginning of block groups, even if +flex\_bg is enabled. The number of block groups that make up a flex\_bg +is given by 2 ^ ``sb.s_log_groups_per_flex``. + +Meta Block Groups +----------------- + +Without the option META\_BG, for safety concerns, all block group +descriptors copies are kept in the first block group. Given the default +128MiB(2^27 bytes) block group size and 64-byte group descriptors, ext4 +can have at most 2^27/64 = 2^21 block groups. This limits the entire +filesystem size to 2^21 ∗ 2^27 = 2^48bytes or 256TiB. + +The solution to this problem is to use the metablock group feature +(META\_BG), which is already in ext3 for all 2.6 releases. With the +META\_BG feature, ext4 filesystems are partitioned into many metablock +groups. Each metablock group is a cluster of block groups whose group +descriptor structures can be stored in a single disk block. For ext4 +filesystems with 4 KB block size, a single metablock group partition +includes 64 block groups, or 8 GiB of disk space. The metablock group +feature moves the location of the group descriptors from the congested +first block group of the whole filesystem into the first group of each +metablock group itself. The backups are in the second and last group of +each metablock group. This increases the 2^21 maximum block groups limit +to the hard limit 2^32, allowing support for a 512PiB filesystem. + +The change in the filesystem format replaces the current scheme where +the superblock is followed by a variable-length set of block group +descriptors. Instead, the superblock and a single block group descriptor +block is placed at the beginning of the first, second, and last block +groups in a meta-block group. A meta-block group is a collection of +block groups which can be described by a single block group descriptor +block. Since the size of the block group descriptor structure is 32 +bytes, a meta-block group contains 32 block groups for filesystems with +a 1KB block size, and 128 block groups for filesystems with a 4KB +blocksize. Filesystems can either be created using this new block group +descriptor layout, or existing filesystems can be resized on-line, and +the field s\_first\_meta\_bg in the superblock will indicate the first +block group using this new layout. + +Please see an important note about ``BLOCK_UNINIT`` in the section about +block and inode bitmaps. + +Lazy Block Group Initialization +------------------------------- + +A new feature for ext4 are three block group descriptor flags that +enable mkfs to skip initializing other parts of the block group +metadata. Specifically, the INODE\_UNINIT and BLOCK\_UNINIT flags mean +that the inode and block bitmaps for that group can be calculated and +therefore the on-disk bitmap blocks are not initialized. This is +generally the case for an empty block group or a block group containing +only fixed-location block group metadata. The INODE\_ZEROED flag means +that the inode table has been initialized; mkfs will unset this flag and +rely on the kernel to initialize the inode tables in the background. + +By not writing zeroes to the bitmaps and inode table, mkfs time is +reduced considerably. Note the feature flag is RO\_COMPAT\_GDT\_CSUM, +but the dumpe2fs output prints this as “uninit\_bg”. They are the same +thing. diff --git a/Documentation/filesystems/ext4/ondisk/blockmap.rst b/Documentation/filesystems/ext4/ondisk/blockmap.rst new file mode 100644 index 000000000000..30e25750d88a --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/blockmap.rst @@ -0,0 +1,49 @@ +.. SPDX-License-Identifier: GPL-2.0 + ++---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| i.i\_block Offset | Where It Points | ++=====================+==============================================================================================================================================================================================================================+ +| 0 to 11 | Direct map to file blocks 0 to 11. | ++---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| 12 | Indirect block: (file blocks 12 to (``$block_size`` / 4) + 11, or 12 to 1035 if 4KiB blocks) | +| | | +| | +------------------------------+--------------------------------------------------------------------+ | +| | | Indirect Block Offset | Where It Points | | +| | +==============================+====================================================================+ | +| | | 0 to (``$block_size`` / 4) | Direct map to (``$block_size`` / 4) blocks (1024 if 4KiB blocks) | | +| | +------------------------------+--------------------------------------------------------------------+ | ++---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| 13 | Double-indirect block: (file blocks ``$block_size``/4 + 12 to (``$block_size`` / 4) ^ 2 + (``$block_size`` / 4) + 11, or 1036 to 1049611 if 4KiB blocks) | +| | | +| | +--------------------------------+---------------------------------------------------------------------------------------------------------+ | +| | | Double Indirect Block Offset | Where It Points | | +| | +================================+=========================================================================================================+ | +| | | 0 to (``$block_size`` / 4) | Map to (``$block_size`` / 4) indirect blocks (1024 if 4KiB blocks) | | +| | | | | | +| | | | +------------------------------+--------------------------------------------------------------------+ | | +| | | | | Indirect Block Offset | Where It Points | | | +| | | | +==============================+====================================================================+ | | +| | | | | 0 to (``$block_size`` / 4) | Direct map to (``$block_size`` / 4) blocks (1024 if 4KiB blocks) | | | +| | | | +------------------------------+--------------------------------------------------------------------+ | | +| | +--------------------------------+---------------------------------------------------------------------------------------------------------+ | ++---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| 14 | Triple-indirect block: (file blocks (``$block_size`` / 4) ^ 2 + (``$block_size`` / 4) + 12 to (``$block_size`` / 4) ^ 3 + (``$block_size`` / 4) ^ 2 + (``$block_size`` / 4) + 12, or 1049612 to 1074791436 if 4KiB blocks) | +| | | +| | +--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------+ | +| | | Triple Indirect Block Offset | Where It Points | | +| | +================================+================================================================================================================================================+ | +| | | 0 to (``$block_size`` / 4) | Map to (``$block_size`` / 4) double indirect blocks (1024 if 4KiB blocks) | | +| | | | | | +| | | | +--------------------------------+---------------------------------------------------------------------------------------------------------+ | | +| | | | | Double Indirect Block Offset | Where It Points | | | +| | | | +================================+=========================================================================================================+ | | +| | | | | 0 to (``$block_size`` / 4) | Map to (``$block_size`` / 4) indirect blocks (1024 if 4KiB blocks) | | | +| | | | | | | | | +| | | | | | +------------------------------+--------------------------------------------------------------------+ | | | +| | | | | | | Indirect Block Offset | Where It Points | | | | +| | | | | | +==============================+====================================================================+ | | | +| | | | | | | 0 to (``$block_size`` / 4) | Direct map to (``$block_size`` / 4) blocks (1024 if 4KiB blocks) | | | | +| | | | | | +------------------------------+--------------------------------------------------------------------+ | | | +| | | | +--------------------------------+---------------------------------------------------------------------------------------------------------+ | | +| | +--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------+ | ++---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ diff --git a/Documentation/filesystems/ext4/ondisk/blocks.rst b/Documentation/filesystems/ext4/ondisk/blocks.rst new file mode 100644 index 000000000000..73d4dc0f7bda --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/blocks.rst @@ -0,0 +1,142 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Blocks +------ + +ext4 allocates storage space in units of “blocks”. A block is a group of +sectors between 1KiB and 64KiB, and the number of sectors must be an +integral power of 2. Blocks are in turn grouped into larger units called +block groups. Block size is specified at mkfs time and typically is +4KiB. You may experience mounting problems if block size is greater than +page size (i.e. 64KiB blocks on a i386 which only has 4KiB memory +pages). By default a filesystem can contain 2^32 blocks; if the '64bit' +feature is enabled, then a filesystem can have 2^64 blocks. + +For 32-bit filesystems, limits are as follows: + +.. list-table:: + :widths: 1 1 1 1 1 + :header-rows: 1 + + * - Item + - 1KiB + - 2KiB + - 4KiB + - 64KiB + * - Blocks + - 2^32 + - 2^32 + - 2^32 + - 2^32 + * - Inodes + - 2^32 + - 2^32 + - 2^32 + - 2^32 + * - File System Size + - 4TiB + - 8TiB + - 16TiB + - 256PiB + * - Blocks Per Block Group + - 8,192 + - 16,384 + - 32,768 + - 524,288 + * - Inodes Per Block Group + - 8,192 + - 16,384 + - 32,768 + - 524,288 + * - Block Group Size + - 8MiB + - 32MiB + - 128MiB + - 32GiB + * - Blocks Per File, Extents + - 2^32 + - 2^32 + - 2^32 + - 2^32 + * - Blocks Per File, Block Maps + - 16,843,020 + - 134,480,396 + - 1,074,791,436 + - 4,398,314,962,956 (really 2^32 due to field size limitations) + * - File Size, Extents + - 4TiB + - 8TiB + - 16TiB + - 256TiB + * - File Size, Block Maps + - 16GiB + - 256GiB + - 4TiB + - 256TiB + +For 64-bit filesystems, limits are as follows: + +.. list-table:: + :widths: 1 1 1 1 1 + :header-rows: 1 + + * - Item + - 1KiB + - 2KiB + - 4KiB + - 64KiB + * - Blocks + - 2^64 + - 2^64 + - 2^64 + - 2^64 + * - Inodes + - 2^32 + - 2^32 + - 2^32 + - 2^32 + * - File System Size + - 16ZiB + - 32ZiB + - 64ZiB + - 1YiB + * - Blocks Per Block Group + - 8,192 + - 16,384 + - 32,768 + - 524,288 + * - Inodes Per Block Group + - 8,192 + - 16,384 + - 32,768 + - 524,288 + * - Block Group Size + - 8MiB + - 32MiB + - 128MiB + - 32GiB + * - Blocks Per File, Extents + - 2^32 + - 2^32 + - 2^32 + - 2^32 + * - Blocks Per File, Block Maps + - 16,843,020 + - 134,480,396 + - 1,074,791,436 + - 4,398,314,962,956 (really 2^32 due to field size limitations) + * - File Size, Extents + - 4TiB + - 8TiB + - 16TiB + - 256TiB + * - File Size, Block Maps + - 16GiB + - 256GiB + - 4TiB + - 256TiB + +Note: Files not using extents (i.e. files using block maps) must be +placed within the first 2^32 blocks of a filesystem. Files with extents +must be placed within the first 2^48 blocks of a filesystem. It's not +clear what happens with larger filesystems. diff --git a/Documentation/filesystems/ext4/ondisk/checksums.rst b/Documentation/filesystems/ext4/ondisk/checksums.rst new file mode 100644 index 000000000000..9d6a793b2e03 --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/checksums.rst @@ -0,0 +1,73 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Checksums +--------- + +Starting in early 2012, metadata checksums were added to all major ext4 +and jbd2 data structures. The associated feature flag is metadata\_csum. +The desired checksum algorithm is indicated in the superblock, though as +of October 2012 the only supported algorithm is crc32c. Some data +structures did not have space to fit a full 32-bit checksum, so only the +lower 16 bits are stored. Enabling the 64bit feature increases the data +structure size so that full 32-bit checksums can be stored for many data +structures. However, existing 32-bit filesystems cannot be extended to +enable 64bit mode, at least not without the experimental resize2fs +patches to do so. + +Existing filesystems can have checksumming added by running +``tune2fs -O metadata_csum`` against the underlying device. If tune2fs +encounters directory blocks that lack sufficient empty space to add a +checksum, it will request that you run ``e2fsck -D`` to have the +directories rebuilt with checksums. This has the added benefit of +removing slack space from the directory files and rebalancing the htree +indexes. If you \_ignore\_ this step, your directories will not be +protected by a checksum! + +The following table describes the data elements that go into each type +of checksum. The checksum function is whatever the superblock describes +(crc32c as of October 2013) unless noted otherwise. + +.. list-table:: + :widths: 1 1 4 + :header-rows: 1 + + * - Metadata + - Length + - Ingredients + * - Superblock + - \_\_le32 + - The entire superblock up to the checksum field. The UUID lives inside + the superblock. + * - MMP + - \_\_le32 + - UUID + the entire MMP block up to the checksum field. + * - Extended Attributes + - \_\_le32 + - UUID + the entire extended attribute block. The checksum field is set to + zero. + * - Directory Entries + - \_\_le32 + - UUID + inode number + inode generation + the directory block up to the + fake entry enclosing the checksum field. + * - HTREE Nodes + - \_\_le32 + - UUID + inode number + inode generation + all valid extents + HTREE tail. + The checksum field is set to zero. + * - Extents + - \_\_le32 + - UUID + inode number + inode generation + the entire extent block up to + the checksum field. + * - Bitmaps + - \_\_le32 or \_\_le16 + - UUID + the entire bitmap. Checksums are stored in the group descriptor, + and truncated if the group descriptor size is 32 bytes (i.e. ^64bit) + * - Inodes + - \_\_le32 + - UUID + inode number + inode generation + the entire inode. The checksum + field is set to zero. Each inode has its own checksum. + * - Group Descriptors + - \_\_le16 + - If metadata\_csum, then UUID + group number + the entire descriptor; + else if gdt\_csum, then crc16(UUID + group number + the entire + descriptor). In all cases, only the lower 16 bits are stored. + diff --git a/Documentation/filesystems/ext4/ondisk/directory.rst b/Documentation/filesystems/ext4/ondisk/directory.rst new file mode 100644 index 000000000000..8fcba68c2884 --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/directory.rst @@ -0,0 +1,426 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Directory Entries +----------------- + +In an ext4 filesystem, a directory is more or less a flat file that maps +an arbitrary byte string (usually ASCII) to an inode number on the +filesystem. There can be many directory entries across the filesystem +that reference the same inode number--these are known as hard links, and +that is why hard links cannot reference files on other filesystems. As +such, directory entries are found by reading the data block(s) +associated with a directory file for the particular directory entry that +is desired. + +Linear (Classic) Directories +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +By default, each directory lists its entries in an “almost-linear” +array. I write “almost” because it's not a linear array in the memory +sense because directory entries are not split across filesystem blocks. +Therefore, it is more accurate to say that a directory is a series of +data blocks and that each block contains a linear array of directory +entries. The end of each per-block array is signified by reaching the +end of the block; the last entry in the block has a record length that +takes it all the way to the end of the block. The end of the entire +directory is of course signified by reaching the end of the file. Unused +directory entries are signified by inode = 0. By default the filesystem +uses ``struct ext4_dir_entry_2`` for directory entries unless the +“filetype” feature flag is not set, in which case it uses +``struct ext4_dir_entry``. + +The original directory entry format is ``struct ext4_dir_entry``, which +is at most 263 bytes long, though on disk you'll need to reference +``dirent.rec_len`` to know for sure. + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le32 + - inode + - Number of the inode that this directory entry points to. + * - 0x4 + - \_\_le16 + - rec\_len + - Length of this directory entry. Must be a multiple of 4. + * - 0x6 + - \_\_le16 + - name\_len + - Length of the file name. + * - 0x8 + - char + - name[EXT4\_NAME\_LEN] + - File name. + +Since file names cannot be longer than 255 bytes, the new directory +entry format shortens the rec\_len field and uses the space for a file +type flag, probably to avoid having to load every inode during directory +tree traversal. This format is ``ext4_dir_entry_2``, which is at most +263 bytes long, though on disk you'll need to reference +``dirent.rec_len`` to know for sure. + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le32 + - inode + - Number of the inode that this directory entry points to. + * - 0x4 + - \_\_le16 + - rec\_len + - Length of this directory entry. + * - 0x6 + - \_\_u8 + - name\_len + - Length of the file name. + * - 0x7 + - \_\_u8 + - file\_type + - File type code, see ftype_ table below. + * - 0x8 + - char + - name[EXT4\_NAME\_LEN] + - File name. + +.. _ftype: + +The directory file type is one of the following values: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 0x0 + - Unknown. + * - 0x1 + - Regular file. + * - 0x2 + - Directory. + * - 0x3 + - Character device file. + * - 0x4 + - Block device file. + * - 0x5 + - FIFO. + * - 0x6 + - Socket. + * - 0x7 + - Symbolic link. + +In order to add checksums to these classic directory blocks, a phony +``struct ext4_dir_entry`` is placed at the end of each leaf block to +hold the checksum. The directory entry is 12 bytes long. The inode +number and name\_len fields are set to zero to fool old software into +ignoring an apparently empty directory entry, and the checksum is stored +in the place where the name normally goes. The structure is +``struct ext4_dir_entry_tail``: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le32 + - det\_reserved\_zero1 + - Inode number, which must be zero. + * - 0x4 + - \_\_le16 + - det\_rec\_len + - Length of this directory entry, which must be 12. + * - 0x6 + - \_\_u8 + - det\_reserved\_zero2 + - Length of the file name, which must be zero. + * - 0x7 + - \_\_u8 + - det\_reserved\_ft + - File type, which must be 0xDE. + * - 0x8 + - \_\_le32 + - det\_checksum + - Directory leaf block checksum. + +The leaf directory block checksum is calculated against the FS UUID, the +directory's inode number, the directory's inode generation number, and +the entire directory entry block up to (but not including) the fake +directory entry. + +Hash Tree Directories +~~~~~~~~~~~~~~~~~~~~~ + +A linear array of directory entries isn't great for performance, so a +new feature was added to ext3 to provide a faster (but peculiar) +balanced tree keyed off a hash of the directory entry name. If the +EXT4\_INDEX\_FL (0x1000) flag is set in the inode, this directory uses a +hashed btree (htree) to organize and find directory entries. For +backwards read-only compatibility with ext2, this tree is actually +hidden inside the directory file, masquerading as “empty” directory data +blocks! It was stated previously that the end of the linear directory +entry table was signified with an entry pointing to inode 0; this is +(ab)used to fool the old linear-scan algorithm into thinking that the +rest of the directory block is empty so that it moves on. + +The root of the tree always lives in the first data block of the +directory. By ext2 custom, the '.' and '..' entries must appear at the +beginning of this first block, so they are put here as two +``struct ext4_dir_entry_2``\ s and not stored in the tree. The rest of +the root node contains metadata about the tree and finally a hash->block +map to find nodes that are lower in the htree. If +``dx_root.info.indirect_levels`` is non-zero then the htree has two +levels; the data block pointed to by the root node's map is an interior +node, which is indexed by a minor hash. Interior nodes in this tree +contains a zeroed out ``struct ext4_dir_entry_2`` followed by a +minor\_hash->block map to find leafe nodes. Leaf nodes contain a linear +array of all ``struct ext4_dir_entry_2``; all of these entries +(presumably) hash to the same value. If there is an overflow, the +entries simply overflow into the next leaf node, and the +least-significant bit of the hash (in the interior node map) that gets +us to this next leaf node is set. + +To traverse the directory as a htree, the code calculates the hash of +the desired file name and uses it to find the corresponding block +number. If the tree is flat, the block is a linear array of directory +entries that can be searched; otherwise, the minor hash of the file name +is computed and used against this second block to find the corresponding +third block number. That third block number will be a linear array of +directory entries. + +To traverse the directory as a linear array (such as the old code does), +the code simply reads every data block in the directory. The blocks used +for the htree will appear to have no entries (aside from '.' and '..') +and so only the leaf nodes will appear to have any interesting content. + +The root of the htree is in ``struct dx_root``, which is the full length +of a data block: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Type + - Name + - Description + * - 0x0 + - \_\_le32 + - dot.inode + - inode number of this directory. + * - 0x4 + - \_\_le16 + - dot.rec\_len + - Length of this record, 12. + * - 0x6 + - u8 + - dot.name\_len + - Length of the name, 1. + * - 0x7 + - u8 + - dot.file\_type + - File type of this entry, 0x2 (directory) (if the feature flag is set). + * - 0x8 + - char + - dot.name[4] + - “.\\0\\0\\0” + * - 0xC + - \_\_le32 + - dotdot.inode + - inode number of parent directory. + * - 0x10 + - \_\_le16 + - dotdot.rec\_len + - block\_size - 12. The record length is long enough to cover all htree + data. + * - 0x12 + - u8 + - dotdot.name\_len + - Length of the name, 2. + * - 0x13 + - u8 + - dotdot.file\_type + - File type of this entry, 0x2 (directory) (if the feature flag is set). + * - 0x14 + - char + - dotdot\_name[4] + - “..\\0\\0” + * - 0x18 + - \_\_le32 + - struct dx\_root\_info.reserved\_zero + - Zero. + * - 0x1C + - u8 + - struct dx\_root\_info.hash\_version + - Hash type, see dirhash_ table below. + * - 0x1D + - u8 + - struct dx\_root\_info.info\_length + - Length of the tree information, 0x8. + * - 0x1E + - u8 + - struct dx\_root\_info.indirect\_levels + - Depth of the htree. Cannot be larger than 3 if the INCOMPAT\_LARGEDIR + feature is set; cannot be larger than 2 otherwise. + * - 0x1F + - u8 + - struct dx\_root\_info.unused\_flags + - + * - 0x20 + - \_\_le16 + - limit + - Maximum number of dx\_entries that can follow this header, plus 1 for + the header itself. + * - 0x22 + - \_\_le16 + - count + - Actual number of dx\_entries that follow this header, plus 1 for the + header itself. + * - 0x24 + - \_\_le32 + - block + - The block number (within the directory file) that goes with hash=0. + * - 0x28 + - struct dx\_entry + - entries[0] + - As many 8-byte ``struct dx_entry`` as fits in the rest of the data block. + +.. _dirhash: + +The directory hash is one of the following values: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 0x0 + - Legacy. + * - 0x1 + - Half MD4. + * - 0x2 + - Tea. + * - 0x3 + - Legacy, unsigned. + * - 0x4 + - Half MD4, unsigned. + * - 0x5 + - Tea, unsigned. + +Interior nodes of an htree are recorded as ``struct dx_node``, which is +also the full length of a data block: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Type + - Name + - Description + * - 0x0 + - \_\_le32 + - fake.inode + - Zero, to make it look like this entry is not in use. + * - 0x4 + - \_\_le16 + - fake.rec\_len + - The size of the block, in order to hide all of the dx\_node data. + * - 0x6 + - u8 + - name\_len + - Zero. There is no name for this “unused” directory entry. + * - 0x7 + - u8 + - file\_type + - Zero. There is no file type for this “unused” directory entry. + * - 0x8 + - \_\_le16 + - limit + - Maximum number of dx\_entries that can follow this header, plus 1 for + the header itself. + * - 0xA + - \_\_le16 + - count + - Actual number of dx\_entries that follow this header, plus 1 for the + header itself. + * - 0xE + - \_\_le32 + - block + - The block number (within the directory file) that goes with the lowest + hash value of this block. This value is stored in the parent block. + * - 0x12 + - struct dx\_entry + - entries[0] + - As many 8-byte ``struct dx_entry`` as fits in the rest of the data block. + +The hash maps that exist in both ``struct dx_root`` and +``struct dx_node`` are recorded as ``struct dx_entry``, which is 8 bytes +long: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Type + - Name + - Description + * - 0x0 + - \_\_le32 + - hash + - Hash code. + * - 0x4 + - \_\_le32 + - block + - Block number (within the directory file, not filesystem blocks) of the + next node in the htree. + +(If you think this is all quite clever and peculiar, so does the +author.) + +If metadata checksums are enabled, the last 8 bytes of the directory +block (precisely the length of one dx\_entry) are used to store a +``struct dx_tail``, which contains the checksum. The ``limit`` and +``count`` entries in the dx\_root/dx\_node structures are adjusted as +necessary to fit the dx\_tail into the block. If there is no space for +the dx\_tail, the user is notified to run e2fsck -D to rebuild the +directory index (which will ensure that there's space for the checksum. +The dx\_tail structure is 8 bytes long and looks like this: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Type + - Name + - Description + * - 0x0 + - u32 + - dt\_reserved + - Zero. + * - 0x4 + - \_\_le32 + - dt\_checksum + - Checksum of the htree directory block. + +The checksum is calculated against the FS UUID, the htree index header +(dx\_root or dx\_node), all of the htree indices (dx\_entry) that are in +use, and the tail block (dx\_tail). diff --git a/Documentation/filesystems/ext4/ondisk/dynamic.rst b/Documentation/filesystems/ext4/ondisk/dynamic.rst new file mode 100644 index 000000000000..bb0c84333341 --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/dynamic.rst @@ -0,0 +1,12 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Dynamic Structures +================== + +Dynamic metadata are created on the fly when files and blocks are +allocated to files. + +.. include:: inodes.rst +.. include:: ifork.rst +.. include:: directory.rst +.. include:: attributes.rst diff --git a/Documentation/filesystems/ext4/ondisk/eainode.rst b/Documentation/filesystems/ext4/ondisk/eainode.rst new file mode 100644 index 000000000000..ecc0d01a0a72 --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/eainode.rst @@ -0,0 +1,18 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Large Extended Attribute Values +------------------------------- + +To enable ext4 to store extended attribute values that do not fit in the +inode or in the single extended attribute block attached to an inode, +the EA\_INODE feature allows us to store the value in the data blocks of +a regular file inode. This “EA inode” is linked only from the extended +attribute name index and must not appear in a directory entry. The +inode's i\_atime field is used to store a checksum of the xattr value; +and i\_ctime/i\_version store a 64-bit reference count, which enables +sharing of large xattr values between multiple owning inodes. For +backward compatibility with older versions of this feature, the +i\_mtime/i\_generation *may* store a back-reference to the inode number +and i\_generation of the **one** owning inode (in cases where the EA +inode is not referenced by multiple inodes) to verify that the EA inode +is the correct one being accessed. diff --git a/Documentation/filesystems/ext4/ondisk/globals.rst b/Documentation/filesystems/ext4/ondisk/globals.rst new file mode 100644 index 000000000000..368bf7662b96 --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/globals.rst @@ -0,0 +1,13 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Global Structures +================= + +The filesystem is sharded into a number of block groups, each of which +have static metadata at fixed locations. + +.. include:: super.rst +.. include:: group_descr.rst +.. include:: bitmaps.rst +.. include:: mmp.rst +.. include:: journal.rst diff --git a/Documentation/filesystems/ext4/ondisk/group_descr.rst b/Documentation/filesystems/ext4/ondisk/group_descr.rst new file mode 100644 index 000000000000..759827e5d2cf --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/group_descr.rst @@ -0,0 +1,170 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Block Group Descriptors +----------------------- + +Each block group on the filesystem has one of these descriptors +associated with it. As noted in the Layout section above, the group +descriptors (if present) are the second item in the block group. The +standard configuration is for each block group to contain a full copy of +the block group descriptor table unless the sparse\_super feature flag +is set. + +Notice how the group descriptor records the location of both bitmaps and +the inode table (i.e. they can float). This means that within a block +group, the only data structures with fixed locations are the superblock +and the group descriptor table. The flex\_bg mechanism uses this +property to group several block groups into a flex group and lay out all +of the groups' bitmaps and inode tables into one long run in the first +group of the flex group. + +If the meta\_bg feature flag is set, then several block groups are +grouped together into a meta group. Note that in the meta\_bg case, +however, the first and last two block groups within the larger meta +group contain only group descriptors for the groups inside the meta +group. + +flex\_bg and meta\_bg do not appear to be mutually exclusive features. + +In ext2, ext3, and ext4 (when the 64bit feature is not enabled), the +block group descriptor was only 32 bytes long and therefore ends at +bg\_checksum. On an ext4 filesystem with the 64bit feature enabled, the +block group descriptor expands to at least the 64 bytes described below; +the size is stored in the superblock. + +If gdt\_csum is set and metadata\_csum is not set, the block group +checksum is the crc16 of the FS UUID, the group number, and the group +descriptor structure. If metadata\_csum is set, then the block group +checksum is the lower 16 bits of the checksum of the FS UUID, the group +number, and the group descriptor structure. Both block and inode bitmap +checksums are calculated against the FS UUID, the group number, and the +entire bitmap. + +The block group descriptor is laid out in ``struct ext4_group_desc``. + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le32 + - bg\_block\_bitmap\_lo + - Lower 32-bits of location of block bitmap. + * - 0x4 + - \_\_le32 + - bg\_inode\_bitmap\_lo + - Lower 32-bits of location of inode bitmap. + * - 0x8 + - \_\_le32 + - bg\_inode\_table\_lo + - Lower 32-bits of location of inode table. + * - 0xC + - \_\_le16 + - bg\_free\_blocks\_count\_lo + - Lower 16-bits of free block count. + * - 0xE + - \_\_le16 + - bg\_free\_inodes\_count\_lo + - Lower 16-bits of free inode count. + * - 0x10 + - \_\_le16 + - bg\_used\_dirs\_count\_lo + - Lower 16-bits of directory count. + * - 0x12 + - \_\_le16 + - bg\_flags + - Block group flags. See the bgflags_ table below. + * - 0x14 + - \_\_le32 + - bg\_exclude\_bitmap\_lo + - Lower 32-bits of location of snapshot exclusion bitmap. + * - 0x18 + - \_\_le16 + - bg\_block\_bitmap\_csum\_lo + - Lower 16-bits of the block bitmap checksum. + * - 0x1A + - \_\_le16 + - bg\_inode\_bitmap\_csum\_lo + - Lower 16-bits of the inode bitmap checksum. + * - 0x1C + - \_\_le16 + - bg\_itable\_unused\_lo + - Lower 16-bits of unused inode count. If set, we needn't scan past the + ``(sb.s_inodes_per_group - gdt.bg_itable_unused)``\ th entry in the + inode table for this group. + * - 0x1E + - \_\_le16 + - bg\_checksum + - Group descriptor checksum; crc16(sb\_uuid+group+desc) if the + RO\_COMPAT\_GDT\_CSUM feature is set, or crc32c(sb\_uuid+group\_desc) & + 0xFFFF if the RO\_COMPAT\_METADATA\_CSUM feature is set. + * - + - + - + - These fields only exist if the 64bit feature is enabled and s_desc_size + > 32. + * - 0x20 + - \_\_le32 + - bg\_block\_bitmap\_hi + - Upper 32-bits of location of block bitmap. + * - 0x24 + - \_\_le32 + - bg\_inode\_bitmap\_hi + - Upper 32-bits of location of inodes bitmap. + * - 0x28 + - \_\_le32 + - bg\_inode\_table\_hi + - Upper 32-bits of location of inodes table. + * - 0x2C + - \_\_le16 + - bg\_free\_blocks\_count\_hi + - Upper 16-bits of free block count. + * - 0x2E + - \_\_le16 + - bg\_free\_inodes\_count\_hi + - Upper 16-bits of free inode count. + * - 0x30 + - \_\_le16 + - bg\_used\_dirs\_count\_hi + - Upper 16-bits of directory count. + * - 0x32 + - \_\_le16 + - bg\_itable\_unused\_hi + - Upper 16-bits of unused inode count. + * - 0x34 + - \_\_le32 + - bg\_exclude\_bitmap\_hi + - Upper 32-bits of location of snapshot exclusion bitmap. + * - 0x38 + - \_\_le16 + - bg\_block\_bitmap\_csum\_hi + - Upper 16-bits of the block bitmap checksum. + * - 0x3A + - \_\_le16 + - bg\_inode\_bitmap\_csum\_hi + - Upper 16-bits of the inode bitmap checksum. + * - 0x3C + - \_\_u32 + - bg\_reserved + - Padding to 64 bytes. + +.. _bgflags: + +Block group flags can be any combination of the following: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 0x1 + - inode table and bitmap are not initialized (EXT4\_BG\_INODE\_UNINIT). + * - 0x2 + - block bitmap is not initialized (EXT4\_BG\_BLOCK\_UNINIT). + * - 0x4 + - inode table is zeroed (EXT4\_BG\_INODE\_ZEROED). diff --git a/Documentation/filesystems/ext4/ondisk/ifork.rst b/Documentation/filesystems/ext4/ondisk/ifork.rst new file mode 100644 index 000000000000..5dbe3b2b121a --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/ifork.rst @@ -0,0 +1,194 @@ +.. SPDX-License-Identifier: GPL-2.0 + +The Contents of inode.i\_block +------------------------------ + +Depending on the type of file an inode describes, the 60 bytes of +storage in ``inode.i_block`` can be used in different ways. In general, +regular files and directories will use it for file block indexing +information, and special files will use it for special purposes. + +Symbolic Links +~~~~~~~~~~~~~~ + +The target of a symbolic link will be stored in this field if the target +string is less than 60 bytes long. Otherwise, either extents or block +maps will be used to allocate data blocks to store the link target. + +Direct/Indirect Block Addressing +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In ext2/3, file block numbers were mapped to logical block numbers by +means of an (up to) three level 1-1 block map. To find the logical block +that stores a particular file block, the code would navigate through +this increasingly complicated structure. Notice that there is neither a +magic number nor a checksum to provide any level of confidence that the +block isn't full of garbage. + +.. ifconfig:: builder != 'latex' + + .. include:: blockmap.rst + +.. ifconfig:: builder == 'latex' + + [Table omitted because LaTeX doesn't support nested tables.] + +Note that with this block mapping scheme, it is necessary to fill out a +lot of mapping data even for a large contiguous file! This inefficiency +led to the creation of the extent mapping scheme, discussed below. + +Notice also that a file using this mapping scheme cannot be placed +higher than 2^32 blocks. + +Extent Tree +~~~~~~~~~~~ + +In ext4, the file to logical block map has been replaced with an extent +tree. Under the old scheme, allocating a contiguous run of 1,000 blocks +requires an indirect block to map all 1,000 entries; with extents, the +mapping is reduced to a single ``struct ext4_extent`` with +``ee_len = 1000``. If flex\_bg is enabled, it is possible to allocate +very large files with a single extent, at a considerable reduction in +metadata block use, and some improvement in disk efficiency. The inode +must have the extents flag (0x80000) flag set for this feature to be in +use. + +Extents are arranged as a tree. Each node of the tree begins with a +``struct ext4_extent_header``. If the node is an interior node +(``eh.eh_depth`` > 0), the header is followed by ``eh.eh_entries`` +instances of ``struct ext4_extent_idx``; each of these index entries +points to a block containing more nodes in the extent tree. If the node +is a leaf node (``eh.eh_depth == 0``), then the header is followed by +``eh.eh_entries`` instances of ``struct ext4_extent``; these instances +point to the file's data blocks. The root node of the extent tree is +stored in ``inode.i_block``, which allows for the first four extents to +be recorded without the use of extra metadata blocks. + +The extent tree header is recorded in ``struct ext4_extent_header``, +which is 12 bytes long: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le16 + - eh\_magic + - Magic number, 0xF30A. + * - 0x2 + - \_\_le16 + - eh\_entries + - Number of valid entries following the header. + * - 0x4 + - \_\_le16 + - eh\_max + - Maximum number of entries that could follow the header. + * - 0x6 + - \_\_le16 + - eh\_depth + - Depth of this extent node in the extent tree. 0 = this extent node + points to data blocks; otherwise, this extent node points to other + extent nodes. The extent tree can be at most 5 levels deep: a logical + block number can be at most ``2^32``, and the smallest ``n`` that + satisfies ``4*(((blocksize - 12)/12)^n) >= 2^32`` is 5. + * - 0x8 + - \_\_le32 + - eh\_generation + - Generation of the tree. (Used by Lustre, but not standard ext4). + +Internal nodes of the extent tree, also known as index nodes, are +recorded as ``struct ext4_extent_idx``, and are 12 bytes long: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le32 + - ei\_block + - This index node covers file blocks from 'block' onward. + * - 0x4 + - \_\_le32 + - ei\_leaf\_lo + - Lower 32-bits of the block number of the extent node that is the next + level lower in the tree. The tree node pointed to can be either another + internal node or a leaf node, described below. + * - 0x8 + - \_\_le16 + - ei\_leaf\_hi + - Upper 16-bits of the previous field. + * - 0xA + - \_\_u16 + - ei\_unused + - + +Leaf nodes of the extent tree are recorded as ``struct ext4_extent``, +and are also 12 bytes long: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le32 + - ee\_block + - First file block number that this extent covers. + * - 0x4 + - \_\_le16 + - ee\_len + - Number of blocks covered by extent. If the value of this field is <= + 32768, the extent is initialized. If the value of the field is > 32768, + the extent is uninitialized and the actual extent length is ``ee_len`` - + 32768. Therefore, the maximum length of a initialized extent is 32768 + blocks, and the maximum length of an uninitialized extent is 32767. + * - 0x6 + - \_\_le16 + - ee\_start\_hi + - Upper 16-bits of the block number to which this extent points. + * - 0x8 + - \_\_le32 + - ee\_start\_lo + - Lower 32-bits of the block number to which this extent points. + +Prior to the introduction of metadata checksums, the extent header + +extent entries always left at least 4 bytes of unallocated space at the +end of each extent tree data block (because (2^x % 12) >= 4). Therefore, +the 32-bit checksum is inserted into this space. The 4 extents in the +inode do not need checksumming, since the inode is already checksummed. +The checksum is calculated against the FS UUID, the inode number, the +inode generation, and the entire extent block leading up to (but not +including) the checksum itself. + +``struct ext4_extent_tail`` is 4 bytes long: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le32 + - eb\_checksum + - Checksum of the extent block, crc32c(uuid+inum+igeneration+extentblock) + +Inline Data +~~~~~~~~~~~ + +If the inline data feature is enabled for the filesystem and the flag is +set for the inode, it is possible that the first 60 bytes of the file +data are stored here. diff --git a/Documentation/filesystems/ext4/ondisk/index.rst b/Documentation/filesystems/ext4/ondisk/index.rst new file mode 100644 index 000000000000..f7d082c3a435 --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/index.rst @@ -0,0 +1,9 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================== +Data Structures and Algorithms +============================== +.. include:: about.rst +.. include:: overview.rst +.. include:: globals.rst +.. include:: dynamic.rst diff --git a/Documentation/filesystems/ext4/ondisk/inlinedata.rst b/Documentation/filesystems/ext4/ondisk/inlinedata.rst new file mode 100644 index 000000000000..d1075178ce0b --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/inlinedata.rst @@ -0,0 +1,37 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Inline Data +----------- + +The inline data feature was designed to handle the case that a file's +data is so tiny that it readily fits inside the inode, which +(theoretically) reduces disk block consumption and reduces seeks. If the +file is smaller than 60 bytes, then the data are stored inline in +``inode.i_block``. If the rest of the file would fit inside the extended +attribute space, then it might be found as an extended attribute +“system.data” within the inode body (“ibody EA”). This of course +constrains the amount of extended attributes one can attach to an inode. +If the data size increases beyond i\_block + ibody EA, a regular block +is allocated and the contents moved to that block. + +Pending a change to compact the extended attribute key used to store +inline data, one ought to be able to store 160 bytes of data in a +256-byte inode (as of June 2015, when i\_extra\_isize is 28). Prior to +that, the limit was 156 bytes due to inefficient use of inode space. + +The inline data feature requires the presence of an extended attribute +for “system.data”, even if the attribute value is zero length. + +Inline Directories +~~~~~~~~~~~~~~~~~~ + +The first four bytes of i\_block are the inode number of the parent +directory. Following that is a 56-byte space for an array of directory +entries; see ``struct ext4_dir_entry``. If there is a “system.data” +attribute in the inode body, the EA value is an array of +``struct ext4_dir_entry`` as well. Note that for inline directories, the +i\_block and EA space are treated as separate dirent blocks; directory +entries cannot span the two. + +Inline directory entries are not checksummed, as the inode checksum +should protect all inline data contents. diff --git a/Documentation/filesystems/ext4/ondisk/inodes.rst b/Documentation/filesystems/ext4/ondisk/inodes.rst new file mode 100644 index 000000000000..655ce898f3f5 --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/inodes.rst @@ -0,0 +1,575 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Index Nodes +----------- + +In a regular UNIX filesystem, the inode stores all the metadata +pertaining to the file (time stamps, block maps, extended attributes, +etc), not the directory entry. To find the information associated with a +file, one must traverse the directory files to find the directory entry +associated with a file, then load the inode to find the metadata for +that file. ext4 appears to cheat (for performance reasons) a little bit +by storing a copy of the file type (normally stored in the inode) in the +directory entry. (Compare all this to FAT, which stores all the file +information directly in the directory entry, but does not support hard +links and is in general more seek-happy than ext4 due to its simpler +block allocator and extensive use of linked lists.) + +The inode table is a linear array of ``struct ext4_inode``. The table is +sized to have enough blocks to store at least +``sb.s_inode_size * sb.s_inodes_per_group`` bytes. The number of the +block group containing an inode can be calculated as +``(inode_number - 1) / sb.s_inodes_per_group``, and the offset into the +group's table is ``(inode_number - 1) % sb.s_inodes_per_group``. There +is no inode 0. + +The inode checksum is calculated against the FS UUID, the inode number, +and the inode structure itself. + +The inode table entry is laid out in ``struct ext4_inode``. + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le16 + - i\_mode + - File mode. See the table i_mode_ below. + * - 0x2 + - \_\_le16 + - i\_uid + - Lower 16-bits of Owner UID. + * - 0x4 + - \_\_le32 + - i\_size\_lo + - Lower 32-bits of size in bytes. + * - 0x8 + - \_\_le32 + - i\_atime + - Last access time, in seconds since the epoch. However, if the EA\_INODE + inode flag is set, this inode stores an extended attribute value and + this field contains the checksum of the value. + * - 0xC + - \_\_le32 + - i\_ctime + - Last inode change time, in seconds since the epoch. However, if the + EA\_INODE inode flag is set, this inode stores an extended attribute + value and this field contains the lower 32 bits of the attribute value's + reference count. + * - 0x10 + - \_\_le32 + - i\_mtime + - Last data modification time, in seconds since the epoch. However, if the + EA\_INODE inode flag is set, this inode stores an extended attribute + value and this field contains the number of the inode that owns the + extended attribute. + * - 0x14 + - \_\_le32 + - i\_dtime + - Deletion Time, in seconds since the epoch. + * - 0x18 + - \_\_le16 + - i\_gid + - Lower 16-bits of GID. + * - 0x1A + - \_\_le16 + - i\_links\_count + - Hard link count. Normally, ext4 does not permit an inode to have more + than 65,000 hard links. This applies to files as well as directories, + which means that there cannot be more than 64,998 subdirectories in a + directory (each subdirectory's '..' entry counts as a hard link, as does + the '.' entry in the directory itself). With the DIR\_NLINK feature + enabled, ext4 supports more than 64,998 subdirectories by setting this + field to 1 to indicate that the number of hard links is not known. + * - 0x1C + - \_\_le32 + - i\_blocks\_lo + - Lower 32-bits of “block” count. If the huge\_file feature flag is not + set on the filesystem, the file consumes ``i_blocks_lo`` 512-byte blocks + on disk. If huge\_file is set and EXT4\_HUGE\_FILE\_FL is NOT set in + ``inode.i_flags``, then the file consumes ``i_blocks_lo + (i_blocks_hi + << 32)`` 512-byte blocks on disk. If huge\_file is set and + EXT4\_HUGE\_FILE\_FL IS set in ``inode.i_flags``, then this file + consumes (``i_blocks_lo + i_blocks_hi`` << 32) filesystem blocks on + disk. + * - 0x20 + - \_\_le32 + - i\_flags + - Inode flags. See the table i_flags_ below. + * - 0x24 + - 4 bytes + - i\_osd1 + - See the table i_osd1_ for more details. + * - 0x28 + - 60 bytes + - i\_block[EXT4\_N\_BLOCKS=15] + - Block map or extent tree. See the section “The Contents of inode.i\_block”. + * - 0x64 + - \_\_le32 + - i\_generation + - File version (for NFS). + * - 0x68 + - \_\_le32 + - i\_file\_acl\_lo + - Lower 32-bits of extended attribute block. ACLs are of course one of + many possible extended attributes; I think the name of this field is a + result of the first use of extended attributes being for ACLs. + * - 0x6C + - \_\_le32 + - i\_size\_high / i\_dir\_acl + - Upper 32-bits of file/directory size. In ext2/3 this field was named + i\_dir\_acl, though it was usually set to zero and never used. + * - 0x70 + - \_\_le32 + - i\_obso\_faddr + - (Obsolete) fragment address. + * - 0x74 + - 12 bytes + - i\_osd2 + - See the table i_osd2_ for more details. + * - 0x80 + - \_\_le16 + - i\_extra\_isize + - Size of this inode - 128. Alternately, the size of the extended inode + fields beyond the original ext2 inode, including this field. + * - 0x82 + - \_\_le16 + - i\_checksum\_hi + - Upper 16-bits of the inode checksum. + * - 0x84 + - \_\_le32 + - i\_ctime\_extra + - Extra change time bits. This provides sub-second precision. See Inode + Timestamps section. + * - 0x88 + - \_\_le32 + - i\_mtime\_extra + - Extra modification time bits. This provides sub-second precision. + * - 0x8C + - \_\_le32 + - i\_atime\_extra + - Extra access time bits. This provides sub-second precision. + * - 0x90 + - \_\_le32 + - i\_crtime + - File creation time, in seconds since the epoch. + * - 0x94 + - \_\_le32 + - i\_crtime\_extra + - Extra file creation time bits. This provides sub-second precision. + * - 0x98 + - \_\_le32 + - i\_version\_hi + - Upper 32-bits for version number. + * - 0x9C + - \_\_le32 + - i\_projid + - Project ID. + +.. _i_mode: + +The ``i_mode`` value is a combination of the following flags: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 0x1 + - S\_IXOTH (Others may execute) + * - 0x2 + - S\_IWOTH (Others may write) + * - 0x4 + - S\_IROTH (Others may read) + * - 0x8 + - S\_IXGRP (Group members may execute) + * - 0x10 + - S\_IWGRP (Group members may write) + * - 0x20 + - S\_IRGRP (Group members may read) + * - 0x40 + - S\_IXUSR (Owner may execute) + * - 0x80 + - S\_IWUSR (Owner may write) + * - 0x100 + - S\_IRUSR (Owner may read) + * - 0x200 + - S\_ISVTX (Sticky bit) + * - 0x400 + - S\_ISGID (Set GID) + * - 0x800 + - S\_ISUID (Set UID) + * - + - These are mutually-exclusive file types: + * - 0x1000 + - S\_IFIFO (FIFO) + * - 0x2000 + - S\_IFCHR (Character device) + * - 0x4000 + - S\_IFDIR (Directory) + * - 0x6000 + - S\_IFBLK (Block device) + * - 0x8000 + - S\_IFREG (Regular file) + * - 0xA000 + - S\_IFLNK (Symbolic link) + * - 0xC000 + - S\_IFSOCK (Socket) + +.. _i_flags: + +The ``i_flags`` field is a combination of these values: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 0x1 + - This file requires secure deletion (EXT4\_SECRM\_FL). (not implemented) + * - 0x2 + - This file should be preserved, should undeletion be desired + (EXT4\_UNRM\_FL). (not implemented) + * - 0x4 + - File is compressed (EXT4\_COMPR\_FL). (not really implemented) + * - 0x8 + - All writes to the file must be synchronous (EXT4\_SYNC\_FL). + * - 0x10 + - File is immutable (EXT4\_IMMUTABLE\_FL). + * - 0x20 + - File can only be appended (EXT4\_APPEND\_FL). + * - 0x40 + - The dump(1) utility should not dump this file (EXT4\_NODUMP\_FL). + * - 0x80 + - Do not update access time (EXT4\_NOATIME\_FL). + * - 0x100 + - Dirty compressed file (EXT4\_DIRTY\_FL). (not used) + * - 0x200 + - File has one or more compressed clusters (EXT4\_COMPRBLK\_FL). (not used) + * - 0x400 + - Do not compress file (EXT4\_NOCOMPR\_FL). (not used) + * - 0x800 + - Encrypted inode (EXT4\_ENCRYPT\_FL). This bit value previously was + EXT4\_ECOMPR\_FL (compression error), which was never used. + * - 0x1000 + - Directory has hashed indexes (EXT4\_INDEX\_FL). + * - 0x2000 + - AFS magic directory (EXT4\_IMAGIC\_FL). + * - 0x4000 + - File data must always be written through the journal + (EXT4\_JOURNAL\_DATA\_FL). + * - 0x8000 + - File tail should not be merged (EXT4\_NOTAIL\_FL). (not used by ext4) + * - 0x10000 + - All directory entry data should be written synchronously (see + ``dirsync``) (EXT4\_DIRSYNC\_FL). + * - 0x20000 + - Top of directory hierarchy (EXT4\_TOPDIR\_FL). + * - 0x40000 + - This is a huge file (EXT4\_HUGE\_FILE\_FL). + * - 0x80000 + - Inode uses extents (EXT4\_EXTENTS\_FL). + * - 0x200000 + - Inode stores a large extended attribute value in its data blocks + (EXT4\_EA\_INODE\_FL). + * - 0x400000 + - This file has blocks allocated past EOF (EXT4\_EOFBLOCKS\_FL). + (deprecated) + * - 0x01000000 + - Inode is a snapshot (``EXT4_SNAPFILE_FL``). (not in mainline) + * - 0x04000000 + - Snapshot is being deleted (``EXT4_SNAPFILE_DELETED_FL``). (not in + mainline) + * - 0x08000000 + - Snapshot shrink has completed (``EXT4_SNAPFILE_SHRUNK_FL``). (not in + mainline) + * - 0x10000000 + - Inode has inline data (EXT4\_INLINE\_DATA\_FL). + * - 0x20000000 + - Create children with the same project ID (EXT4\_PROJINHERIT\_FL). + * - 0x80000000 + - Reserved for ext4 library (EXT4\_RESERVED\_FL). + * - + - Aggregate flags: + * - 0x4BDFFF + - User-visible flags. + * - 0x4B80FF + - User-modifiable flags. Note that while EXT4\_JOURNAL\_DATA\_FL and + EXT4\_EXTENTS\_FL can be set with setattr, they are not in the kernel's + EXT4\_FL\_USER\_MODIFIABLE mask, since it needs to handle the setting of + these flags in a special manner and they are masked out of the set of + flags that are saved directly to i\_flags. + +.. _i_osd1: + +The ``osd1`` field has multiple meanings depending on the creator: + +Linux: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le32 + - l\_i\_version + - Inode version. However, if the EA\_INODE inode flag is set, this inode + stores an extended attribute value and this field contains the upper 32 + bits of the attribute value's reference count. + +Hurd: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le32 + - h\_i\_translator + - ?? + +Masix: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le32 + - m\_i\_reserved + - ?? + +.. _i_osd2: + +The ``osd2`` field has multiple meanings depending on the filesystem creator: + +Linux: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le16 + - l\_i\_blocks\_high + - Upper 16-bits of the block count. Please see the note attached to + i\_blocks\_lo. + * - 0x2 + - \_\_le16 + - l\_i\_file\_acl\_high + - Upper 16-bits of the extended attribute block (historically, the file + ACL location). See the Extended Attributes section below. + * - 0x4 + - \_\_le16 + - l\_i\_uid\_high + - Upper 16-bits of the Owner UID. + * - 0x6 + - \_\_le16 + - l\_i\_gid\_high + - Upper 16-bits of the GID. + * - 0x8 + - \_\_le16 + - l\_i\_checksum\_lo + - Lower 16-bits of the inode checksum. + * - 0xA + - \_\_le16 + - l\_i\_reserved + - Unused. + +Hurd: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le16 + - h\_i\_reserved1 + - ?? + * - 0x2 + - \_\_u16 + - h\_i\_mode\_high + - Upper 16-bits of the file mode. + * - 0x4 + - \_\_le16 + - h\_i\_uid\_high + - Upper 16-bits of the Owner UID. + * - 0x6 + - \_\_le16 + - h\_i\_gid\_high + - Upper 16-bits of the GID. + * - 0x8 + - \_\_u32 + - h\_i\_author + - Author code? + +Masix: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le16 + - h\_i\_reserved1 + - ?? + * - 0x2 + - \_\_u16 + - m\_i\_file\_acl\_high + - Upper 16-bits of the extended attribute block (historically, the file + ACL location). + * - 0x4 + - \_\_u32 + - m\_i\_reserved2[2] + - ?? + +Inode Size +~~~~~~~~~~ + +In ext2 and ext3, the inode structure size was fixed at 128 bytes +(``EXT2_GOOD_OLD_INODE_SIZE``) and each inode had a disk record size of +128 bytes. Starting with ext4, it is possible to allocate a larger +on-disk inode at format time for all inodes in the filesystem to provide +space beyond the end of the original ext2 inode. The on-disk inode +record size is recorded in the superblock as ``s_inode_size``. The +number of bytes actually used by struct ext4\_inode beyond the original +128-byte ext2 inode is recorded in the ``i_extra_isize`` field for each +inode, which allows struct ext4\_inode to grow for a new kernel without +having to upgrade all of the on-disk inodes. Access to fields beyond +EXT2\_GOOD\_OLD\_INODE\_SIZE should be verified to be within +``i_extra_isize``. By default, ext4 inode records are 256 bytes, and (as +of October 2013) the inode structure is 156 bytes +(``i_extra_isize = 28``). The extra space between the end of the inode +structure and the end of the inode record can be used to store extended +attributes. Each inode record can be as large as the filesystem block +size, though this is not terribly efficient. + +Finding an Inode +~~~~~~~~~~~~~~~~ + +Each block group contains ``sb->s_inodes_per_group`` inodes. Because +inode 0 is defined not to exist, this formula can be used to find the +block group that an inode lives in: +``bg = (inode_num - 1) / sb->s_inodes_per_group``. The particular inode +can be found within the block group's inode table at +``index = (inode_num - 1) % sb->s_inodes_per_group``. To get the byte +address within the inode table, use +``offset = index * sb->s_inode_size``. + +Inode Timestamps +~~~~~~~~~~~~~~~~ + +Four timestamps are recorded in the lower 128 bytes of the inode +structure -- inode change time (ctime), access time (atime), data +modification time (mtime), and deletion time (dtime). The four fields +are 32-bit signed integers that represent seconds since the Unix epoch +(1970-01-01 00:00:00 GMT), which means that the fields will overflow in +January 2038. For inodes that are not linked from any directory but are +still open (orphan inodes), the dtime field is overloaded for use with +the orphan list. The superblock field ``s_last_orphan`` points to the +first inode in the orphan list; dtime is then the number of the next +orphaned inode, or zero if there are no more orphans. + +If the inode structure size ``sb->s_inode_size`` is larger than 128 +bytes and the ``i_inode_extra`` field is large enough to encompass the +respective ``i_[cma]time_extra`` field, the ctime, atime, and mtime +inode fields are widened to 64 bits. Within this “extra” 32-bit field, +the lower two bits are used to extend the 32-bit seconds field to be 34 +bit wide; the upper 30 bits are used to provide nanosecond timestamp +accuracy. Therefore, timestamps should not overflow until May 2446. +dtime was not widened. There is also a fifth timestamp to record inode +creation time (crtime); this field is 64-bits wide and decoded in the +same manner as 64-bit [cma]time. Neither crtime nor dtime are accessible +through the regular stat() interface, though debugfs will report them. + +We use the 32-bit signed time value plus (2^32 \* (extra epoch bits)). +In other words: + +.. list-table:: + :widths: 20 20 20 20 20 + :header-rows: 1 + + * - Extra epoch bits + - MSB of 32-bit time + - Adjustment for signed 32-bit to 64-bit tv\_sec + - Decoded 64-bit tv\_sec + - valid time range + * - 0 0 + - 1 + - 0 + - ``-0x80000000 - -0x00000001`` + - 1901-12-13 to 1969-12-31 + * - 0 0 + - 0 + - 0 + - ``0x000000000 - 0x07fffffff`` + - 1970-01-01 to 2038-01-19 + * - 0 1 + - 1 + - 0x100000000 + - ``0x080000000 - 0x0ffffffff`` + - 2038-01-19 to 2106-02-07 + * - 0 1 + - 0 + - 0x100000000 + - ``0x100000000 - 0x17fffffff`` + - 2106-02-07 to 2174-02-25 + * - 1 0 + - 1 + - 0x200000000 + - ``0x180000000 - 0x1ffffffff`` + - 2174-02-25 to 2242-03-16 + * - 1 0 + - 0 + - 0x200000000 + - ``0x200000000 - 0x27fffffff`` + - 2242-03-16 to 2310-04-04 + * - 1 1 + - 1 + - 0x300000000 + - ``0x280000000 - 0x2ffffffff`` + - 2310-04-04 to 2378-04-22 + * - 1 1 + - 0 + - 0x300000000 + - ``0x300000000 - 0x37fffffff`` + - 2378-04-22 to 2446-05-10 + +This is a somewhat odd encoding since there are effectively seven times +as many positive values as negative values. There have also been +long-standing bugs decoding and encoding dates beyond 2038, which don't +seem to be fixed as of kernel 3.12 and e2fsprogs 1.42.8. 64-bit kernels +incorrectly use the extra epoch bits 1,1 for dates between 1901 and +1970. At some point the kernel will be fixed and e2fsck will fix this +situation, assuming that it is run before 2310. diff --git a/Documentation/filesystems/ext4/ondisk/journal.rst b/Documentation/filesystems/ext4/ondisk/journal.rst new file mode 100644 index 000000000000..e7031af86876 --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/journal.rst @@ -0,0 +1,611 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Journal (jbd2) +-------------- + +Introduced in ext3, the ext4 filesystem employs a journal to protect the +filesystem against corruption in the case of a system crash. A small +continuous region of disk (default 128MiB) is reserved inside the +filesystem as a place to land “important” data writes on-disk as quickly +as possible. Once the important data transaction is fully written to the +disk and flushed from the disk write cache, a record of the data being +committed is also written to the journal. At some later point in time, +the journal code writes the transactions to their final locations on +disk (this could involve a lot of seeking or a lot of small +read-write-erases) before erasing the commit record. Should the system +crash during the second slow write, the journal can be replayed all the +way to the latest commit record, guaranteeing the atomicity of whatever +gets written through the journal to the disk. The effect of this is to +guarantee that the filesystem does not become stuck midway through a +metadata update. + +For performance reasons, ext4 by default only writes filesystem metadata +through the journal. This means that file data blocks are /not/ +guaranteed to be in any consistent state after a crash. If this default +guarantee level (``data=ordered``) is not satisfactory, there is a mount +option to control journal behavior. If ``data=journal``, all data and +metadata are written to disk through the journal. This is slower but +safest. If ``data=writeback``, dirty data blocks are not flushed to the +disk before the metadata are written to disk through the journal. + +The journal inode is typically inode 8. The first 68 bytes of the +journal inode are replicated in the ext4 superblock. The journal itself +is normal (but hidden) file within the filesystem. The file usually +consumes an entire block group, though mke2fs tries to put it in the +middle of the disk. + +All fields in jbd2 are written to disk in big-endian order. This is the +opposite of ext4. + +NOTE: Both ext4 and ocfs2 use jbd2. + +The maximum size of a journal embedded in an ext4 filesystem is 2^32 +blocks. jbd2 itself does not seem to care. + +Layout +~~~~~~ + +Generally speaking, the journal has this format: + +.. list-table:: + :widths: 1 1 78 + :header-rows: 1 + + * - Superblock + - descriptor\_block (data\_blocks or revocation\_block) [more data or + revocations] commmit\_block + - [more transactions...] + * - + - One transaction + - + +Notice that a transaction begins with either a descriptor and some data, +or a block revocation list. A finished transaction always ends with a +commit. If there is no commit record (or the checksums don't match), the +transaction will be discarded during replay. + +External Journal +~~~~~~~~~~~~~~~~ + +Optionally, an ext4 filesystem can be created with an external journal +device (as opposed to an internal journal, which uses a reserved inode). +In this case, on the filesystem device, ``s_journal_inum`` should be +zero and ``s_journal_uuid`` should be set. On the journal device there +will be an ext4 super block in the usual place, with a matching UUID. +The journal superblock will be in the next full block after the +superblock. + +.. list-table:: + :widths: 1 1 1 1 76 + :header-rows: 1 + + * - 1024 bytes of padding + - ext4 Superblock + - Journal Superblock + - descriptor\_block (data\_blocks or revocation\_block) [more data or + revocations] commmit\_block + - [more transactions...] + * - + - + - + - One transaction + - + +Block Header +~~~~~~~~~~~~ + +Every block in the journal starts with a common 12-byte header +``struct journal_header_s``: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Type + - Name + - Description + * - 0x0 + - \_\_be32 + - h\_magic + - jbd2 magic number, 0xC03B3998. + * - 0x4 + - \_\_be32 + - h\_blocktype + - Description of what this block contains. See the jbd2_blocktype_ table + below. + * - 0x8 + - \_\_be32 + - h\_sequence + - The transaction ID that goes with this block. + +.. _jbd2_blocktype: + +The journal block type can be any one of: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 1 + - Descriptor. This block precedes a series of data blocks that were + written through the journal during a transaction. + * - 2 + - Block commit record. This block signifies the completion of a + transaction. + * - 3 + - Journal superblock, v1. + * - 4 + - Journal superblock, v2. + * - 5 + - Block revocation records. This speeds up recovery by enabling the + journal to skip writing blocks that were subsequently rewritten. + +Super Block +~~~~~~~~~~~ + +The super block for the journal is much simpler as compared to ext4's. +The key data kept within are size of the journal, and where to find the +start of the log of transactions. + +The journal superblock is recorded as ``struct journal_superblock_s``, +which is 1024 bytes long: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Type + - Name + - Description + * - + - + - + - Static information describing the journal. + * - 0x0 + - journal\_header\_t (12 bytes) + - s\_header + - Common header identifying this as a superblock. + * - 0xC + - \_\_be32 + - s\_blocksize + - Journal device block size. + * - 0x10 + - \_\_be32 + - s\_maxlen + - Total number of blocks in this journal. + * - 0x14 + - \_\_be32 + - s\_first + - First block of log information. + * - + - + - + - Dynamic information describing the current state of the log. + * - 0x18 + - \_\_be32 + - s\_sequence + - First commit ID expected in log. + * - 0x1C + - \_\_be32 + - s\_start + - Block number of the start of log. Contrary to the comments, this field + being zero does not imply that the journal is clean! + * - 0x20 + - \_\_be32 + - s\_errno + - Error value, as set by jbd2\_journal\_abort(). + * - + - + - + - The remaining fields are only valid in a v2 superblock. + * - 0x24 + - \_\_be32 + - s\_feature\_compat; + - Compatible feature set. See the table jbd2_compat_ below. + * - 0x28 + - \_\_be32 + - s\_feature\_incompat + - Incompatible feature set. See the table jbd2_incompat_ below. + * - 0x2C + - \_\_be32 + - s\_feature\_ro\_compat + - Read-only compatible feature set. There aren't any of these currently. + * - 0x30 + - \_\_u8 + - s\_uuid[16] + - 128-bit uuid for journal. This is compared against the copy in the ext4 + super block at mount time. + * - 0x40 + - \_\_be32 + - s\_nr\_users + - Number of file systems sharing this journal. + * - 0x44 + - \_\_be32 + - s\_dynsuper + - Location of dynamic super block copy. (Not used?) + * - 0x48 + - \_\_be32 + - s\_max\_transaction + - Limit of journal blocks per transaction. (Not used?) + * - 0x4C + - \_\_be32 + - s\_max\_trans\_data + - Limit of data blocks per transaction. (Not used?) + * - 0x50 + - \_\_u8 + - s\_checksum\_type + - Checksum algorithm used for the journal. See jbd2_checksum_type_ for + more info. + * - 0x51 + - \_\_u8[3] + - s\_padding2 + - + * - 0x54 + - \_\_u32 + - s\_padding[42] + - + * - 0xFC + - \_\_be32 + - s\_checksum + - Checksum of the entire superblock, with this field set to zero. + * - 0x100 + - \_\_u8 + - s\_users[16\*48] + - ids of all file systems sharing the log. e2fsprogs/Linux don't allow + shared external journals, but I imagine Lustre (or ocfs2?), which use + the jbd2 code, might. + +.. _jbd2_compat: + +The journal compat features are any combination of the following: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 0x1 + - Journal maintains checksums on the data blocks. + (JBD2\_FEATURE\_COMPAT\_CHECKSUM) + +.. _jbd2_incompat: + +The journal incompat features are any combination of the following: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 0x1 + - Journal has block revocation records. (JBD2\_FEATURE\_INCOMPAT\_REVOKE) + * - 0x2 + - Journal can deal with 64-bit block numbers. + (JBD2\_FEATURE\_INCOMPAT\_64BIT) + * - 0x4 + - Journal commits asynchronously. (JBD2\_FEATURE\_INCOMPAT\_ASYNC\_COMMIT) + * - 0x8 + - This journal uses v2 of the checksum on-disk format. Each journal + metadata block gets its own checksum, and the block tags in the + descriptor table contain checksums for each of the data blocks in the + journal. (JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2) + * - 0x10 + - This journal uses v3 of the checksum on-disk format. This is the same as + v2, but the journal block tag size is fixed regardless of the size of + block numbers. (JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3) + +.. _jbd2_checksum_type: + +Journal checksum type codes are one of the following. crc32 or crc32c are the +most likely choices. + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 1 + - CRC32 + * - 2 + - MD5 + * - 3 + - SHA1 + * - 4 + - CRC32C + +Descriptor Block +~~~~~~~~~~~~~~~~ + +The descriptor block contains an array of journal block tags that +describe the final locations of the data blocks that follow in the +journal. Descriptor blocks are open-coded instead of being completely +described by a data structure, but here is the block structure anyway. +Descriptor blocks consume at least 36 bytes, but use a full block: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Type + - Name + - Descriptor + * - 0x0 + - journal\_header\_t + - (open coded) + - Common block header. + * - 0xC + - struct journal\_block\_tag\_s + - open coded array[] + - Enough tags either to fill up the block or to describe all the data + blocks that follow this descriptor block. + +Journal block tags have any of the following formats, depending on which +journal feature and block tag flags are set. + +If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 is set, the journal block tag is +defined as ``struct journal_block_tag3_s``, which looks like the +following. The size is 16 or 32 bytes. + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Type + - Name + - Descriptor + * - 0x0 + - \_\_be32 + - t\_blocknr + - Lower 32-bits of the location of where the corresponding data block + should end up on disk. + * - 0x4 + - \_\_be32 + - t\_flags + - Flags that go with the descriptor. See the table jbd2_tag_flags_ for + more info. + * - 0x8 + - \_\_be32 + - t\_blocknr\_high + - Upper 32-bits of the location of where the corresponding data block + should end up on disk. This is zero if JBD2\_FEATURE\_INCOMPAT\_64BIT is + not enabled. + * - 0xC + - \_\_be32 + - t\_checksum + - Checksum of the journal UUID, the sequence number, and the data block. + * - + - + - + - This field appears to be open coded. It always comes at the end of the + tag, after t_checksum. This field is not present if the "same UUID" flag + is set. + * - 0x8 or 0xC + - char + - uuid[16] + - A UUID to go with this tag. This field appears to be copied from the + ``j_uuid`` field in ``struct journal_s``, but only tune2fs touches that + field. + +.. _jbd2_tag_flags: + +The journal tag flags are any combination of the following: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 0x1 + - On-disk block is escaped. The first four bytes of the data block just + happened to match the jbd2 magic number. + * - 0x2 + - This block has the same UUID as previous, therefore the UUID field is + omitted. + * - 0x4 + - The data block was deleted by the transaction. (Not used?) + * - 0x8 + - This is the last tag in this descriptor block. + +If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 is NOT set, the journal block tag +is defined as ``struct journal_block_tag_s``, which looks like the +following. The size is 8, 12, 24, or 28 bytes: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Type + - Name + - Descriptor + * - 0x0 + - \_\_be32 + - t\_blocknr + - Lower 32-bits of the location of where the corresponding data block + should end up on disk. + * - 0x4 + - \_\_be16 + - t\_checksum + - Checksum of the journal UUID, the sequence number, and the data block. + Note that only the lower 16 bits are stored. + * - 0x6 + - \_\_be16 + - t\_flags + - Flags that go with the descriptor. See the table jbd2_tag_flags_ for + more info. + * - + - + - + - This next field is only present if the super block indicates support for + 64-bit block numbers. + * - 0x8 + - \_\_be32 + - t\_blocknr\_high + - Upper 32-bits of the location of where the corresponding data block + should end up on disk. + * - + - + - + - This field appears to be open coded. It always comes at the end of the + tag, after t_flags or t_blocknr_high. This field is not present if the + "same UUID" flag is set. + * - 0x8 or 0xC + - char + - uuid[16] + - A UUID to go with this tag. This field appears to be copied from the + ``j_uuid`` field in ``struct journal_s``, but only tune2fs touches that + field. + +If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2 or +JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 are set, the end of the block is a +``struct jbd2_journal_block_tail``, which looks like this: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Type + - Name + - Descriptor + * - 0x0 + - \_\_be32 + - t\_checksum + - Checksum of the journal UUID + the descriptor block, with this field set + to zero. + +Data Block +~~~~~~~~~~ + +In general, the data blocks being written to disk through the journal +are written verbatim into the journal file after the descriptor block. +However, if the first four bytes of the block match the jbd2 magic +number then those four bytes are replaced with zeroes and the “escaped” +flag is set in the descriptor block tag. + +Revocation Block +~~~~~~~~~~~~~~~~ + +A revocation block is used to prevent replay of a block in an earlier +transaction. This is used to mark blocks that were journalled at one +time but are no longer journalled. Typically this happens if a metadata +block is freed and re-allocated as a file data block; in this case, a +journal replay after the file block was written to disk will cause +corruption. + +**NOTE**: This mechanism is NOT used to express “this journal block is +superseded by this other journal block”, as the author (djwong) +mistakenly thought. Any block being added to a transaction will cause +the removal of all existing revocation records for that block. + +Revocation blocks are described in +``struct jbd2_journal_revoke_header_s``, are at least 16 bytes in +length, but use a full block: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Type + - Name + - Description + * - 0x0 + - journal\_header\_t + - r\_header + - Common block header. + * - 0xC + - \_\_be32 + - r\_count + - Number of bytes used in this block. + * - 0x10 + - \_\_be32 or \_\_be64 + - blocks[0] + - Blocks to revoke. + +After r\_count is a linear array of block numbers that are effectively +revoked by this transaction. The size of each block number is 8 bytes if +the superblock advertises 64-bit block number support, or 4 bytes +otherwise. + +If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2 or +JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 are set, the end of the revocation +block is a ``struct jbd2_journal_revoke_tail``, which has this format: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Type + - Name + - Description + * - 0x0 + - \_\_be32 + - r\_checksum + - Checksum of the journal UUID + revocation block + +Commit Block +~~~~~~~~~~~~ + +The commit block is a sentry that indicates that a transaction has been +completely written to the journal. Once this commit block reaches the +journal, the data stored with this transaction can be written to their +final locations on disk. + +The commit block is described by ``struct commit_header``, which is 32 +bytes long (but uses a full block): + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Type + - Name + - Descriptor + * - 0x0 + - journal\_header\_s + - (open coded) + - Common block header. + * - 0xC + - unsigned char + - h\_chksum\_type + - The type of checksum to use to verify the integrity of the data blocks + in the transaction. See jbd2_checksum_type_ for more info. + * - 0xD + - unsigned char + - h\_chksum\_size + - The number of bytes used by the checksum. Most likely 4. + * - 0xE + - unsigned char + - h\_padding[2] + - + * - 0x10 + - \_\_be32 + - h\_chksum[JBD2\_CHECKSUM\_BYTES] + - 32 bytes of space to store checksums. If + JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2 or JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 + are set, the first ``__be32`` is the checksum of the journal UUID and + the entire commit block, with this field zeroed. If + JBD2\_FEATURE\_COMPAT\_CHECKSUM is set, the first ``__be32`` is the + crc32 of all the blocks already written to the transaction. + * - 0x30 + - \_\_be64 + - h\_commit\_sec + - The time that the transaction was committed, in seconds since the epoch. + * - 0x38 + - \_\_be32 + - h\_commit\_nsec + - Nanoseconds component of the above timestamp. + diff --git a/Documentation/filesystems/ext4/ondisk/mmp.rst b/Documentation/filesystems/ext4/ondisk/mmp.rst new file mode 100644 index 000000000000..b7d7a3137f80 --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/mmp.rst @@ -0,0 +1,77 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Multiple Mount Protection +------------------------- + +Multiple mount protection (MMP) is a feature that protects the +filesystem against multiple hosts trying to use the filesystem +simultaneously. When a filesystem is opened (for mounting, or fsck, +etc.), the MMP code running on the node (call it node A) checks a +sequence number. If the sequence number is EXT4\_MMP\_SEQ\_CLEAN, the +open continues. If the sequence number is EXT4\_MMP\_SEQ\_FSCK, then +fsck is (hopefully) running, and open fails immediately. Otherwise, the +open code will wait for twice the specified MMP check interval and check +the sequence number again. If the sequence number has changed, then the +filesystem is active on another machine and the open fails. If the MMP +code passes all of those checks, a new MMP sequence number is generated +and written to the MMP block, and the mount proceeds. + +While the filesystem is live, the kernel sets up a timer to re-check the +MMP block at the specified MMP check interval. To perform the re-check, +the MMP sequence number is re-read; if it does not match the in-memory +MMP sequence number, then another node (node B) has mounted the +filesystem, and node A remounts the filesystem read-only. If the +sequence numbers match, the sequence number is incremented both in +memory and on disk, and the re-check is complete. + +The hostname and device filename are written into the MMP block whenever +an open operation succeeds. The MMP code does not use these values; they +are provided purely for informational purposes. + +The checksum is calculated against the FS UUID and the MMP structure. +The MMP structure (``struct mmp_struct``) is as follows: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Type + - Name + - Description + * - 0x0 + - \_\_le32 + - mmp\_magic + - Magic number for MMP, 0x004D4D50 (“MMP”). + * - 0x4 + - \_\_le32 + - mmp\_seq + - Sequence number, updated periodically. + * - 0x8 + - \_\_le64 + - mmp\_time + - Time that the MMP block was last updated. + * - 0x10 + - char[64] + - mmp\_nodename + - Hostname of the node that opened the filesystem. + * - 0x50 + - char[32] + - mmp\_bdevname + - Block device name of the filesystem. + * - 0x70 + - \_\_le16 + - mmp\_check\_interval + - The MMP re-check interval, in seconds. + * - 0x72 + - \_\_le16 + - mmp\_pad1 + - Zero. + * - 0x74 + - \_\_le32[226] + - mmp\_pad2 + - Zero. + * - 0x3FC + - \_\_le32 + - mmp\_checksum + - Checksum of the MMP block. diff --git a/Documentation/filesystems/ext4/ondisk/overview.rst b/Documentation/filesystems/ext4/ondisk/overview.rst new file mode 100644 index 000000000000..cbab18baba12 --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/overview.rst @@ -0,0 +1,26 @@ +.. SPDX-License-Identifier: GPL-2.0 + +High Level Design +================= + +An ext4 file system is split into a series of block groups. To reduce +performance difficulties due to fragmentation, the block allocator tries +very hard to keep each file's blocks within the same group, thereby +reducing seek times. The size of a block group is specified in +``sb.s_blocks_per_group`` blocks, though it can also calculated as 8 \* +``block_size_in_bytes``. With the default block size of 4KiB, each group +will contain 32,768 blocks, for a length of 128MiB. The number of block +groups is the size of the device divided by the size of a block group. + +All fields in ext4 are written to disk in little-endian order. HOWEVER, +all fields in jbd2 (the journal) are written to disk in big-endian +order. + +.. include:: blocks.rst +.. include:: blockgroup.rst +.. include:: special_inodes.rst +.. include:: allocators.rst +.. include:: checksums.rst +.. include:: bigalloc.rst +.. include:: inlinedata.rst +.. include:: eainode.rst diff --git a/Documentation/filesystems/ext4/ondisk/special_inodes.rst b/Documentation/filesystems/ext4/ondisk/special_inodes.rst new file mode 100644 index 000000000000..a82f70c9baeb --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/special_inodes.rst @@ -0,0 +1,38 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Special inodes +-------------- + +ext4 reserves some inode for special features, as follows: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - inode Number + - Purpose + * - 0 + - Doesn't exist; there is no inode 0. + * - 1 + - List of defective blocks. + * - 2 + - Root directory. + * - 3 + - User quota. + * - 4 + - Group quota. + * - 5 + - Boot loader. + * - 6 + - Undelete directory. + * - 7 + - Reserved group descriptors inode. (“resize inode”) + * - 8 + - Journal inode. + * - 9 + - The “exclude” inode, for snapshots(?) + * - 10 + - Replica inode, used for some non-upstream feature? + * - 11 + - Traditional first non-reserved inode. Usually this is the lost+found directory. See s\_first\_ino in the superblock. + diff --git a/Documentation/filesystems/ext4/ondisk/super.rst b/Documentation/filesystems/ext4/ondisk/super.rst new file mode 100644 index 000000000000..5f81dd87e0b9 --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/super.rst @@ -0,0 +1,801 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Super Block +----------- + +The superblock records various information about the enclosing +filesystem, such as block counts, inode counts, supported features, +maintenance information, and more. + +If the sparse\_super feature flag is set, redundant copies of the +superblock and group descriptors are kept only in the groups whose group +number is either 0 or a power of 3, 5, or 7. If the flag is not set, +redundant copies are kept in all groups. + +The superblock checksum is calculated against the superblock structure, +which includes the FS UUID. + +The ext4 superblock is laid out as follows in +``struct ext4_super_block``: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le32 + - s\_inodes\_count + - Total inode count. + * - 0x4 + - \_\_le32 + - s\_blocks\_count\_lo + - Total block count. + * - 0x8 + - \_\_le32 + - s\_r\_blocks\_count\_lo + - This number of blocks can only be allocated by the super-user. + * - 0xC + - \_\_le32 + - s\_free\_blocks\_count\_lo + - Free block count. + * - 0x10 + - \_\_le32 + - s\_free\_inodes\_count + - Free inode count. + * - 0x14 + - \_\_le32 + - s\_first\_data\_block + - First data block. This must be at least 1 for 1k-block filesystems and + is typically 0 for all other block sizes. + * - 0x18 + - \_\_le32 + - s\_log\_block\_size + - Block size is 2 ^ (10 + s\_log\_block\_size). + * - 0x1C + - \_\_le32 + - s\_log\_cluster\_size + - Cluster size is (2 ^ s\_log\_cluster\_size) blocks if bigalloc is + enabled. Otherwise s\_log\_cluster\_size must equal s\_log\_block\_size. + * - 0x20 + - \_\_le32 + - s\_blocks\_per\_group + - Blocks per group. + * - 0x24 + - \_\_le32 + - s\_clusters\_per\_group + - Clusters per group, if bigalloc is enabled. Otherwise + s\_clusters\_per\_group must equal s\_blocks\_per\_group. + * - 0x28 + - \_\_le32 + - s\_inodes\_per\_group + - Inodes per group. + * - 0x2C + - \_\_le32 + - s\_mtime + - Mount time, in seconds since the epoch. + * - 0x30 + - \_\_le32 + - s\_wtime + - Write time, in seconds since the epoch. + * - 0x34 + - \_\_le16 + - s\_mnt\_count + - Number of mounts since the last fsck. + * - 0x36 + - \_\_le16 + - s\_max\_mnt\_count + - Number of mounts beyond which a fsck is needed. + * - 0x38 + - \_\_le16 + - s\_magic + - Magic signature, 0xEF53 + * - 0x3A + - \_\_le16 + - s\_state + - File system state. See super_state_ for more info. + * - 0x3C + - \_\_le16 + - s\_errors + - Behaviour when detecting errors. See super_errors_ for more info. + * - 0x3E + - \_\_le16 + - s\_minor\_rev\_level + - Minor revision level. + * - 0x40 + - \_\_le32 + - s\_lastcheck + - Time of last check, in seconds since the epoch. + * - 0x44 + - \_\_le32 + - s\_checkinterval + - Maximum time between checks, in seconds. + * - 0x48 + - \_\_le32 + - s\_creator\_os + - Creator OS. See the table super_creator_ for more info. + * - 0x4C + - \_\_le32 + - s\_rev\_level + - Revision level. See the table super_revision_ for more info. + * - 0x50 + - \_\_le16 + - s\_def\_resuid + - Default uid for reserved blocks. + * - 0x52 + - \_\_le16 + - s\_def\_resgid + - Default gid for reserved blocks. + * - + - + - + - These fields are for EXT4_DYNAMIC_REV superblocks only. + + Note: the difference between the compatible feature set and the + incompatible feature set is that if there is a bit set in the + incompatible feature set that the kernel doesn't know about, it should + refuse to mount the filesystem. + + e2fsck's requirements are more strict; if it doesn't know + about a feature in either the compatible or incompatible feature set, it + must abort and not try to meddle with things it doesn't understand... + * - 0x54 + - \_\_le32 + - s\_first\_ino + - First non-reserved inode. + * - 0x58 + - \_\_le16 + - s\_inode\_size + - Size of inode structure, in bytes. + * - 0x5A + - \_\_le16 + - s\_block\_group\_nr + - Block group # of this superblock. + * - 0x5C + - \_\_le32 + - s\_feature\_compat + - Compatible feature set flags. Kernel can still read/write this fs even + if it doesn't understand a flag; fsck should not do that. See the + super_compat_ table for more info. + * - 0x60 + - \_\_le32 + - s\_feature\_incompat + - Incompatible feature set. If the kernel or fsck doesn't understand one + of these bits, it should stop. See the super_incompat_ table for more + info. + * - 0x64 + - \_\_le32 + - s\_feature\_ro\_compat + - Readonly-compatible feature set. If the kernel doesn't understand one of + these bits, it can still mount read-only. See the super_rocompat_ table + for more info. + * - 0x68 + - \_\_u8 + - s\_uuid[16] + - 128-bit UUID for volume. + * - 0x78 + - char + - s\_volume\_name[16] + - Volume label. + * - 0x88 + - char + - s\_last\_mounted[64] + - Directory where filesystem was last mounted. + * - 0xC8 + - \_\_le32 + - s\_algorithm\_usage\_bitmap + - For compression (Not used in e2fsprogs/Linux) + * - + - + - + - Performance hints. Directory preallocation should only happen if the + EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on. + * - 0xCC + - \_\_u8 + - s\_prealloc\_blocks + - #. of blocks to try to preallocate for ... files? (Not used in + e2fsprogs/Linux) + * - 0xCD + - \_\_u8 + - s\_prealloc\_dir\_blocks + - #. of blocks to preallocate for directories. (Not used in + e2fsprogs/Linux) + * - 0xCE + - \_\_le16 + - s\_reserved\_gdt\_blocks + - Number of reserved GDT entries for future filesystem expansion. + * - + - + - + - Journalling support is valid only if EXT4_FEATURE_COMPAT_HAS_JOURNAL is + set. + * - 0xD0 + - \_\_u8 + - s\_journal\_uuid[16] + - UUID of journal superblock + * - 0xE0 + - \_\_le32 + - s\_journal\_inum + - inode number of journal file. + * - 0xE4 + - \_\_le32 + - s\_journal\_dev + - Device number of journal file, if the external journal feature flag is + set. + * - 0xE8 + - \_\_le32 + - s\_last\_orphan + - Start of list of orphaned inodes to delete. + * - 0xEC + - \_\_le32 + - s\_hash\_seed[4] + - HTREE hash seed. + * - 0xFC + - \_\_u8 + - s\_def\_hash\_version + - Default hash algorithm to use for directory hashes. See super_def_hash_ + for more info. + * - 0xFD + - \_\_u8 + - s\_jnl\_backup\_type + - If this value is 0 or EXT3\_JNL\_BACKUP\_BLOCKS (1), then the + ``s_jnl_blocks`` field contains a duplicate copy of the inode's + ``i_block[]`` array and ``i_size``. + * - 0xFE + - \_\_le16 + - s\_desc\_size + - Size of group descriptors, in bytes, if the 64bit incompat feature flag + is set. + * - 0x100 + - \_\_le32 + - s\_default\_mount\_opts + - Default mount options. See the super_mountopts_ table for more info. + * - 0x104 + - \_\_le32 + - s\_first\_meta\_bg + - First metablock block group, if the meta\_bg feature is enabled. + * - 0x108 + - \_\_le32 + - s\_mkfs\_time + - When the filesystem was created, in seconds since the epoch. + * - 0x10C + - \_\_le32 + - s\_jnl\_blocks[17] + - Backup copy of the journal inode's ``i_block[]`` array in the first 15 + elements and i\_size\_high and i\_size in the 16th and 17th elements, + respectively. + * - + - + - + - 64bit support is valid only if EXT4_FEATURE_COMPAT_64BIT is set. + * - 0x150 + - \_\_le32 + - s\_blocks\_count\_hi + - High 32-bits of the block count. + * - 0x154 + - \_\_le32 + - s\_r\_blocks\_count\_hi + - High 32-bits of the reserved block count. + * - 0x158 + - \_\_le32 + - s\_free\_blocks\_count\_hi + - High 32-bits of the free block count. + * - 0x15C + - \_\_le16 + - s\_min\_extra\_isize + - All inodes have at least # bytes. + * - 0x15E + - \_\_le16 + - s\_want\_extra\_isize + - New inodes should reserve # bytes. + * - 0x160 + - \_\_le32 + - s\_flags + - Miscellaneous flags. See the super_flags_ table for more info. + * - 0x164 + - \_\_le16 + - s\_raid\_stride + - RAID stride. This is the number of logical blocks read from or written + to the disk before moving to the next disk. This affects the placement + of filesystem metadata, which will hopefully make RAID storage faster. + * - 0x166 + - \_\_le16 + - s\_mmp\_interval + - #. seconds to wait in multi-mount prevention (MMP) checking. In theory, + MMP is a mechanism to record in the superblock which host and device + have mounted the filesystem, in order to prevent multiple mounts. This + feature does not seem to be implemented... + * - 0x168 + - \_\_le64 + - s\_mmp\_block + - Block # for multi-mount protection data. + * - 0x170 + - \_\_le32 + - s\_raid\_stripe\_width + - RAID stripe width. This is the number of logical blocks read from or + written to the disk before coming back to the current disk. This is used + by the block allocator to try to reduce the number of read-modify-write + operations in a RAID5/6. + * - 0x174 + - \_\_u8 + - s\_log\_groups\_per\_flex + - Size of a flexible block group is 2 ^ ``s_log_groups_per_flex``. + * - 0x175 + - \_\_u8 + - s\_checksum\_type + - Metadata checksum algorithm type. The only valid value is 1 (crc32c). + * - 0x176 + - \_\_le16 + - s\_reserved\_pad + - + * - 0x178 + - \_\_le64 + - s\_kbytes\_written + - Number of KiB written to this filesystem over its lifetime. + * - 0x180 + - \_\_le32 + - s\_snapshot\_inum + - inode number of active snapshot. (Not used in e2fsprogs/Linux.) + * - 0x184 + - \_\_le32 + - s\_snapshot\_id + - Sequential ID of active snapshot. (Not used in e2fsprogs/Linux.) + * - 0x188 + - \_\_le64 + - s\_snapshot\_r\_blocks\_count + - Number of blocks reserved for active snapshot's future use. (Not used in + e2fsprogs/Linux.) + * - 0x190 + - \_\_le32 + - s\_snapshot\_list + - inode number of the head of the on-disk snapshot list. (Not used in + e2fsprogs/Linux.) + * - 0x194 + - \_\_le32 + - s\_error\_count + - Number of errors seen. + * - 0x198 + - \_\_le32 + - s\_first\_error\_time + - First time an error happened, in seconds since the epoch. + * - 0x19C + - \_\_le32 + - s\_first\_error\_ino + - inode involved in first error. + * - 0x1A0 + - \_\_le64 + - s\_first\_error\_block + - Number of block involved of first error. + * - 0x1A8 + - \_\_u8 + - s\_first\_error\_func[32] + - Name of function where the error happened. + * - 0x1C8 + - \_\_le32 + - s\_first\_error\_line + - Line number where error happened. + * - 0x1CC + - \_\_le32 + - s\_last\_error\_time + - Time of most recent error, in seconds since the epoch. + * - 0x1D0 + - \_\_le32 + - s\_last\_error\_ino + - inode involved in most recent error. + * - 0x1D4 + - \_\_le32 + - s\_last\_error\_line + - Line number where most recent error happened. + * - 0x1D8 + - \_\_le64 + - s\_last\_error\_block + - Number of block involved in most recent error. + * - 0x1E0 + - \_\_u8 + - s\_last\_error\_func[32] + - Name of function where the most recent error happened. + * - 0x200 + - \_\_u8 + - s\_mount\_opts[64] + - ASCIIZ string of mount options. + * - 0x240 + - \_\_le32 + - s\_usr\_quota\_inum + - Inode number of user `quota <quota>`__ file. + * - 0x244 + - \_\_le32 + - s\_grp\_quota\_inum + - Inode number of group `quota <quota>`__ file. + * - 0x248 + - \_\_le32 + - s\_overhead\_blocks + - Overhead blocks/clusters in fs. (Huh? This field is always zero, which + means that the kernel calculates it dynamically.) + * - 0x24C + - \_\_le32 + - s\_backup\_bgs[2] + - Block groups containing superblock backups (if sparse\_super2) + * - 0x254 + - \_\_u8 + - s\_encrypt\_algos[4] + - Encryption algorithms in use. There can be up to four algorithms in use + at any time; valid algorithm codes are given in the super_encrypt_ table + below. + * - 0x258 + - \_\_u8 + - s\_encrypt\_pw\_salt[16] + - Salt for the string2key algorithm for encryption. + * - 0x268 + - \_\_le32 + - s\_lpf\_ino + - Inode number of lost+found + * - 0x26C + - \_\_le32 + - s\_prj\_quota\_inum + - Inode that tracks project quotas. + * - 0x270 + - \_\_le32 + - s\_checksum\_seed + - Checksum seed used for metadata\_csum calculations. This value is + crc32c(~0, $orig\_fs\_uuid). + * - 0x274 + - \_\_u8 + - s\_wtime_hi + - Upper 8 bits of the s_wtime field. + * - 0x275 + - \_\_u8 + - s\_wtime_hi + - Upper 8 bits of the s_mtime field. + * - 0x276 + - \_\_u8 + - s\_mkfs_time_hi + - Upper 8 bits of the s_mkfs_time field. + * - 0x277 + - \_\_u8 + - s\_lastcheck_hi + - Upper 8 bits of the s_lastcheck_hi field. + * - 0x278 + - \_\_u8 + - s\_first_error_time_hi + - Upper 8 bits of the s_first_error_time_hi field. + * - 0x279 + - \_\_u8 + - s\_last_error_time_hi + - Upper 8 bits of the s_last_error_time_hi field. + * - 0x27A + - \_\_u8[2] + - s\_pad + - Zero padding. + * - 0x27C + - \_\_le32 + - s\_reserved[96] + - Padding to the end of the block. + * - 0x3FC + - \_\_le32 + - s\_checksum + - Superblock checksum. + +.. _super_state: + +The superblock state is some combination of the following: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 0x0001 + - Cleanly umounted + * - 0x0002 + - Errors detected + * - 0x0004 + - Orphans being recovered + +.. _super_errors: + +The superblock error policy is one of the following: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 1 + - Continue + * - 2 + - Remount read-only + * - 3 + - Panic + +.. _super_creator: + +The filesystem creator is one of the following: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 0 + - Linux + * - 1 + - Hurd + * - 2 + - Masix + * - 3 + - FreeBSD + * - 4 + - Lites + +.. _super_revision: + +The superblock revision is one of the following: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 0 + - Original format + * - 1 + - v2 format w/ dynamic inode sizes + +Note that ``EXT4_DYNAMIC_REV`` refers to a revision 1 or newer filesystem. + +.. _super_compat: + +The superblock compatible features field is a combination of any of the +following: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 0x1 + - Directory preallocation (COMPAT\_DIR\_PREALLOC). + * - 0x2 + - “imagic inodes”. Not clear from the code what this does + (COMPAT\_IMAGIC\_INODES). + * - 0x4 + - Has a journal (COMPAT\_HAS\_JOURNAL). + * - 0x8 + - Supports extended attributes (COMPAT\_EXT\_ATTR). + * - 0x10 + - Has reserved GDT blocks for filesystem expansion + (COMPAT\_RESIZE\_INODE). Requires RO\_COMPAT\_SPARSE\_SUPER. + * - 0x20 + - Has directory indices (COMPAT\_DIR\_INDEX). + * - 0x40 + - “Lazy BG”. Not in Linux kernel, seems to have been for uninitialized + block groups? (COMPAT\_LAZY\_BG) + * - 0x80 + - “Exclude inode”. Not used. (COMPAT\_EXCLUDE\_INODE). + * - 0x100 + - “Exclude bitmap”. Seems to be used to indicate the presence of + snapshot-related exclude bitmaps? Not defined in kernel or used in + e2fsprogs (COMPAT\_EXCLUDE\_BITMAP). + * - 0x200 + - Sparse Super Block, v2. If this flag is set, the SB field s\_backup\_bgs + points to the two block groups that contain backup superblocks + (COMPAT\_SPARSE\_SUPER2). + +.. _super_incompat: + +The superblock incompatible features field is a combination of any of the +following: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 0x1 + - Compression (INCOMPAT\_COMPRESSION). + * - 0x2 + - Directory entries record the file type. See ext4\_dir\_entry\_2 below + (INCOMPAT\_FILETYPE). + * - 0x4 + - Filesystem needs recovery (INCOMPAT\_RECOVER). + * - 0x8 + - Filesystem has a separate journal device (INCOMPAT\_JOURNAL\_DEV). + * - 0x10 + - Meta block groups. See the earlier discussion of this feature + (INCOMPAT\_META\_BG). + * - 0x40 + - Files in this filesystem use extents (INCOMPAT\_EXTENTS). + * - 0x80 + - Enable a filesystem size of 2^64 blocks (INCOMPAT\_64BIT). + * - 0x100 + - Multiple mount protection. Not implemented (INCOMPAT\_MMP). + * - 0x200 + - Flexible block groups. See the earlier discussion of this feature + (INCOMPAT\_FLEX\_BG). + * - 0x400 + - Inodes can be used to store large extended attribute values + (INCOMPAT\_EA\_INODE). + * - 0x1000 + - Data in directory entry (INCOMPAT\_DIRDATA). (Not implemented?) + * - 0x2000 + - Metadata checksum seed is stored in the superblock. This feature enables + the administrator to change the UUID of a metadata\_csum filesystem + while the filesystem is mounted; without it, the checksum definition + requires all metadata blocks to be rewritten (INCOMPAT\_CSUM\_SEED). + * - 0x4000 + - Large directory >2GB or 3-level htree (INCOMPAT\_LARGEDIR). Prior to + this feature, directories could not be larger than 4GiB and could not + have an htree more than 2 levels deep. If this feature is enabled, + directories can be larger than 4GiB and have a maximum htree depth of 3. + * - 0x8000 + - Data in inode (INCOMPAT\_INLINE\_DATA). + * - 0x10000 + - Encrypted inodes are present on the filesystem. (INCOMPAT\_ENCRYPT). + +.. _super_rocompat: + +The superblock read-only compatible features field is a combination of any of +the following: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 0x1 + - Sparse superblocks. See the earlier discussion of this feature + (RO\_COMPAT\_SPARSE\_SUPER). + * - 0x2 + - This filesystem has been used to store a file greater than 2GiB + (RO\_COMPAT\_LARGE\_FILE). + * - 0x4 + - Not used in kernel or e2fsprogs (RO\_COMPAT\_BTREE\_DIR). + * - 0x8 + - This filesystem has files whose sizes are represented in units of + logical blocks, not 512-byte sectors. This implies a very large file + indeed! (RO\_COMPAT\_HUGE\_FILE) + * - 0x10 + - Group descriptors have checksums. In addition to detecting corruption, + this is useful for lazy formatting with uninitialized groups + (RO\_COMPAT\_GDT\_CSUM). + * - 0x20 + - Indicates that the old ext3 32,000 subdirectory limit no longer applies + (RO\_COMPAT\_DIR\_NLINK). A directory's i\_links\_count will be set to 1 + if it is incremented past 64,999. + * - 0x40 + - Indicates that large inodes exist on this filesystem + (RO\_COMPAT\_EXTRA\_ISIZE). + * - 0x80 + - This filesystem has a snapshot (RO\_COMPAT\_HAS\_SNAPSHOT). + * - 0x100 + - `Quota <Quota>`__ (RO\_COMPAT\_QUOTA). + * - 0x200 + - This filesystem supports “bigalloc”, which means that file extents are + tracked in units of clusters (of blocks) instead of blocks + (RO\_COMPAT\_BIGALLOC). + * - 0x400 + - This filesystem supports metadata checksumming. + (RO\_COMPAT\_METADATA\_CSUM; implies RO\_COMPAT\_GDT\_CSUM, though + GDT\_CSUM must not be set) + * - 0x800 + - Filesystem supports replicas. This feature is neither in the kernel nor + e2fsprogs. (RO\_COMPAT\_REPLICA) + * - 0x1000 + - Read-only filesystem image; the kernel will not mount this image + read-write and most tools will refuse to write to the image. + (RO\_COMPAT\_READONLY) + * - 0x2000 + - Filesystem tracks project quotas. (RO\_COMPAT\_PROJECT) + +.. _super_def_hash: + +The ``s_def_hash_version`` field is one of the following: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 0x0 + - Legacy. + * - 0x1 + - Half MD4. + * - 0x2 + - Tea. + * - 0x3 + - Legacy, unsigned. + * - 0x4 + - Half MD4, unsigned. + * - 0x5 + - Tea, unsigned. + +.. _super_mountopts: + +The ``s_default_mount_opts`` field is any combination of the following: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 0x0001 + - Print debugging info upon (re)mount. (EXT4\_DEFM\_DEBUG) + * - 0x0002 + - New files take the gid of the containing directory (instead of the fsgid + of the current process). (EXT4\_DEFM\_BSDGROUPS) + * - 0x0004 + - Support userspace-provided extended attributes. (EXT4\_DEFM\_XATTR\_USER) + * - 0x0008 + - Support POSIX access control lists (ACLs). (EXT4\_DEFM\_ACL) + * - 0x0010 + - Do not support 32-bit UIDs. (EXT4\_DEFM\_UID16) + * - 0x0020 + - All data and metadata are commited to the journal. + (EXT4\_DEFM\_JMODE\_DATA) + * - 0x0040 + - All data are flushed to the disk before metadata are committed to the + journal. (EXT4\_DEFM\_JMODE\_ORDERED) + * - 0x0060 + - Data ordering is not preserved; data may be written after the metadata + has been written. (EXT4\_DEFM\_JMODE\_WBACK) + * - 0x0100 + - Disable write flushes. (EXT4\_DEFM\_NOBARRIER) + * - 0x0200 + - Track which blocks in a filesystem are metadata and therefore should not + be used as data blocks. This option will be enabled by default on 3.18, + hopefully. (EXT4\_DEFM\_BLOCK\_VALIDITY) + * - 0x0400 + - Enable DISCARD support, where the storage device is told about blocks + becoming unused. (EXT4\_DEFM\_DISCARD) + * - 0x0800 + - Disable delayed allocation. (EXT4\_DEFM\_NODELALLOC) + +.. _super_flags: + +The ``s_flags`` field is any combination of the following: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 0x0001 + - Signed directory hash in use. + * - 0x0002 + - Unsigned directory hash in use. + * - 0x0004 + - To test development code. + +.. _super_encrypt: + +The ``s_encrypt_algos`` list can contain any of the following: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 0 + - Invalid algorithm (ENCRYPTION\_MODE\_INVALID). + * - 1 + - 256-bit AES in XTS mode (ENCRYPTION\_MODE\_AES\_256\_XTS). + * - 2 + - 256-bit AES in GCM mode (ENCRYPTION\_MODE\_AES\_256\_GCM). + * - 3 + - 256-bit AES in CBC mode (ENCRYPTION\_MODE\_AES\_256\_CBC). + +Total size of the superblock is 1024 bytes. diff --git a/Documentation/index.rst b/Documentation/index.rst index fdc585703498..f95ba981f8cd 100644 --- a/Documentation/index.rst +++ b/Documentation/index.rst @@ -102,6 +102,17 @@ implementation. sh/index +Filesystem Documentation +------------------------ + +The documentation in this section are provided by specific filesystem +subprojects. + +.. toctree:: + :maxdepth: 2 + + filesystems/ext4/index + Korean translations ------------------- @@ -566,7 +566,8 @@ struct page *dax_layout_busy_page(struct address_space *mapping) if (index >= end) break; - if (!radix_tree_exceptional_entry(pvec_ent)) + if (WARN_ON_ONCE( + !radix_tree_exceptional_entry(pvec_ent))) continue; xa_lock_irq(&mapping->i_pages); @@ -578,6 +579,13 @@ struct page *dax_layout_busy_page(struct address_space *mapping) if (page) break; } + + /* + * We don't expect normal struct page entries to exist in our + * tree, but we keep these pagevec calls so that this code is + * consistent with the common pattern for handling pagevecs + * throughout the kernel. + */ pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); index++; diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index aa52d87985aa..e5d6ee61ff48 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -426,9 +426,9 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) } bh = sb_getblk(sb, bitmap_blk); if (unlikely(!bh)) { - ext4_error(sb, "Cannot get buffer for block bitmap - " - "block_group = %u, block_bitmap = %llu", - block_group, bitmap_blk); + ext4_warning(sb, "Cannot get buffer for block bitmap - " + "block_group = %u, block_bitmap = %llu", + block_group, bitmap_blk); return ERR_PTR(-ENOMEM); } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 7c7123f265c2..1fc013f3d944 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -789,17 +789,16 @@ struct move_extent { * affected filesystem before 2242. */ -static inline __le32 ext4_encode_extra_time(struct timespec *time) +static inline __le32 ext4_encode_extra_time(struct timespec64 *time) { - u32 extra = sizeof(time->tv_sec) > 4 ? - ((time->tv_sec - (s32)time->tv_sec) >> 32) & EXT4_EPOCH_MASK : 0; + u32 extra =((time->tv_sec - (s32)time->tv_sec) >> 32) & EXT4_EPOCH_MASK; return cpu_to_le32(extra | (time->tv_nsec << EXT4_EPOCH_BITS)); } -static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) +static inline void ext4_decode_extra_time(struct timespec64 *time, + __le32 extra) { - if (unlikely(sizeof(time->tv_sec) > 4 && - (extra & cpu_to_le32(EXT4_EPOCH_MASK)))) { + if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK))) { #if 1 /* Handle legacy encoding of pre-1970 dates with epoch @@ -821,9 +820,8 @@ static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) do { \ (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \ if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) {\ - struct timespec ts = timespec64_to_timespec((inode)->xtime); \ (raw_inode)->xtime ## _extra = \ - ext4_encode_extra_time(&ts); \ + ext4_encode_extra_time(&(inode)->xtime); \ } \ } while (0) @@ -840,10 +838,8 @@ do { \ do { \ (inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime); \ if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) { \ - struct timespec ts = timespec64_to_timespec((inode)->xtime); \ - ext4_decode_extra_time(&ts, \ + ext4_decode_extra_time(&(inode)->xtime, \ raw_inode->xtime ## _extra); \ - (inode)->xtime = timespec_to_timespec64(ts); \ } \ else \ (inode)->xtime.tv_nsec = 0; \ @@ -993,9 +989,9 @@ struct ext4_inode_info { /* * File creation time. Its function is same as that of - * struct timespec i_{a,c,m}time in the generic inode. + * struct timespec64 i_{a,c,m}time in the generic inode. */ - struct timespec i_crtime; + struct timespec64 i_crtime; /* mballoc */ struct list_head i_prealloc_list; @@ -1299,7 +1295,14 @@ struct ext4_super_block { __le32 s_lpf_ino; /* Location of the lost+found inode */ __le32 s_prj_quota_inum; /* inode for tracking project quota */ __le32 s_checksum_seed; /* crc32c(uuid) if csum_seed set */ - __le32 s_reserved[98]; /* Padding to the end of the block */ + __u8 s_wtime_hi; + __u8 s_mtime_hi; + __u8 s_mkfs_time_hi; + __u8 s_lastcheck_hi; + __u8 s_first_error_time_hi; + __u8 s_last_error_time_hi; + __u8 s_pad[2]; + __le32 s_reserved[96]; /* Padding to the end of the block */ __le32 s_checksum; /* crc32c(superblock) */ }; @@ -2456,6 +2459,7 @@ extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); extern int ext4_inode_attach_jinode(struct inode *inode); extern int ext4_can_truncate(struct inode *inode); extern int ext4_truncate(struct inode *); +extern int ext4_break_layouts(struct inode *); extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); extern void ext4_set_inode_flags(struct inode *); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 8ce6fd5b10dd..72a361d5ef74 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4826,6 +4826,13 @@ static long ext4_zero_range(struct file *file, loff_t offset, * released from page cache. */ down_write(&EXT4_I(inode)->i_mmap_sem); + + ret = ext4_break_layouts(inode); + if (ret) { + up_write(&EXT4_I(inode)->i_mmap_sem); + goto out_mutex; + } + ret = ext4_update_disksize_before_punch(inode, offset, len); if (ret) { up_write(&EXT4_I(inode)->i_mmap_sem); @@ -5499,6 +5506,11 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) * page cache. */ down_write(&EXT4_I(inode)->i_mmap_sem); + + ret = ext4_break_layouts(inode); + if (ret) + goto out_mmap; + /* * Need to round down offset to be aligned with page size boundary * for page size > block size. @@ -5647,6 +5659,11 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) * page cache. */ down_write(&EXT4_I(inode)->i_mmap_sem); + + ret = ext4_break_layouts(inode); + if (ret) + goto out_mmap; + /* * Need to round down to align start offset to page size boundary * for page size > block size. diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index f336cbc6e932..2addcb8730e1 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -138,9 +138,9 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) } bh = sb_getblk(sb, bitmap_blk); if (unlikely(!bh)) { - ext4_error(sb, "Cannot read inode bitmap - " - "block_group = %u, inode_bitmap = %llu", - block_group, bitmap_blk); + ext4_warning(sb, "Cannot read inode bitmap - " + "block_group = %u, inode_bitmap = %llu", + block_group, bitmap_blk); return ERR_PTR(-ENOMEM); } if (bitmap_uptodate(bh)) @@ -1086,7 +1086,7 @@ got: /* This is the optimal IO size (for stat), not the fs block size */ inode->i_blocks = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); - ei->i_crtime = timespec64_to_timespec(inode->i_mtime); + ei->i_crtime = inode->i_mtime; memset(ei->i_data, 0, sizeof(ei->i_data)); ei->i_dir_start_lookup = 0; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4efe77286ecd..8f6ad7667974 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -317,7 +317,7 @@ stop_handle: * (Well, we could do this if we need to, but heck - it works) */ ext4_orphan_del(handle, inode); - EXT4_I(inode)->i_dtime = get_seconds(); + EXT4_I(inode)->i_dtime = (__u32)ktime_get_real_seconds(); /* * One subtle ordering requirement: if anything has gone wrong @@ -4191,6 +4191,39 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, return 0; } +static void ext4_wait_dax_page(struct ext4_inode_info *ei, bool *did_unlock) +{ + *did_unlock = true; + up_write(&ei->i_mmap_sem); + schedule(); + down_write(&ei->i_mmap_sem); +} + +int ext4_break_layouts(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct page *page; + bool retry; + int error; + + if (WARN_ON_ONCE(!rwsem_is_locked(&ei->i_mmap_sem))) + return -EINVAL; + + do { + retry = false; + page = dax_layout_busy_page(inode->i_mapping); + if (!page) + return 0; + + error = ___wait_var_event(&page->_refcount, + atomic_read(&page->_refcount) == 1, + TASK_INTERRUPTIBLE, 0, 0, + ext4_wait_dax_page(ei, &retry)); + } while (error == 0 && retry); + + return error; +} + /* * ext4_punch_hole: punches a hole in a file by releasing the blocks * associated with the given offset and length @@ -4264,6 +4297,11 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) * page cache. */ down_write(&EXT4_I(inode)->i_mmap_sem); + + ret = ext4_break_layouts(inode); + if (ret) + goto out_dio; + first_block_offset = round_up(offset, sb->s_blocksize); last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; @@ -4944,17 +4982,14 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ret = -EFSCORRUPTED; goto bad_inode; } else if (!ext4_has_inline_data(inode)) { - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - (S_ISLNK(inode->i_mode) && - !ext4_inode_is_fast_symlink(inode)))) - /* Validate extent which is part of inode */ + /* validate the block references in the inode */ + if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + (S_ISLNK(inode->i_mode) && + !ext4_inode_is_fast_symlink(inode))) { + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ret = ext4_ext_check_inode(inode); - } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - (S_ISLNK(inode->i_mode) && - !ext4_inode_is_fast_symlink(inode))) { - /* Validate block references which are part of inode */ - ret = ext4_ind_check_inode(inode); + else + ret = ext4_ind_check_inode(inode); } } if (ret) @@ -5553,6 +5588,14 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) ext4_wait_for_tail_page_commit(inode); } down_write(&EXT4_I(inode)->i_mmap_sem); + + rc = ext4_break_layouts(inode); + if (rc) { + up_write(&EXT4_I(inode)->i_mmap_sem); + error = rc; + goto err_out; + } + /* * Truncate pagecache after we've waited for commit * in data=journal mode to make pages freeable. diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index f7ab34088162..e29fce2fbf25 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -14,6 +14,7 @@ #include <linux/log2.h> #include <linux/module.h> #include <linux/slab.h> +#include <linux/nospec.h> #include <linux/backing-dev.h> #include <trace/events/ext4.h> @@ -2140,7 +2141,8 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) * This should tell if fe_len is exactly power of 2 */ if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0) - ac->ac_2order = i - 1; + ac->ac_2order = array_index_nospec(i - 1, + sb->s_blocksize_bits + 2); } /* if stream allocation is enabled, use global goal */ @@ -3799,7 +3801,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, ext4_group_t group; ext4_grpblk_t bit; unsigned long long grp_blk_start; - int err = 0; int free = 0; BUG_ON(pa->pa_deleted == 0); @@ -3840,7 +3841,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, } atomic_add(free, &sbi->s_mb_discarded); - return err; + return 0; } static noinline_for_stack int diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 638ad4743477..39b07c2d3384 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -147,7 +147,7 @@ static int kmmpd(void *data) mmp_block = le64_to_cpu(es->s_mmp_block); mmp = (struct mmp_struct *)(bh->b_data); - mmp->mmp_time = cpu_to_le64(get_seconds()); + mmp->mmp_time = cpu_to_le64(ktime_get_real_seconds()); /* * Start with the higher mmp_check_interval and reduce it if * the MMP block is being updated on time. @@ -165,7 +165,7 @@ static int kmmpd(void *data) seq = 1; mmp->mmp_seq = cpu_to_le32(seq); - mmp->mmp_time = cpu_to_le64(get_seconds()); + mmp->mmp_time = cpu_to_le64(ktime_get_real_seconds()); last_update_time = jiffies; retval = write_mmp_block(sb, bh); @@ -241,7 +241,7 @@ static int kmmpd(void *data) * Unmount seems to be clean. */ mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN); - mmp->mmp_time = cpu_to_le64(get_seconds()); + mmp->mmp_time = cpu_to_le64(ktime_get_real_seconds()); retval = write_mmp_block(sb, bh); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 8e17efdcbf11..a409ff70d67b 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -134,9 +134,7 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2, mapping[0] = inode1->i_mapping; mapping[1] = inode2->i_mapping; } else { - pgoff_t tmp = index1; - index1 = index2; - index2 = tmp; + swap(index1, index2); mapping[0] = inode2->i_mapping; mapping[1] = inode1->i_mapping; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 2a4c25c4681d..116ff68c5bd4 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1398,6 +1398,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, goto cleanup_and_exit; dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " "falling back\n")); + ret = NULL; } nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); if (!nblocks) { diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b7f7922061be..f7750bc5b85a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -312,6 +312,24 @@ void ext4_itable_unused_set(struct super_block *sb, bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); } +static void __ext4_update_tstamp(__le32 *lo, __u8 *hi) +{ + time64_t now = ktime_get_real_seconds(); + + now = clamp_val(now, 0, (1ull << 40) - 1); + + *lo = cpu_to_le32(lower_32_bits(now)); + *hi = upper_32_bits(now); +} + +static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi) +{ + return ((time64_t)(*hi) << 32) + le32_to_cpu(*lo); +} +#define ext4_update_tstamp(es, tstamp) \ + __ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi) +#define ext4_get_tstamp(es, tstamp) \ + __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi) static void __save_error_info(struct super_block *sb, const char *func, unsigned int line) @@ -322,11 +340,12 @@ static void __save_error_info(struct super_block *sb, const char *func, if (bdev_read_only(sb->s_bdev)) return; es->s_state |= cpu_to_le16(EXT4_ERROR_FS); - es->s_last_error_time = cpu_to_le32(get_seconds()); + ext4_update_tstamp(es, s_last_error_time); strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func)); es->s_last_error_line = cpu_to_le32(line); if (!es->s_first_error_time) { es->s_first_error_time = es->s_last_error_time; + es->s_first_error_time_hi = es->s_last_error_time_hi; strncpy(es->s_first_error_func, func, sizeof(es->s_first_error_func)); es->s_first_error_line = cpu_to_le32(line); @@ -776,26 +795,26 @@ void ext4_mark_group_bitmap_corrupted(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); + int ret; - if ((flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) && - !EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) { - percpu_counter_sub(&sbi->s_freeclusters_counter, - grp->bb_free); - set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, - &grp->bb_state); + if (flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) { + ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, + &grp->bb_state); + if (!ret) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); } - if ((flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) && - !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { - if (gdp) { + if (flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) { + ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, + &grp->bb_state); + if (!ret && gdp) { int count; count = ext4_free_inodes_count(sb, gdp); percpu_counter_sub(&sbi->s_freeinodes_counter, count); } - set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, - &grp->bb_state); } } @@ -2174,8 +2193,8 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, "warning: maximal mount count reached, " "running e2fsck is recommended"); else if (le32_to_cpu(es->s_checkinterval) && - (le32_to_cpu(es->s_lastcheck) + - le32_to_cpu(es->s_checkinterval) <= get_seconds())) + (ext4_get_tstamp(es, s_lastcheck) + + le32_to_cpu(es->s_checkinterval) <= ktime_get_real_seconds())) ext4_msg(sb, KERN_WARNING, "warning: checktime reached, " "running e2fsck is recommended"); @@ -2184,7 +2203,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); le16_add_cpu(&es->s_mnt_count, 1); - es->s_mtime = cpu_to_le32(get_seconds()); + ext4_update_tstamp(es, s_mtime); ext4_update_dynamic_rev(sb); if (sbi->s_journal) ext4_set_feature_journal_needs_recovery(sb); @@ -2875,8 +2894,9 @@ static void print_daily_error_info(struct timer_list *t) ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u", le32_to_cpu(es->s_error_count)); if (es->s_first_error_time) { - printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %u: %.*s:%d", - sb->s_id, le32_to_cpu(es->s_first_error_time), + printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %llu: %.*s:%d", + sb->s_id, + ext4_get_tstamp(es, s_first_error_time), (int) sizeof(es->s_first_error_func), es->s_first_error_func, le32_to_cpu(es->s_first_error_line)); @@ -2889,8 +2909,9 @@ static void print_daily_error_info(struct timer_list *t) printk(KERN_CONT "\n"); } if (es->s_last_error_time) { - printk(KERN_NOTICE "EXT4-fs (%s): last error at time %u: %.*s:%d", - sb->s_id, le32_to_cpu(es->s_last_error_time), + printk(KERN_NOTICE "EXT4-fs (%s): last error at time %llu: %.*s:%d", + sb->s_id, + ext4_get_tstamp(es, s_last_error_time), (int) sizeof(es->s_last_error_func), es->s_last_error_func, le32_to_cpu(es->s_last_error_line)); @@ -4813,7 +4834,7 @@ static int ext4_commit_super(struct super_block *sb, int sync) * to complain and force a full file system check. */ if (!(sb->s_flags & SB_RDONLY)) - es->s_wtime = cpu_to_le32(get_seconds()); + ext4_update_tstamp(es, s_wtime); if (sb->s_bdev->bd_part) es->s_kbytes_written = cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + @@ -5080,6 +5101,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) #endif char *orig_data = kstrdup(data, GFP_KERNEL); + if (data && !orig_data) + return -ENOMEM; + /* Store the original options */ old_sb_flags = sb->s_flags; old_opts.s_mount_opt = sbi->s_mount_opt; @@ -5665,13 +5689,13 @@ static int ext4_enable_quotas(struct super_block *sb) DQUOT_USAGE_ENABLED | (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0)); if (err) { - for (type--; type >= 0; type--) - dquot_quota_off(sb, type); - ext4_warning(sb, "Failed to enable quota tracking " "(type=%d, err=%d). Please run " "e2fsck to fix.", type, err); + for (type--; type >= 0; type--) + dquot_quota_off(sb, type); + return err; } } diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index f34da0bb8f17..e60cc5e89023 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -25,6 +25,8 @@ typedef enum { attr_reserved_clusters, attr_inode_readahead, attr_trigger_test_error, + attr_first_error_time, + attr_last_error_time, attr_feature, attr_pointer_ui, attr_pointer_atomic, @@ -182,8 +184,8 @@ EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); EXT4_RO_ATTR_ES_UI(errors_count, s_error_count); -EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time); -EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time); +EXT4_ATTR(first_error_time, 0444, first_error_time); +EXT4_ATTR(last_error_time, 0444, last_error_time); static unsigned int old_bump_val = 128; EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val); @@ -249,6 +251,15 @@ static void *calc_ptr(struct ext4_attr *a, struct ext4_sb_info *sbi) return NULL; } +static ssize_t __print_tstamp(char *buf, __le32 lo, __u8 hi) +{ + return snprintf(buf, PAGE_SIZE, "%lld", + ((time64_t)hi << 32) + le32_to_cpu(lo)); +} + +#define print_tstamp(buf, es, tstamp) \ + __print_tstamp(buf, (es)->tstamp, (es)->tstamp ## _hi) + static ssize_t ext4_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { @@ -274,8 +285,12 @@ static ssize_t ext4_attr_show(struct kobject *kobj, case attr_pointer_ui: if (!ptr) return 0; - return snprintf(buf, PAGE_SIZE, "%u\n", - *((unsigned int *) ptr)); + if (a->attr_ptr == ptr_ext4_super_block_offset) + return snprintf(buf, PAGE_SIZE, "%u\n", + le32_to_cpup(ptr)); + else + return snprintf(buf, PAGE_SIZE, "%u\n", + *((unsigned int *) ptr)); case attr_pointer_atomic: if (!ptr) return 0; @@ -283,6 +298,10 @@ static ssize_t ext4_attr_show(struct kobject *kobj, atomic_read((atomic_t *) ptr)); case attr_feature: return snprintf(buf, PAGE_SIZE, "supported\n"); + case attr_first_error_time: + return print_tstamp(buf, sbi->s_es, s_first_error_time); + case attr_last_error_time: + return print_tstamp(buf, sbi->s_es, s_last_error_time); } return 0; @@ -308,7 +327,10 @@ static ssize_t ext4_attr_store(struct kobject *kobj, ret = kstrtoul(skip_spaces(buf), 0, &t); if (ret) return ret; - *((unsigned int *) ptr) = t; + if (a->attr_ptr == ptr_ext4_super_block_offset) + *((__le32 *) ptr) = cpu_to_le32(t); + else + *((unsigned int *) ptr) = t; return len; case attr_inode_readahead: return inode_readahead_blks_store(sbi, buf, len); diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h index 0cb13badf473..bcbe3668c1d4 100644 --- a/fs/ext4/truncate.h +++ b/fs/ext4/truncate.h @@ -11,6 +11,10 @@ */ static inline void ext4_truncate_failed_write(struct inode *inode) { + /* + * We don't need to call ext4_break_layouts() because the blocks we + * are truncating were never visible to userspace. + */ down_write(&EXT4_I(inode)->i_mmap_sem); truncate_inode_pages(inode->i_mapping, inode->i_size); ext4_truncate(inode); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 723df14f4084..f36fc5d5b257 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -190,6 +190,8 @@ ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end, struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e); if ((void *)next >= end) return -EFSCORRUPTED; + if (strnlen(e->e_name, e->e_name_len) != e->e_name_len) + return -EFSCORRUPTED; e = next; } diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 8de0e7723316..150cc030b4d7 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -121,7 +121,7 @@ static int journal_submit_commit_record(journal_t *journal, struct commit_header *tmp; struct buffer_head *bh; int ret; - struct timespec64 now = current_kernel_time64(); + struct timespec64 now; *cbh = NULL; @@ -134,6 +134,7 @@ static int journal_submit_commit_record(journal_t *journal, return 1; tmp = (struct commit_header *)bh->b_data; + ktime_get_coarse_real_ts64(&now); tmp->h_commit_sec = cpu_to_be64(now.tv_sec); tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec); |