summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/admin-guide/device-mapper/index.rst2
-rw-r--r--Documentation/admin-guide/device-mapper/vdo-design.rst633
-rw-r--r--Documentation/admin-guide/device-mapper/vdo.rst406
-rw-r--r--MAINTAINERS8
-rw-r--r--drivers/md/Kconfig2
-rw-r--r--drivers/md/Makefile1
-rw-r--r--drivers/md/dm-vdo/Kconfig17
-rw-r--r--drivers/md/dm-vdo/Makefile57
-rw-r--r--drivers/md/dm-vdo/action-manager.c388
-rw-r--r--drivers/md/dm-vdo/action-manager.h110
-rw-r--r--drivers/md/dm-vdo/admin-state.c506
-rw-r--r--drivers/md/dm-vdo/admin-state.h178
-rw-r--r--drivers/md/dm-vdo/block-map.c3318
-rw-r--r--drivers/md/dm-vdo/block-map.h394
-rw-r--r--drivers/md/dm-vdo/completion.c140
-rw-r--r--drivers/md/dm-vdo/completion.h152
-rw-r--r--drivers/md/dm-vdo/constants.h96
-rw-r--r--drivers/md/dm-vdo/cpu.h59
-rw-r--r--drivers/md/dm-vdo/data-vio.c2063
-rw-r--r--drivers/md/dm-vdo/data-vio.h670
-rw-r--r--drivers/md/dm-vdo/dedupe.c3003
-rw-r--r--drivers/md/dm-vdo/dedupe.h120
-rw-r--r--drivers/md/dm-vdo/dm-vdo-target.c2910
-rw-r--r--drivers/md/dm-vdo/dump.c275
-rw-r--r--drivers/md/dm-vdo/dump.h17
-rw-r--r--drivers/md/dm-vdo/encodings.c1483
-rw-r--r--drivers/md/dm-vdo/encodings.h1298
-rw-r--r--drivers/md/dm-vdo/errors.c307
-rw-r--r--drivers/md/dm-vdo/errors.h73
-rw-r--r--drivers/md/dm-vdo/flush.c560
-rw-r--r--drivers/md/dm-vdo/flush.h44
-rw-r--r--drivers/md/dm-vdo/funnel-queue.c170
-rw-r--r--drivers/md/dm-vdo/funnel-queue.h110
-rw-r--r--drivers/md/dm-vdo/funnel-workqueue.c638
-rw-r--r--drivers/md/dm-vdo/funnel-workqueue.h51
-rw-r--r--drivers/md/dm-vdo/indexer/chapter-index.c293
-rw-r--r--drivers/md/dm-vdo/indexer/chapter-index.h61
-rw-r--r--drivers/md/dm-vdo/indexer/config.c376
-rw-r--r--drivers/md/dm-vdo/indexer/config.h124
-rw-r--r--drivers/md/dm-vdo/indexer/delta-index.c1970
-rw-r--r--drivers/md/dm-vdo/indexer/delta-index.h279
-rw-r--r--drivers/md/dm-vdo/indexer/funnel-requestqueue.c279
-rw-r--r--drivers/md/dm-vdo/indexer/funnel-requestqueue.h31
-rw-r--r--drivers/md/dm-vdo/indexer/geometry.c201
-rw-r--r--drivers/md/dm-vdo/indexer/geometry.h140
-rw-r--r--drivers/md/dm-vdo/indexer/hash-utils.h66
-rw-r--r--drivers/md/dm-vdo/indexer/index-layout.c1765
-rw-r--r--drivers/md/dm-vdo/indexer/index-layout.h43
-rw-r--r--drivers/md/dm-vdo/indexer/index-page-map.c173
-rw-r--r--drivers/md/dm-vdo/indexer/index-page-map.h50
-rw-r--r--drivers/md/dm-vdo/indexer/index-session.c739
-rw-r--r--drivers/md/dm-vdo/indexer/index-session.h85
-rw-r--r--drivers/md/dm-vdo/indexer/index.c1388
-rw-r--r--drivers/md/dm-vdo/indexer/index.h83
-rw-r--r--drivers/md/dm-vdo/indexer/indexer.h353
-rw-r--r--drivers/md/dm-vdo/indexer/io-factory.c415
-rw-r--r--drivers/md/dm-vdo/indexer/io-factory.h64
-rw-r--r--drivers/md/dm-vdo/indexer/open-chapter.c426
-rw-r--r--drivers/md/dm-vdo/indexer/open-chapter.h79
-rw-r--r--drivers/md/dm-vdo/indexer/radix-sort.c330
-rw-r--r--drivers/md/dm-vdo/indexer/radix-sort.h26
-rw-r--r--drivers/md/dm-vdo/indexer/sparse-cache.c624
-rw-r--r--drivers/md/dm-vdo/indexer/sparse-cache.h46
-rw-r--r--drivers/md/dm-vdo/indexer/volume-index.c1283
-rw-r--r--drivers/md/dm-vdo/indexer/volume-index.h193
-rw-r--r--drivers/md/dm-vdo/indexer/volume.c1693
-rw-r--r--drivers/md/dm-vdo/indexer/volume.h172
-rw-r--r--drivers/md/dm-vdo/int-map.c707
-rw-r--r--drivers/md/dm-vdo/int-map.h39
-rw-r--r--drivers/md/dm-vdo/io-submitter.c477
-rw-r--r--drivers/md/dm-vdo/io-submitter.h47
-rw-r--r--drivers/md/dm-vdo/logger.c239
-rw-r--r--drivers/md/dm-vdo/logger.h100
-rw-r--r--drivers/md/dm-vdo/logical-zone.c373
-rw-r--r--drivers/md/dm-vdo/logical-zone.h89
-rw-r--r--drivers/md/dm-vdo/memory-alloc.c438
-rw-r--r--drivers/md/dm-vdo/memory-alloc.h162
-rw-r--r--drivers/md/dm-vdo/message-stats.c432
-rw-r--r--drivers/md/dm-vdo/message-stats.h13
-rw-r--r--drivers/md/dm-vdo/murmurhash3.c175
-rw-r--r--drivers/md/dm-vdo/murmurhash3.h15
-rw-r--r--drivers/md/dm-vdo/numeric.h78
-rw-r--r--drivers/md/dm-vdo/packer.c780
-rw-r--r--drivers/md/dm-vdo/packer.h122
-rw-r--r--drivers/md/dm-vdo/permassert.c26
-rw-r--r--drivers/md/dm-vdo/permassert.h45
-rw-r--r--drivers/md/dm-vdo/physical-zone.c644
-rw-r--r--drivers/md/dm-vdo/physical-zone.h115
-rw-r--r--drivers/md/dm-vdo/priority-table.c224
-rw-r--r--drivers/md/dm-vdo/priority-table.h47
-rw-r--r--drivers/md/dm-vdo/recovery-journal.c1762
-rw-r--r--drivers/md/dm-vdo/recovery-journal.h316
-rw-r--r--drivers/md/dm-vdo/repair.c1756
-rw-r--r--drivers/md/dm-vdo/repair.h14
-rw-r--r--drivers/md/dm-vdo/slab-depot.c5101
-rw-r--r--drivers/md/dm-vdo/slab-depot.h601
-rw-r--r--drivers/md/dm-vdo/statistics.h278
-rw-r--r--drivers/md/dm-vdo/status-codes.c94
-rw-r--r--drivers/md/dm-vdo/status-codes.h86
-rw-r--r--drivers/md/dm-vdo/string-utils.c22
-rw-r--r--drivers/md/dm-vdo/string-utils.h23
-rw-r--r--drivers/md/dm-vdo/thread-device.c34
-rw-r--r--drivers/md/dm-vdo/thread-device.h20
-rw-r--r--drivers/md/dm-vdo/thread-registry.c93
-rw-r--r--drivers/md/dm-vdo/thread-registry.h32
-rw-r--r--drivers/md/dm-vdo/thread-utils.c108
-rw-r--r--drivers/md/dm-vdo/thread-utils.h20
-rw-r--r--drivers/md/dm-vdo/time-utils.h28
-rw-r--r--drivers/md/dm-vdo/types.h393
-rw-r--r--drivers/md/dm-vdo/vdo.c1730
-rw-r--r--drivers/md/dm-vdo/vdo.h362
-rw-r--r--drivers/md/dm-vdo/vio.c500
-rw-r--r--drivers/md/dm-vdo/vio.h199
-rw-r--r--drivers/md/dm-vdo/wait-queue.c205
-rw-r--r--drivers/md/dm-vdo/wait-queue.h138
115 files changed, 53411 insertions, 0 deletions
diff --git a/Documentation/admin-guide/device-mapper/index.rst b/Documentation/admin-guide/device-mapper/index.rst
index cde52cc09645..cc5aec861576 100644
--- a/Documentation/admin-guide/device-mapper/index.rst
+++ b/Documentation/admin-guide/device-mapper/index.rst
@@ -34,6 +34,8 @@ Device Mapper
switch
thin-provisioning
unstriped
+ vdo-design
+ vdo
verity
writecache
zero
diff --git a/Documentation/admin-guide/device-mapper/vdo-design.rst b/Documentation/admin-guide/device-mapper/vdo-design.rst
new file mode 100644
index 000000000000..3cd59decbec0
--- /dev/null
+++ b/Documentation/admin-guide/device-mapper/vdo-design.rst
@@ -0,0 +1,633 @@
+.. SPDX-License-Identifier: GPL-2.0-only
+
+================
+Design of dm-vdo
+================
+
+The dm-vdo (virtual data optimizer) target provides inline deduplication,
+compression, zero-block elimination, and thin provisioning. A dm-vdo target
+can be backed by up to 256TB of storage, and can present a logical size of
+up to 4PB. This target was originally developed at Permabit Technology
+Corp. starting in 2009. It was first released in 2013 and has been used in
+production environments ever since. It was made open-source in 2017 after
+Permabit was acquired by Red Hat. This document describes the design of
+dm-vdo. For usage, see vdo.rst in the same directory as this file.
+
+Because deduplication rates fall drastically as the block size increases, a
+vdo target has a maximum block size of 4K. However, it can achieve
+deduplication rates of 254:1, i.e. up to 254 copies of a given 4K block can
+reference a single 4K of actual storage. It can achieve compression rates
+of 14:1. All zero blocks consume no storage at all.
+
+Theory of Operation
+===================
+
+The design of dm-vdo is based on the idea that deduplication is a two-part
+problem. The first is to recognize duplicate data. The second is to avoid
+storing multiple copies of those duplicates. Therefore, dm-vdo has two main
+parts: a deduplication index (called UDS) that is used to discover
+duplicate data, and a data store with a reference counted block map that
+maps from logical block addresses to the actual storage location of the
+data.
+
+Zones and Threading
+-------------------
+
+Due to the complexity of data optimization, the number of metadata
+structures involved in a single write operation to a vdo target is larger
+than most other targets. Furthermore, because vdo must operate on small
+block sizes in order to achieve good deduplication rates, acceptable
+performance can only be achieved through parallelism. Therefore, vdo's
+design attempts to be lock-free.
+
+Most of a vdo's main data structures are designed to be easily divided into
+"zones" such that any given bio must only access a single zone of any zoned
+structure. Safety with minimal locking is achieved by ensuring that during
+normal operation, each zone is assigned to a specific thread, and only that
+thread will access the portion of the data structure in that zone.
+Associated with each thread is a work queue. Each bio is associated with a
+request object (the "data_vio") which will be added to a work queue when
+the next phase of its operation requires access to the structures in the
+zone associated with that queue.
+
+Another way of thinking about this arrangement is that the work queue for
+each zone has an implicit lock on the structures it manages for all its
+operations, because vdo guarantees that no other thread will alter those
+structures.
+
+Although each structure is divided into zones, this division is not
+reflected in the on-disk representation of each data structure. Therefore,
+the number of zones for each structure, and hence the number of threads,
+can be reconfigured each time a vdo target is started.
+
+The Deduplication Index
+-----------------------
+
+In order to identify duplicate data efficiently, vdo was designed to
+leverage some common characteristics of duplicate data. From empirical
+observations, we gathered two key insights. The first is that in most data
+sets with significant amounts of duplicate data, the duplicates tend to
+have temporal locality. When a duplicate appears, it is more likely that
+other duplicates will be detected, and that those duplicates will have been
+written at about the same time. This is why the index keeps records in
+temporal order. The second insight is that new data is more likely to
+duplicate recent data than it is to duplicate older data and in general,
+there are diminishing returns to looking further back in time. Therefore,
+when the index is full, it should cull its oldest records to make space for
+new ones. Another important idea behind the design of the index is that the
+ultimate goal of deduplication is to reduce storage costs. Since there is a
+trade-off between the storage saved and the resources expended to achieve
+those savings, vdo does not attempt to find every last duplicate block. It
+is sufficient to find and eliminate most of the redundancy.
+
+Each block of data is hashed to produce a 16-byte block name. An index
+record consists of this block name paired with the presumed location of
+that data on the underlying storage. However, it is not possible to
+guarantee that the index is accurate. In the most common case, this occurs
+because it is too costly to update the index when a block is over-written
+or discarded. Doing so would require either storing the block name along
+with the blocks, which is difficult to do efficiently in block-based
+storage, or reading and rehashing each block before overwriting it.
+Inaccuracy can also result from a hash collision where two different blocks
+have the same name. In practice, this is extremely unlikely, but because
+vdo does not use a cryptographic hash, a malicious workload could be
+constructed. Because of these inaccuracies, vdo treats the locations in the
+index as hints, and reads each indicated block to verify that it is indeed
+a duplicate before sharing the existing block with a new one.
+
+Records are collected into groups called chapters. New records are added to
+the newest chapter, called the open chapter. This chapter is stored in a
+format optimized for adding and modifying records, and the content of the
+open chapter is not finalized until it runs out of space for new records.
+When the open chapter fills up, it is closed and a new open chapter is
+created to collect new records.
+
+Closing a chapter converts it to a different format which is optimized for
+reading. The records are written to a series of record pages based on the
+order in which they were received. This means that records with temporal
+locality should be on a small number of pages, reducing the I/O required to
+retrieve them. The chapter also compiles an index that indicates which
+record page contains any given name. This index means that a request for a
+name can determine exactly which record page may contain that record,
+without having to load the entire chapter from storage. This index uses
+only a subset of the block name as its key, so it cannot guarantee that an
+index entry refers to the desired block name. It can only guarantee that if
+there is a record for this name, it will be on the indicated page. Closed
+chapters are read-only structures and their contents are never altered in
+any way.
+
+Once enough records have been written to fill up all the available index
+space, the oldest chapter is removed to make space for new chapters. Any
+time a request finds a matching record in the index, that record is copied
+into the open chapter. This ensures that useful block names remain available
+in the index, while unreferenced block names are forgotten over time.
+
+In order to find records in older chapters, the index also maintains a
+higher level structure called the volume index, which contains entries
+mapping each block name to the chapter containing its newest record. This
+mapping is updated as records for the block name are copied or updated,
+ensuring that only the newest record for a given block name can be found.
+An older record for a block name will no longer be found even though it has
+not been deleted from its chapter. Like the chapter index, the volume index
+uses only a subset of the block name as its key and can not definitively
+say that a record exists for a name. It can only say which chapter would
+contain the record if a record exists. The volume index is stored entirely
+in memory and is saved to storage only when the vdo target is shut down.
+
+From the viewpoint of a request for a particular block name, it will first
+look up the name in the volume index. This search will either indicate that
+the name is new, or which chapter to search. If it returns a chapter, the
+request looks up its name in the chapter index. This will indicate either
+that the name is new, or which record page to search. Finally, if it is not
+new, the request will look for its name in the indicated record page.
+This process may require up to two page reads per request (one for the
+chapter index page and one for the request page). However, recently
+accessed pages are cached so that these page reads can be amortized across
+many block name requests.
+
+The volume index and the chapter indexes are implemented using a
+memory-efficient structure called a delta index. Instead of storing the
+entire block name (the key) for each entry, the entries are sorted by name
+and only the difference between adjacent keys (the delta) is stored.
+Because we expect the hashes to be randomly distributed, the size of the
+deltas follows an exponential distribution. Because of this distribution,
+the deltas are expressed using a Huffman code to take up even less space.
+The entire sorted list of keys is called a delta list. This structure
+allows the index to use many fewer bytes per entry than a traditional hash
+table, but it is slightly more expensive to look up entries, because a
+request must read every entry in a delta list to add up the deltas in order
+to find the record it needs. The delta index reduces this lookup cost by
+splitting its key space into many sub-lists, each starting at a fixed key
+value, so that each individual list is short.
+
+The default index size can hold 64 million records, corresponding to about
+256GB of data. This means that the index can identify duplicate data if the
+original data was written within the last 256GB of writes. This range is
+called the deduplication window. If new writes duplicate data that is older
+than that, the index will not be able to find it because the records of the
+older data have been removed. This means that if an application writes a
+200 GB file to a vdo target and then immediately writes it again, the two
+copies will deduplicate perfectly. Doing the same with a 500 GB file will
+result in no deduplication, because the beginning of the file will no
+longer be in the index by the time the second write begins (assuming there
+is no duplication within the file itself).
+
+If an application anticipates a data workload that will see useful
+deduplication beyond the 256GB threshold, vdo can be configured to use a
+larger index with a correspondingly larger deduplication window. (This
+configuration can only be set when the target is created, not altered
+later. It is important to consider the expected workload for a vdo target
+before configuring it.) There are two ways to do this.
+
+One way is to increase the memory size of the index, which also increases
+the amount of backing storage required. Doubling the size of the index will
+double the length of the deduplication window at the expense of doubling
+the storage size and the memory requirements.
+
+The other option is to enable sparse indexing. Sparse indexing increases
+the deduplication window by a factor of 10, at the expense of also
+increasing the storage size by a factor of 10. However with sparse
+indexing, the memory requirements do not increase. The trade-off is
+slightly more computation per request and a slight decrease in the amount
+of deduplication detected. For most workloads with significant amounts of
+duplicate data, sparse indexing will detect 97-99% of the deduplication
+that a standard index will detect.
+
+The vio and data_vio Structures
+-------------------------------
+
+A vio (short for Vdo I/O) is conceptually similar to a bio, with additional
+fields and data to track vdo-specific information. A struct vio maintains a
+pointer to a bio but also tracks other fields specific to the operation of
+vdo. The vio is kept separate from its related bio because there are many
+circumstances where vdo completes the bio but must continue to do work
+related to deduplication or compression.
+
+Metadata reads and writes, and other writes that originate within vdo, use
+a struct vio directly. Application reads and writes use a larger structure
+called a data_vio to track information about their progress. A struct
+data_vio contain a struct vio and also includes several other fields
+related to deduplication and other vdo features. The data_vio is the
+primary unit of application work in vdo. Each data_vio proceeds through a
+set of steps to handle the application data, after which it is reset and
+returned to a pool of data_vios for reuse.
+
+There is a fixed pool of 2048 data_vios. This number was chosen to bound
+the amount of work that is required to recover from a crash. In addition,
+benchmarks have indicated that increasing the size of the pool does not
+significantly improve performance.
+
+The Data Store
+--------------
+
+The data store is implemented by three main data structures, all of which
+work in concert to reduce or amortize metadata updates across as many data
+writes as possible.
+
+*The Slab Depot*
+
+Most of the vdo volume belongs to the slab depot. The depot contains a
+collection of slabs. The slabs can be up to 32GB, and are divided into
+three sections. Most of a slab consists of a linear sequence of 4K blocks.
+These blocks are used either to store data, or to hold portions of the
+block map (see below). In addition to the data blocks, each slab has a set
+of reference counters, using 1 byte for each data block. Finally each slab
+has a journal.
+
+Reference updates are written to the slab journal. Slab journal blocks are
+written out either when they are full, or when the recovery journal
+requests they do so in order to allow the main recovery journal (see below)
+to free up space. The slab journal is used both to ensure that the main
+recovery journal can regularly free up space, and also to amortize the cost
+of updating individual reference blocks. The reference counters are kept in
+memory and are written out, a block at a time in oldest-dirtied-order, only
+when there is a need to reclaim slab journal space. The write operations
+are performed in the background as needed so they do not add latency to
+particular I/O operations.
+
+Each slab is independent of every other. They are assigned to "physical
+zones" in round-robin fashion. If there are P physical zones, then slab n
+is assigned to zone n mod P.
+
+The slab depot maintains an additional small data structure, the "slab
+summary," which is used to reduce the amount of work needed to come back
+online after a crash. The slab summary maintains an entry for each slab
+indicating whether or not the slab has ever been used, whether all of its
+reference count updates have been persisted to storage, and approximately
+how full it is. During recovery, each physical zone will attempt to recover
+at least one slab, stopping whenever it has recovered a slab which has some
+free blocks. Once each zone has some space, or has determined that none is
+available, the target can resume normal operation in a degraded mode. Read
+and write requests can be serviced, perhaps with degraded performance,
+while the remainder of the dirty slabs are recovered.
+
+*The Block Map*
+
+The block map contains the logical to physical mapping. It can be thought
+of as an array with one entry per logical address. Each entry is 5 bytes,
+36 bits of which contain the physical block number which holds the data for
+the given logical address. The other 4 bits are used to indicate the nature
+of the mapping. Of the 16 possible states, one represents a logical address
+which is unmapped (i.e. it has never been written, or has been discarded),
+one represents an uncompressed block, and the other 14 states are used to
+indicate that the mapped data is compressed, and which of the compression
+slots in the compressed block contains the data for this logical address.
+
+In practice, the array of mapping entries is divided into "block map
+pages," each of which fits in a single 4K block. Each block map page
+consists of a header and 812 mapping entries. Each mapping page is actually
+a leaf of a radix tree which consists of block map pages at each level.
+There are 60 radix trees which are assigned to "logical zones" in round
+robin fashion. (If there are L logical zones, tree n will belong to zone n
+mod L.) At each level, the trees are interleaved, so logical addresses
+0-811 belong to tree 0, logical addresses 812-1623 belong to tree 1, and so
+on. The interleaving is maintained all the way up to the 60 root nodes.
+Choosing 60 trees results in an evenly distributed number of trees per zone
+for a large number of possible logical zone counts. The storage for the 60
+tree roots is allocated at format time. All other block map pages are
+allocated out of the slabs as needed. This flexible allocation avoids the
+need to pre-allocate space for the entire set of logical mappings and also
+makes growing the logical size of a vdo relatively easy.
+
+In operation, the block map maintains two caches. It is prohibitive to keep
+the entire leaf level of the trees in memory, so each logical zone
+maintains its own cache of leaf pages. The size of this cache is
+configurable at target start time. The second cache is allocated at start
+time, and is large enough to hold all the non-leaf pages of the entire
+block map. This cache is populated as pages are needed.
+
+*The Recovery Journal*
+
+The recovery journal is used to amortize updates across the block map and
+slab depot. Each write request causes an entry to be made in the journal.
+Entries are either "data remappings" or "block map remappings." For a data
+remapping, the journal records the logical address affected and its old and
+new physical mappings. For a block map remapping, the journal records the
+block map page number and the physical block allocated for it. Block map
+pages are never reclaimed or repurposed, so the old mapping is always 0.
+
+Each journal entry is an intent record summarizing the metadata updates
+that are required for a data_vio. The recovery journal issues a flush
+before each journal block write to ensure that the physical data for the
+new block mappings in that block are stable on storage, and journal block
+writes are all issued with the FUA bit set to ensure the recovery journal
+entries themselves are stable. The journal entry and the data write it
+represents must be stable on disk before the other metadata structures may
+be updated to reflect the operation. These entries allow the vdo device to
+reconstruct the logical to physical mappings after an unexpected
+interruption such as a loss of power.
+
+*Write Path*
+
+All write I/O to vdo is asynchronous. Each bio will be acknowledged as soon
+as vdo has done enough work to guarantee that it can complete the write
+eventually. Generally, the data for acknowledged but unflushed write I/O
+can be treated as though it is cached in memory. If an application
+requires data to be stable on storage, it must issue a flush or write the
+data with the FUA bit set like any other asynchronous I/O. Shutting down
+the vdo target will also flush any remaining I/O.
+
+Application write bios follow the steps outlined below.
+
+1. A data_vio is obtained from the data_vio pool and associated with the
+ application bio. If there are no data_vios available, the incoming bio
+ will block until a data_vio is available. This provides back pressure
+ to the application. The data_vio pool is protected by a spin lock.
+
+ The newly acquired data_vio is reset and the bio's data is copied into
+ the data_vio if it is a write and the data is not all zeroes. The data
+ must be copied because the application bio can be acknowledged before
+ the data_vio processing is complete, which means later processing steps
+ will no longer have access to the application bio. The application bio
+ may also be smaller than 4K, in which case the data_vio will have
+ already read the underlying block and the data is instead copied over
+ the relevant portion of the larger block.
+
+2. The data_vio places a claim (the "logical lock") on the logical address
+ of the bio. It is vital to prevent simultaneous modifications of the
+ same logical address, because deduplication involves sharing blocks.
+ This claim is implemented as an entry in a hashtable where the key is
+ the logical address and the value is a pointer to the data_vio
+ currently handling that address.
+
+ If a data_vio looks in the hashtable and finds that another data_vio is
+ already operating on that logical address, it waits until the previous
+ operation finishes. It also sends a message to inform the current
+ lock holder that it is waiting. Most notably, a new data_vio waiting
+ for a logical lock will flush the previous lock holder out of the
+ compression packer (step 8d) rather than allowing it to continue
+ waiting to be packed.
+
+ This stage requires the data_vio to get an implicit lock on the
+ appropriate logical zone to prevent concurrent modifications of the
+ hashtable. This implicit locking is handled by the zone divisions
+ described above.
+
+3. The data_vio traverses the block map tree to ensure that all the
+ necessary internal tree nodes have been allocated, by trying to find
+ the leaf page for its logical address. If any interior tree page is
+ missing, it is allocated at this time out of the same physical storage
+ pool used to store application data.
+
+ a. If any page-node in the tree has not yet been allocated, it must be
+ allocated before the write can continue. This step requires the
+ data_vio to lock the page-node that needs to be allocated. This
+ lock, like the logical block lock in step 2, is a hashtable entry
+ that causes other data_vios to wait for the allocation process to
+ complete.
+
+ The implicit logical zone lock is released while the allocation is
+ happening, in order to allow other operations in the same logical
+ zone to proceed. The details of allocation are the same as in
+ step 4. Once a new node has been allocated, that node is added to
+ the tree using a similar process to adding a new data block mapping.
+ The data_vio journals the intent to add the new node to the block
+ map tree (step 10), updates the reference count of the new block
+ (step 11), and reacquires the implicit logical zone lock to add the
+ new mapping to the parent tree node (step 12). Once the tree is
+ updated, the data_vio proceeds down the tree. Any other data_vios
+ waiting on this allocation also proceed.
+
+ b. In the steady-state case, the block map tree nodes will already be
+ allocated, so the data_vio just traverses the tree until it finds
+ the required leaf node. The location of the mapping (the "block map
+ slot") is recorded in the data_vio so that later steps do not need
+ to traverse the tree again. The data_vio then releases the implicit
+ logical zone lock.
+
+4. If the block is a zero block, skip to step 9. Otherwise, an attempt is
+ made to allocate a free data block. This allocation ensures that the
+ data_vio can write its data somewhere even if deduplication and
+ compression are not possible. This stage gets an implicit lock on a
+ physical zone to search for free space within that zone.
+
+ The data_vio will search each slab in a zone until it finds a free
+ block or decides there are none. If the first zone has no free space,
+ it will proceed to search the next physical zone by taking the implicit
+ lock for that zone and releasing the previous one until it finds a
+ free block or runs out of zones to search. The data_vio will acquire a
+ struct pbn_lock (the "physical block lock") on the free block. The
+ struct pbn_lock also has several fields to record the various kinds of
+ claims that data_vios can have on physical blocks. The pbn_lock is
+ added to a hashtable like the logical block locks in step 2. This
+ hashtable is also covered by the implicit physical zone lock. The
+ reference count of the free block is updated to prevent any other
+ data_vio from considering it free. The reference counters are a
+ sub-component of the slab and are thus also covered by the implicit
+ physical zone lock.
+
+5. If an allocation was obtained, the data_vio has all the resources it
+ needs to complete the write. The application bio can safely be
+ acknowledged at this point. The acknowledgment happens on a separate
+ thread to prevent the application callback from blocking other data_vio
+ operations.
+
+ If an allocation could not be obtained, the data_vio continues to
+ attempt to deduplicate or compress the data, but the bio is not
+ acknowledged because the vdo device may be out of space.
+
+6. At this point vdo must determine where to store the application data.
+ The data_vio's data is hashed and the hash (the "record name") is
+ recorded in the data_vio.
+
+7. The data_vio reserves or joins a struct hash_lock, which manages all of
+ the data_vios currently writing the same data. Active hash locks are
+ tracked in a hashtable similar to the way logical block locks are
+ tracked in step 2. This hashtable is covered by the implicit lock on
+ the hash zone.
+
+ If there is no existing hash lock for this data_vio's record_name, the
+ data_vio obtains a hash lock from the pool, adds it to the hashtable,
+ and sets itself as the new hash lock's "agent." The hash_lock pool is
+ also covered by the implicit hash zone lock. The hash lock agent will
+ do all the work to decide where the application data will be
+ written. If a hash lock for the data_vio's record_name already exists,
+ and the data_vio's data is the same as the agent's data, the new
+ data_vio will wait for the agent to complete its work and then share
+ its result.
+
+ In the rare case that a hash lock exists for the data_vio's hash but
+ the data does not match the hash lock's agent, the data_vio skips to
+ step 8h and attempts to write its data directly. This can happen if two
+ different data blocks produce the same hash, for example.
+
+8. The hash lock agent attempts to deduplicate or compress its data with
+ the following steps.
+
+ a. The agent initializes and sends its embedded deduplication request
+ (struct uds_request) to the deduplication index. This does not
+ require the data_vio to get any locks because the index components
+ manage their own locking. The data_vio waits until it either gets a
+ response from the index or times out.
+
+ b. If the deduplication index returns advice, the data_vio attempts to
+ obtain a physical block lock on the indicated physical address, in
+ order to read the data and verify that it is the same as the
+ data_vio's data, and that it can accept more references. If the
+ physical address is already locked by another data_vio, the data at
+ that address may soon be overwritten so it is not safe to use the
+ address for deduplication.
+
+ c. If the data matches and the physical block can add references, the
+ agent and any other data_vios waiting on it will record this
+ physical block as their new physical address and proceed to step 9
+ to record their new mapping. If there are more data_vios in the hash
+ lock than there are references available, one of the remaining
+ data_vios becomes the new agent and continues to step 8d as if no
+ valid advice was returned.
+
+ d. If no usable duplicate block was found, the agent first checks that
+ it has an allocated physical block (from step 3) that it can write
+ to. If the agent does not have an allocation, some other data_vio in
+ the hash lock that does have an allocation takes over as agent. If
+ none of the data_vios have an allocated physical block, these writes
+ are out of space, so they proceed to step 13 for cleanup.
+
+ e. The agent attempts to compress its data. If the data does not
+ compress, the data_vio will continue to step 8h to write its data
+ directly.
+
+ If the compressed size is small enough, the agent will release the
+ implicit hash zone lock and go to the packer (struct packer) where
+ it will be placed in a bin (struct packer_bin) along with other
+ data_vios. All compression operations require the implicit lock on
+ the packer zone.
+
+ The packer can combine up to 14 compressed blocks in a single 4k
+ data block. Compression is only helpful if vdo can pack at least 2
+ data_vios into a single data block. This means that a data_vio may
+ wait in the packer for an arbitrarily long time for other data_vios
+ to fill out the compressed block. There is a mechanism for vdo to
+ evict waiting data_vios when continuing to wait would cause
+ problems. Circumstances causing an eviction include an application
+ flush, device shutdown, or a subsequent data_vio trying to overwrite
+ the same logical block address. A data_vio may also be evicted from
+ the packer if it cannot be paired with any other compressed block
+ before more compressible blocks need to use its bin. An evicted
+ data_vio will proceed to step 8h to write its data directly.
+
+ f. If the agent fills a packer bin, either because all 14 of its slots
+ are used or because it has no remaining space, it is written out
+ using the allocated physical block from one of its data_vios. Step
+ 8d has already ensured that an allocation is available.
+
+ g. Each data_vio sets the compressed block as its new physical address.
+ The data_vio obtains an implicit lock on the physical zone and
+ acquires the struct pbn_lock for the compressed block, which is
+ modified to be a shared lock. Then it releases the implicit physical
+ zone lock and proceeds to step 8i.
+
+ h. Any data_vio evicted from the packer will have an allocation from
+ step 3. It will write its data to that allocated physical block.
+
+ i. After the data is written, if the data_vio is the agent of a hash
+ lock, it will reacquire the implicit hash zone lock and share its
+ physical address with as many other data_vios in the hash lock as
+ possible. Each data_vio will then proceed to step 9 to record its
+ new mapping.
+
+ j. If the agent actually wrote new data (whether compressed or not),
+ the deduplication index is updated to reflect the location of the
+ new data. The agent then releases the implicit hash zone lock.
+
+9. The data_vio determines the previous mapping of the logical address.
+ There is a cache for block map leaf pages (the "block map cache"),
+ because there are usually too many block map leaf nodes to store
+ entirely in memory. If the desired leaf page is not in the cache, the
+ data_vio will reserve a slot in the cache and load the desired page
+ into it, possibly evicting an older cached page. The data_vio then
+ finds the current physical address for this logical address (the "old
+ physical mapping"), if any, and records it. This step requires a lock
+ on the block map cache structures, covered by the implicit logical zone
+ lock.
+
+10. The data_vio makes an entry in the recovery journal containing the
+ logical block address, the old physical mapping, and the new physical
+ mapping. Making this journal entry requires holding the implicit
+ recovery journal lock. The data_vio will wait in the journal until all
+ recovery blocks up to the one containing its entry have been written
+ and flushed to ensure the transaction is stable on storage.
+
+11. Once the recovery journal entry is stable, the data_vio makes two slab
+ journal entries: an increment entry for the new mapping, and a
+ decrement entry for the old mapping. These two operations each require
+ holding a lock on the affected physical slab, covered by its implicit
+ physical zone lock. For correctness during recovery, the slab journal
+ entries in any given slab journal must be in the same order as the
+ corresponding recovery journal entries. Therefore, if the two entries
+ are in different zones, they are made concurrently, and if they are in
+ the same zone, the increment is always made before the decrement in
+ order to avoid underflow. After each slab journal entry is made in
+ memory, the associated reference count is also updated in memory.
+
+12. Once both of the reference count updates are done, the data_vio
+ acquires the implicit logical zone lock and updates the
+ logical-to-physical mapping in the block map to point to the new
+ physical block. At this point the write operation is complete.
+
+13. If the data_vio has a hash lock, it acquires the implicit hash zone
+ lock and releases its hash lock to the pool.
+
+ The data_vio then acquires the implicit physical zone lock and releases
+ the struct pbn_lock it holds for its allocated block. If it had an
+ allocation that it did not use, it also sets the reference count for
+ that block back to zero to free it for use by subsequent data_vios.
+
+ The data_vio then acquires the implicit logical zone lock and releases
+ the logical block lock acquired in step 2.
+
+ The application bio is then acknowledged if it has not previously been
+ acknowledged, and the data_vio is returned to the pool.
+
+*Read Path*
+
+An application read bio follows a much simpler set of steps. It does steps
+1 and 2 in the write path to obtain a data_vio and lock its logical
+address. If there is already a write data_vio in progress for that logical
+address that is guaranteed to complete, the read data_vio will copy the
+data from the write data_vio and return it. Otherwise, it will look up the
+logical-to-physical mapping by traversing the block map tree as in step 3,
+and then read and possibly decompress the indicated data at the indicated
+physical block address. A read data_vio will not allocate block map tree
+nodes if they are missing. If the interior block map nodes do not exist
+yet, the logical block map address must still be unmapped and the read
+data_vio will return all zeroes. A read data_vio handles cleanup and
+acknowledgment as in step 13, although it only needs to release the logical
+lock and return itself to the pool.
+
+*Small Writes*
+
+All storage within vdo is managed as 4KB blocks, but it can accept writes
+as small as 512 bytes. Processing a write that is smaller than 4K requires
+a read-modify-write operation that reads the relevant 4K block, copies the
+new data over the approriate sectors of the block, and then launches a
+write operation for the modified data block. The read and write stages of
+this operation are nearly identical to the normal read and write
+operations, and a single data_vio is used throughout this operation.
+
+*Recovery*
+
+When a vdo is restarted after a crash, it will attempt to recover from the
+recovery journal. During the pre-resume phase of the next start, the
+recovery journal is read. The increment portion of valid entries are played
+into the block map. Next, valid entries are played, in order as required,
+into the slab journals. Finally, each physical zone attempts to replay at
+least one slab journal to reconstruct the reference counts of one slab.
+Once each zone has some free space (or has determined that it has none),
+the vdo comes back online, while the remainder of the slab journals are
+used to reconstruct the rest of the reference counts in the background.
+
+*Read-only Rebuild*
+
+If a vdo encounters an unrecoverable error, it will enter read-only mode.
+This mode indicates that some previously acknowledged data may have been
+lost. The vdo may be instructed to rebuild as best it can in order to
+return to a writable state. However, this is never done automatically due
+to the possibility that data has been lost. During a read-only rebuild, the
+block map is recovered from the recovery journal as before. However, the
+reference counts are not rebuilt from the slab journals. Instead, the
+reference counts are zeroed, the entire block map is traversed, and the
+reference counts are updated from the block mappings. While this may lose
+some data, it ensures that the block map and reference counts are
+consistent with each other. This allows vdo to resume normal operation and
+accept further writes.
diff --git a/Documentation/admin-guide/device-mapper/vdo.rst b/Documentation/admin-guide/device-mapper/vdo.rst
new file mode 100644
index 000000000000..7e1ecafdf91e
--- /dev/null
+++ b/Documentation/admin-guide/device-mapper/vdo.rst
@@ -0,0 +1,406 @@
+.. SPDX-License-Identifier: GPL-2.0-only
+
+dm-vdo
+======
+
+The dm-vdo (virtual data optimizer) device mapper target provides
+block-level deduplication, compression, and thin provisioning. As a device
+mapper target, it can add these features to the storage stack, compatible
+with any file system. The vdo target does not protect against data
+corruption, relying instead on integrity protection of the storage below
+it. It is strongly recommended that lvm be used to manage vdo volumes. See
+lvmvdo(7).
+
+Userspace component
+===================
+
+Formatting a vdo volume requires the use of the 'vdoformat' tool, available
+at:
+
+https://github.com/dm-vdo/vdo/
+
+In most cases, a vdo target will recover from a crash automatically the
+next time it is started. In cases where it encountered an unrecoverable
+error (either during normal operation or crash recovery) the target will
+enter or come up in read-only mode. Because read-only mode is indicative of
+data-loss, a positive action must be taken to bring vdo out of read-only
+mode. The 'vdoforcerebuild' tool, available from the same repo, is used to
+prepare a read-only vdo to exit read-only mode. After running this tool,
+the vdo target will rebuild its metadata the next time it is
+started. Although some data may be lost, the rebuilt vdo's metadata will be
+internally consistent and the target will be writable again.
+
+The repo also contains additional userspace tools which can be used to
+inspect a vdo target's on-disk metadata. Fortunately, these tools are
+rarely needed except by dm-vdo developers.
+
+Metadata requirements
+=====================
+
+Each vdo volume reserves 3GB of space for metadata, or more depending on
+its configuration. It is helpful to check that the space saved by
+deduplication and compression is not cancelled out by the metadata
+requirements. An estimation of the space saved for a specific dataset can
+be computed with the vdo estimator tool, which is available at:
+
+https://github.com/dm-vdo/vdoestimator/
+
+Target interface
+================
+
+Table line
+----------
+
+::
+
+ <offset> <logical device size> vdo V4 <storage device>
+ <storage device size> <minimum I/O size> <block map cache size>
+ <block map era length> [optional arguments]
+
+
+Required parameters:
+
+ offset:
+ The offset, in sectors, at which the vdo volume's logical
+ space begins.
+
+ logical device size:
+ The size of the device which the vdo volume will service,
+ in sectors. Must match the current logical size of the vdo
+ volume.
+
+ storage device:
+ The device holding the vdo volume's data and metadata.
+
+ storage device size:
+ The size of the device holding the vdo volume, as a number
+ of 4096-byte blocks. Must match the current size of the vdo
+ volume.
+
+ minimum I/O size:
+ The minimum I/O size for this vdo volume to accept, in
+ bytes. Valid values are 512 or 4096. The recommended value
+ is 4096.
+
+ block map cache size:
+ The size of the block map cache, as a number of 4096-byte
+ blocks. The minimum and recommended value is 32768 blocks.
+ If the logical thread count is non-zero, the cache size
+ must be at least 4096 blocks per logical thread.
+
+ block map era length:
+ The speed with which the block map cache writes out
+ modified block map pages. A smaller era length is likely to
+ reduce the amount of time spent rebuilding, at the cost of
+ increased block map writes during normal operation. The
+ maximum and recommended value is 16380; the minimum value
+ is 1.
+
+Optional parameters:
+--------------------
+Some or all of these parameters may be specified as <key> <value> pairs.
+
+Thread related parameters:
+
+Different categories of work are assigned to separate thread groups, and
+the number of threads in each group can be configured separately.
+
+If <hash>, <logical>, and <physical> are all set to 0, the work handled by
+all three thread types will be handled by a single thread. If any of these
+values are non-zero, all of them must be non-zero.
+
+ ack:
+ The number of threads used to complete bios. Since
+ completing a bio calls an arbitrary completion function
+ outside the vdo volume, threads of this type allow the vdo
+ volume to continue processing requests even when bio
+ completion is slow. The default is 1.
+
+ bio:
+ The number of threads used to issue bios to the underlying
+ storage. Threads of this type allow the vdo volume to
+ continue processing requests even when bio submission is
+ slow. The default is 4.
+
+ bioRotationInterval:
+ The number of bios to enqueue on each bio thread before
+ switching to the next thread. The value must be greater
+ than 0 and not more than 1024; the default is 64.
+
+ cpu:
+ The number of threads used to do CPU-intensive work, such
+ as hashing and compression. The default is 1.
+
+ hash:
+ The number of threads used to manage data comparisons for
+ deduplication based on the hash value of data blocks. The
+ default is 0.
+
+ logical:
+ The number of threads used to manage caching and locking
+ based on the logical address of incoming bios. The default
+ is 0; the maximum is 60.
+
+ physical:
+ The number of threads used to manage administration of the
+ underlying storage device. At format time, a slab size for
+ the vdo is chosen; the vdo storage device must be large
+ enough to have at least 1 slab per physical thread. The
+ default is 0; the maximum is 16.
+
+Miscellaneous parameters:
+
+ maxDiscard:
+ The maximum size of discard bio accepted, in 4096-byte
+ blocks. I/O requests to a vdo volume are normally split
+ into 4096-byte blocks, and processed up to 2048 at a time.
+ However, discard requests to a vdo volume can be
+ automatically split to a larger size, up to <maxDiscard>
+ 4096-byte blocks in a single bio, and are limited to 1500
+ at a time. Increasing this value may provide better overall
+ performance, at the cost of increased latency for the
+ individual discard requests. The default and minimum is 1;
+ the maximum is UINT_MAX / 4096.
+
+ deduplication:
+ Whether deduplication is enabled. The default is 'on'; the
+ acceptable values are 'on' and 'off'.
+
+ compression:
+ Whether compression is enabled. The default is 'off'; the
+ acceptable values are 'on' and 'off'.
+
+Device modification
+-------------------
+
+A modified table may be loaded into a running, non-suspended vdo volume.
+The modifications will take effect when the device is next resumed. The
+modifiable parameters are <logical device size>, <physical device size>,
+<maxDiscard>, <compression>, and <deduplication>.
+
+If the logical device size or physical device size are changed, upon
+successful resume vdo will store the new values and require them on future
+startups. These two parameters may not be decreased. The logical device
+size may not exceed 4 PB. The physical device size must increase by at
+least 32832 4096-byte blocks if at all, and must not exceed the size of the
+underlying storage device. Additionally, when formatting the vdo device, a
+slab size is chosen: the physical device size may never increase above the
+size which provides 8192 slabs, and each increase must be large enough to
+add at least one new slab.
+
+Examples:
+
+Start a previously-formatted vdo volume with 1 GB logical space and 1 GB
+physical space, storing to /dev/dm-1 which has more than 1 GB of space.
+
+::
+
+ dmsetup create vdo0 --table \
+ "0 2097152 vdo V4 /dev/dm-1 262144 4096 32768 16380"
+
+Grow the logical size to 4 GB.
+
+::
+
+ dmsetup reload vdo0 --table \
+ "0 8388608 vdo V4 /dev/dm-1 262144 4096 32768 16380"
+ dmsetup resume vdo0
+
+Grow the physical size to 2 GB.
+
+::
+
+ dmsetup reload vdo0 --table \
+ "0 8388608 vdo V4 /dev/dm-1 524288 4096 32768 16380"
+ dmsetup resume vdo0
+
+Grow the physical size by 1 GB more and increase max discard sectors.
+
+::
+
+ dmsetup reload vdo0 --table \
+ "0 10485760 vdo V4 /dev/dm-1 786432 4096 32768 16380 maxDiscard 8"
+ dmsetup resume vdo0
+
+Stop the vdo volume.
+
+::
+
+ dmsetup remove vdo0
+
+Start the vdo volume again. Note that the logical and physical device sizes
+must still match, but other parameters can change.
+
+::
+
+ dmsetup create vdo1 --table \
+ "0 10485760 vdo V4 /dev/dm-1 786432 512 65550 5000 hash 1 logical 3 physical 2"
+
+Messages
+--------
+All vdo devices accept messages in the form:
+
+::
+ dmsetup message <target-name> 0 <message-name> <message-parameters>
+
+The messages are:
+
+ stats:
+ Outputs the current view of the vdo statistics. Mostly used
+ by the vdostats userspace program to interpret the output
+ buffer.
+
+ dump:
+ Dumps many internal structures to the system log. This is
+ not always safe to run, so it should only be used to debug
+ a hung vdo. Optional parameters to specify structures to
+ dump are:
+
+ viopool: The pool of I/O requests incoming bios
+ pools: A synonym of 'viopool'
+ vdo: Most of the structures managing on-disk data
+ queues: Basic information about each vdo thread
+ threads: A synonym of 'queues'
+ default: Equivalent to 'queues vdo'
+ all: All of the above.
+
+ dump-on-shutdown:
+ Perform a default dump next time vdo shuts down.
+
+
+Status
+------
+
+::
+
+ <device> <operating mode> <in recovery> <index state>
+ <compression state> <physical blocks used> <total physical blocks>
+
+ device:
+ The name of the vdo volume.
+
+ operating mode:
+ The current operating mode of the vdo volume; values may be
+ 'normal', 'recovering' (the volume has detected an issue
+ with its metadata and is attempting to repair itself), and
+ 'read-only' (an error has occurred that forces the vdo
+ volume to only support read operations and not writes).
+
+ in recovery:
+ Whether the vdo volume is currently in recovery mode;
+ values may be 'recovering' or '-' which indicates not
+ recovering.
+
+ index state:
+ The current state of the deduplication index in the vdo
+ volume; values may be 'closed', 'closing', 'error',
+ 'offline', 'online', 'opening', and 'unknown'.
+
+ compression state:
+ The current state of compression in the vdo volume; values
+ may be 'offline' and 'online'.
+
+ used physical blocks:
+ The number of physical blocks in use by the vdo volume.
+
+ total physical blocks:
+ The total number of physical blocks the vdo volume may use;
+ the difference between this value and the
+ <used physical blocks> is the number of blocks the vdo
+ volume has left before being full.
+
+Memory Requirements
+===================
+
+A vdo target requires a fixed 38 MB of RAM along with the following amounts
+that scale with the target:
+
+- 1.15 MB of RAM for each 1 MB of configured block map cache size. The
+ block map cache requires a minimum of 150 MB.
+- 1.6 MB of RAM for each 1 TB of logical space.
+- 268 MB of RAM for each 1 TB of physical storage managed by the volume.
+
+The deduplication index requires additional memory which scales with the
+size of the deduplication window. For dense indexes, the index requires 1
+GB of RAM per 1 TB of window. For sparse indexes, the index requires 1 GB
+of RAM per 10 TB of window. The index configuration is set when the target
+is formatted and may not be modified.
+
+Module Parameters
+=================
+
+The vdo driver has a numeric parameter 'log_level' which controls the
+verbosity of logging from the driver. The default setting is 6
+(LOGLEVEL_INFO and more severe messages).
+
+Run-time Usage
+==============
+
+When using dm-vdo, it is important to be aware of the ways in which its
+behavior differs from other storage targets.
+
+- There is no guarantee that over-writes of existing blocks will succeed.
+ Because the underlying storage may be multiply referenced, over-writing
+ an existing block generally requires a vdo to have a free block
+ available.
+
+- When blocks are no longer in use, sending a discard request for those
+ blocks lets the vdo release references for those blocks. If the vdo is
+ thinly provisioned, discarding unused blocks is essential to prevent the
+ target from running out of space. However, due to the sharing of
+ duplicate blocks, no discard request for any given logical block is
+ guaranteed to reclaim space.
+
+- Assuming the underlying storage properly implements flush requests, vdo
+ is resilient against crashes, however, unflushed writes may or may not
+ persist after a crash.
+
+- Each write to a vdo target entails a significant amount of processing.
+ However, much of the work is paralellizable. Therefore, vdo targets
+ achieve better throughput at higher I/O depths, and can support up 2048
+ requests in parallel.
+
+Tuning
+======
+
+The vdo device has many options, and it can be difficult to make optimal
+choices without perfect knowledge of the workload. Additionally, most
+configuration options must be set when a vdo target is started, and cannot
+be changed without shutting it down completely; the configuration cannot be
+changed while the target is active. Ideally, tuning with simulated
+workloads should be performed before deploying vdo in production
+environments.
+
+The most important value to adjust is the block map cache size. In order to
+service a request for any logical address, a vdo must load the portion of
+the block map which holds the relevant mapping. These mappings are cached.
+Performance will suffer when the working set does not fit in the cache. By
+default, a vdo allocates 128 MB of metadata cache in RAM to support
+efficient access to 100 GB of logical space at a time. It should be scaled
+up proportionally for larger working sets.
+
+The logical and physical thread counts should also be adjusted. A logical
+thread controls a disjoint section of the block map, so additional logical
+threads increase parallelism and can increase throughput. Physical threads
+control a disjoint section of the data blocks, so additional physical
+threads can also increase throughput. However, excess threads can waste
+resources and increase contention.
+
+Bio submission threads control the parallelism involved in sending I/O to
+the underlying storage; fewer threads mean there is more opportunity to
+reorder I/O requests for performance benefit, but also that each I/O
+request has to wait longer before being submitted.
+
+Bio acknowledgment threads are used for finishing I/O requests. This is
+done on dedicated threads since the amount of work required to execute a
+bio's callback can not be controlled by the vdo itself. Usually one thread
+is sufficient but additional threads may be beneficial, particularly when
+bios have CPU-heavy callbacks.
+
+CPU threads are used for hashing and for compression; in workloads with
+compression enabled, more threads may result in higher throughput.
+
+Hash threads are used to sort active requests by hash and determine whether
+they should deduplicate; the most CPU intensive actions done by these
+threads are comparison of 4096-byte data blocks. In most cases, a single
+hash thread is sufficient.
diff --git a/MAINTAINERS b/MAINTAINERS
index e9c4517c2efa..8ea2cbfc52ea 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6134,6 +6134,14 @@ F: include/linux/device-mapper.h
F: include/linux/dm-*.h
F: include/uapi/linux/dm-*.h
+DEVICE-MAPPER VDO TARGET
+M: Matthew Sakai <msakai@redhat.com>
+M: dm-devel@lists.linux.dev
+L: dm-devel@lists.linux.dev
+S: Maintained
+F: Documentation/admin-guide/device-mapper/vdo*.rst
+F: drivers/md/dm-vdo/
+
DEVLINK
M: Jiri Pirko <jiri@resnulli.us>
L: netdev@vger.kernel.org
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index a743e2c572fc..68ce56fc61d0 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -634,4 +634,6 @@ config DM_AUDIT
Enables audit logging of several security relevant events in the
particular device-mapper targets, especially the integrity target.
+source "drivers/md/dm-vdo/Kconfig"
+
endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 027d7cfeca3f..476a214e4bdc 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -68,6 +68,7 @@ obj-$(CONFIG_DM_ZERO) += dm-zero.o
obj-$(CONFIG_DM_RAID) += dm-raid.o
obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o
obj-$(CONFIG_DM_VERITY) += dm-verity.o
+obj-$(CONFIG_DM_VDO) += dm-vdo/
obj-$(CONFIG_DM_CACHE) += dm-cache.o
obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o
obj-$(CONFIG_DM_EBS) += dm-ebs.o
diff --git a/drivers/md/dm-vdo/Kconfig b/drivers/md/dm-vdo/Kconfig
new file mode 100644
index 000000000000..111ecd2c2a24
--- /dev/null
+++ b/drivers/md/dm-vdo/Kconfig
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config DM_VDO
+ tristate "VDO: deduplication and compression target"
+ depends on 64BIT
+ depends on BLK_DEV_DM
+ select DM_BUFIO
+ select LZ4_COMPRESS
+ select LZ4_DECOMPRESS
+ help
+ This device mapper target presents a block device with
+ deduplication, compression and thin-provisioning.
+
+ To compile this code as a module, choose M here: the module will
+ be called dm-vdo.
+
+ If unsure, say N.
diff --git a/drivers/md/dm-vdo/Makefile b/drivers/md/dm-vdo/Makefile
new file mode 100644
index 000000000000..33e09abc6acd
--- /dev/null
+++ b/drivers/md/dm-vdo/Makefile
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+ccflags-y := -I$(srctree)/$(src) -I$(srctree)/$(src)/indexer
+
+obj-$(CONFIG_DM_VDO) += dm-vdo.o
+
+dm-vdo-objs := \
+ action-manager.o \
+ admin-state.o \
+ block-map.o \
+ completion.o \
+ data-vio.o \
+ dedupe.o \
+ dm-vdo-target.o \
+ dump.o \
+ encodings.o \
+ errors.o \
+ flush.o \
+ funnel-queue.o \
+ funnel-workqueue.o \
+ int-map.o \
+ io-submitter.o \
+ logger.o \
+ logical-zone.o \
+ memory-alloc.o \
+ message-stats.o \
+ murmurhash3.o \
+ packer.o \
+ permassert.o \
+ physical-zone.o \
+ priority-table.o \
+ recovery-journal.o \
+ repair.o \
+ slab-depot.o \
+ status-codes.o \
+ string-utils.o \
+ thread-device.o \
+ thread-registry.o \
+ thread-utils.o \
+ vdo.o \
+ vio.o \
+ wait-queue.o \
+ indexer/chapter-index.o \
+ indexer/config.o \
+ indexer/delta-index.o \
+ indexer/funnel-requestqueue.o \
+ indexer/geometry.o \
+ indexer/index.o \
+ indexer/index-layout.o \
+ indexer/index-page-map.o \
+ indexer/index-session.o \
+ indexer/io-factory.o \
+ indexer/open-chapter.o \
+ indexer/radix-sort.o \
+ indexer/sparse-cache.o \
+ indexer/volume.o \
+ indexer/volume-index.o
diff --git a/drivers/md/dm-vdo/action-manager.c b/drivers/md/dm-vdo/action-manager.c
new file mode 100644
index 000000000000..a0e5e7077d13
--- /dev/null
+++ b/drivers/md/dm-vdo/action-manager.c
@@ -0,0 +1,388 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "action-manager.h"
+
+#include "memory-alloc.h"
+#include "permassert.h"
+
+#include "admin-state.h"
+#include "completion.h"
+#include "status-codes.h"
+#include "types.h"
+#include "vdo.h"
+
+/**
+ * struct action - An action to be performed in each of a set of zones.
+ * @in_use: Whether this structure is in use.
+ * @operation: The admin operation associated with this action.
+ * @preamble: The method to run on the initiator thread before the action is applied to each zone.
+ * @zone_action: The action to be performed in each zone.
+ * @conclusion: The method to run on the initiator thread after the action is applied to each zone.
+ * @parent: The object to notify when the action is complete.
+ * @context: The action specific context.
+ * @next: The action to perform after this one.
+ */
+struct action {
+ bool in_use;
+ const struct admin_state_code *operation;
+ vdo_action_preamble_fn preamble;
+ vdo_zone_action_fn zone_action;
+ vdo_action_conclusion_fn conclusion;
+ struct vdo_completion *parent;
+ void *context;
+ struct action *next;
+};
+
+/**
+ * struct action_manager - Definition of an action manager.
+ * @completion: The completion for performing actions.
+ * @state: The state of this action manager.
+ * @actions: The two action slots.
+ * @current_action: The current action slot.
+ * @zones: The number of zones in which an action is to be applied.
+ * @Scheduler: A function to schedule a default next action.
+ * @get_zone_thread_id: A function to get the id of the thread on which to apply an action to a
+ * zone.
+ * @initiator_thread_id: The ID of the thread on which actions may be initiated.
+ * @context: Opaque data associated with this action manager.
+ * @acting_zone: The zone currently being acted upon.
+ */
+struct action_manager {
+ struct vdo_completion completion;
+ struct admin_state state;
+ struct action actions[2];
+ struct action *current_action;
+ zone_count_t zones;
+ vdo_action_scheduler_fn scheduler;
+ vdo_zone_thread_getter_fn get_zone_thread_id;
+ thread_id_t initiator_thread_id;
+ void *context;
+ zone_count_t acting_zone;
+};
+
+static inline struct action_manager *as_action_manager(struct vdo_completion *completion)
+{
+ vdo_assert_completion_type(completion, VDO_ACTION_COMPLETION);
+ return container_of(completion, struct action_manager, completion);
+}
+
+/* Implements vdo_action_scheduler_fn. */
+static bool no_default_action(void *context __always_unused)
+{
+ return false;
+}
+
+/* Implements vdo_action_preamble_fn. */
+static void no_preamble(void *context __always_unused, struct vdo_completion *completion)
+{
+ vdo_finish_completion(completion);
+}
+
+/* Implements vdo_action_conclusion_fn. */
+static int no_conclusion(void *context __always_unused)
+{
+ return VDO_SUCCESS;
+}
+
+/**
+ * vdo_make_action_manager() - Make an action manager.
+ * @zones: The number of zones to which actions will be applied.
+ * @get_zone_thread_id: A function to get the thread id associated with a zone.
+ * @initiator_thread_id: The thread on which actions may initiated.
+ * @context: The object which holds the per-zone context for the action.
+ * @scheduler: A function to schedule a next action after an action concludes if there is no
+ * pending action (may be NULL).
+ * @vdo: The vdo used to initialize completions.
+ * @manager_ptr: A pointer to hold the new action manager.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+int vdo_make_action_manager(zone_count_t zones,
+ vdo_zone_thread_getter_fn get_zone_thread_id,
+ thread_id_t initiator_thread_id, void *context,
+ vdo_action_scheduler_fn scheduler, struct vdo *vdo,
+ struct action_manager **manager_ptr)
+{
+ struct action_manager *manager;
+ int result = vdo_allocate(1, struct action_manager, __func__, &manager);
+
+ if (result != VDO_SUCCESS)
+ return result;
+
+ *manager = (struct action_manager) {
+ .zones = zones,
+ .scheduler =
+ ((scheduler == NULL) ? no_default_action : scheduler),
+ .get_zone_thread_id = get_zone_thread_id,
+ .initiator_thread_id = initiator_thread_id,
+ .context = context,
+ };
+
+ manager->actions[0].next = &manager->actions[1];
+ manager->current_action = manager->actions[1].next =
+ &manager->actions[0];
+ vdo_set_admin_state_code(&manager->state, VDO_ADMIN_STATE_NORMAL_OPERATION);
+ vdo_initialize_completion(&manager->completion, vdo, VDO_ACTION_COMPLETION);
+ *manager_ptr = manager;
+ return VDO_SUCCESS;
+}
+
+const struct admin_state_code *vdo_get_current_manager_operation(struct action_manager *manager)
+{
+ return vdo_get_admin_state_code(&manager->state);
+}
+
+void *vdo_get_current_action_context(struct action_manager *manager)
+{
+ return manager->current_action->in_use ? manager->current_action->context : NULL;
+}
+
+static void finish_action_callback(struct vdo_completion *completion);
+static void apply_to_zone(struct vdo_completion *completion);
+
+static thread_id_t get_acting_zone_thread_id(struct action_manager *manager)
+{
+ return manager->get_zone_thread_id(manager->context, manager->acting_zone);
+}
+
+static void preserve_error(struct vdo_completion *completion)
+{
+ if (completion->parent != NULL)
+ vdo_set_completion_result(completion->parent, completion->result);
+
+ vdo_reset_completion(completion);
+ vdo_run_completion(completion);
+}
+
+static void prepare_for_next_zone(struct action_manager *manager)
+{
+ vdo_prepare_completion_for_requeue(&manager->completion, apply_to_zone,
+ preserve_error,
+ get_acting_zone_thread_id(manager),
+ manager->current_action->parent);
+}
+
+static void prepare_for_conclusion(struct action_manager *manager)
+{
+ vdo_prepare_completion_for_requeue(&manager->completion, finish_action_callback,
+ preserve_error, manager->initiator_thread_id,
+ manager->current_action->parent);
+}
+
+static void apply_to_zone(struct vdo_completion *completion)
+{
+ zone_count_t zone;
+ struct action_manager *manager = as_action_manager(completion);
+
+ VDO_ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == get_acting_zone_thread_id(manager)),
+ "%s() called on acting zones's thread", __func__);
+
+ zone = manager->acting_zone++;
+ if (manager->acting_zone == manager->zones) {
+ /*
+ * We are about to apply to the last zone. Once that is finished, we're done, so go
+ * back to the initiator thread and finish up.
+ */
+ prepare_for_conclusion(manager);
+ } else {
+ /* Prepare to come back on the next zone */
+ prepare_for_next_zone(manager);
+ }
+
+ manager->current_action->zone_action(manager->context, zone, completion);
+}
+
+static void handle_preamble_error(struct vdo_completion *completion)
+{
+ /* Skip the zone actions since the preamble failed. */
+ completion->callback = finish_action_callback;
+ preserve_error(completion);
+}
+
+static void launch_current_action(struct action_manager *manager)
+{
+ struct action *action = manager->current_action;
+ int result = vdo_start_operation(&manager->state, action->operation);
+
+ if (result != VDO_SUCCESS) {
+ if (action->parent != NULL)
+ vdo_set_completion_result(action->parent, result);
+
+ /* We aren't going to run the preamble, so don't run the conclusion */
+ action->conclusion = no_conclusion;
+ finish_action_callback(&manager->completion);
+ return;
+ }
+
+ if (action->zone_action == NULL) {
+ prepare_for_conclusion(manager);
+ } else {
+ manager->acting_zone = 0;
+ vdo_prepare_completion_for_requeue(&manager->completion, apply_to_zone,
+ handle_preamble_error,
+ get_acting_zone_thread_id(manager),
+ manager->current_action->parent);
+ }
+
+ action->preamble(manager->context, &manager->completion);
+}
+
+/**
+ * vdo_schedule_default_action() - Attempt to schedule the default action.
+ * @manager: The action manager.
+ *
+ * If the manager is not operating normally, the action will not be scheduled.
+ *
+ * Return: true if an action was scheduled.
+ */
+bool vdo_schedule_default_action(struct action_manager *manager)
+{
+ /* Don't schedule a default action if we are operating or not in normal operation. */
+ const struct admin_state_code *code = vdo_get_current_manager_operation(manager);
+
+ return ((code == VDO_ADMIN_STATE_NORMAL_OPERATION) &&
+ manager->scheduler(manager->context));
+}
+
+static void finish_action_callback(struct vdo_completion *completion)
+{
+ bool has_next_action;
+ int result;
+ struct action_manager *manager = as_action_manager(completion);
+ struct action action = *(manager->current_action);
+
+ manager->current_action->in_use = false;
+ manager->current_action = manager->current_action->next;
+
+ /*
+ * We need to check this now to avoid use-after-free issues if running the conclusion or
+ * notifying the parent results in the manager being freed.
+ */
+ has_next_action =
+ (manager->current_action->in_use || vdo_schedule_default_action(manager));
+ result = action.conclusion(manager->context);
+ vdo_finish_operation(&manager->state, VDO_SUCCESS);
+ if (action.parent != NULL)
+ vdo_continue_completion(action.parent, result);
+
+ if (has_next_action)
+ launch_current_action(manager);
+}
+
+/**
+ * vdo_schedule_action() - Schedule an action to be applied to all zones.
+ * @manager: The action manager to schedule the action on.
+ * @preamble: A method to be invoked on the initiator thread once this action is started but before
+ * applying to each zone; may be NULL.
+ * @action: The action to apply to each zone; may be NULL.
+ * @conclusion: A method to be invoked back on the initiator thread once the action has been
+ * applied to all zones; may be NULL.
+ * @parent: The object to notify once the action is complete or if the action can not be scheduled;
+ * may be NULL.
+ *
+ * The action will be launched immediately if there is no current action, or as soon as the current
+ * action completes. If there is already a pending action, this action will not be scheduled, and,
+ * if it has a parent, that parent will be notified. At least one of the preamble, action, or
+ * conclusion must not be NULL.
+ *
+ * Return: true if the action was scheduled.
+ */
+bool vdo_schedule_action(struct action_manager *manager, vdo_action_preamble_fn preamble,
+ vdo_zone_action_fn action, vdo_action_conclusion_fn conclusion,
+ struct vdo_completion *parent)
+{
+ return vdo_schedule_operation(manager, VDO_ADMIN_STATE_OPERATING, preamble,
+ action, conclusion, parent);
+}
+
+/**
+ * vdo_schedule_operation() - Schedule an operation to be applied to all zones.
+ * @manager: The action manager to schedule the action on.
+ * @operation: The operation this action will perform
+ * @preamble: A method to be invoked on the initiator thread once this action is started but before
+ * applying to each zone; may be NULL.
+ * @action: The action to apply to each zone; may be NULL.
+ * @conclusion: A method to be invoked back on the initiator thread once the action has been
+ * applied to all zones; may be NULL.
+ * @parent: The object to notify once the action is complete or if the action can not be scheduled;
+ * may be NULL.
+ *
+ * The operation's action will be launched immediately if there is no current action, or as soon as
+ * the current action completes. If there is already a pending action, this operation will not be
+ * scheduled, and, if it has a parent, that parent will be notified. At least one of the preamble,
+ * action, or conclusion must not be NULL.
+ *
+ * Return: true if the action was scheduled.
+ */
+bool vdo_schedule_operation(struct action_manager *manager,
+ const struct admin_state_code *operation,
+ vdo_action_preamble_fn preamble, vdo_zone_action_fn action,
+ vdo_action_conclusion_fn conclusion,
+ struct vdo_completion *parent)
+{
+ return vdo_schedule_operation_with_context(manager, operation, preamble, action,
+ conclusion, NULL, parent);
+}
+
+/**
+ * vdo_schedule_operation_with_context() - Schedule an operation on all zones.
+ * @manager: The action manager to schedule the action on.
+ * @operation: The operation this action will perform.
+ * @preamble: A method to be invoked on the initiator thread once this action is started but before
+ * applying to each zone; may be NULL.
+ * @action: The action to apply to each zone; may be NULL.
+ * @conclusion: A method to be invoked back on the initiator thread once the action has been
+ * applied to all zones; may be NULL.
+ * @context: An action-specific context which may be retrieved via
+ * vdo_get_current_action_context(); may be NULL.
+ * @parent: The object to notify once the action is complete or if the action can not be scheduled;
+ * may be NULL.
+ *
+ * The operation's action will be launched immediately if there is no current action, or as soon as
+ * the current action completes. If there is already a pending action, this operation will not be
+ * scheduled, and, if it has a parent, that parent will be notified. At least one of the preamble,
+ * action, or conclusion must not be NULL.
+ *
+ * Return: true if the action was scheduled
+ */
+bool vdo_schedule_operation_with_context(struct action_manager *manager,
+ const struct admin_state_code *operation,
+ vdo_action_preamble_fn preamble,
+ vdo_zone_action_fn action,
+ vdo_action_conclusion_fn conclusion,
+ void *context, struct vdo_completion *parent)
+{
+ struct action *current_action;
+
+ VDO_ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == manager->initiator_thread_id),
+ "action initiated from correct thread");
+ if (!manager->current_action->in_use) {
+ current_action = manager->current_action;
+ } else if (!manager->current_action->next->in_use) {
+ current_action = manager->current_action->next;
+ } else {
+ if (parent != NULL)
+ vdo_continue_completion(parent, VDO_COMPONENT_BUSY);
+
+ return false;
+ }
+
+ *current_action = (struct action) {
+ .in_use = true,
+ .operation = operation,
+ .preamble = (preamble == NULL) ? no_preamble : preamble,
+ .zone_action = action,
+ .conclusion = (conclusion == NULL) ? no_conclusion : conclusion,
+ .context = context,
+ .parent = parent,
+ .next = current_action->next,
+ };
+
+ if (current_action == manager->current_action)
+ launch_current_action(manager);
+
+ return true;
+}
diff --git a/drivers/md/dm-vdo/action-manager.h b/drivers/md/dm-vdo/action-manager.h
new file mode 100644
index 000000000000..b0a8d3ddf3db
--- /dev/null
+++ b/drivers/md/dm-vdo/action-manager.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef VDO_ACTION_MANAGER_H
+#define VDO_ACTION_MANAGER_H
+
+#include "admin-state.h"
+#include "types.h"
+
+/*
+ * An action_manager provides a generic mechanism for applying actions to multi-zone entities (such
+ * as the block map or slab depot). Each action manager is tied to a specific context for which it
+ * manages actions. The manager ensures that only one action is active on that context at a time,
+ * and supports at most one pending action. Calls to schedule an action when there is already a
+ * pending action will result in VDO_COMPONENT_BUSY errors. Actions may only be submitted to the
+ * action manager from a single thread (which thread is determined when the action manager is
+ * constructed).
+ *
+ * A scheduled action consists of four components:
+ *
+ * preamble
+ * an optional method to be run on the initiator thread before applying the action to all zones
+ * zone_action
+ * an optional method to be applied to each of the zones
+ * conclusion
+ * an optional method to be run on the initiator thread once the per-zone method has been
+ * applied to all zones
+ * parent
+ * an optional completion to be finished once the conclusion is done
+ *
+ * At least one of the three methods must be provided.
+ */
+
+/*
+ * A function which is to be applied asynchronously to a set of zones.
+ * @context: The object which holds the per-zone context for the action
+ * @zone_number: The number of zone to which the action is being applied
+ * @parent: The object to notify when the action is complete
+ */
+typedef void (*vdo_zone_action_fn)(void *context, zone_count_t zone_number,
+ struct vdo_completion *parent);
+
+/*
+ * A function which is to be applied asynchronously on an action manager's initiator thread as the
+ * preamble of an action.
+ * @context: The object which holds the per-zone context for the action
+ * @parent: The object to notify when the action is complete
+ */
+typedef void (*vdo_action_preamble_fn)(void *context, struct vdo_completion *parent);
+
+/*
+ * A function which will run on the action manager's initiator thread as the conclusion of an
+ * action.
+ * @context: The object which holds the per-zone context for the action
+ *
+ * Return: VDO_SUCCESS or an error
+ */
+typedef int (*vdo_action_conclusion_fn)(void *context);
+
+/*
+ * A function to schedule an action.
+ * @context: The object which holds the per-zone context for the action
+ *
+ * Return: true if an action was scheduled
+ */
+typedef bool (*vdo_action_scheduler_fn)(void *context);
+
+/*
+ * A function to get the id of the thread associated with a given zone.
+ * @context: The action context
+ * @zone_number: The number of the zone for which the thread ID is desired
+ */
+typedef thread_id_t (*vdo_zone_thread_getter_fn)(void *context, zone_count_t zone_number);
+
+struct action_manager;
+
+int __must_check vdo_make_action_manager(zone_count_t zones,
+ vdo_zone_thread_getter_fn get_zone_thread_id,
+ thread_id_t initiator_thread_id, void *context,
+ vdo_action_scheduler_fn scheduler,
+ struct vdo *vdo,
+ struct action_manager **manager_ptr);
+
+const struct admin_state_code *__must_check
+vdo_get_current_manager_operation(struct action_manager *manager);
+
+void * __must_check vdo_get_current_action_context(struct action_manager *manager);
+
+bool vdo_schedule_default_action(struct action_manager *manager);
+
+bool vdo_schedule_action(struct action_manager *manager, vdo_action_preamble_fn preamble,
+ vdo_zone_action_fn action, vdo_action_conclusion_fn conclusion,
+ struct vdo_completion *parent);
+
+bool vdo_schedule_operation(struct action_manager *manager,
+ const struct admin_state_code *operation,
+ vdo_action_preamble_fn preamble, vdo_zone_action_fn action,
+ vdo_action_conclusion_fn conclusion,
+ struct vdo_completion *parent);
+
+bool vdo_schedule_operation_with_context(struct action_manager *manager,
+ const struct admin_state_code *operation,
+ vdo_action_preamble_fn preamble,
+ vdo_zone_action_fn action,
+ vdo_action_conclusion_fn conclusion,
+ void *context, struct vdo_completion *parent);
+
+#endif /* VDO_ACTION_MANAGER_H */
diff --git a/drivers/md/dm-vdo/admin-state.c b/drivers/md/dm-vdo/admin-state.c
new file mode 100644
index 000000000000..3f9dba525154
--- /dev/null
+++ b/drivers/md/dm-vdo/admin-state.c
@@ -0,0 +1,506 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "admin-state.h"
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "permassert.h"
+
+#include "completion.h"
+#include "types.h"
+
+static const struct admin_state_code VDO_CODE_NORMAL_OPERATION = {
+ .name = "VDO_ADMIN_STATE_NORMAL_OPERATION",
+ .normal = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_NORMAL_OPERATION = &VDO_CODE_NORMAL_OPERATION;
+static const struct admin_state_code VDO_CODE_OPERATING = {
+ .name = "VDO_ADMIN_STATE_OPERATING",
+ .normal = true,
+ .operating = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_OPERATING = &VDO_CODE_OPERATING;
+static const struct admin_state_code VDO_CODE_FORMATTING = {
+ .name = "VDO_ADMIN_STATE_FORMATTING",
+ .operating = true,
+ .loading = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_FORMATTING = &VDO_CODE_FORMATTING;
+static const struct admin_state_code VDO_CODE_PRE_LOADING = {
+ .name = "VDO_ADMIN_STATE_PRE_LOADING",
+ .operating = true,
+ .loading = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_PRE_LOADING = &VDO_CODE_PRE_LOADING;
+static const struct admin_state_code VDO_CODE_PRE_LOADED = {
+ .name = "VDO_ADMIN_STATE_PRE_LOADED",
+};
+const struct admin_state_code *VDO_ADMIN_STATE_PRE_LOADED = &VDO_CODE_PRE_LOADED;
+static const struct admin_state_code VDO_CODE_LOADING = {
+ .name = "VDO_ADMIN_STATE_LOADING",
+ .normal = true,
+ .operating = true,
+ .loading = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_LOADING = &VDO_CODE_LOADING;
+static const struct admin_state_code VDO_CODE_LOADING_FOR_RECOVERY = {
+ .name = "VDO_ADMIN_STATE_LOADING_FOR_RECOVERY",
+ .operating = true,
+ .loading = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_LOADING_FOR_RECOVERY =
+ &VDO_CODE_LOADING_FOR_RECOVERY;
+static const struct admin_state_code VDO_CODE_LOADING_FOR_REBUILD = {
+ .name = "VDO_ADMIN_STATE_LOADING_FOR_REBUILD",
+ .operating = true,
+ .loading = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_LOADING_FOR_REBUILD = &VDO_CODE_LOADING_FOR_REBUILD;
+static const struct admin_state_code VDO_CODE_WAITING_FOR_RECOVERY = {
+ .name = "VDO_ADMIN_STATE_WAITING_FOR_RECOVERY",
+ .operating = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_WAITING_FOR_RECOVERY =
+ &VDO_CODE_WAITING_FOR_RECOVERY;
+static const struct admin_state_code VDO_CODE_NEW = {
+ .name = "VDO_ADMIN_STATE_NEW",
+ .quiescent = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_NEW = &VDO_CODE_NEW;
+static const struct admin_state_code VDO_CODE_INITIALIZED = {
+ .name = "VDO_ADMIN_STATE_INITIALIZED",
+};
+const struct admin_state_code *VDO_ADMIN_STATE_INITIALIZED = &VDO_CODE_INITIALIZED;
+static const struct admin_state_code VDO_CODE_RECOVERING = {
+ .name = "VDO_ADMIN_STATE_RECOVERING",
+ .draining = true,
+ .operating = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_RECOVERING = &VDO_CODE_RECOVERING;
+static const struct admin_state_code VDO_CODE_REBUILDING = {
+ .name = "VDO_ADMIN_STATE_REBUILDING",
+ .draining = true,
+ .operating = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_REBUILDING = &VDO_CODE_REBUILDING;
+static const struct admin_state_code VDO_CODE_SAVING = {
+ .name = "VDO_ADMIN_STATE_SAVING",
+ .draining = true,
+ .quiescing = true,
+ .operating = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_SAVING = &VDO_CODE_SAVING;
+static const struct admin_state_code VDO_CODE_SAVED = {
+ .name = "VDO_ADMIN_STATE_SAVED",
+ .quiescent = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_SAVED = &VDO_CODE_SAVED;
+static const struct admin_state_code VDO_CODE_SCRUBBING = {
+ .name = "VDO_ADMIN_STATE_SCRUBBING",
+ .draining = true,
+ .loading = true,
+ .operating = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_SCRUBBING = &VDO_CODE_SCRUBBING;
+static const struct admin_state_code VDO_CODE_SAVE_FOR_SCRUBBING = {
+ .name = "VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING",
+ .draining = true,
+ .operating = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING = &VDO_CODE_SAVE_FOR_SCRUBBING;
+static const struct admin_state_code VDO_CODE_STOPPING = {
+ .name = "VDO_ADMIN_STATE_STOPPING",
+ .draining = true,
+ .quiescing = true,
+ .operating = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_STOPPING = &VDO_CODE_STOPPING;
+static const struct admin_state_code VDO_CODE_STOPPED = {
+ .name = "VDO_ADMIN_STATE_STOPPED",
+ .quiescent = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_STOPPED = &VDO_CODE_STOPPED;
+static const struct admin_state_code VDO_CODE_SUSPENDING = {
+ .name = "VDO_ADMIN_STATE_SUSPENDING",
+ .draining = true,
+ .quiescing = true,
+ .operating = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_SUSPENDING = &VDO_CODE_SUSPENDING;
+static const struct admin_state_code VDO_CODE_SUSPENDED = {
+ .name = "VDO_ADMIN_STATE_SUSPENDED",
+ .quiescent = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_SUSPENDED = &VDO_CODE_SUSPENDED;
+static const struct admin_state_code VDO_CODE_SUSPENDED_OPERATION = {
+ .name = "VDO_ADMIN_STATE_SUSPENDED_OPERATION",
+ .operating = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_SUSPENDED_OPERATION = &VDO_CODE_SUSPENDED_OPERATION;
+static const struct admin_state_code VDO_CODE_RESUMING = {
+ .name = "VDO_ADMIN_STATE_RESUMING",
+ .operating = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_RESUMING = &VDO_CODE_RESUMING;
+
+/**
+ * get_next_state() - Determine the state which should be set after a given operation completes
+ * based on the operation and the current state.
+ * @operation The operation to be started.
+ *
+ * Return: The state to set when the operation completes or NULL if the operation can not be
+ * started in the current state.
+ */
+static const struct admin_state_code *get_next_state(const struct admin_state *state,
+ const struct admin_state_code *operation)
+{
+ const struct admin_state_code *code = vdo_get_admin_state_code(state);
+
+ if (code->operating)
+ return NULL;
+
+ if (operation == VDO_ADMIN_STATE_SAVING)
+ return (code == VDO_ADMIN_STATE_NORMAL_OPERATION ? VDO_ADMIN_STATE_SAVED : NULL);
+
+ if (operation == VDO_ADMIN_STATE_SUSPENDING) {
+ return (code == VDO_ADMIN_STATE_NORMAL_OPERATION
+ ? VDO_ADMIN_STATE_SUSPENDED
+ : NULL);
+ }
+
+ if (operation == VDO_ADMIN_STATE_STOPPING)
+ return (code == VDO_ADMIN_STATE_NORMAL_OPERATION ? VDO_ADMIN_STATE_STOPPED : NULL);
+
+ if (operation == VDO_ADMIN_STATE_PRE_LOADING)
+ return (code == VDO_ADMIN_STATE_INITIALIZED ? VDO_ADMIN_STATE_PRE_LOADED : NULL);
+
+ if (operation == VDO_ADMIN_STATE_SUSPENDED_OPERATION) {
+ return (((code == VDO_ADMIN_STATE_SUSPENDED) ||
+ (code == VDO_ADMIN_STATE_SAVED)) ? code : NULL);
+ }
+
+ return VDO_ADMIN_STATE_NORMAL_OPERATION;
+}
+
+/**
+ * vdo_finish_operation() - Finish the current operation.
+ *
+ * Will notify the operation waiter if there is one. This method should be used for operations
+ * started with vdo_start_operation(). For operations which were started with vdo_start_draining(),
+ * use vdo_finish_draining() instead.
+ *
+ * Return: true if there was an operation to finish.
+ */
+bool vdo_finish_operation(struct admin_state *state, int result)
+{
+ if (!vdo_get_admin_state_code(state)->operating)
+ return false;
+
+ state->complete = state->starting;
+ if (state->waiter != NULL)
+ vdo_set_completion_result(state->waiter, result);
+
+ if (!state->starting) {
+ vdo_set_admin_state_code(state, state->next_state);
+ if (state->waiter != NULL)
+ vdo_launch_completion(vdo_forget(state->waiter));
+ }
+
+ return true;
+}
+
+/**
+ * begin_operation() - Begin an operation if it may be started given the current state.
+ * @waiter A completion to notify when the operation is complete; may be NULL.
+ * @initiator The vdo_admin_initiator_fn to call if the operation may begin; may be NULL.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int __must_check begin_operation(struct admin_state *state,
+ const struct admin_state_code *operation,
+ struct vdo_completion *waiter,
+ vdo_admin_initiator_fn initiator)
+{
+ int result;
+ const struct admin_state_code *next_state = get_next_state(state, operation);
+
+ if (next_state == NULL) {
+ result = vdo_log_error_strerror(VDO_INVALID_ADMIN_STATE,
+ "Can't start %s from %s",
+ operation->name,
+ vdo_get_admin_state_code(state)->name);
+ } else if (state->waiter != NULL) {
+ result = vdo_log_error_strerror(VDO_COMPONENT_BUSY,
+ "Can't start %s with extant waiter",
+ operation->name);
+ } else {
+ state->waiter = waiter;
+ state->next_state = next_state;
+ vdo_set_admin_state_code(state, operation);
+ if (initiator != NULL) {
+ state->starting = true;
+ initiator(state);
+ state->starting = false;
+ if (state->complete)
+ vdo_finish_operation(state, VDO_SUCCESS);
+ }
+
+ return VDO_SUCCESS;
+ }
+
+ if (waiter != NULL)
+ vdo_continue_completion(waiter, result);
+
+ return result;
+}
+
+/**
+ * start_operation() - Start an operation if it may be started given the current state.
+ * @waiter A completion to notify when the operation is complete.
+ * @initiator The vdo_admin_initiator_fn to call if the operation may begin; may be NULL.
+ *
+ * Return: true if the operation was started.
+ */
+static inline bool __must_check start_operation(struct admin_state *state,
+ const struct admin_state_code *operation,
+ struct vdo_completion *waiter,
+ vdo_admin_initiator_fn initiator)
+{
+ return (begin_operation(state, operation, waiter, initiator) == VDO_SUCCESS);
+}
+
+/**
+ * check_code() - Check the result of a state validation.
+ * @valid true if the code is of an appropriate type.
+ * @code The code which failed to be of the correct type.
+ * @what What the code failed to be, for logging.
+ * @waiter The completion to notify of the error; may be NULL.
+ *
+ * If the result failed, log an invalid state error and, if there is a waiter, notify it.
+ *
+ * Return: The result of the check.
+ */
+static bool check_code(bool valid, const struct admin_state_code *code, const char *what,
+ struct vdo_completion *waiter)
+{
+ int result;
+
+ if (valid)
+ return true;
+
+ result = vdo_log_error_strerror(VDO_INVALID_ADMIN_STATE,
+ "%s is not a %s", code->name, what);
+ if (waiter != NULL)
+ vdo_continue_completion(waiter, result);
+
+ return false;
+}
+
+/**
+ * assert_vdo_drain_operation() - Check that an operation is a drain.
+ * @waiter The completion to finish with an error if the operation is not a drain.
+ *
+ * Return: true if the specified operation is a drain.
+ */
+static bool __must_check assert_vdo_drain_operation(const struct admin_state_code *operation,
+ struct vdo_completion *waiter)
+{
+ return check_code(operation->draining, operation, "drain operation", waiter);
+}
+
+/**
+ * vdo_start_draining() - Initiate a drain operation if the current state permits it.
+ * @operation The type of drain to initiate.
+ * @waiter The completion to notify when the drain is complete.
+ * @initiator The vdo_admin_initiator_fn to call if the operation may begin; may be NULL.
+ *
+ * Return: true if the drain was initiated, if not the waiter will be notified.
+ */
+bool vdo_start_draining(struct admin_state *state,
+ const struct admin_state_code *operation,
+ struct vdo_completion *waiter, vdo_admin_initiator_fn initiator)
+{
+ const struct admin_state_code *code = vdo_get_admin_state_code(state);
+
+ if (!assert_vdo_drain_operation(operation, waiter))
+ return false;
+
+ if (code->quiescent) {
+ vdo_launch_completion(waiter);
+ return false;
+ }
+
+ if (!code->normal) {
+ vdo_log_error_strerror(VDO_INVALID_ADMIN_STATE, "can't start %s from %s",
+ operation->name, code->name);
+ vdo_continue_completion(waiter, VDO_INVALID_ADMIN_STATE);
+ return false;
+ }
+
+ return start_operation(state, operation, waiter, initiator);
+}
+
+/**
+ * vdo_finish_draining() - Finish a drain operation if one was in progress.
+ *
+ * Return: true if the state was draining; will notify the waiter if so.
+ */
+bool vdo_finish_draining(struct admin_state *state)
+{
+ return vdo_finish_draining_with_result(state, VDO_SUCCESS);
+}
+
+/**
+ * vdo_finish_draining_with_result() - Finish a drain operation with a status code.
+ *
+ * Return: true if the state was draining; will notify the waiter if so.
+ */
+bool vdo_finish_draining_with_result(struct admin_state *state, int result)
+{
+ return (vdo_is_state_draining(state) && vdo_finish_operation(state, result));
+}
+
+/**
+ * vdo_assert_load_operation() - Check that an operation is a load.
+ * @waiter The completion to finish with an error if the operation is not a load.
+ *
+ * Return: true if the specified operation is a load.
+ */
+bool vdo_assert_load_operation(const struct admin_state_code *operation,
+ struct vdo_completion *waiter)
+{
+ return check_code(operation->loading, operation, "load operation", waiter);
+}
+
+/**
+ * vdo_start_loading() - Initiate a load operation if the current state permits it.
+ * @operation The type of load to initiate.
+ * @waiter The completion to notify when the load is complete (may be NULL).
+ * @initiator The vdo_admin_initiator_fn to call if the operation may begin; may be NULL.
+ *
+ * Return: true if the load was initiated, if not the waiter will be notified.
+ */
+bool vdo_start_loading(struct admin_state *state,
+ const struct admin_state_code *operation,
+ struct vdo_completion *waiter, vdo_admin_initiator_fn initiator)
+{
+ return (vdo_assert_load_operation(operation, waiter) &&
+ start_operation(state, operation, waiter, initiator));
+}
+
+/**
+ * vdo_finish_loading() - Finish a load operation if one was in progress.
+ *
+ * Return: true if the state was loading; will notify the waiter if so.
+ */
+bool vdo_finish_loading(struct admin_state *state)
+{
+ return vdo_finish_loading_with_result(state, VDO_SUCCESS);
+}
+
+/**
+ * vdo_finish_loading_with_result() - Finish a load operation with a status code.
+ * @result The result of the load operation.
+ *
+ * Return: true if the state was loading; will notify the waiter if so.
+ */
+bool vdo_finish_loading_with_result(struct admin_state *state, int result)
+{
+ return (vdo_is_state_loading(state) && vdo_finish_operation(state, result));
+}
+
+/**
+ * assert_vdo_resume_operation() - Check whether an admin_state_code is a resume operation.
+ * @waiter The completion to notify if the operation is not a resume operation; may be NULL.
+ *
+ * Return: true if the code is a resume operation.
+ */
+static bool __must_check assert_vdo_resume_operation(const struct admin_state_code *operation,
+ struct vdo_completion *waiter)
+{
+ return check_code(operation == VDO_ADMIN_STATE_RESUMING, operation,
+ "resume operation", waiter);
+}
+
+/**
+ * vdo_start_resuming() - Initiate a resume operation if the current state permits it.
+ * @operation The type of resume to start.
+ * @waiter The completion to notify when the resume is complete (may be NULL).
+ * @initiator The vdo_admin_initiator_fn to call if the operation may begin; may be NULL.
+ *
+ * Return: true if the resume was initiated, if not the waiter will be notified.
+ */
+bool vdo_start_resuming(struct admin_state *state,
+ const struct admin_state_code *operation,
+ struct vdo_completion *waiter, vdo_admin_initiator_fn initiator)
+{
+ return (assert_vdo_resume_operation(operation, waiter) &&
+ start_operation(state, operation, waiter, initiator));
+}
+
+/**
+ * vdo_finish_resuming() - Finish a resume operation if one was in progress.
+ *
+ * Return: true if the state was resuming; will notify the waiter if so.
+ */
+bool vdo_finish_resuming(struct admin_state *state)
+{
+ return vdo_finish_resuming_with_result(state, VDO_SUCCESS);
+}
+
+/**
+ * vdo_finish_resuming_with_result() - Finish a resume operation with a status code.
+ * @result The result of the resume operation.
+ *
+ * Return: true if the state was resuming; will notify the waiter if so.
+ */
+bool vdo_finish_resuming_with_result(struct admin_state *state, int result)
+{
+ return (vdo_is_state_resuming(state) && vdo_finish_operation(state, result));
+}
+
+/**
+ * vdo_resume_if_quiescent() - Change the state to normal operation if the current state is
+ * quiescent.
+ *
+ * Return: VDO_SUCCESS if the state resumed, VDO_INVALID_ADMIN_STATE otherwise.
+ */
+int vdo_resume_if_quiescent(struct admin_state *state)
+{
+ if (!vdo_is_state_quiescent(state))
+ return VDO_INVALID_ADMIN_STATE;
+
+ vdo_set_admin_state_code(state, VDO_ADMIN_STATE_NORMAL_OPERATION);
+ return VDO_SUCCESS;
+}
+
+/**
+ * vdo_start_operation() - Attempt to start an operation.
+ *
+ * Return: VDO_SUCCESS if the operation was started, VDO_INVALID_ADMIN_STATE if not
+ */
+int vdo_start_operation(struct admin_state *state,
+ const struct admin_state_code *operation)
+{
+ return vdo_start_operation_with_waiter(state, operation, NULL, NULL);
+}
+
+/**
+ * vdo_start_operation_with_waiter() - Attempt to start an operation.
+ * @waiter the completion to notify when the operation completes or fails to start; may be NULL.
+ * @initiator The vdo_admin_initiator_fn to call if the operation may begin; may be NULL.
+ *
+ * Return: VDO_SUCCESS if the operation was started, VDO_INVALID_ADMIN_STATE if not
+ */
+int vdo_start_operation_with_waiter(struct admin_state *state,
+ const struct admin_state_code *operation,
+ struct vdo_completion *waiter,
+ vdo_admin_initiator_fn initiator)
+{
+ return (check_code(operation->operating, operation, "operation", waiter) ?
+ begin_operation(state, operation, waiter, initiator) :
+ VDO_INVALID_ADMIN_STATE);
+}
diff --git a/drivers/md/dm-vdo/admin-state.h b/drivers/md/dm-vdo/admin-state.h
new file mode 100644
index 000000000000..a7d6ac2c30a6
--- /dev/null
+++ b/drivers/md/dm-vdo/admin-state.h
@@ -0,0 +1,178 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef VDO_ADMIN_STATE_H
+#define VDO_ADMIN_STATE_H
+
+#include "completion.h"
+#include "types.h"
+
+struct admin_state_code {
+ const char *name;
+ /* Normal operation, data_vios may be active */
+ bool normal;
+ /* I/O is draining, new requests should not start */
+ bool draining;
+ /* This is a startup time operation */
+ bool loading;
+ /* The next state will be quiescent */
+ bool quiescing;
+ /* The VDO is quiescent, there should be no I/O */
+ bool quiescent;
+ /* Whether an operation is in progress and so no other operation may be started */
+ bool operating;
+};
+
+extern const struct admin_state_code *VDO_ADMIN_STATE_NORMAL_OPERATION;
+extern const struct admin_state_code *VDO_ADMIN_STATE_OPERATING;
+extern const struct admin_state_code *VDO_ADMIN_STATE_FORMATTING;
+extern const struct admin_state_code *VDO_ADMIN_STATE_PRE_LOADING;
+extern const struct admin_state_code *VDO_ADMIN_STATE_PRE_LOADED;
+extern const struct admin_state_code *VDO_ADMIN_STATE_LOADING;
+extern const struct admin_state_code *VDO_ADMIN_STATE_LOADING_FOR_RECOVERY;
+extern const struct admin_state_code *VDO_ADMIN_STATE_LOADING_FOR_REBUILD;
+extern const struct admin_state_code *VDO_ADMIN_STATE_WAITING_FOR_RECOVERY;
+extern const struct admin_state_code *VDO_ADMIN_STATE_NEW;
+extern const struct admin_state_code *VDO_ADMIN_STATE_INITIALIZED;
+extern const struct admin_state_code *VDO_ADMIN_STATE_RECOVERING;
+extern const struct admin_state_code *VDO_ADMIN_STATE_REBUILDING;
+extern const struct admin_state_code *VDO_ADMIN_STATE_SAVING;
+extern const struct admin_state_code *VDO_ADMIN_STATE_SAVED;
+extern const struct admin_state_code *VDO_ADMIN_STATE_SCRUBBING;
+extern const struct admin_state_code *VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING;
+extern const struct admin_state_code *VDO_ADMIN_STATE_STOPPING;
+extern const struct admin_state_code *VDO_ADMIN_STATE_STOPPED;
+extern const struct admin_state_code *VDO_ADMIN_STATE_SUSPENDING;
+extern const struct admin_state_code *VDO_ADMIN_STATE_SUSPENDED;
+extern const struct admin_state_code *VDO_ADMIN_STATE_SUSPENDED_OPERATION;
+extern const struct admin_state_code *VDO_ADMIN_STATE_RESUMING;
+
+struct admin_state {
+ const struct admin_state_code *current_state;
+ /* The next administrative state (when the current operation finishes) */
+ const struct admin_state_code *next_state;
+ /* A completion waiting on a state change */
+ struct vdo_completion *waiter;
+ /* Whether an operation is being initiated */
+ bool starting;
+ /* Whether an operation has completed in the initiator */
+ bool complete;
+};
+
+/**
+ * typedef vdo_admin_initiator_fn - A method to be called once an admin operation may be initiated.
+ */
+typedef void (*vdo_admin_initiator_fn)(struct admin_state *state);
+
+static inline const struct admin_state_code * __must_check
+vdo_get_admin_state_code(const struct admin_state *state)
+{
+ return READ_ONCE(state->current_state);
+}
+
+/**
+ * vdo_set_admin_state_code() - Set the current admin state code.
+ *
+ * This function should be used primarily for initialization and by adminState internals. Most uses
+ * should go through the operation interfaces.
+ */
+static inline void vdo_set_admin_state_code(struct admin_state *state,
+ const struct admin_state_code *code)
+{
+ WRITE_ONCE(state->current_state, code);
+}
+
+static inline bool __must_check vdo_is_state_normal(const struct admin_state *state)
+{
+ return vdo_get_admin_state_code(state)->normal;
+}
+
+static inline bool __must_check vdo_is_state_suspending(const struct admin_state *state)
+{
+ return (vdo_get_admin_state_code(state) == VDO_ADMIN_STATE_SUSPENDING);
+}
+
+static inline bool __must_check vdo_is_state_saving(const struct admin_state *state)
+{
+ return (vdo_get_admin_state_code(state) == VDO_ADMIN_STATE_SAVING);
+}
+
+static inline bool __must_check vdo_is_state_saved(const struct admin_state *state)
+{
+ return (vdo_get_admin_state_code(state) == VDO_ADMIN_STATE_SAVED);
+}
+
+static inline bool __must_check vdo_is_state_draining(const struct admin_state *state)
+{
+ return vdo_get_admin_state_code(state)->draining;
+}
+
+static inline bool __must_check vdo_is_state_loading(const struct admin_state *state)
+{
+ return vdo_get_admin_state_code(state)->loading;
+}
+
+static inline bool __must_check vdo_is_state_resuming(const struct admin_state *state)
+{
+ return (vdo_get_admin_state_code(state) == VDO_ADMIN_STATE_RESUMING);
+}
+
+static inline bool __must_check vdo_is_state_clean_load(const struct admin_state *state)
+{
+ const struct admin_state_code *code = vdo_get_admin_state_code(state);
+
+ return ((code == VDO_ADMIN_STATE_FORMATTING) || (code == VDO_ADMIN_STATE_LOADING));
+}
+
+static inline bool __must_check vdo_is_state_quiescing(const struct admin_state *state)
+{
+ return vdo_get_admin_state_code(state)->quiescing;
+}
+
+static inline bool __must_check vdo_is_state_quiescent(const struct admin_state *state)
+{
+ return vdo_get_admin_state_code(state)->quiescent;
+}
+
+bool __must_check vdo_assert_load_operation(const struct admin_state_code *operation,
+ struct vdo_completion *waiter);
+
+bool vdo_start_loading(struct admin_state *state,
+ const struct admin_state_code *operation,
+ struct vdo_completion *waiter, vdo_admin_initiator_fn initiator);
+
+bool vdo_finish_loading(struct admin_state *state);
+
+bool vdo_finish_loading_with_result(struct admin_state *state, int result);
+
+bool vdo_start_resuming(struct admin_state *state,
+ const struct admin_state_code *operation,
+ struct vdo_completion *waiter, vdo_admin_initiator_fn initiator);
+
+bool vdo_finish_resuming(struct admin_state *state);
+
+bool vdo_finish_resuming_with_result(struct admin_state *state, int result);
+
+int vdo_resume_if_quiescent(struct admin_state *state);
+
+bool vdo_start_draining(struct admin_state *state,
+ const struct admin_state_code *operation,
+ struct vdo_completion *waiter, vdo_admin_initiator_fn initiator);
+
+bool vdo_finish_draining(struct admin_state *state);
+
+bool vdo_finish_draining_with_result(struct admin_state *state, int result);
+
+int vdo_start_operation(struct admin_state *state,
+ const struct admin_state_code *operation);
+
+int vdo_start_operation_with_waiter(struct admin_state *state,
+ const struct admin_state_code *operation,
+ struct vdo_completion *waiter,
+ vdo_admin_initiator_fn initiator);
+
+bool vdo_finish_operation(struct admin_state *state, int result);
+
+#endif /* VDO_ADMIN_STATE_H */
diff --git a/drivers/md/dm-vdo/block-map.c b/drivers/md/dm-vdo/block-map.c
new file mode 100644
index 000000000000..a0a7c1bd634e
--- /dev/null
+++ b/drivers/md/dm-vdo/block-map.c
@@ -0,0 +1,3318 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "block-map.h"
+
+#include <linux/bio.h>
+#include <linux/ratelimit.h>
+
+#include "errors.h"
+#include "logger.h"
+#include "memory-alloc.h"
+#include "permassert.h"
+
+#include "action-manager.h"
+#include "admin-state.h"
+#include "completion.h"
+#include "constants.h"
+#include "data-vio.h"
+#include "encodings.h"
+#include "io-submitter.h"
+#include "physical-zone.h"
+#include "recovery-journal.h"
+#include "slab-depot.h"
+#include "status-codes.h"
+#include "types.h"
+#include "vdo.h"
+#include "vio.h"
+#include "wait-queue.h"
+
+/**
+ * DOC: Block map eras
+ *
+ * The block map era, or maximum age, is used as follows:
+ *
+ * Each block map page, when dirty, records the earliest recovery journal block sequence number of
+ * the changes reflected in that dirty block. Sequence numbers are classified into eras: every
+ * @maximum_age sequence numbers, we switch to a new era. Block map pages are assigned to eras
+ * according to the sequence number they record.
+ *
+ * In the current (newest) era, block map pages are not written unless there is cache pressure. In
+ * the next oldest era, each time a new journal block is written 1/@maximum_age of the pages in
+ * this era are issued for write. In all older eras, pages are issued for write immediately.
+ */
+
+struct page_descriptor {
+ root_count_t root_index;
+ height_t height;
+ page_number_t page_index;
+ slot_number_t slot;
+} __packed;
+
+union page_key {
+ struct page_descriptor descriptor;
+ u64 key;
+};
+
+struct write_if_not_dirtied_context {
+ struct block_map_zone *zone;
+ u8 generation;
+};
+
+struct block_map_tree_segment {
+ struct tree_page *levels[VDO_BLOCK_MAP_TREE_HEIGHT];
+};
+
+struct block_map_tree {
+ struct block_map_tree_segment *segments;
+};
+
+struct forest {
+ struct block_map *map;
+ size_t segments;
+ struct boundary *boundaries;
+ struct tree_page **pages;
+ struct block_map_tree trees[];
+};
+
+struct cursor_level {
+ page_number_t page_index;
+ slot_number_t slot;
+};
+
+struct cursors;
+
+struct cursor {
+ struct vdo_waiter waiter;
+ struct block_map_tree *tree;
+ height_t height;
+ struct cursors *parent;
+ struct boundary boundary;
+ struct cursor_level levels[VDO_BLOCK_MAP_TREE_HEIGHT];
+ struct pooled_vio *vio;
+};
+
+struct cursors {
+ struct block_map_zone *zone;
+ struct vio_pool *pool;
+ vdo_entry_callback_fn entry_callback;
+ struct vdo_completion *completion;
+ root_count_t active_roots;
+ struct cursor cursors[];
+};
+
+static const physical_block_number_t NO_PAGE = 0xFFFFFFFFFFFFFFFF;
+
+/* Used to indicate that the page holding the location of a tree root has been "loaded". */
+static const physical_block_number_t VDO_INVALID_PBN = 0xFFFFFFFFFFFFFFFF;
+
+const struct block_map_entry UNMAPPED_BLOCK_MAP_ENTRY = {
+ .mapping_state = VDO_MAPPING_STATE_UNMAPPED & 0x0F,
+ .pbn_high_nibble = 0,
+ .pbn_low_word = __cpu_to_le32(VDO_ZERO_BLOCK & UINT_MAX),
+};
+
+#define LOG_INTERVAL 4000
+#define DISPLAY_INTERVAL 100000
+
+/*
+ * For adjusting VDO page cache statistic fields which are only mutated on the logical zone thread.
+ * Prevents any compiler shenanigans from affecting other threads reading those stats.
+ */
+#define ADD_ONCE(value, delta) WRITE_ONCE(value, (value) + (delta))
+
+static inline bool is_dirty(const struct page_info *info)
+{
+ return info->state == PS_DIRTY;
+}
+
+static inline bool is_present(const struct page_info *info)
+{
+ return (info->state == PS_RESIDENT) || (info->state == PS_DIRTY);
+}
+
+static inline bool is_in_flight(const struct page_info *info)
+{
+ return (info->state == PS_INCOMING) || (info->state == PS_OUTGOING);
+}
+
+static inline bool is_incoming(const struct page_info *info)
+{
+ return info->state == PS_INCOMING;
+}
+
+static inline bool is_outgoing(const struct page_info *info)
+{
+ return info->state == PS_OUTGOING;
+}
+
+static inline bool is_valid(const struct page_info *info)
+{
+ return is_present(info) || is_outgoing(info);
+}
+
+static char *get_page_buffer(struct page_info *info)
+{
+ struct vdo_page_cache *cache = info->cache;
+
+ return &cache->pages[(info - cache->infos) * VDO_BLOCK_SIZE];
+}
+
+static inline struct vdo_page_completion *page_completion_from_waiter(struct vdo_waiter *waiter)
+{
+ struct vdo_page_completion *completion;
+
+ if (waiter == NULL)
+ return NULL;
+
+ completion = container_of(waiter, struct vdo_page_completion, waiter);
+ vdo_assert_completion_type(&completion->completion, VDO_PAGE_COMPLETION);
+ return completion;
+}
+
+/**
+ * initialize_info() - Initialize all page info structures and put them on the free list.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int initialize_info(struct vdo_page_cache *cache)
+{
+ struct page_info *info;
+
+ INIT_LIST_HEAD(&cache->free_list);
+ for (info = cache->infos; info < cache->infos + cache->page_count; info++) {
+ int result;
+
+ info->cache = cache;
+ info->state = PS_FREE;
+ info->pbn = NO_PAGE;
+
+ result = create_metadata_vio(cache->vdo, VIO_TYPE_BLOCK_MAP,
+ VIO_PRIORITY_METADATA, info,
+ get_page_buffer(info), &info->vio);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ /* The thread ID should never change. */
+ info->vio->completion.callback_thread_id = cache->zone->thread_id;
+
+ INIT_LIST_HEAD(&info->state_entry);
+ list_add_tail(&info->state_entry, &cache->free_list);
+ INIT_LIST_HEAD(&info->lru_entry);
+ }
+
+ return VDO_SUCCESS;
+}
+
+/**
+ * allocate_cache_components() - Allocate components of the cache which require their own
+ * allocation.
+ * @maximum_age: The number of journal blocks before a dirtied page is considered old and must be
+ * written out.
+ *
+ * The caller is responsible for all clean up on errors.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+static int __must_check allocate_cache_components(struct vdo_page_cache *cache)
+{
+ u64 size = cache->page_count * (u64) VDO_BLOCK_SIZE;
+ int result;
+
+ result = vdo_allocate(cache->page_count, struct page_info, "page infos",
+ &cache->infos);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = vdo_allocate_memory(size, VDO_BLOCK_SIZE, "cache pages", &cache->pages);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = vdo_int_map_create(cache->page_count, &cache->page_map);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ return initialize_info(cache);
+}
+
+/**
+ * assert_on_cache_thread() - Assert that a function has been called on the VDO page cache's
+ * thread.
+ */
+static inline void assert_on_cache_thread(struct vdo_page_cache *cache,
+ const char *function_name)
+{
+ thread_id_t thread_id = vdo_get_callback_thread_id();
+
+ VDO_ASSERT_LOG_ONLY((thread_id == cache->zone->thread_id),
+ "%s() must only be called on cache thread %d, not thread %d",
+ function_name, cache->zone->thread_id, thread_id);
+}
+
+/** assert_io_allowed() - Assert that a page cache may issue I/O. */
+static inline void assert_io_allowed(struct vdo_page_cache *cache)
+{
+ VDO_ASSERT_LOG_ONLY(!vdo_is_state_quiescent(&cache->zone->state),
+ "VDO page cache may issue I/O");
+}
+
+/** report_cache_pressure() - Log and, if enabled, report cache pressure. */
+static void report_cache_pressure(struct vdo_page_cache *cache)
+{
+ ADD_ONCE(cache->stats.cache_pressure, 1);
+ if (cache->waiter_count > cache->page_count) {
+ if ((cache->pressure_report % LOG_INTERVAL) == 0)
+ vdo_log_info("page cache pressure %u", cache->stats.cache_pressure);
+
+ if (++cache->pressure_report >= DISPLAY_INTERVAL)
+ cache->pressure_report = 0;
+ }
+}
+
+/**
+ * get_page_state_name() - Return the name of a page state.
+ *
+ * If the page state is invalid a static string is returned and the invalid state is logged.
+ *
+ * Return: A pointer to a static page state name.
+ */
+static const char * __must_check get_page_state_name(enum vdo_page_buffer_state state)
+{
+ int result;
+ static const char * const state_names[] = {
+ "FREE", "INCOMING", "FAILED", "RESIDENT", "DIRTY", "OUTGOING"
+ };
+
+ BUILD_BUG_ON(ARRAY_SIZE(state_names) != PAGE_STATE_COUNT);
+
+ result = VDO_ASSERT(state < ARRAY_SIZE(state_names),
+ "Unknown page_state value %d", state);
+ if (result != VDO_SUCCESS)
+ return "[UNKNOWN PAGE STATE]";
+
+ return state_names[state];
+}
+
+/**
+ * update_counter() - Update the counter associated with a given state.
+ * @info: The page info to count.
+ * @delta: The delta to apply to the counter.
+ */
+static void update_counter(struct page_info *info, s32 delta)
+{
+ struct block_map_statistics *stats = &info->cache->stats;
+
+ switch (info->state) {
+ case PS_FREE:
+ ADD_ONCE(stats->free_pages, delta);
+ return;
+
+ case PS_INCOMING:
+ ADD_ONCE(stats->incoming_pages, delta);
+ return;
+
+ case PS_OUTGOING:
+ ADD_ONCE(stats->outgoing_pages, delta);
+ return;
+
+ case PS_FAILED:
+ ADD_ONCE(stats->failed_pages, delta);
+ return;
+
+ case PS_RESIDENT:
+ ADD_ONCE(stats->clean_pages, delta);
+ return;
+
+ case PS_DIRTY:
+ ADD_ONCE(stats->dirty_pages, delta);
+ return;
+
+ default:
+ return;
+ }
+}
+
+/** update_lru() - Update the lru information for an active page. */
+static void update_lru(struct page_info *info)
+{
+ if (info->cache->lru_list.prev != &info->lru_entry)
+ list_move_tail(&info->lru_entry, &info->cache->lru_list);
+}
+
+/**
+ * set_info_state() - Set the state of a page_info and put it on the right list, adjusting
+ * counters.
+ */
+static void set_info_state(struct page_info *info, enum vdo_page_buffer_state new_state)
+{
+ if (new_state == info->state)
+ return;
+
+ update_counter(info, -1);
+ info->state = new_state;
+ update_counter(info, 1);
+
+ switch (info->state) {
+ case PS_FREE:
+ case PS_FAILED:
+ list_move_tail(&info->state_entry, &info->cache->free_list);
+ return;
+
+ case PS_OUTGOING:
+ list_move_tail(&info->state_entry, &info->cache->outgoing_list);
+ return;
+
+ case PS_DIRTY:
+ return;
+
+ default:
+ list_del_init(&info->state_entry);
+ }
+}
+
+/** set_info_pbn() - Set the pbn for an info, updating the map as needed. */
+static int __must_check set_info_pbn(struct page_info *info, physical_block_number_t pbn)
+{
+ struct vdo_page_cache *cache = info->cache;
+
+ /* Either the new or the old page number must be NO_PAGE. */
+ int result = VDO_ASSERT((pbn == NO_PAGE) || (info->pbn == NO_PAGE),
+ "Must free a page before reusing it.");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ if (info->pbn != NO_PAGE)
+ vdo_int_map_remove(cache->page_map, info->pbn);
+
+ info->pbn = pbn;
+
+ if (pbn != NO_PAGE) {
+ result = vdo_int_map_put(cache->page_map, pbn, info, true, NULL);
+ if (result != VDO_SUCCESS)
+ return result;
+ }
+ return VDO_SUCCESS;
+}
+
+/** reset_page_info() - Reset page info to represent an unallocated page. */
+static int reset_page_info(struct page_info *info)
+{
+ int result;
+
+ result = VDO_ASSERT(info->busy == 0, "VDO Page must not be busy");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = VDO_ASSERT(!vdo_waitq_has_waiters(&info->waiting),
+ "VDO Page must not have waiters");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = set_info_pbn(info, NO_PAGE);
+ set_info_state(info, PS_FREE);
+ list_del_init(&info->lru_entry);
+ return result;
+}
+
+/**
+ * find_free_page() - Find a free page.
+ *
+ * Return: A pointer to the page info structure (if found), NULL otherwise.
+ */
+static struct page_info * __must_check find_free_page(struct vdo_page_cache *cache)
+{
+ struct page_info *info;
+
+ info = list_first_entry_or_null(&cache->free_list, struct page_info,
+ state_entry);
+ if (info != NULL)
+ list_del_init(&info->state_entry);
+
+ return info;
+}
+
+/**
+ * find_page() - Find the page info (if any) associated with a given pbn.
+ * @pbn: The absolute physical block number of the page.
+ *
+ * Return: The page info for the page if available, or NULL if not.
+ */
+static struct page_info * __must_check find_page(struct vdo_page_cache *cache,
+ physical_block_number_t pbn)
+{
+ if ((cache->last_found != NULL) && (cache->last_found->pbn == pbn))
+ return cache->last_found;
+
+ cache->last_found = vdo_int_map_get(cache->page_map, pbn);
+ return cache->last_found;
+}
+
+/**
+ * select_lru_page() - Determine which page is least recently used.
+ *
+ * Picks the least recently used from among the non-busy entries at the front of each of the lru
+ * ring. Since whenever we mark a page busy we also put it to the end of the ring it is unlikely
+ * that the entries at the front are busy unless the queue is very short, but not impossible.
+ *
+ * Return: A pointer to the info structure for a relevant page, or NULL if no such page can be
+ * found. The page can be dirty or resident.
+ */
+static struct page_info * __must_check select_lru_page(struct vdo_page_cache *cache)
+{
+ struct page_info *info;
+
+ list_for_each_entry(info, &cache->lru_list, lru_entry)
+ if ((info->busy == 0) && !is_in_flight(info))
+ return info;
+
+ return NULL;
+}
+
+/* ASYNCHRONOUS INTERFACE BEYOND THIS POINT */
+
+/**
+ * complete_with_page() - Helper to complete the VDO Page Completion request successfully.
+ * @info: The page info representing the result page.
+ * @vdo_page_comp: The VDO page completion to complete.
+ */
+static void complete_with_page(struct page_info *info,
+ struct vdo_page_completion *vdo_page_comp)
+{
+ bool available = vdo_page_comp->writable ? is_present(info) : is_valid(info);
+
+ if (!available) {
+ vdo_log_error_strerror(VDO_BAD_PAGE,
+ "Requested cache page %llu in state %s is not %s",
+ (unsigned long long) info->pbn,
+ get_page_state_name(info->state),
+ vdo_page_comp->writable ? "present" : "valid");
+ vdo_fail_completion(&vdo_page_comp->completion, VDO_BAD_PAGE);
+ return;
+ }
+
+ vdo_page_comp->info = info;
+ vdo_page_comp->ready = true;
+ vdo_finish_completion(&vdo_page_comp->completion);
+}
+
+/**
+ * complete_waiter_with_error() - Complete a page completion with an error code.
+ * @waiter: The page completion, as a waiter.
+ * @result_ptr: A pointer to the error code.
+ *
+ * Implements waiter_callback_fn.
+ */
+static void complete_waiter_with_error(struct vdo_waiter *waiter, void *result_ptr)
+{
+ int *result = result_ptr;
+
+ vdo_fail_completion(&page_completion_from_waiter(waiter)->completion, *result);
+}
+
+/**
+ * complete_waiter_with_page() - Complete a page completion with a page.
+ * @waiter: The page completion, as a waiter.
+ * @page_info: The page info to complete with.
+ *
+ * Implements waiter_callback_fn.
+ */
+static void complete_waiter_with_page(struct vdo_waiter *waiter, void *page_info)
+{
+ complete_with_page(page_info, page_completion_from_waiter(waiter));
+}
+
+/**
+ * distribute_page_over_waitq() - Complete a waitq of VDO page completions with a page result.
+ *
+ * Upon completion the waitq will be empty.
+ *
+ * Return: The number of pages distributed.
+ */
+static unsigned int distribute_page_over_waitq(struct page_info *info,
+ struct vdo_wait_queue *waitq)
+{
+ size_t num_pages;
+
+ update_lru(info);
+ num_pages = vdo_waitq_num_waiters(waitq);
+
+ /*
+ * Increment the busy count once for each pending completion so that this page does not
+ * stop being busy until all completions have been processed.
+ */
+ info->busy += num_pages;
+
+ vdo_waitq_notify_all_waiters(waitq, complete_waiter_with_page, info);
+ return num_pages;
+}
+
+/**
+ * set_persistent_error() - Set a persistent error which all requests will receive in the future.
+ * @context: A string describing what triggered the error.
+ *
+ * Once triggered, all enqueued completions will get this error. Any future requests will result in
+ * this error as well.
+ */
+static void set_persistent_error(struct vdo_page_cache *cache, const char *context,
+ int result)
+{
+ struct page_info *info;
+ /* If we're already read-only, there's no need to log. */
+ struct vdo *vdo = cache->vdo;
+
+ if ((result != VDO_READ_ONLY) && !vdo_is_read_only(vdo)) {
+ vdo_log_error_strerror(result, "VDO Page Cache persistent error: %s",
+ context);
+ vdo_enter_read_only_mode(vdo, result);
+ }
+
+ assert_on_cache_thread(cache, __func__);
+
+ vdo_waitq_notify_all_waiters(&cache->free_waiters,
+ complete_waiter_with_error, &result);
+ cache->waiter_count = 0;
+
+ for (info = cache->infos; info < cache->infos + cache->page_count; info++) {
+ vdo_waitq_notify_all_waiters(&info->waiting,
+ complete_waiter_with_error, &result);
+ }
+}
+
+/**
+ * validate_completed_page() - Check that a page completion which is being freed to the cache
+ * referred to a valid page and is in a valid state.
+ * @writable: Whether a writable page is required.
+ *
+ * Return: VDO_SUCCESS if the page was valid, otherwise as error
+ */
+static int __must_check validate_completed_page(struct vdo_page_completion *completion,
+ bool writable)
+{
+ int result;
+
+ result = VDO_ASSERT(completion->ready, "VDO Page completion not ready");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = VDO_ASSERT(completion->info != NULL,
+ "VDO Page Completion must be complete");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = VDO_ASSERT(completion->info->pbn == completion->pbn,
+ "VDO Page Completion pbn must be consistent");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = VDO_ASSERT(is_valid(completion->info),
+ "VDO Page Completion page must be valid");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ if (writable) {
+ result = VDO_ASSERT(completion->writable,
+ "VDO Page Completion must be writable");
+ if (result != VDO_SUCCESS)
+ return result;
+ }
+
+ return VDO_SUCCESS;
+}
+
+static void check_for_drain_complete(struct block_map_zone *zone)
+{
+ if (vdo_is_state_draining(&zone->state) &&
+ (zone->active_lookups == 0) &&
+ !vdo_waitq_has_waiters(&zone->flush_waiters) &&
+ !is_vio_pool_busy(zone->vio_pool) &&
+ (zone->page_cache.outstanding_reads == 0) &&
+ (zone->page_cache.outstanding_writes == 0)) {
+ vdo_finish_draining_with_result(&zone->state,
+ (vdo_is_read_only(zone->block_map->vdo) ?
+ VDO_READ_ONLY : VDO_SUCCESS));
+ }
+}
+
+static void enter_zone_read_only_mode(struct block_map_zone *zone, int result)
+{
+ vdo_enter_read_only_mode(zone->block_map->vdo, result);
+
+ /*
+ * We are in read-only mode, so we won't ever write any page out.
+ * Just take all waiters off the waitq so the zone can drain.
+ */
+ vdo_waitq_init(&zone->flush_waiters);
+ check_for_drain_complete(zone);
+}
+
+static bool __must_check
+validate_completed_page_or_enter_read_only_mode(struct vdo_page_completion *completion,
+ bool writable)
+{
+ int result = validate_completed_page(completion, writable);
+
+ if (result == VDO_SUCCESS)
+ return true;
+
+ enter_zone_read_only_mode(completion->info->cache->zone, result);
+ return false;
+}
+
+/**
+ * handle_load_error() - Handle page load errors.
+ * @completion: The page read vio.
+ */
+static void handle_load_error(struct vdo_completion *completion)
+{
+ int result = completion->result;
+ struct page_info *info = completion->parent;
+ struct vdo_page_cache *cache = info->cache;
+
+ assert_on_cache_thread(cache, __func__);
+ vio_record_metadata_io_error(as_vio(completion));
+ vdo_enter_read_only_mode(cache->zone->block_map->vdo, result);
+ ADD_ONCE(cache->stats.failed_reads, 1);
+ set_info_state(info, PS_FAILED);
+ vdo_waitq_notify_all_waiters(&info->waiting, complete_waiter_with_error, &result);
+ reset_page_info(info);
+
+ /*
+ * Don't decrement until right before calling check_for_drain_complete() to
+ * ensure that the above work can't cause the page cache to be freed out from under us.
+ */
+ cache->outstanding_reads--;
+ check_for_drain_complete(cache->zone);
+}
+
+/**
+ * page_is_loaded() - Callback used when a page has been loaded.
+ * @completion: The vio which has loaded the page. Its parent is the page_info.
+ */
+static void page_is_loaded(struct vdo_completion *completion)
+{
+ struct page_info *info = completion->parent;
+ struct vdo_page_cache *cache = info->cache;
+ nonce_t nonce = info->cache->zone->block_map->nonce;
+ struct block_map_page *page;
+ enum block_map_page_validity validity;
+
+ assert_on_cache_thread(cache, __func__);
+
+ page = (struct block_map_page *) get_page_buffer(info);
+ validity = vdo_validate_block_map_page(page, nonce, info->pbn);
+ if (validity == VDO_BLOCK_MAP_PAGE_BAD) {
+ physical_block_number_t pbn = vdo_get_block_map_page_pbn(page);
+ int result = vdo_log_error_strerror(VDO_BAD_PAGE,
+ "Expected page %llu but got page %llu instead",
+ (unsigned long long) info->pbn,
+ (unsigned long long) pbn);
+
+ vdo_continue_completion(completion, result);
+ return;
+ }
+
+ if (validity == VDO_BLOCK_MAP_PAGE_INVALID)
+ vdo_format_block_map_page(page, nonce, info->pbn, false);
+
+ info->recovery_lock = 0;
+ set_info_state(info, PS_RESIDENT);
+ distribute_page_over_waitq(info, &info->waiting);
+
+ /*
+ * Don't decrement until right before calling check_for_drain_complete() to
+ * ensure that the above work can't cause the page cache to be freed out from under us.
+ */
+ cache->outstanding_reads--;
+ check_for_drain_complete(cache->zone);
+}
+
+/**
+ * handle_rebuild_read_error() - Handle a read error during a read-only rebuild.
+ * @completion: The page load completion.
+ */
+static void handle_rebuild_read_error(struct vdo_completion *completion)
+{
+ struct page_info *info = completion->parent;
+ struct vdo_page_cache *cache = info->cache;
+
+ assert_on_cache_thread(cache, __func__);
+
+ /*
+ * We are doing a read-only rebuild, so treat this as a successful read
+ * of an uninitialized page.
+ */
+ vio_record_metadata_io_error(as_vio(completion));
+ ADD_ONCE(cache->stats.failed_reads, 1);
+ memset(get_page_buffer(info), 0, VDO_BLOCK_SIZE);
+ vdo_reset_completion(completion);
+ page_is_loaded(completion);
+}
+
+static void load_cache_page_endio(struct bio *bio)
+{
+ struct vio *vio = bio->bi_private;
+ struct page_info *info = vio->completion.parent;
+
+ continue_vio_after_io(vio, page_is_loaded, info->cache->zone->thread_id);
+}
+
+/**
+ * launch_page_load() - Begin the process of loading a page.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+static int __must_check launch_page_load(struct page_info *info,
+ physical_block_number_t pbn)
+{
+ int result;
+ vdo_action_fn callback;
+ struct vdo_page_cache *cache = info->cache;
+
+ assert_io_allowed(cache);
+
+ result = set_info_pbn(info, pbn);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = VDO_ASSERT((info->busy == 0), "Page is not busy before loading.");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ set_info_state(info, PS_INCOMING);
+ cache->outstanding_reads++;
+ ADD_ONCE(cache->stats.pages_loaded, 1);
+ callback = (cache->rebuilding ? handle_rebuild_read_error : handle_load_error);
+ vdo_submit_metadata_vio(info->vio, pbn, load_cache_page_endio,
+ callback, REQ_OP_READ | REQ_PRIO);
+ return VDO_SUCCESS;
+}
+
+static void write_pages(struct vdo_completion *completion);
+
+/** handle_flush_error() - Handle errors flushing the layer. */
+static void handle_flush_error(struct vdo_completion *completion)
+{
+ struct page_info *info = completion->parent;
+
+ vio_record_metadata_io_error(as_vio(completion));
+ set_persistent_error(info->cache, "flush failed", completion->result);
+ write_pages(completion);
+}
+
+static void flush_endio(struct bio *bio)
+{
+ struct vio *vio = bio->bi_private;
+ struct page_info *info = vio->completion.parent;
+
+ continue_vio_after_io(vio, write_pages, info->cache->zone->thread_id);
+}
+
+/** save_pages() - Attempt to save the outgoing pages by first flushing the layer. */
+static void save_pages(struct vdo_page_cache *cache)
+{
+ struct page_info *info;
+ struct vio *vio;
+
+ if ((cache->pages_in_flush > 0) || (cache->pages_to_flush == 0))
+ return;
+
+ assert_io_allowed(cache);
+
+ info = list_first_entry(&cache->outgoing_list, struct page_info, state_entry);
+
+ cache->pages_in_flush = cache->pages_to_flush;
+ cache->pages_to_flush = 0;
+ ADD_ONCE(cache->stats.flush_count, 1);
+
+ vio = info->vio;
+
+ /*
+ * We must make sure that the recovery journal entries that changed these pages were
+ * successfully persisted, and thus must issue a flush before each batch of pages is
+ * written to ensure this.
+ */
+ vdo_submit_flush_vio(vio, flush_endio, handle_flush_error);
+}
+
+/**
+ * schedule_page_save() - Add a page to the outgoing list of pages waiting to be saved.
+ *
+ * Once in the list, a page may not be used until it has been written out.
+ */
+static void schedule_page_save(struct page_info *info)
+{
+ if (info->busy > 0) {
+ info->write_status = WRITE_STATUS_DEFERRED;
+ return;
+ }
+
+ info->cache->pages_to_flush++;
+ info->cache->outstanding_writes++;
+ set_info_state(info, PS_OUTGOING);
+}
+
+/**
+ * launch_page_save() - Add a page to outgoing pages waiting to be saved, and then start saving
+ * pages if another save is not in progress.
+ */
+static void launch_page_save(struct page_info *info)
+{
+ schedule_page_save(info);
+ save_pages(info->cache);
+}
+
+/**
+ * completion_needs_page() - Determine whether a given vdo_page_completion (as a waiter) is
+ * requesting a given page number.
+ * @context: A pointer to the pbn of the desired page.
+ *
+ * Implements waiter_match_fn.
+ *
+ * Return: true if the page completion is for the desired page number.
+ */
+static bool completion_needs_page(struct vdo_waiter *waiter, void *context)
+{
+ physical_block_number_t *pbn = context;
+
+ return (page_completion_from_waiter(waiter)->pbn == *pbn);
+}
+
+/**
+ * allocate_free_page() - Allocate a free page to the first completion in the waiting queue, and
+ * any other completions that match it in page number.
+ */
+static void allocate_free_page(struct page_info *info)
+{
+ int result;
+ struct vdo_waiter *oldest_waiter;
+ physical_block_number_t pbn;
+ struct vdo_page_cache *cache = info->cache;
+
+ assert_on_cache_thread(cache, __func__);
+
+ if (!vdo_waitq_has_waiters(&cache->free_waiters)) {
+ if (cache->stats.cache_pressure > 0) {
+ vdo_log_info("page cache pressure relieved");
+ WRITE_ONCE(cache->stats.cache_pressure, 0);
+ }
+
+ return;
+ }
+
+ result = reset_page_info(info);
+ if (result != VDO_SUCCESS) {
+ set_persistent_error(cache, "cannot reset page info", result);
+ return;
+ }
+
+ oldest_waiter = vdo_waitq_get_first_waiter(&cache->free_waiters);
+ pbn = page_completion_from_waiter(oldest_waiter)->pbn;
+
+ /*
+ * Remove all entries which match the page number in question and push them onto the page
+ * info's waitq.
+ */
+ vdo_waitq_dequeue_matching_waiters(&cache->free_waiters, completion_needs_page,
+ &pbn, &info->waiting);
+ cache->waiter_count -= vdo_waitq_num_waiters(&info->waiting);
+
+ result = launch_page_load(info, pbn);
+ if (result != VDO_SUCCESS) {
+ vdo_waitq_notify_all_waiters(&info->waiting,
+ complete_waiter_with_error, &result);
+ }
+}
+
+/**
+ * discard_a_page() - Begin the process of discarding a page.
+ *
+ * If no page is discardable, increments a count of deferred frees so that the next release of a
+ * page which is no longer busy will kick off another discard cycle. This is an indication that the
+ * cache is not big enough.
+ *
+ * If the selected page is not dirty, immediately allocates the page to the oldest completion
+ * waiting for a free page.
+ */
+static void discard_a_page(struct vdo_page_cache *cache)
+{
+ struct page_info *info = select_lru_page(cache);
+
+ if (info == NULL) {
+ report_cache_pressure(cache);
+ return;
+ }
+
+ if (!is_dirty(info)) {
+ allocate_free_page(info);
+ return;
+ }
+
+ VDO_ASSERT_LOG_ONLY(!is_in_flight(info),
+ "page selected for discard is not in flight");
+
+ cache->discard_count++;
+ info->write_status = WRITE_STATUS_DISCARD;
+ launch_page_save(info);
+}
+
+/**
+ * discard_page_for_completion() - Helper used to trigger a discard so that the completion can get
+ * a different page.
+ */
+static void discard_page_for_completion(struct vdo_page_completion *vdo_page_comp)
+{
+ struct vdo_page_cache *cache = vdo_page_comp->cache;
+
+ cache->waiter_count++;
+ vdo_waitq_enqueue_waiter(&cache->free_waiters, &vdo_page_comp->waiter);
+ discard_a_page(cache);
+}
+
+/**
+ * discard_page_if_needed() - Helper used to trigger a discard if the cache needs another free
+ * page.
+ * @cache: The page cache.
+ */
+static void discard_page_if_needed(struct vdo_page_cache *cache)
+{
+ if (cache->waiter_count > cache->discard_count)
+ discard_a_page(cache);
+}
+
+/**
+ * write_has_finished() - Inform the cache that a write has finished (possibly with an error).
+ * @info: The info structure for the page whose write just completed.
+ *
+ * Return: true if the page write was a discard.
+ */
+static bool write_has_finished(struct page_info *info)
+{
+ bool was_discard = (info->write_status == WRITE_STATUS_DISCARD);
+
+ assert_on_cache_thread(info->cache, __func__);
+ info->cache->outstanding_writes--;
+
+ info->write_status = WRITE_STATUS_NORMAL;
+ return was_discard;
+}
+
+/**
+ * handle_page_write_error() - Handler for page write errors.
+ * @completion: The page write vio.
+ */
+static void handle_page_write_error(struct vdo_completion *completion)
+{
+ int result = completion->result;
+ struct page_info *info = completion->parent;
+ struct vdo_page_cache *cache = info->cache;
+
+ vio_record_metadata_io_error(as_vio(completion));
+
+ /* If we're already read-only, write failures are to be expected. */
+ if (result != VDO_READ_ONLY) {
+ vdo_log_ratelimit(vdo_log_error,
+ "failed to write block map page %llu",
+ (unsigned long long) info->pbn);
+ }
+
+ set_info_state(info, PS_DIRTY);
+ ADD_ONCE(cache->stats.failed_writes, 1);
+ set_persistent_error(cache, "cannot write page", result);
+
+ if (!write_has_finished(info))
+ discard_page_if_needed(cache);
+
+ check_for_drain_complete(cache->zone);
+}
+
+static void page_is_written_out(struct vdo_completion *completion);
+
+static void write_cache_page_endio(struct bio *bio)
+{
+ struct vio *vio = bio->bi_private;
+ struct page_info *info = vio->completion.parent;
+
+ continue_vio_after_io(vio, page_is_written_out, info->cache->zone->thread_id);
+}
+
+/**
+ * page_is_written_out() - Callback used when a page has been written out.
+ * @completion: The vio which wrote the page. Its parent is a page_info.
+ */
+static void page_is_written_out(struct vdo_completion *completion)
+{
+ bool was_discard, reclaimed;
+ u32 reclamations;
+ struct page_info *info = completion->parent;
+ struct vdo_page_cache *cache = info->cache;
+ struct block_map_page *page = (struct block_map_page *) get_page_buffer(info);
+
+ if (!page->header.initialized) {
+ page->header.initialized = true;
+ vdo_submit_metadata_vio(info->vio, info->pbn,
+ write_cache_page_endio,
+ handle_page_write_error,
+ REQ_OP_WRITE | REQ_PRIO | REQ_PREFLUSH);
+ return;
+ }
+
+ /* Handle journal updates and torn write protection. */
+ vdo_release_recovery_journal_block_reference(cache->zone->block_map->journal,
+ info->recovery_lock,
+ VDO_ZONE_TYPE_LOGICAL,
+ cache->zone->zone_number);
+ info->recovery_lock = 0;
+ was_discard = write_has_finished(info);
+ reclaimed = (!was_discard || (info->busy > 0) || vdo_waitq_has_waiters(&info->waiting));
+
+ set_info_state(info, PS_RESIDENT);
+
+ reclamations = distribute_page_over_waitq(info, &info->waiting);
+ ADD_ONCE(cache->stats.reclaimed, reclamations);
+
+ if (was_discard)
+ cache->discard_count--;
+
+ if (reclaimed)
+ discard_page_if_needed(cache);
+ else
+ allocate_free_page(info);
+
+ check_for_drain_complete(cache->zone);
+}
+
+/**
+ * write_pages() - Write the batch of pages which were covered by the layer flush which just
+ * completed.
+ * @flush_completion: The flush vio.
+ *
+ * This callback is registered in save_pages().
+ */
+static void write_pages(struct vdo_completion *flush_completion)
+{
+ struct vdo_page_cache *cache = ((struct page_info *) flush_completion->parent)->cache;
+
+ /*
+ * We need to cache these two values on the stack since it is possible for the last
+ * page info to cause the page cache to get freed. Hence once we launch the last page,
+ * it may be unsafe to dereference the cache.
+ */
+ bool has_unflushed_pages = (cache->pages_to_flush > 0);
+ page_count_t pages_in_flush = cache->pages_in_flush;
+
+ cache->pages_in_flush = 0;
+ while (pages_in_flush-- > 0) {
+ struct page_info *info =
+ list_first_entry(&cache->outgoing_list, struct page_info,
+ state_entry);
+
+ list_del_init(&info->state_entry);
+ if (vdo_is_read_only(info->cache->vdo)) {
+ struct vdo_completion *completion = &info->vio->completion;
+
+ vdo_reset_completion(completion);
+ completion->callback = page_is_written_out;
+ completion->error_handler = handle_page_write_error;
+ vdo_fail_completion(completion, VDO_READ_ONLY);
+ continue;
+ }
+ ADD_ONCE(info->cache->stats.pages_saved, 1);
+ vdo_submit_metadata_vio(info->vio, info->pbn, write_cache_page_endio,
+ handle_page_write_error, REQ_OP_WRITE | REQ_PRIO);
+ }
+
+ if (has_unflushed_pages) {
+ /*
+ * If there are unflushed pages, the cache can't have been freed, so this call is
+ * safe.
+ */
+ save_pages(cache);
+ }
+}
+
+/**
+ * vdo_release_page_completion() - Release a VDO Page Completion.
+ *
+ * The page referenced by this completion (if any) will no longer be held busy by this completion.
+ * If a page becomes discardable and there are completions awaiting free pages then a new round of
+ * page discarding is started.
+ */
+void vdo_release_page_completion(struct vdo_completion *completion)
+{
+ struct page_info *discard_info = NULL;
+ struct vdo_page_completion *page_completion = as_vdo_page_completion(completion);
+ struct vdo_page_cache *cache;
+
+ if (completion->result == VDO_SUCCESS) {
+ if (!validate_completed_page_or_enter_read_only_mode(page_completion, false))
+ return;
+
+ if (--page_completion->info->busy == 0)
+ discard_info = page_completion->info;
+ }
+
+ VDO_ASSERT_LOG_ONLY((page_completion->waiter.next_waiter == NULL),
+ "Page being released after leaving all queues");
+
+ page_completion->info = NULL;
+ cache = page_completion->cache;
+ assert_on_cache_thread(cache, __func__);
+
+ if (discard_info != NULL) {
+ if (discard_info->write_status == WRITE_STATUS_DEFERRED) {
+ discard_info->write_status = WRITE_STATUS_NORMAL;
+ launch_page_save(discard_info);
+ }
+
+ /*
+ * if there are excess requests for pages (that have not already started discards)
+ * we need to discard some page (which may be this one)
+ */
+ discard_page_if_needed(cache);
+ }
+}
+
+/**
+ * load_page_for_completion() - Helper function to load a page as described by a VDO Page
+ * Completion.
+ */
+static void load_page_for_completion(struct page_info *info,
+ struct vdo_page_completion *vdo_page_comp)
+{
+ int result;
+
+ vdo_waitq_enqueue_waiter(&info->waiting, &vdo_page_comp->waiter);
+ result = launch_page_load(info, vdo_page_comp->pbn);
+ if (result != VDO_SUCCESS) {
+ vdo_waitq_notify_all_waiters(&info->waiting,
+ complete_waiter_with_error, &result);
+ }
+}
+
+/**
+ * vdo_get_page() - Initialize a page completion and get a block map page.
+ * @page_completion: The vdo_page_completion to initialize.
+ * @zone: The block map zone of the desired page.
+ * @pbn: The absolute physical block of the desired page.
+ * @writable: Whether the page can be modified.
+ * @parent: The object to notify when the fetch is complete.
+ * @callback: The notification callback.
+ * @error_handler: The handler for fetch errors.
+ * @requeue: Whether we must requeue when notifying the parent.
+ *
+ * May cause another page to be discarded (potentially writing a dirty page) and the one nominated
+ * by the completion to be loaded from disk. When the callback is invoked, the page will be
+ * resident in the cache and marked busy. All callers must call vdo_release_page_completion()
+ * when they are done with the page to clear the busy mark.
+ */
+void vdo_get_page(struct vdo_page_completion *page_completion,
+ struct block_map_zone *zone, physical_block_number_t pbn,
+ bool writable, void *parent, vdo_action_fn callback,
+ vdo_action_fn error_handler, bool requeue)
+{
+ struct vdo_page_cache *cache = &zone->page_cache;
+ struct vdo_completion *completion = &page_completion->completion;
+ struct page_info *info;
+
+ assert_on_cache_thread(cache, __func__);
+ VDO_ASSERT_LOG_ONLY((page_completion->waiter.next_waiter == NULL),
+ "New page completion was not already on a wait queue");
+
+ *page_completion = (struct vdo_page_completion) {
+ .pbn = pbn,
+ .writable = writable,
+ .cache = cache,
+ };
+
+ vdo_initialize_completion(completion, cache->vdo, VDO_PAGE_COMPLETION);
+ vdo_prepare_completion(completion, callback, error_handler,
+ cache->zone->thread_id, parent);
+ completion->requeue = requeue;
+
+ if (page_completion->writable && vdo_is_read_only(cache->vdo)) {
+ vdo_fail_completion(completion, VDO_READ_ONLY);
+ return;
+ }
+
+ if (page_completion->writable)
+ ADD_ONCE(cache->stats.write_count, 1);
+ else
+ ADD_ONCE(cache->stats.read_count, 1);
+
+ info = find_page(cache, page_completion->pbn);
+ if (info != NULL) {
+ /* The page is in the cache already. */
+ if ((info->write_status == WRITE_STATUS_DEFERRED) ||
+ is_incoming(info) ||
+ (is_outgoing(info) && page_completion->writable)) {
+ /* The page is unusable until it has finished I/O. */
+ ADD_ONCE(cache->stats.wait_for_page, 1);
+ vdo_waitq_enqueue_waiter(&info->waiting, &page_completion->waiter);
+ return;
+ }
+
+ if (is_valid(info)) {
+ /* The page is usable. */
+ ADD_ONCE(cache->stats.found_in_cache, 1);
+ if (!is_present(info))
+ ADD_ONCE(cache->stats.read_outgoing, 1);
+ update_lru(info);
+ info->busy++;
+ complete_with_page(info, page_completion);
+ return;
+ }
+
+ /* Something horrible has gone wrong. */
+ VDO_ASSERT_LOG_ONLY(false, "Info found in a usable state.");
+ }
+
+ /* The page must be fetched. */
+ info = find_free_page(cache);
+ if (info != NULL) {
+ ADD_ONCE(cache->stats.fetch_required, 1);
+ load_page_for_completion(info, page_completion);
+ return;
+ }
+
+ /* The page must wait for a page to be discarded. */
+ ADD_ONCE(cache->stats.discard_required, 1);
+ discard_page_for_completion(page_completion);
+}
+
+/**
+ * vdo_request_page_write() - Request that a VDO page be written out as soon as it is not busy.
+ * @completion: The vdo_page_completion containing the page.
+ */
+void vdo_request_page_write(struct vdo_completion *completion)
+{
+ struct page_info *info;
+ struct vdo_page_completion *vdo_page_comp = as_vdo_page_completion(completion);
+
+ if (!validate_completed_page_or_enter_read_only_mode(vdo_page_comp, true))
+ return;
+
+ info = vdo_page_comp->info;
+ set_info_state(info, PS_DIRTY);
+ launch_page_save(info);
+}
+
+/**
+ * vdo_get_cached_page() - Get the block map page from a page completion.
+ * @completion: A vdo page completion whose callback has been called.
+ * @page_ptr: A pointer to hold the page
+ *
+ * Return: VDO_SUCCESS or an error
+ */
+int vdo_get_cached_page(struct vdo_completion *completion,
+ struct block_map_page **page_ptr)
+{
+ int result;
+ struct vdo_page_completion *vpc;
+
+ vpc = as_vdo_page_completion(completion);
+ result = validate_completed_page(vpc, true);
+ if (result == VDO_SUCCESS)
+ *page_ptr = (struct block_map_page *) get_page_buffer(vpc->info);
+
+ return result;
+}
+
+/**
+ * vdo_invalidate_page_cache() - Invalidate all entries in the VDO page cache.
+ *
+ * There must not be any dirty pages in the cache.
+ *
+ * Return: A success or error code.
+ */
+int vdo_invalidate_page_cache(struct vdo_page_cache *cache)
+{
+ struct page_info *info;
+
+ assert_on_cache_thread(cache, __func__);
+
+ /* Make sure we don't throw away any dirty pages. */
+ for (info = cache->infos; info < cache->infos + cache->page_count; info++) {
+ int result = VDO_ASSERT(!is_dirty(info), "cache must have no dirty pages");
+
+ if (result != VDO_SUCCESS)
+ return result;
+ }
+
+ /* Reset the page map by re-allocating it. */
+ vdo_int_map_free(vdo_forget(cache->page_map));
+ return vdo_int_map_create(cache->page_count, &cache->page_map);
+}
+
+/**
+ * get_tree_page_by_index() - Get the tree page for a given height and page index.
+ *
+ * Return: The requested page.
+ */
+static struct tree_page * __must_check get_tree_page_by_index(struct forest *forest,
+ root_count_t root_index,
+ height_t height,
+ page_number_t page_index)
+{
+ page_number_t offset = 0;
+ size_t segment;
+
+ for (segment = 0; segment < forest->segments; segment++) {
+ page_number_t border = forest->boundaries[segment].levels[height - 1];
+
+ if (page_index < border) {
+ struct block_map_tree *tree = &forest->trees[root_index];
+
+ return &(tree->segments[segment].levels[height - 1][page_index - offset]);
+ }
+
+ offset = border;
+ }
+
+ return NULL;
+}
+
+/* Get the page referred to by the lock's tree slot at its current height. */
+static inline struct tree_page *get_tree_page(const struct block_map_zone *zone,
+ const struct tree_lock *lock)
+{
+ return get_tree_page_by_index(zone->block_map->forest, lock->root_index,
+ lock->height,
+ lock->tree_slots[lock->height].page_index);
+}
+
+/** vdo_copy_valid_page() - Validate and copy a buffer to a page. */
+bool vdo_copy_valid_page(char *buffer, nonce_t nonce,
+ physical_block_number_t pbn,
+ struct block_map_page *page)
+{
+ struct block_map_page *loaded = (struct block_map_page *) buffer;
+ enum block_map_page_validity validity =
+ vdo_validate_block_map_page(loaded, nonce, pbn);
+
+ if (validity == VDO_BLOCK_MAP_PAGE_VALID) {
+ memcpy(page, loaded, VDO_BLOCK_SIZE);
+ return true;
+ }
+
+ if (validity == VDO_BLOCK_MAP_PAGE_BAD) {
+ vdo_log_error_strerror(VDO_BAD_PAGE,
+ "Expected page %llu but got page %llu instead",
+ (unsigned long long) pbn,
+ (unsigned long long) vdo_get_block_map_page_pbn(loaded));
+ }
+
+ return false;
+}
+
+/**
+ * in_cyclic_range() - Check whether the given value is between the lower and upper bounds, within
+ * a cyclic range of values from 0 to (modulus - 1).
+ * @lower: The lowest value to accept.
+ * @value: The value to check.
+ * @upper: The highest value to accept.
+ * @modulus: The size of the cyclic space, no more than 2^15.
+ *
+ * The value and both bounds must be smaller than the modulus.
+ *
+ * Return: true if the value is in range.
+ */
+static bool in_cyclic_range(u16 lower, u16 value, u16 upper, u16 modulus)
+{
+ if (value < lower)
+ value += modulus;
+ if (upper < lower)
+ upper += modulus;
+ return (value <= upper);
+}
+
+/**
+ * is_not_older() - Check whether a generation is strictly older than some other generation in the
+ * context of a zone's current generation range.
+ * @zone: The zone in which to do the comparison.
+ * @a: The generation in question.
+ * @b: The generation to compare to.
+ *
+ * Return: true if generation @a is not strictly older than generation @b in the context of @zone
+ */
+static bool __must_check is_not_older(struct block_map_zone *zone, u8 a, u8 b)
+{
+ int result;
+
+ result = VDO_ASSERT((in_cyclic_range(zone->oldest_generation, a, zone->generation, 1 << 8) &&
+ in_cyclic_range(zone->oldest_generation, b, zone->generation, 1 << 8)),
+ "generation(s) %u, %u are out of range [%u, %u]",
+ a, b, zone->oldest_generation, zone->generation);
+ if (result != VDO_SUCCESS) {
+ enter_zone_read_only_mode(zone, result);
+ return true;
+ }
+
+ return in_cyclic_range(b, a, zone->generation, 1 << 8);
+}
+
+static void release_generation(struct block_map_zone *zone, u8 generation)
+{
+ int result;
+
+ result = VDO_ASSERT((zone->dirty_page_counts[generation] > 0),
+ "dirty page count underflow for generation %u", generation);
+ if (result != VDO_SUCCESS) {
+ enter_zone_read_only_mode(zone, result);
+ return;
+ }
+
+ zone->dirty_page_counts[generation]--;
+ while ((zone->dirty_page_counts[zone->oldest_generation] == 0) &&
+ (zone->oldest_generation != zone->generation))
+ zone->oldest_generation++;
+}
+
+static void set_generation(struct block_map_zone *zone, struct tree_page *page,
+ u8 new_generation)
+{
+ u32 new_count;
+ int result;
+ bool decrement_old = vdo_waiter_is_waiting(&page->waiter);
+ u8 old_generation = page->generation;
+
+ if (decrement_old && (old_generation == new_generation))
+ return;
+
+ page->generation = new_generation;
+ new_count = ++zone->dirty_page_counts[new_generation];
+ result = VDO_ASSERT((new_count != 0), "dirty page count overflow for generation %u",
+ new_generation);
+ if (result != VDO_SUCCESS) {
+ enter_zone_read_only_mode(zone, result);
+ return;
+ }
+
+ if (decrement_old)
+ release_generation(zone, old_generation);
+}
+
+static void write_page(struct tree_page *tree_page, struct pooled_vio *vio);
+
+/* Implements waiter_callback_fn */
+static void write_page_callback(struct vdo_waiter *waiter, void *context)
+{
+ write_page(container_of(waiter, struct tree_page, waiter), context);
+}
+
+static void acquire_vio(struct vdo_waiter *waiter, struct block_map_zone *zone)
+{
+ waiter->callback = write_page_callback;
+ acquire_vio_from_pool(zone->vio_pool, waiter);
+}
+
+/* Return: true if all possible generations were not already active */
+static bool attempt_increment(struct block_map_zone *zone)
+{
+ u8 generation = zone->generation + 1;
+
+ if (zone->oldest_generation == generation)
+ return false;
+
+ zone->generation = generation;
+ return true;
+}
+
+/* Launches a flush if one is not already in progress. */
+static void enqueue_page(struct tree_page *page, struct block_map_zone *zone)
+{
+ if ((zone->flusher == NULL) && attempt_increment(zone)) {
+ zone->flusher = page;
+ acquire_vio(&page->waiter, zone);
+ return;
+ }
+
+ vdo_waitq_enqueue_waiter(&zone->flush_waiters, &page->waiter);
+}
+
+static void write_page_if_not_dirtied(struct vdo_waiter *waiter, void *context)
+{
+ struct tree_page *page = container_of(waiter, struct tree_page, waiter);
+ struct write_if_not_dirtied_context *write_context = context;
+
+ if (page->generation == write_context->generation) {
+ acquire_vio(waiter, write_context->zone);
+ return;
+ }
+
+ enqueue_page(page, write_context->zone);
+}
+
+static void return_to_pool(struct block_map_zone *zone, struct pooled_vio *vio)
+{
+ return_vio_to_pool(zone->vio_pool, vio);
+ check_for_drain_complete(zone);
+}
+
+/* This callback is registered in write_initialized_page(). */
+static void finish_page_write(struct vdo_completion *completion)
+{
+ bool dirty;
+ struct vio *vio = as_vio(completion);
+ struct pooled_vio *pooled = container_of(vio, struct pooled_vio, vio);
+ struct tree_page *page = completion->parent;
+ struct block_map_zone *zone = pooled->context;
+
+ vdo_release_recovery_journal_block_reference(zone->block_map->journal,
+ page->writing_recovery_lock,
+ VDO_ZONE_TYPE_LOGICAL,
+ zone->zone_number);
+
+ dirty = (page->writing_generation != page->generation);
+ release_generation(zone, page->writing_generation);
+ page->writing = false;
+
+ if (zone->flusher == page) {
+ struct write_if_not_dirtied_context context = {
+ .zone = zone,
+ .generation = page->writing_generation,
+ };
+
+ vdo_waitq_notify_all_waiters(&zone->flush_waiters,
+ write_page_if_not_dirtied, &context);
+ if (dirty && attempt_increment(zone)) {
+ write_page(page, pooled);
+ return;
+ }
+
+ zone->flusher = NULL;
+ }
+
+ if (dirty) {
+ enqueue_page(page, zone);
+ } else if ((zone->flusher == NULL) && vdo_waitq_has_waiters(&zone->flush_waiters) &&
+ attempt_increment(zone)) {
+ zone->flusher = container_of(vdo_waitq_dequeue_waiter(&zone->flush_waiters),
+ struct tree_page, waiter);
+ write_page(zone->flusher, pooled);
+ return;
+ }
+
+ return_to_pool(zone, pooled);
+}
+
+static void handle_write_error(struct vdo_completion *completion)
+{
+ int result = completion->result;
+ struct vio *vio = as_vio(completion);
+ struct pooled_vio *pooled = container_of(vio, struct pooled_vio, vio);
+ struct block_map_zone *zone = pooled->context;
+
+ vio_record_metadata_io_error(vio);
+ enter_zone_read_only_mode(zone, result);
+ return_to_pool(zone, pooled);
+}
+
+static void write_page_endio(struct bio *bio);
+
+static void write_initialized_page(struct vdo_completion *completion)
+{
+ struct vio *vio = as_vio(completion);
+ struct pooled_vio *pooled = container_of(vio, struct pooled_vio, vio);
+ struct block_map_zone *zone = pooled->context;
+ struct tree_page *tree_page = completion->parent;
+ struct block_map_page *page = (struct block_map_page *) vio->data;
+ blk_opf_t operation = REQ_OP_WRITE | REQ_PRIO;
+
+ /*
+ * Now that we know the page has been written at least once, mark the copy we are writing
+ * as initialized.
+ */
+ page->header.initialized = true;
+
+ if (zone->flusher == tree_page)
+ operation |= REQ_PREFLUSH;
+
+ vdo_submit_metadata_vio(vio, vdo_get_block_map_page_pbn(page),
+ write_page_endio, handle_write_error,
+ operation);
+}
+
+static void write_page_endio(struct bio *bio)
+{
+ struct pooled_vio *vio = bio->bi_private;
+ struct block_map_zone *zone = vio->context;
+ struct block_map_page *page = (struct block_map_page *) vio->vio.data;
+
+ continue_vio_after_io(&vio->vio,
+ (page->header.initialized ?
+ finish_page_write : write_initialized_page),
+ zone->thread_id);
+}
+
+static void write_page(struct tree_page *tree_page, struct pooled_vio *vio)
+{
+ struct vdo_completion *completion = &vio->vio.completion;
+ struct block_map_zone *zone = vio->context;
+ struct block_map_page *page = vdo_as_block_map_page(tree_page);
+
+ if ((zone->flusher != tree_page) &&
+ is_not_older(zone, tree_page->generation, zone->generation)) {
+ /*
+ * This page was re-dirtied after the last flush was issued, hence we need to do
+ * another flush.
+ */
+ enqueue_page(tree_page, zone);
+ return_to_pool(zone, vio);
+ return;
+ }
+
+ completion->parent = tree_page;
+ memcpy(vio->vio.data, tree_page->page_buffer, VDO_BLOCK_SIZE);
+ completion->callback_thread_id = zone->thread_id;
+
+ tree_page->writing = true;
+ tree_page->writing_generation = tree_page->generation;
+ tree_page->writing_recovery_lock = tree_page->recovery_lock;
+
+ /* Clear this now so that we know this page is not on any dirty list. */
+ tree_page->recovery_lock = 0;
+
+ /*
+ * We've already copied the page into the vio which will write it, so if it was not yet
+ * initialized, the first write will indicate that (for torn write protection). It is now
+ * safe to mark it as initialized in memory since if the write fails, the in memory state
+ * will become irrelevant.
+ */
+ if (page->header.initialized) {
+ write_initialized_page(completion);
+ return;
+ }
+
+ page->header.initialized = true;
+ vdo_submit_metadata_vio(&vio->vio, vdo_get_block_map_page_pbn(page),
+ write_page_endio, handle_write_error,
+ REQ_OP_WRITE | REQ_PRIO);
+}
+
+/* Release a lock on a page which was being loaded or allocated. */
+static void release_page_lock(struct data_vio *data_vio, char *what)
+{
+ struct block_map_zone *zone;
+ struct tree_lock *lock_holder;
+ struct tree_lock *lock = &data_vio->tree_lock;
+
+ VDO_ASSERT_LOG_ONLY(lock->locked,
+ "release of unlocked block map page %s for key %llu in tree %u",
+ what, (unsigned long long) lock->key, lock->root_index);
+
+ zone = data_vio->logical.zone->block_map_zone;
+ lock_holder = vdo_int_map_remove(zone->loading_pages, lock->key);
+ VDO_ASSERT_LOG_ONLY((lock_holder == lock),
+ "block map page %s mismatch for key %llu in tree %u",
+ what, (unsigned long long) lock->key, lock->root_index);
+ lock->locked = false;
+}
+
+static void finish_lookup(struct data_vio *data_vio, int result)
+{
+ data_vio->tree_lock.height = 0;
+
+ --data_vio->logical.zone->block_map_zone->active_lookups;
+
+ set_data_vio_logical_callback(data_vio, continue_data_vio_with_block_map_slot);
+ data_vio->vio.completion.error_handler = handle_data_vio_error;
+ continue_data_vio_with_error(data_vio, result);
+}
+
+static void abort_lookup_for_waiter(struct vdo_waiter *waiter, void *context)
+{
+ struct data_vio *data_vio = vdo_waiter_as_data_vio(waiter);
+ int result = *((int *) context);
+
+ if (!data_vio->write) {
+ if (result == VDO_NO_SPACE)
+ result = VDO_SUCCESS;
+ } else if (result != VDO_NO_SPACE) {
+ result = VDO_READ_ONLY;
+ }
+
+ finish_lookup(data_vio, result);
+}
+
+static void abort_lookup(struct data_vio *data_vio, int result, char *what)
+{
+ if (result != VDO_NO_SPACE)
+ enter_zone_read_only_mode(data_vio->logical.zone->block_map_zone, result);
+
+ if (data_vio->tree_lock.locked) {
+ release_page_lock(data_vio, what);
+ vdo_waitq_notify_all_waiters(&data_vio->tree_lock.waiters,
+ abort_lookup_for_waiter,
+ &result);
+ }
+
+ finish_lookup(data_vio, result);
+}
+
+static void abort_load(struct data_vio *data_vio, int result)
+{
+ abort_lookup(data_vio, result, "load");
+}
+
+static bool __must_check is_invalid_tree_entry(const struct vdo *vdo,
+ const struct data_location *mapping,
+ height_t height)
+{
+ if (!vdo_is_valid_location(mapping) ||
+ vdo_is_state_compressed(mapping->state) ||
+ (vdo_is_mapped_location(mapping) && (mapping->pbn == VDO_ZERO_BLOCK)))
+ return true;
+
+ /* Roots aren't physical data blocks, so we can't check their PBNs. */
+ if (height == VDO_BLOCK_MAP_TREE_HEIGHT)
+ return false;
+
+ return !vdo_is_physical_data_block(vdo->depot, mapping->pbn);
+}
+
+static void load_block_map_page(struct block_map_zone *zone, struct data_vio *data_vio);
+static void allocate_block_map_page(struct block_map_zone *zone,
+ struct data_vio *data_vio);
+
+static void continue_with_loaded_page(struct data_vio *data_vio,
+ struct block_map_page *page)
+{
+ struct tree_lock *lock = &data_vio->tree_lock;
+ struct block_map_tree_slot slot = lock->tree_slots[lock->height];
+ struct data_location mapping =
+ vdo_unpack_block_map_entry(&page->entries[slot.block_map_slot.slot]);
+
+ if (is_invalid_tree_entry(vdo_from_data_vio(data_vio), &mapping, lock->height)) {
+ vdo_log_error_strerror(VDO_BAD_MAPPING,
+ "Invalid block map tree PBN: %llu with state %u for page index %u at height %u",
+ (unsigned long long) mapping.pbn, mapping.state,
+ lock->tree_slots[lock->height - 1].page_index,
+ lock->height - 1);
+ abort_load(data_vio, VDO_BAD_MAPPING);
+ return;
+ }
+
+ if (!vdo_is_mapped_location(&mapping)) {
+ /* The page we need is unallocated */
+ allocate_block_map_page(data_vio->logical.zone->block_map_zone,
+ data_vio);
+ return;
+ }
+
+ lock->tree_slots[lock->height - 1].block_map_slot.pbn = mapping.pbn;
+ if (lock->height == 1) {
+ finish_lookup(data_vio, VDO_SUCCESS);
+ return;
+ }
+
+ /* We know what page we need to load next */
+ load_block_map_page(data_vio->logical.zone->block_map_zone, data_vio);
+}
+
+static void continue_load_for_waiter(struct vdo_waiter *waiter, void *context)
+{
+ struct data_vio *data_vio = vdo_waiter_as_data_vio(waiter);
+
+ data_vio->tree_lock.height--;
+ continue_with_loaded_page(data_vio, context);
+}
+
+static void finish_block_map_page_load(struct vdo_completion *completion)
+{
+ physical_block_number_t pbn;
+ struct tree_page *tree_page;
+ struct block_map_page *page;
+ nonce_t nonce;
+ struct vio *vio = as_vio(completion);
+ struct pooled_vio *pooled = vio_as_pooled_vio(vio);
+ struct data_vio *data_vio = completion->parent;
+ struct block_map_zone *zone = pooled->context;
+ struct tree_lock *tree_lock = &data_vio->tree_lock;
+
+ tree_lock->height--;
+ pbn = tree_lock->tree_slots[tree_lock->height].block_map_slot.pbn;
+ tree_page = get_tree_page(zone, tree_lock);
+ page = (struct block_map_page *) tree_page->page_buffer;
+ nonce = zone->block_map->nonce;
+
+ if (!vdo_copy_valid_page(vio->data, nonce, pbn, page))
+ vdo_format_block_map_page(page, nonce, pbn, false);
+ return_vio_to_pool(zone->vio_pool, pooled);
+
+ /* Release our claim to the load and wake any waiters */
+ release_page_lock(data_vio, "load");
+ vdo_waitq_notify_all_waiters(&tree_lock->waiters, continue_load_for_waiter, page);
+ continue_with_loaded_page(data_vio, page);
+}
+
+static void handle_io_error(struct vdo_completion *completion)
+{
+ int result = completion->result;
+ struct vio *vio = as_vio(completion);
+ struct pooled_vio *pooled = container_of(vio, struct pooled_vio, vio);
+ struct data_vio *data_vio = completion->parent;
+ struct block_map_zone *zone = pooled->context;
+
+ vio_record_metadata_io_error(vio);
+ return_vio_to_pool(zone->vio_pool, pooled);
+ abort_load(data_vio, result);
+}
+
+static void load_page_endio(struct bio *bio)
+{
+ struct vio *vio = bio->bi_private;
+ struct data_vio *data_vio = vio->completion.parent;
+
+ continue_vio_after_io(vio, finish_block_map_page_load,
+ data_vio->logical.zone->thread_id);
+}
+
+static void load_page(struct vdo_waiter *waiter, void *context)
+{
+ struct pooled_vio *pooled = context;
+ struct data_vio *data_vio = vdo_waiter_as_data_vio(waiter);
+ struct tree_lock *lock = &data_vio->tree_lock;
+ physical_block_number_t pbn = lock->tree_slots[lock->height - 1].block_map_slot.pbn;
+
+ pooled->vio.completion.parent = data_vio;
+ vdo_submit_metadata_vio(&pooled->vio, pbn, load_page_endio,
+ handle_io_error, REQ_OP_READ | REQ_PRIO);
+}
+
+/*
+ * If the page is already locked, queue up to wait for the lock to be released. If the lock is
+ * acquired, @data_vio->tree_lock.locked will be true.
+ */
+static int attempt_page_lock(struct block_map_zone *zone, struct data_vio *data_vio)
+{
+ int result;
+ struct tree_lock *lock_holder;
+ struct tree_lock *lock = &data_vio->tree_lock;
+ height_t height = lock->height;
+ struct block_map_tree_slot tree_slot = lock->tree_slots[height];
+ union page_key key;
+
+ key.descriptor = (struct page_descriptor) {
+ .root_index = lock->root_index,
+ .height = height,
+ .page_index = tree_slot.page_index,
+ .slot = tree_slot.block_map_slot.slot,
+ };
+ lock->key = key.key;
+
+ result = vdo_int_map_put(zone->loading_pages, lock->key,
+ lock, false, (void **) &lock_holder);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ if (lock_holder == NULL) {
+ /* We got the lock */
+ data_vio->tree_lock.locked = true;
+ return VDO_SUCCESS;
+ }
+
+ /* Someone else is loading or allocating the page we need */
+ vdo_waitq_enqueue_waiter(&lock_holder->waiters, &data_vio->waiter);
+ return VDO_SUCCESS;
+}
+
+/* Load a block map tree page from disk, for the next level in the data vio tree lock. */
+static void load_block_map_page(struct block_map_zone *zone, struct data_vio *data_vio)
+{
+ int result;
+
+ result = attempt_page_lock(zone, data_vio);
+ if (result != VDO_SUCCESS) {
+ abort_load(data_vio, result);
+ return;
+ }
+
+ if (data_vio->tree_lock.locked) {
+ data_vio->waiter.callback = load_page;
+ acquire_vio_from_pool(zone->vio_pool, &data_vio->waiter);
+ }
+}
+
+static void allocation_failure(struct vdo_completion *completion)
+{
+ struct data_vio *data_vio = as_data_vio(completion);
+
+ if (vdo_requeue_completion_if_needed(completion,
+ data_vio->logical.zone->thread_id))
+ return;
+
+ abort_lookup(data_vio, completion->result, "allocation");
+}
+
+static void continue_allocation_for_waiter(struct vdo_waiter *waiter, void *context)
+{
+ struct data_vio *data_vio = vdo_waiter_as_data_vio(waiter);
+ struct tree_lock *tree_lock = &data_vio->tree_lock;
+ physical_block_number_t pbn = *((physical_block_number_t *) context);
+
+ tree_lock->height--;
+ data_vio->tree_lock.tree_slots[tree_lock->height].block_map_slot.pbn = pbn;
+
+ if (tree_lock->height == 0) {
+ finish_lookup(data_vio, VDO_SUCCESS);
+ return;
+ }
+
+ allocate_block_map_page(data_vio->logical.zone->block_map_zone, data_vio);
+}
+
+/** expire_oldest_list() - Expire the oldest list. */
+static void expire_oldest_list(struct dirty_lists *dirty_lists)
+{
+ block_count_t i = dirty_lists->offset++;
+
+ dirty_lists->oldest_period++;
+ if (!list_empty(&dirty_lists->eras[i][VDO_TREE_PAGE])) {
+ list_splice_tail_init(&dirty_lists->eras[i][VDO_TREE_PAGE],
+ &dirty_lists->expired[VDO_TREE_PAGE]);
+ }
+
+ if (!list_empty(&dirty_lists->eras[i][VDO_CACHE_PAGE])) {
+ list_splice_tail_init(&dirty_lists->eras[i][VDO_CACHE_PAGE],
+ &dirty_lists->expired[VDO_CACHE_PAGE]);
+ }
+
+ if (dirty_lists->offset == dirty_lists->maximum_age)
+ dirty_lists->offset = 0;
+}
+
+
+/** update_period() - Update the dirty_lists period if necessary. */
+static void update_period(struct dirty_lists *dirty, sequence_number_t period)
+{
+ while (dirty->next_period <= period) {
+ if ((dirty->next_period - dirty->oldest_period) == dirty->maximum_age)
+ expire_oldest_list(dirty);
+ dirty->next_period++;
+ }
+}
+
+/** write_expired_elements() - Write out the expired list. */
+static void write_expired_elements(struct block_map_zone *zone)
+{
+ struct tree_page *page, *ttmp;
+ struct page_info *info, *ptmp;
+ struct list_head *expired;
+ u8 generation = zone->generation;
+
+ expired = &zone->dirty_lists->expired[VDO_TREE_PAGE];
+ list_for_each_entry_safe(page, ttmp, expired, entry) {
+ int result;
+
+ list_del_init(&page->entry);
+
+ result = VDO_ASSERT(!vdo_waiter_is_waiting(&page->waiter),
+ "Newly expired page not already waiting to write");
+ if (result != VDO_SUCCESS) {
+ enter_zone_read_only_mode(zone, result);
+ continue;
+ }
+
+ set_generation(zone, page, generation);
+ if (!page->writing)
+ enqueue_page(page, zone);
+ }
+
+ expired = &zone->dirty_lists->expired[VDO_CACHE_PAGE];
+ list_for_each_entry_safe(info, ptmp, expired, state_entry) {
+ list_del_init(&info->state_entry);
+ schedule_page_save(info);
+ }
+
+ save_pages(&zone->page_cache);
+}
+
+/**
+ * add_to_dirty_lists() - Add an element to the dirty lists.
+ * @zone: The zone in which we are operating.
+ * @entry: The list entry of the element to add.
+ * @type: The type of page.
+ * @old_period: The period in which the element was previously dirtied, or 0 if it was not dirty.
+ * @new_period: The period in which the element has now been dirtied, or 0 if it does not hold a
+ * lock.
+ */
+static void add_to_dirty_lists(struct block_map_zone *zone,
+ struct list_head *entry,
+ enum block_map_page_type type,
+ sequence_number_t old_period,
+ sequence_number_t new_period)
+{
+ struct dirty_lists *dirty_lists = zone->dirty_lists;
+
+ if ((old_period == new_period) || ((old_period != 0) && (old_period < new_period)))
+ return;
+
+ if (new_period < dirty_lists->oldest_period) {
+ list_move_tail(entry, &dirty_lists->expired[type]);
+ } else {
+ update_period(dirty_lists, new_period);
+ list_move_tail(entry,
+ &dirty_lists->eras[new_period % dirty_lists->maximum_age][type]);
+ }
+
+ write_expired_elements(zone);
+}
+
+/*
+ * Record the allocation in the tree and wake any waiters now that the write lock has been
+ * released.
+ */
+static void finish_block_map_allocation(struct vdo_completion *completion)
+{
+ physical_block_number_t pbn;
+ struct tree_page *tree_page;
+ struct block_map_page *page;
+ sequence_number_t old_lock;
+ struct data_vio *data_vio = as_data_vio(completion);
+ struct block_map_zone *zone = data_vio->logical.zone->block_map_zone;
+ struct tree_lock *tree_lock = &data_vio->tree_lock;
+ height_t height = tree_lock->height;
+
+ assert_data_vio_in_logical_zone(data_vio);
+
+ tree_page = get_tree_page(zone, tree_lock);
+ pbn = tree_lock->tree_slots[height - 1].block_map_slot.pbn;
+
+ /* Record the allocation. */
+ page = (struct block_map_page *) tree_page->page_buffer;
+ old_lock = tree_page->recovery_lock;
+ vdo_update_block_map_page(page, data_vio, pbn,
+ VDO_MAPPING_STATE_UNCOMPRESSED,
+ &tree_page->recovery_lock);
+
+ if (vdo_waiter_is_waiting(&tree_page->waiter)) {
+ /* This page is waiting to be written out. */
+ if (zone->flusher != tree_page) {
+ /*
+ * The outstanding flush won't cover the update we just made,
+ * so mark the page as needing another flush.
+ */
+ set_generation(zone, tree_page, zone->generation);
+ }
+ } else {
+ /* Put the page on a dirty list */
+ if (old_lock == 0)
+ INIT_LIST_HEAD(&tree_page->entry);
+ add_to_dirty_lists(zone, &tree_page->entry, VDO_TREE_PAGE,
+ old_lock, tree_page->recovery_lock);
+ }
+
+ tree_lock->height--;
+ if (height > 1) {
+ /* Format the interior node we just allocated (in memory). */
+ tree_page = get_tree_page(zone, tree_lock);
+ vdo_format_block_map_page(tree_page->page_buffer,
+ zone->block_map->nonce,
+ pbn, false);
+ }
+
+ /* Release our claim to the allocation and wake any waiters */
+ release_page_lock(data_vio, "allocation");
+ vdo_waitq_notify_all_waiters(&tree_lock->waiters,
+ continue_allocation_for_waiter, &pbn);
+ if (tree_lock->height == 0) {
+ finish_lookup(data_vio, VDO_SUCCESS);
+ return;
+ }
+
+ allocate_block_map_page(zone, data_vio);
+}
+
+static void release_block_map_write_lock(struct vdo_completion *completion)
+{
+ struct data_vio *data_vio = as_data_vio(completion);
+
+ assert_data_vio_in_allocated_zone(data_vio);
+
+ release_data_vio_allocation_lock(data_vio, true);
+ launch_data_vio_logical_callback(data_vio, finish_block_map_allocation);
+}
+
+/*
+ * Newly allocated block map pages are set to have to MAXIMUM_REFERENCES after they are journaled,
+ * to prevent deduplication against the block after we release the write lock on it, but before we
+ * write out the page.
+ */
+static void set_block_map_page_reference_count(struct vdo_completion *completion)
+{
+ struct data_vio *data_vio = as_data_vio(completion);
+
+ assert_data_vio_in_allocated_zone(data_vio);
+
+ completion->callback = release_block_map_write_lock;
+ vdo_modify_reference_count(completion, &data_vio->increment_updater);
+}
+
+static void journal_block_map_allocation(struct vdo_completion *completion)
+{
+ struct data_vio *data_vio = as_data_vio(completion);
+
+ assert_data_vio_in_journal_zone(data_vio);
+
+ set_data_vio_allocated_zone_callback(data_vio,
+ set_block_map_page_reference_count);
+ vdo_add_recovery_journal_entry(completion->vdo->recovery_journal, data_vio);
+}
+
+static void allocate_block(struct vdo_completion *completion)
+{
+ struct data_vio *data_vio = as_data_vio(completion);
+ struct tree_lock *lock = &data_vio->tree_lock;
+ physical_block_number_t pbn;
+
+ assert_data_vio_in_allocated_zone(data_vio);
+
+ if (!vdo_allocate_block_in_zone(data_vio))
+ return;
+
+ pbn = data_vio->allocation.pbn;
+ lock->tree_slots[lock->height - 1].block_map_slot.pbn = pbn;
+ data_vio->increment_updater = (struct reference_updater) {
+ .operation = VDO_JOURNAL_BLOCK_MAP_REMAPPING,
+ .increment = true,
+ .zpbn = {
+ .pbn = pbn,
+ .state = VDO_MAPPING_STATE_UNCOMPRESSED,
+ },
+ .lock = data_vio->allocation.lock,
+ };
+
+ launch_data_vio_journal_callback(data_vio, journal_block_map_allocation);
+}
+
+static void allocate_block_map_page(struct block_map_zone *zone,
+ struct data_vio *data_vio)
+{
+ int result;
+
+ if (!data_vio->write || data_vio->is_discard) {
+ /* This is a pure read or a discard, so there's nothing left to do here. */
+ finish_lookup(data_vio, VDO_SUCCESS);
+ return;
+ }
+
+ result = attempt_page_lock(zone, data_vio);
+ if (result != VDO_SUCCESS) {
+ abort_lookup(data_vio, result, "allocation");
+ return;
+ }
+
+ if (!data_vio->tree_lock.locked)
+ return;
+
+ data_vio_allocate_data_block(data_vio, VIO_BLOCK_MAP_WRITE_LOCK,
+ allocate_block, allocation_failure);
+}
+
+/**
+ * vdo_find_block_map_slot() - Find the block map slot in which the block map entry for a data_vio
+ * resides and cache that result in the data_vio.
+ *
+ * All ancestors in the tree will be allocated or loaded, as needed.
+ */
+void vdo_find_block_map_slot(struct data_vio *data_vio)
+{
+ page_number_t page_index;
+ struct block_map_tree_slot tree_slot;
+ struct data_location mapping;
+ struct block_map_page *page = NULL;
+ struct tree_lock *lock = &data_vio->tree_lock;
+ struct block_map_zone *zone = data_vio->logical.zone->block_map_zone;
+
+ zone->active_lookups++;
+ if (vdo_is_state_draining(&zone->state)) {
+ finish_lookup(data_vio, VDO_SHUTTING_DOWN);
+ return;
+ }
+
+ lock->tree_slots[0].block_map_slot.slot =
+ data_vio->logical.lbn % VDO_BLOCK_MAP_ENTRIES_PER_PAGE;
+ page_index = (lock->tree_slots[0].page_index / zone->block_map->root_count);
+ tree_slot = (struct block_map_tree_slot) {
+ .page_index = page_index / VDO_BLOCK_MAP_ENTRIES_PER_PAGE,
+ .block_map_slot = {
+ .pbn = 0,
+ .slot = page_index % VDO_BLOCK_MAP_ENTRIES_PER_PAGE,
+ },
+ };
+
+ for (lock->height = 1; lock->height <= VDO_BLOCK_MAP_TREE_HEIGHT; lock->height++) {
+ physical_block_number_t pbn;
+
+ lock->tree_slots[lock->height] = tree_slot;
+ page = (struct block_map_page *) (get_tree_page(zone, lock)->page_buffer);
+ pbn = vdo_get_block_map_page_pbn(page);
+ if (pbn != VDO_ZERO_BLOCK) {
+ lock->tree_slots[lock->height].block_map_slot.pbn = pbn;
+ break;
+ }
+
+ /* Calculate the index and slot for the next level. */
+ tree_slot.block_map_slot.slot =
+ tree_slot.page_index % VDO_BLOCK_MAP_ENTRIES_PER_PAGE;
+ tree_slot.page_index = tree_slot.page_index / VDO_BLOCK_MAP_ENTRIES_PER_PAGE;
+ }
+
+ /* The page at this height has been allocated and loaded. */
+ mapping = vdo_unpack_block_map_entry(&page->entries[tree_slot.block_map_slot.slot]);
+ if (is_invalid_tree_entry(vdo_from_data_vio(data_vio), &mapping, lock->height)) {
+ vdo_log_error_strerror(VDO_BAD_MAPPING,
+ "Invalid block map tree PBN: %llu with state %u for page index %u at height %u",
+ (unsigned long long) mapping.pbn, mapping.state,
+ lock->tree_slots[lock->height - 1].page_index,
+ lock->height - 1);
+ abort_load(data_vio, VDO_BAD_MAPPING);
+ return;
+ }
+
+ if (!vdo_is_mapped_location(&mapping)) {
+ /* The page we want one level down has not been allocated, so allocate it. */
+ allocate_block_map_page(zone, data_vio);
+ return;
+ }
+
+ lock->tree_slots[lock->height - 1].block_map_slot.pbn = mapping.pbn;
+ if (lock->height == 1) {
+ /* This is the ultimate block map page, so we're done */
+ finish_lookup(data_vio, VDO_SUCCESS);
+ return;
+ }
+
+ /* We know what page we need to load. */
+ load_block_map_page(zone, data_vio);
+}
+
+/*
+ * Find the PBN of a leaf block map page. This method may only be used after all allocated tree
+ * pages have been loaded, otherwise, it may give the wrong answer (0).
+ */
+physical_block_number_t vdo_find_block_map_page_pbn(struct block_map *map,
+ page_number_t page_number)
+{
+ struct data_location mapping;
+ struct tree_page *tree_page;
+ struct block_map_page *page;
+ root_count_t root_index = page_number % map->root_count;
+ page_number_t page_index = page_number / map->root_count;
+ slot_number_t slot = page_index % VDO_BLOCK_MAP_ENTRIES_PER_PAGE;
+
+ page_index /= VDO_BLOCK_MAP_ENTRIES_PER_PAGE;
+
+ tree_page = get_tree_page_by_index(map->forest, root_index, 1, page_index);
+ page = (struct block_map_page *) tree_page->page_buffer;
+ if (!page->header.initialized)
+ return VDO_ZERO_BLOCK;
+
+ mapping = vdo_unpack_block_map_entry(&page->entries[slot]);
+ if (!vdo_is_valid_location(&mapping) || vdo_is_state_compressed(mapping.state))
+ return VDO_ZERO_BLOCK;
+ return mapping.pbn;
+}
+
+/*
+ * Write a tree page or indicate that it has been re-dirtied if it is already being written. This
+ * method is used when correcting errors in the tree during read-only rebuild.
+ */
+void vdo_write_tree_page(struct tree_page *page, struct block_map_zone *zone)
+{
+ bool waiting = vdo_waiter_is_waiting(&page->waiter);
+
+ if (waiting && (zone->flusher == page))
+ return;
+
+ set_generation(zone, page, zone->generation);
+ if (waiting || page->writing)
+ return;
+
+ enqueue_page(page, zone);
+}
+
+static int make_segment(struct forest *old_forest, block_count_t new_pages,
+ struct boundary *new_boundary, struct forest *forest)
+{
+ size_t index = (old_forest == NULL) ? 0 : old_forest->segments;
+ struct tree_page *page_ptr;
+ page_count_t segment_sizes[VDO_BLOCK_MAP_TREE_HEIGHT];
+ height_t height;
+ root_count_t root;
+ int result;
+
+ forest->segments = index + 1;
+
+ result = vdo_allocate(forest->segments, struct boundary,
+ "forest boundary array", &forest->boundaries);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = vdo_allocate(forest->segments, struct tree_page *,
+ "forest page pointers", &forest->pages);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = vdo_allocate(new_pages, struct tree_page,
+ "new forest pages", &forest->pages[index]);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ if (index > 0) {
+ memcpy(forest->boundaries, old_forest->boundaries,
+ index * sizeof(struct boundary));
+ memcpy(forest->pages, old_forest->pages,
+ index * sizeof(struct tree_page *));
+ }
+
+ memcpy(&(forest->boundaries[index]), new_boundary, sizeof(struct boundary));
+
+ for (height = 0; height < VDO_BLOCK_MAP_TREE_HEIGHT; height++) {
+ segment_sizes[height] = new_boundary->levels[height];
+ if (index > 0)
+ segment_sizes[height] -= old_forest->boundaries[index - 1].levels[height];
+ }
+
+ page_ptr = forest->pages[index];
+ for (root = 0; root < forest->map->root_count; root++) {
+ struct block_map_tree_segment *segment;
+ struct block_map_tree *tree = &(forest->trees[root]);
+ height_t height;
+
+ int result = vdo_allocate(forest->segments,
+ struct block_map_tree_segment,
+ "tree root segments", &tree->segments);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ if (index > 0) {
+ memcpy(tree->segments, old_forest->trees[root].segments,
+ index * sizeof(struct block_map_tree_segment));
+ }
+
+ segment = &(tree->segments[index]);
+ for (height = 0; height < VDO_BLOCK_MAP_TREE_HEIGHT; height++) {
+ if (segment_sizes[height] == 0)
+ continue;
+
+ segment->levels[height] = page_ptr;
+ if (height == (VDO_BLOCK_MAP_TREE_HEIGHT - 1)) {
+ /* Record the root. */
+ struct block_map_page *page =
+ vdo_format_block_map_page(page_ptr->page_buffer,
+ forest->map->nonce,
+ VDO_INVALID_PBN, true);
+ page->entries[0] =
+ vdo_pack_block_map_entry(forest->map->root_origin + root,
+ VDO_MAPPING_STATE_UNCOMPRESSED);
+ }
+ page_ptr += segment_sizes[height];
+ }
+ }
+
+ return VDO_SUCCESS;
+}
+
+static void deforest(struct forest *forest, size_t first_page_segment)
+{
+ root_count_t root;
+
+ if (forest->pages != NULL) {
+ size_t segment;
+
+ for (segment = first_page_segment; segment < forest->segments; segment++)
+ vdo_free(forest->pages[segment]);
+ vdo_free(forest->pages);
+ }
+
+ for (root = 0; root < forest->map->root_count; root++)
+ vdo_free(forest->trees[root].segments);
+
+ vdo_free(forest->boundaries);
+ vdo_free(forest);
+}
+
+/**
+ * make_forest() - Make a collection of trees for a block_map, expanding the existing forest if
+ * there is one.
+ * @entries: The number of entries the block map will hold.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int make_forest(struct block_map *map, block_count_t entries)
+{
+ struct forest *forest, *old_forest = map->forest;
+ struct boundary new_boundary, *old_boundary = NULL;
+ block_count_t new_pages;
+ int result;
+
+ if (old_forest != NULL)
+ old_boundary = &(old_forest->boundaries[old_forest->segments - 1]);
+
+ new_pages = vdo_compute_new_forest_pages(map->root_count, old_boundary,
+ entries, &new_boundary);
+ if (new_pages == 0) {
+ map->next_entry_count = entries;
+ return VDO_SUCCESS;
+ }
+
+ result = vdo_allocate_extended(struct forest, map->root_count,
+ struct block_map_tree, __func__,
+ &forest);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ forest->map = map;
+ result = make_segment(old_forest, new_pages, &new_boundary, forest);
+ if (result != VDO_SUCCESS) {
+ deforest(forest, forest->segments - 1);
+ return result;
+ }
+
+ map->next_forest = forest;
+ map->next_entry_count = entries;
+ return VDO_SUCCESS;
+}
+
+/**
+ * replace_forest() - Replace a block_map's forest with the already-prepared larger forest.
+ */
+static void replace_forest(struct block_map *map)
+{
+ if (map->next_forest != NULL) {
+ if (map->forest != NULL)
+ deforest(map->forest, map->forest->segments);
+ map->forest = vdo_forget(map->next_forest);
+ }
+
+ map->entry_count = map->next_entry_count;
+ map->next_entry_count = 0;
+}
+
+/**
+ * finish_cursor() - Finish the traversal of a single tree. If it was the last cursor, finish the
+ * traversal.
+ */
+static void finish_cursor(struct cursor *cursor)
+{
+ struct cursors *cursors = cursor->parent;
+ struct vdo_completion *completion = cursors->completion;
+
+ return_vio_to_pool(cursors->pool, vdo_forget(cursor->vio));
+ if (--cursors->active_roots > 0)
+ return;
+
+ vdo_free(cursors);
+
+ vdo_finish_completion(completion);
+}
+
+static void traverse(struct cursor *cursor);
+
+/**
+ * continue_traversal() - Continue traversing a block map tree.
+ * @completion: The VIO doing a read or write.
+ */
+static void continue_traversal(struct vdo_completion *completion)
+{
+ vio_record_metadata_io_error(as_vio(completion));
+ traverse(completion->parent);
+}
+
+/**
+ * finish_traversal_load() - Continue traversing a block map tree now that a page has been loaded.
+ * @completion: The VIO doing the read.
+ */
+static void finish_traversal_load(struct vdo_completion *completion)
+{
+ struct cursor *cursor = completion->parent;
+ height_t height = cursor->height;
+ struct cursor_level *level = &cursor->levels[height];
+ struct tree_page *tree_page =
+ &(cursor->tree->segments[0].levels[height][level->page_index]);
+ struct block_map_page *page = (struct block_map_page *) tree_page->page_buffer;
+
+ vdo_copy_valid_page(cursor->vio->vio.data,
+ cursor->parent->zone->block_map->nonce,
+ pbn_from_vio_bio(cursor->vio->vio.bio), page);
+ traverse(cursor);
+}
+
+static void traversal_endio(struct bio *bio)
+{
+ struct vio *vio = bio->bi_private;
+ struct cursor *cursor = vio->completion.parent;
+
+ continue_vio_after_io(vio, finish_traversal_load,
+ cursor->parent->zone->thread_id);
+}
+
+/**
+ * traverse() - Traverse a single block map tree.
+ *
+ * This is the recursive heart of the traversal process.
+ */
+static void traverse(struct cursor *cursor)
+{
+ for (; cursor->height < VDO_BLOCK_MAP_TREE_HEIGHT; cursor->height++) {
+ height_t height = cursor->height;
+ struct cursor_level *level = &cursor->levels[height];
+ struct tree_page *tree_page =
+ &(cursor->tree->segments[0].levels[height][level->page_index]);
+ struct block_map_page *page = (struct block_map_page *) tree_page->page_buffer;
+
+ if (!page->header.initialized)
+ continue;
+
+ for (; level->slot < VDO_BLOCK_MAP_ENTRIES_PER_PAGE; level->slot++) {
+ struct cursor_level *next_level;
+ page_number_t entry_index =
+ (VDO_BLOCK_MAP_ENTRIES_PER_PAGE * level->page_index) + level->slot;
+ struct data_location location =
+ vdo_unpack_block_map_entry(&page->entries[level->slot]);
+
+ if (!vdo_is_valid_location(&location)) {
+ /* This entry is invalid, so remove it from the page. */
+ page->entries[level->slot] = UNMAPPED_BLOCK_MAP_ENTRY;
+ vdo_write_tree_page(tree_page, cursor->parent->zone);
+ continue;
+ }
+
+ if (!vdo_is_mapped_location(&location))
+ continue;
+
+ /* Erase mapped entries past the end of the logical space. */
+ if (entry_index >= cursor->boundary.levels[height]) {
+ page->entries[level->slot] = UNMAPPED_BLOCK_MAP_ENTRY;
+ vdo_write_tree_page(tree_page, cursor->parent->zone);
+ continue;
+ }
+
+ if (cursor->height < VDO_BLOCK_MAP_TREE_HEIGHT - 1) {
+ int result = cursor->parent->entry_callback(location.pbn,
+ cursor->parent->completion);
+ if (result != VDO_SUCCESS) {
+ page->entries[level->slot] = UNMAPPED_BLOCK_MAP_ENTRY;
+ vdo_write_tree_page(tree_page, cursor->parent->zone);
+ continue;
+ }
+ }
+
+ if (cursor->height == 0)
+ continue;
+
+ cursor->height--;
+ next_level = &cursor->levels[cursor->height];
+ next_level->page_index = entry_index;
+ next_level->slot = 0;
+ level->slot++;
+ vdo_submit_metadata_vio(&cursor->vio->vio, location.pbn,
+ traversal_endio, continue_traversal,
+ REQ_OP_READ | REQ_PRIO);
+ return;
+ }
+ }
+
+ finish_cursor(cursor);
+}
+
+/**
+ * launch_cursor() - Start traversing a single block map tree now that the cursor has a VIO with
+ * which to load pages.
+ * @context: The pooled_vio just acquired.
+ *
+ * Implements waiter_callback_fn.
+ */
+static void launch_cursor(struct vdo_waiter *waiter, void *context)
+{
+ struct cursor *cursor = container_of(waiter, struct cursor, waiter);
+ struct pooled_vio *pooled = context;
+
+ cursor->vio = pooled;
+ pooled->vio.completion.parent = cursor;
+ pooled->vio.completion.callback_thread_id = cursor->parent->zone->thread_id;
+ traverse(cursor);
+}
+
+/**
+ * compute_boundary() - Compute the number of pages used at each level of the given root's tree.
+ *
+ * Return: The list of page counts as a boundary structure.
+ */
+static struct boundary compute_boundary(struct block_map *map, root_count_t root_index)
+{
+ struct boundary boundary;
+ height_t height;
+ page_count_t leaf_pages = vdo_compute_block_map_page_count(map->entry_count);
+ /*
+ * Compute the leaf pages for this root. If the number of leaf pages does not distribute
+ * evenly, we must determine if this root gets an extra page. Extra pages are assigned to
+ * roots starting from tree 0.
+ */
+ page_count_t last_tree_root = (leaf_pages - 1) % map->root_count;
+ page_count_t level_pages = leaf_pages / map->root_count;
+
+ if (root_index <= last_tree_root)
+ level_pages++;
+
+ for (height = 0; height < VDO_BLOCK_MAP_TREE_HEIGHT - 1; height++) {
+ boundary.levels[height] = level_pages;
+ level_pages = DIV_ROUND_UP(level_pages, VDO_BLOCK_MAP_ENTRIES_PER_PAGE);
+ }
+
+ /* The root node always exists, even if the root is otherwise unused. */
+ boundary.levels[VDO_BLOCK_MAP_TREE_HEIGHT - 1] = 1;
+
+ return boundary;
+}
+
+/**
+ * vdo_traverse_forest() - Walk the entire forest of a block map.
+ * @callback: A function to call with the pbn of each allocated node in the forest.
+ * @completion: The completion to notify on each traversed PBN, and when traversal completes.
+ */
+void vdo_traverse_forest(struct block_map *map, vdo_entry_callback_fn callback,
+ struct vdo_completion *completion)
+{
+ root_count_t root;
+ struct cursors *cursors;
+ int result;
+
+ result = vdo_allocate_extended(struct cursors, map->root_count,
+ struct cursor, __func__, &cursors);
+ if (result != VDO_SUCCESS) {
+ vdo_fail_completion(completion, result);
+ return;
+ }
+
+ cursors->zone = &map->zones[0];
+ cursors->pool = cursors->zone->vio_pool;
+ cursors->entry_callback = callback;
+ cursors->completion = completion;
+ cursors->active_roots = map->root_count;
+ for (root = 0; root < map->root_count; root++) {
+ struct cursor *cursor = &cursors->cursors[root];
+
+ *cursor = (struct cursor) {
+ .tree = &map->forest->trees[root],
+ .height = VDO_BLOCK_MAP_TREE_HEIGHT - 1,
+ .parent = cursors,
+ .boundary = compute_boundary(map, root),
+ };
+
+ cursor->waiter.callback = launch_cursor;
+ acquire_vio_from_pool(cursors->pool, &cursor->waiter);
+ }
+}
+
+/**
+ * initialize_block_map_zone() - Initialize the per-zone portions of the block map.
+ * @maximum_age: The number of journal blocks before a dirtied page is considered old and must be
+ * written out.
+ */
+static int __must_check initialize_block_map_zone(struct block_map *map,
+ zone_count_t zone_number,
+ page_count_t cache_size,
+ block_count_t maximum_age)
+{
+ int result;
+ block_count_t i;
+ struct vdo *vdo = map->vdo;
+ struct block_map_zone *zone = &map->zones[zone_number];
+
+ BUILD_BUG_ON(sizeof(struct page_descriptor) != sizeof(u64));
+
+ zone->zone_number = zone_number;
+ zone->thread_id = vdo->thread_config.logical_threads[zone_number];
+ zone->block_map = map;
+
+ result = vdo_allocate_extended(struct dirty_lists, maximum_age,
+ dirty_era_t, __func__,
+ &zone->dirty_lists);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ zone->dirty_lists->maximum_age = maximum_age;
+ INIT_LIST_HEAD(&zone->dirty_lists->expired[VDO_TREE_PAGE]);
+ INIT_LIST_HEAD(&zone->dirty_lists->expired[VDO_CACHE_PAGE]);
+
+ for (i = 0; i < maximum_age; i++) {
+ INIT_LIST_HEAD(&zone->dirty_lists->eras[i][VDO_TREE_PAGE]);
+ INIT_LIST_HEAD(&zone->dirty_lists->eras[i][VDO_CACHE_PAGE]);
+ }
+
+ result = vdo_int_map_create(VDO_LOCK_MAP_CAPACITY, &zone->loading_pages);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = make_vio_pool(vdo, BLOCK_MAP_VIO_POOL_SIZE,
+ zone->thread_id, VIO_TYPE_BLOCK_MAP_INTERIOR,
+ VIO_PRIORITY_METADATA, zone, &zone->vio_pool);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ vdo_set_admin_state_code(&zone->state, VDO_ADMIN_STATE_NORMAL_OPERATION);
+
+ zone->page_cache.zone = zone;
+ zone->page_cache.vdo = vdo;
+ zone->page_cache.page_count = cache_size / map->zone_count;
+ zone->page_cache.stats.free_pages = zone->page_cache.page_count;
+
+ result = allocate_cache_components(&zone->page_cache);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ /* initialize empty circular queues */
+ INIT_LIST_HEAD(&zone->page_cache.lru_list);
+ INIT_LIST_HEAD(&zone->page_cache.outgoing_list);
+
+ return VDO_SUCCESS;
+}
+
+/* Implements vdo_zone_thread_getter_fn */
+static thread_id_t get_block_map_zone_thread_id(void *context, zone_count_t zone_number)
+{
+ struct block_map *map = context;
+
+ return map->zones[zone_number].thread_id;
+}
+
+/* Implements vdo_action_preamble_fn */
+static void prepare_for_era_advance(void *context, struct vdo_completion *parent)
+{
+ struct block_map *map = context;
+
+ map->current_era_point = map->pending_era_point;
+ vdo_finish_completion(parent);
+}
+
+/* Implements vdo_zone_action_fn */
+static void advance_block_map_zone_era(void *context, zone_count_t zone_number,
+ struct vdo_completion *parent)
+{
+ struct block_map *map = context;
+ struct block_map_zone *zone = &map->zones[zone_number];
+
+ update_period(zone->dirty_lists, map->current_era_point);
+ write_expired_elements(zone);
+ vdo_finish_completion(parent);
+}
+
+/*
+ * Schedule an era advance if necessary. This method should not be called directly. Rather, call
+ * vdo_schedule_default_action() on the block map's action manager.
+ *
+ * Implements vdo_action_scheduler_fn.
+ */
+static bool schedule_era_advance(void *context)
+{
+ struct block_map *map = context;
+
+ if (map->current_era_point == map->pending_era_point)
+ return false;
+
+ return vdo_schedule_action(map->action_manager, prepare_for_era_advance,
+ advance_block_map_zone_era, NULL, NULL);
+}
+
+static void uninitialize_block_map_zone(struct block_map_zone *zone)
+{
+ struct vdo_page_cache *cache = &zone->page_cache;
+
+ vdo_free(vdo_forget(zone->dirty_lists));
+ free_vio_pool(vdo_forget(zone->vio_pool));
+ vdo_int_map_free(vdo_forget(zone->loading_pages));
+ if (cache->infos != NULL) {
+ struct page_info *info;
+
+ for (info = cache->infos; info < cache->infos + cache->page_count; info++)
+ free_vio(vdo_forget(info->vio));
+ }
+
+ vdo_int_map_free(vdo_forget(cache->page_map));
+ vdo_free(vdo_forget(cache->infos));
+ vdo_free(vdo_forget(cache->pages));
+}
+
+void vdo_free_block_map(struct block_map *map)
+{
+ zone_count_t zone;
+
+ if (map == NULL)
+ return;
+
+ for (zone = 0; zone < map->zone_count; zone++)
+ uninitialize_block_map_zone(&map->zones[zone]);
+
+ vdo_abandon_block_map_growth(map);
+ if (map->forest != NULL)
+ deforest(vdo_forget(map->forest), 0);
+ vdo_free(vdo_forget(map->action_manager));
+ vdo_free(map);
+}
+
+/* @journal may be NULL. */
+int vdo_decode_block_map(struct block_map_state_2_0 state, block_count_t logical_blocks,
+ struct vdo *vdo, struct recovery_journal *journal,
+ nonce_t nonce, page_count_t cache_size, block_count_t maximum_age,
+ struct block_map **map_ptr)
+{
+ struct block_map *map;
+ int result;
+ zone_count_t zone = 0;
+
+ BUILD_BUG_ON(VDO_BLOCK_MAP_ENTRIES_PER_PAGE !=
+ ((VDO_BLOCK_SIZE - sizeof(struct block_map_page)) /
+ sizeof(struct block_map_entry)));
+ result = VDO_ASSERT(cache_size > 0, "block map cache size is specified");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = vdo_allocate_extended(struct block_map,
+ vdo->thread_config.logical_zone_count,
+ struct block_map_zone, __func__, &map);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ map->vdo = vdo;
+ map->root_origin = state.root_origin;
+ map->root_count = state.root_count;
+ map->entry_count = logical_blocks;
+ map->journal = journal;
+ map->nonce = nonce;
+
+ result = make_forest(map, map->entry_count);
+ if (result != VDO_SUCCESS) {
+ vdo_free_block_map(map);
+ return result;
+ }
+
+ replace_forest(map);
+
+ map->zone_count = vdo->thread_config.logical_zone_count;
+ for (zone = 0; zone < map->zone_count; zone++) {
+ result = initialize_block_map_zone(map, zone, cache_size, maximum_age);
+ if (result != VDO_SUCCESS) {
+ vdo_free_block_map(map);
+ return result;
+ }
+ }
+
+ result = vdo_make_action_manager(map->zone_count, get_block_map_zone_thread_id,
+ vdo_get_recovery_journal_thread_id(journal),
+ map, schedule_era_advance, vdo,
+ &map->action_manager);
+ if (result != VDO_SUCCESS) {
+ vdo_free_block_map(map);
+ return result;
+ }
+
+ *map_ptr = map;
+ return VDO_SUCCESS;
+}
+
+struct block_map_state_2_0 vdo_record_block_map(const struct block_map *map)
+{
+ return (struct block_map_state_2_0) {
+ .flat_page_origin = VDO_BLOCK_MAP_FLAT_PAGE_ORIGIN,
+ /* This is the flat page count, which has turned out to always be 0. */
+ .flat_page_count = 0,
+ .root_origin = map->root_origin,
+ .root_count = map->root_count,
+ };
+}
+
+/* The block map needs to know the journals' sequence number to initialize the eras. */
+void vdo_initialize_block_map_from_journal(struct block_map *map,
+ struct recovery_journal *journal)
+{
+ zone_count_t z = 0;
+
+ map->current_era_point = vdo_get_recovery_journal_current_sequence_number(journal);
+ map->pending_era_point = map->current_era_point;
+
+ for (z = 0; z < map->zone_count; z++) {
+ struct dirty_lists *dirty_lists = map->zones[z].dirty_lists;
+
+ VDO_ASSERT_LOG_ONLY(dirty_lists->next_period == 0, "current period not set");
+ dirty_lists->oldest_period = map->current_era_point;
+ dirty_lists->next_period = map->current_era_point + 1;
+ dirty_lists->offset = map->current_era_point % dirty_lists->maximum_age;
+ }
+}
+
+/* Compute the logical zone for the LBN of a data vio. */
+zone_count_t vdo_compute_logical_zone(struct data_vio *data_vio)
+{
+ struct block_map *map = vdo_from_data_vio(data_vio)->block_map;
+ struct tree_lock *tree_lock = &data_vio->tree_lock;
+ page_number_t page_number = data_vio->logical.lbn / VDO_BLOCK_MAP_ENTRIES_PER_PAGE;
+
+ tree_lock->tree_slots[0].page_index = page_number;
+ tree_lock->root_index = page_number % map->root_count;
+ return (tree_lock->root_index % map->zone_count);
+}
+
+void vdo_advance_block_map_era(struct block_map *map,
+ sequence_number_t recovery_block_number)
+{
+ if (map == NULL)
+ return;
+
+ map->pending_era_point = recovery_block_number;
+ vdo_schedule_default_action(map->action_manager);
+}
+
+/* Implements vdo_admin_initiator_fn */
+static void initiate_drain(struct admin_state *state)
+{
+ struct block_map_zone *zone = container_of(state, struct block_map_zone, state);
+
+ VDO_ASSERT_LOG_ONLY((zone->active_lookups == 0),
+ "%s() called with no active lookups", __func__);
+
+ if (!vdo_is_state_suspending(state)) {
+ while (zone->dirty_lists->oldest_period < zone->dirty_lists->next_period)
+ expire_oldest_list(zone->dirty_lists);
+ write_expired_elements(zone);
+ }
+
+ check_for_drain_complete(zone);
+}
+
+/* Implements vdo_zone_action_fn. */
+static void drain_zone(void *context, zone_count_t zone_number,
+ struct vdo_completion *parent)
+{
+ struct block_map *map = context;
+ struct block_map_zone *zone = &map->zones[zone_number];
+
+ vdo_start_draining(&zone->state,
+ vdo_get_current_manager_operation(map->action_manager),
+ parent, initiate_drain);
+}
+
+void vdo_drain_block_map(struct block_map *map, const struct admin_state_code *operation,
+ struct vdo_completion *parent)
+{
+ vdo_schedule_operation(map->action_manager, operation, NULL, drain_zone, NULL,
+ parent);
+}
+
+/* Implements vdo_zone_action_fn. */
+static void resume_block_map_zone(void *context, zone_count_t zone_number,
+ struct vdo_completion *parent)
+{
+ struct block_map *map = context;
+ struct block_map_zone *zone = &map->zones[zone_number];
+
+ vdo_fail_completion(parent, vdo_resume_if_quiescent(&zone->state));
+}
+
+void vdo_resume_block_map(struct block_map *map, struct vdo_completion *parent)
+{
+ vdo_schedule_operation(map->action_manager, VDO_ADMIN_STATE_RESUMING,
+ NULL, resume_block_map_zone, NULL, parent);
+}
+
+/* Allocate an expanded collection of trees, for a future growth. */
+int vdo_prepare_to_grow_block_map(struct block_map *map,
+ block_count_t new_logical_blocks)
+{
+ if (map->next_entry_count == new_logical_blocks)
+ return VDO_SUCCESS;
+
+ if (map->next_entry_count > 0)
+ vdo_abandon_block_map_growth(map);
+
+ if (new_logical_blocks < map->entry_count) {
+ map->next_entry_count = map->entry_count;
+ return VDO_SUCCESS;
+ }
+
+ return make_forest(map, new_logical_blocks);
+}
+
+/* Implements vdo_action_preamble_fn */
+static void grow_forest(void *context, struct vdo_completion *completion)
+{
+ replace_forest(context);
+ vdo_finish_completion(completion);
+}
+
+/* Requires vdo_prepare_to_grow_block_map() to have been previously called. */
+void vdo_grow_block_map(struct block_map *map, struct vdo_completion *parent)
+{
+ vdo_schedule_operation(map->action_manager,
+ VDO_ADMIN_STATE_SUSPENDED_OPERATION,
+ grow_forest, NULL, NULL, parent);
+}
+
+void vdo_abandon_block_map_growth(struct block_map *map)
+{
+ struct forest *forest = vdo_forget(map->next_forest);
+
+ if (forest != NULL)
+ deforest(forest, forest->segments - 1);
+
+ map->next_entry_count = 0;
+}
+
+/* Release the page completion and then continue the requester. */
+static inline void finish_processing_page(struct vdo_completion *completion, int result)
+{
+ struct vdo_completion *parent = completion->parent;
+
+ vdo_release_page_completion(completion);
+ vdo_continue_completion(parent, result);
+}
+
+static void handle_page_error(struct vdo_completion *completion)
+{
+ finish_processing_page(completion, completion->result);
+}
+
+/* Fetch the mapping page for a block map update, and call the provided handler when fetched. */
+static void fetch_mapping_page(struct data_vio *data_vio, bool modifiable,
+ vdo_action_fn action)
+{
+ struct block_map_zone *zone = data_vio->logical.zone->block_map_zone;
+
+ if (vdo_is_state_draining(&zone->state)) {
+ continue_data_vio_with_error(data_vio, VDO_SHUTTING_DOWN);
+ return;
+ }
+
+ vdo_get_page(&data_vio->page_completion, zone,
+ data_vio->tree_lock.tree_slots[0].block_map_slot.pbn,
+ modifiable, &data_vio->vio.completion,
+ action, handle_page_error, false);
+}
+
+/**
+ * clear_mapped_location() - Clear a data_vio's mapped block location, setting it to be unmapped.
+ *
+ * This indicates the block map entry for the logical block is either unmapped or corrupted.
+ */
+static void clear_mapped_location(struct data_vio *data_vio)
+{
+ data_vio->mapped = (struct zoned_pbn) {
+ .state = VDO_MAPPING_STATE_UNMAPPED,
+ };
+}
+
+/**
+ * set_mapped_location() - Decode and validate a block map entry, and set the mapped location of a
+ * data_vio.
+ *
+ * Return: VDO_SUCCESS or VDO_BAD_MAPPING if the map entry is invalid or an error code for any
+ * other failure
+ */
+static int __must_check set_mapped_location(struct data_vio *data_vio,
+ const struct block_map_entry *entry)
+{
+ /* Unpack the PBN for logging purposes even if the entry is invalid. */
+ struct data_location mapped = vdo_unpack_block_map_entry(entry);
+
+ if (vdo_is_valid_location(&mapped)) {
+ int result;
+
+ result = vdo_get_physical_zone(vdo_from_data_vio(data_vio),
+ mapped.pbn, &data_vio->mapped.zone);
+ if (result == VDO_SUCCESS) {
+ data_vio->mapped.pbn = mapped.pbn;
+ data_vio->mapped.state = mapped.state;
+ return VDO_SUCCESS;
+ }
+
+ /*
+ * Return all errors not specifically known to be errors from validating the
+ * location.
+ */
+ if ((result != VDO_OUT_OF_RANGE) && (result != VDO_BAD_MAPPING))
+ return result;
+ }
+
+ /*
+ * Log the corruption even if we wind up ignoring it for write VIOs, converting all cases
+ * to VDO_BAD_MAPPING.
+ */
+ vdo_log_error_strerror(VDO_BAD_MAPPING,
+ "PBN %llu with state %u read from the block map was invalid",
+ (unsigned long long) mapped.pbn, mapped.state);
+
+ /*
+ * A read VIO has no option but to report the bad mapping--reading zeros would be hiding
+ * known data loss.
+ */
+ if (!data_vio->write)
+ return VDO_BAD_MAPPING;
+
+ /*
+ * A write VIO only reads this mapping to decref the old block. Treat this as an unmapped
+ * entry rather than fail the write.
+ */
+ clear_mapped_location(data_vio);
+ return VDO_SUCCESS;
+}
+
+/* This callback is registered in vdo_get_mapped_block(). */
+static void get_mapping_from_fetched_page(struct vdo_completion *completion)
+{
+ int result;
+ struct vdo_page_completion *vpc = as_vdo_page_completion(completion);
+ const struct block_map_page *page;
+ const struct block_map_entry *entry;
+ struct data_vio *data_vio = as_data_vio(completion->parent);
+ struct block_map_tree_slot *tree_slot;
+
+ if (completion->result != VDO_SUCCESS) {
+ finish_processing_page(completion, completion->result);
+ return;
+ }
+
+ result = validate_completed_page(vpc, false);
+ if (result != VDO_SUCCESS) {
+ finish_processing_page(completion, result);
+ return;
+ }
+
+ page = (const struct block_map_page *) get_page_buffer(vpc->info);
+ tree_slot = &data_vio->tree_lock.tree_slots[0];
+ entry = &page->entries[tree_slot->block_map_slot.slot];
+
+ result = set_mapped_location(data_vio, entry);
+ finish_processing_page(completion, result);
+}
+
+void vdo_update_block_map_page(struct block_map_page *page, struct data_vio *data_vio,
+ physical_block_number_t pbn,
+ enum block_mapping_state mapping_state,
+ sequence_number_t *recovery_lock)
+{
+ struct block_map_zone *zone = data_vio->logical.zone->block_map_zone;
+ struct block_map *block_map = zone->block_map;
+ struct recovery_journal *journal = block_map->journal;
+ sequence_number_t old_locked, new_locked;
+ struct tree_lock *tree_lock = &data_vio->tree_lock;
+
+ /* Encode the new mapping. */
+ page->entries[tree_lock->tree_slots[tree_lock->height].block_map_slot.slot] =
+ vdo_pack_block_map_entry(pbn, mapping_state);
+
+ /* Adjust references on the recovery journal blocks. */
+ old_locked = *recovery_lock;
+ new_locked = data_vio->recovery_sequence_number;
+
+ if ((old_locked == 0) || (old_locked > new_locked)) {
+ vdo_acquire_recovery_journal_block_reference(journal, new_locked,
+ VDO_ZONE_TYPE_LOGICAL,
+ zone->zone_number);
+
+ if (old_locked > 0) {
+ vdo_release_recovery_journal_block_reference(journal, old_locked,
+ VDO_ZONE_TYPE_LOGICAL,
+ zone->zone_number);
+ }
+
+ *recovery_lock = new_locked;
+ }
+
+ /*
+ * FIXME: explain this more
+ * Release the transferred lock from the data_vio.
+ */
+ vdo_release_journal_entry_lock(journal, new_locked);
+ data_vio->recovery_sequence_number = 0;
+}
+
+static void put_mapping_in_fetched_page(struct vdo_completion *completion)
+{
+ struct data_vio *data_vio = as_data_vio(completion->parent);
+ sequence_number_t old_lock;
+ struct vdo_page_completion *vpc;
+ struct page_info *info;
+ int result;
+
+ if (completion->result != VDO_SUCCESS) {
+ finish_processing_page(completion, completion->result);
+ return;
+ }
+
+ vpc = as_vdo_page_completion(completion);
+ result = validate_completed_page(vpc, true);
+ if (result != VDO_SUCCESS) {
+ finish_processing_page(completion, result);
+ return;
+ }
+
+ info = vpc->info;
+ old_lock = info->recovery_lock;
+ vdo_update_block_map_page((struct block_map_page *) get_page_buffer(info),
+ data_vio, data_vio->new_mapped.pbn,
+ data_vio->new_mapped.state, &info->recovery_lock);
+ set_info_state(info, PS_DIRTY);
+ add_to_dirty_lists(info->cache->zone, &info->state_entry,
+ VDO_CACHE_PAGE, old_lock, info->recovery_lock);
+ finish_processing_page(completion, VDO_SUCCESS);
+}
+
+/* Read a stored block mapping into a data_vio. */
+void vdo_get_mapped_block(struct data_vio *data_vio)
+{
+ if (data_vio->tree_lock.tree_slots[0].block_map_slot.pbn == VDO_ZERO_BLOCK) {
+ /*
+ * We know that the block map page for this LBN has not been allocated, so the
+ * block must be unmapped.
+ */
+ clear_mapped_location(data_vio);
+ continue_data_vio(data_vio);
+ return;
+ }
+
+ fetch_mapping_page(data_vio, false, get_mapping_from_fetched_page);
+}
+
+/* Update a stored block mapping to reflect a data_vio's new mapping. */
+void vdo_put_mapped_block(struct data_vio *data_vio)
+{
+ fetch_mapping_page(data_vio, true, put_mapping_in_fetched_page);
+}
+
+struct block_map_statistics vdo_get_block_map_statistics(struct block_map *map)
+{
+ zone_count_t zone = 0;
+ struct block_map_statistics totals;
+
+ memset(&totals, 0, sizeof(struct block_map_statistics));
+ for (zone = 0; zone < map->zone_count; zone++) {
+ const struct block_map_statistics *stats =
+ &(map->zones[zone].page_cache.stats);
+
+ totals.dirty_pages += READ_ONCE(stats->dirty_pages);
+ totals.clean_pages += READ_ONCE(stats->clean_pages);
+ totals.free_pages += READ_ONCE(stats->free_pages);
+ totals.failed_pages += READ_ONCE(stats->failed_pages);
+ totals.incoming_pages += READ_ONCE(stats->incoming_pages);
+ totals.outgoing_pages += READ_ONCE(stats->outgoing_pages);
+ totals.cache_pressure += READ_ONCE(stats->cache_pressure);
+ totals.read_count += READ_ONCE(stats->read_count);
+ totals.write_count += READ_ONCE(stats->write_count);
+ totals.failed_reads += READ_ONCE(stats->failed_reads);
+ totals.failed_writes += READ_ONCE(stats->failed_writes);
+ totals.reclaimed += READ_ONCE(stats->reclaimed);
+ totals.read_outgoing += READ_ONCE(stats->read_outgoing);
+ totals.found_in_cache += READ_ONCE(stats->found_in_cache);
+ totals.discard_required += READ_ONCE(stats->discard_required);
+ totals.wait_for_page += READ_ONCE(stats->wait_for_page);
+ totals.fetch_required += READ_ONCE(stats->fetch_required);
+ totals.pages_loaded += READ_ONCE(stats->pages_loaded);
+ totals.pages_saved += READ_ONCE(stats->pages_saved);
+ totals.flush_count += READ_ONCE(stats->flush_count);
+ }
+
+ return totals;
+}
diff --git a/drivers/md/dm-vdo/block-map.h b/drivers/md/dm-vdo/block-map.h
new file mode 100644
index 000000000000..39a13039e4a3
--- /dev/null
+++ b/drivers/md/dm-vdo/block-map.h
@@ -0,0 +1,394 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef VDO_BLOCK_MAP_H
+#define VDO_BLOCK_MAP_H
+
+#include <linux/list.h>
+
+#include "numeric.h"
+
+#include "admin-state.h"
+#include "completion.h"
+#include "encodings.h"
+#include "int-map.h"
+#include "statistics.h"
+#include "types.h"
+#include "vio.h"
+#include "wait-queue.h"
+
+/*
+ * The block map is responsible for tracking all the logical to physical mappings of a VDO. It
+ * consists of a collection of 60 radix trees gradually allocated as logical addresses are used.
+ * Each tree is assigned to a logical zone such that it is easy to compute which zone must handle
+ * each logical address. Each logical zone also has a dedicated portion of the leaf page cache.
+ *
+ * Each logical zone has a single dedicated queue and thread for performing all updates to the
+ * radix trees assigned to that zone. The concurrency guarantees of this single-threaded model
+ * allow the code to omit more fine-grained locking for the block map structures.
+ *
+ * Load operations must be performed on the admin thread. Normal operations, such as reading and
+ * updating mappings, must be performed on the appropriate logical zone thread. Save operations
+ * must be launched from the same admin thread as the original load operation.
+ */
+
+enum {
+ BLOCK_MAP_VIO_POOL_SIZE = 64,
+};
+
+/*
+ * Generation counter for page references.
+ */
+typedef u32 vdo_page_generation;
+
+extern const struct block_map_entry UNMAPPED_BLOCK_MAP_ENTRY;
+
+/* The VDO Page Cache abstraction. */
+struct vdo_page_cache {
+ /* the VDO which owns this cache */
+ struct vdo *vdo;
+ /* number of pages in cache */
+ page_count_t page_count;
+ /* number of pages to write in the current batch */
+ page_count_t pages_in_batch;
+ /* Whether the VDO is doing a read-only rebuild */
+ bool rebuilding;
+
+ /* array of page information entries */
+ struct page_info *infos;
+ /* raw memory for pages */
+ char *pages;
+ /* cache last found page info */
+ struct page_info *last_found;
+ /* map of page number to info */
+ struct int_map *page_map;
+ /* main LRU list (all infos) */
+ struct list_head lru_list;
+ /* free page list (oldest first) */
+ struct list_head free_list;
+ /* outgoing page list */
+ struct list_head outgoing_list;
+ /* number of read I/O operations pending */
+ page_count_t outstanding_reads;
+ /* number of write I/O operations pending */
+ page_count_t outstanding_writes;
+ /* number of pages covered by the current flush */
+ page_count_t pages_in_flush;
+ /* number of pages waiting to be included in the next flush */
+ page_count_t pages_to_flush;
+ /* number of discards in progress */
+ unsigned int discard_count;
+ /* how many VPCs waiting for free page */
+ unsigned int waiter_count;
+ /* queue of waiters who want a free page */
+ struct vdo_wait_queue free_waiters;
+ /*
+ * Statistics are only updated on the logical zone thread, but are accessed from other
+ * threads.
+ */
+ struct block_map_statistics stats;
+ /* counter for pressure reports */
+ u32 pressure_report;
+ /* the block map zone to which this cache belongs */
+ struct block_map_zone *zone;
+};
+
+/*
+ * The state of a page buffer. If the page buffer is free no particular page is bound to it,
+ * otherwise the page buffer is bound to particular page whose absolute pbn is in the pbn field. If
+ * the page is resident or dirty the page data is stable and may be accessed. Otherwise the page is
+ * in flight (incoming or outgoing) and its data should not be accessed.
+ *
+ * @note Update the static data in get_page_state_name() if you change this enumeration.
+ */
+enum vdo_page_buffer_state {
+ /* this page buffer is not being used */
+ PS_FREE,
+ /* this page is being read from store */
+ PS_INCOMING,
+ /* attempt to load this page failed */
+ PS_FAILED,
+ /* this page is valid and un-modified */
+ PS_RESIDENT,
+ /* this page is valid and modified */
+ PS_DIRTY,
+ /* this page is being written and should not be used */
+ PS_OUTGOING,
+ /* not a state */
+ PAGE_STATE_COUNT,
+} __packed;
+
+/*
+ * The write status of page
+ */
+enum vdo_page_write_status {
+ WRITE_STATUS_NORMAL,
+ WRITE_STATUS_DISCARD,
+ WRITE_STATUS_DEFERRED,
+} __packed;
+
+/* Per-page-slot information. */
+struct page_info {
+ /* Preallocated page struct vio */
+ struct vio *vio;
+ /* back-link for references */
+ struct vdo_page_cache *cache;
+ /* the pbn of the page */
+ physical_block_number_t pbn;
+ /* page is busy (temporarily locked) */
+ u16 busy;
+ /* the write status the page */
+ enum vdo_page_write_status write_status;
+ /* page state */
+ enum vdo_page_buffer_state state;
+ /* queue of completions awaiting this item */
+ struct vdo_wait_queue waiting;
+ /* state linked list entry */
+ struct list_head state_entry;
+ /* LRU entry */
+ struct list_head lru_entry;
+ /*
+ * The earliest recovery journal block containing uncommitted updates to the block map page
+ * associated with this page_info. A reference (lock) is held on that block to prevent it
+ * from being reaped. When this value changes, the reference on the old value must be
+ * released and a reference on the new value must be acquired.
+ */
+ sequence_number_t recovery_lock;
+};
+
+/*
+ * A completion awaiting a specific page. Also a live reference into the page once completed, until
+ * freed.
+ */
+struct vdo_page_completion {
+ /* The generic completion */
+ struct vdo_completion completion;
+ /* The cache involved */
+ struct vdo_page_cache *cache;
+ /* The waiter for the pending list */
+ struct vdo_waiter waiter;
+ /* The absolute physical block number of the page on disk */
+ physical_block_number_t pbn;
+ /* Whether the page may be modified */
+ bool writable;
+ /* Whether the page is available */
+ bool ready;
+ /* The info structure for the page, only valid when ready */
+ struct page_info *info;
+};
+
+struct forest;
+
+struct tree_page {
+ struct vdo_waiter waiter;
+
+ /* Dirty list entry */
+ struct list_head entry;
+
+ /* If dirty, the tree zone flush generation in which it was last dirtied. */
+ u8 generation;
+
+ /* Whether this page is an interior tree page being written out. */
+ bool writing;
+
+ /* If writing, the tree zone flush generation of the copy being written. */
+ u8 writing_generation;
+
+ /*
+ * Sequence number of the earliest recovery journal block containing uncommitted updates to
+ * this page
+ */
+ sequence_number_t recovery_lock;
+
+ /* The value of recovery_lock when the this page last started writing */
+ sequence_number_t writing_recovery_lock;
+
+ char page_buffer[VDO_BLOCK_SIZE];
+};
+
+enum block_map_page_type {
+ VDO_TREE_PAGE,
+ VDO_CACHE_PAGE,
+};
+
+typedef struct list_head dirty_era_t[2];
+
+struct dirty_lists {
+ /* The number of periods after which an element will be expired */
+ block_count_t maximum_age;
+ /* The oldest period which has unexpired elements */
+ sequence_number_t oldest_period;
+ /* One more than the current period */
+ sequence_number_t next_period;
+ /* The offset in the array of lists of the oldest period */
+ block_count_t offset;
+ /* Expired pages */
+ dirty_era_t expired;
+ /* The lists of dirty pages */
+ dirty_era_t eras[];
+};
+
+struct block_map_zone {
+ zone_count_t zone_number;
+ thread_id_t thread_id;
+ struct admin_state state;
+ struct block_map *block_map;
+ /* Dirty pages, by era*/
+ struct dirty_lists *dirty_lists;
+ struct vdo_page_cache page_cache;
+ data_vio_count_t active_lookups;
+ struct int_map *loading_pages;
+ struct vio_pool *vio_pool;
+ /* The tree page which has issued or will be issuing a flush */
+ struct tree_page *flusher;
+ struct vdo_wait_queue flush_waiters;
+ /* The generation after the most recent flush */
+ u8 generation;
+ u8 oldest_generation;
+ /* The counts of dirty pages in each generation */
+ u32 dirty_page_counts[256];
+};
+
+struct block_map {
+ struct vdo *vdo;
+ struct action_manager *action_manager;
+ /* The absolute PBN of the first root of the tree part of the block map */
+ physical_block_number_t root_origin;
+ block_count_t root_count;
+
+ /* The era point we are currently distributing to the zones */
+ sequence_number_t current_era_point;
+ /* The next era point */
+ sequence_number_t pending_era_point;
+
+ /* The number of entries in block map */
+ block_count_t entry_count;
+ nonce_t nonce;
+ struct recovery_journal *journal;
+
+ /* The trees for finding block map pages */
+ struct forest *forest;
+ /* The expanded trees awaiting growth */
+ struct forest *next_forest;
+ /* The number of entries after growth */
+ block_count_t next_entry_count;
+
+ zone_count_t zone_count;
+ struct block_map_zone zones[];
+};
+
+/**
+ * typedef vdo_entry_callback_fn - A function to be called for each allocated PBN when traversing
+ * the forest.
+ * @pbn: A PBN of a tree node.
+ * @completion: The parent completion of the traversal.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+typedef int (*vdo_entry_callback_fn)(physical_block_number_t pbn,
+ struct vdo_completion *completion);
+
+static inline struct vdo_page_completion *as_vdo_page_completion(struct vdo_completion *completion)
+{
+ vdo_assert_completion_type(completion, VDO_PAGE_COMPLETION);
+ return container_of(completion, struct vdo_page_completion, completion);
+}
+
+void vdo_release_page_completion(struct vdo_completion *completion);
+
+void vdo_get_page(struct vdo_page_completion *page_completion,
+ struct block_map_zone *zone, physical_block_number_t pbn,
+ bool writable, void *parent, vdo_action_fn callback,
+ vdo_action_fn error_handler, bool requeue);
+
+void vdo_request_page_write(struct vdo_completion *completion);
+
+int __must_check vdo_get_cached_page(struct vdo_completion *completion,
+ struct block_map_page **page_ptr);
+
+int __must_check vdo_invalidate_page_cache(struct vdo_page_cache *cache);
+
+static inline struct block_map_page * __must_check
+vdo_as_block_map_page(struct tree_page *tree_page)
+{
+ return (struct block_map_page *) tree_page->page_buffer;
+}
+
+bool vdo_copy_valid_page(char *buffer, nonce_t nonce,
+ physical_block_number_t pbn,
+ struct block_map_page *page);
+
+void vdo_find_block_map_slot(struct data_vio *data_vio);
+
+physical_block_number_t vdo_find_block_map_page_pbn(struct block_map *map,
+ page_number_t page_number);
+
+void vdo_write_tree_page(struct tree_page *page, struct block_map_zone *zone);
+
+void vdo_traverse_forest(struct block_map *map, vdo_entry_callback_fn callback,
+ struct vdo_completion *completion);
+
+int __must_check vdo_decode_block_map(struct block_map_state_2_0 state,
+ block_count_t logical_blocks, struct vdo *vdo,
+ struct recovery_journal *journal, nonce_t nonce,
+ page_count_t cache_size, block_count_t maximum_age,
+ struct block_map **map_ptr);
+
+void vdo_drain_block_map(struct block_map *map, const struct admin_state_code *operation,
+ struct vdo_completion *parent);
+
+void vdo_resume_block_map(struct block_map *map, struct vdo_completion *parent);
+
+int __must_check vdo_prepare_to_grow_block_map(struct block_map *map,
+ block_count_t new_logical_blocks);
+
+void vdo_grow_block_map(struct block_map *map, struct vdo_completion *parent);
+
+void vdo_abandon_block_map_growth(struct block_map *map);
+
+void vdo_free_block_map(struct block_map *map);
+
+struct block_map_state_2_0 __must_check vdo_record_block_map(const struct block_map *map);
+
+void vdo_initialize_block_map_from_journal(struct block_map *map,
+ struct recovery_journal *journal);
+
+zone_count_t vdo_compute_logical_zone(struct data_vio *data_vio);
+
+void vdo_advance_block_map_era(struct block_map *map,
+ sequence_number_t recovery_block_number);
+
+void vdo_update_block_map_page(struct block_map_page *page, struct data_vio *data_vio,
+ physical_block_number_t pbn,
+ enum block_mapping_state mapping_state,
+ sequence_number_t *recovery_lock);
+
+void vdo_get_mapped_block(struct data_vio *data_vio);
+
+void vdo_put_mapped_block(struct data_vio *data_vio);
+
+struct block_map_statistics __must_check vdo_get_block_map_statistics(struct block_map *map);
+
+/**
+ * vdo_convert_maximum_age() - Convert the maximum age to reflect the new recovery journal format
+ * @age: The configured maximum age
+ *
+ * Return: The converted age
+ *
+ * In the old recovery journal format, each journal block held 311 entries, and every write bio
+ * made two entries. The old maximum age was half the usable journal length. In the new format,
+ * each block holds only 217 entries, but each bio only makes one entry. We convert the configured
+ * age so that the number of writes in a block map era is the same in the old and new formats. This
+ * keeps the bound on the amount of work required to recover the block map from the recovery
+ * journal the same across the format change. It also keeps the amortization of block map page
+ * writes to write bios the same.
+ */
+static inline block_count_t vdo_convert_maximum_age(block_count_t age)
+{
+ return DIV_ROUND_UP(age * RECOVERY_JOURNAL_1_ENTRIES_PER_BLOCK,
+ 2 * RECOVERY_JOURNAL_ENTRIES_PER_BLOCK);
+}
+
+#endif /* VDO_BLOCK_MAP_H */
diff --git a/drivers/md/dm-vdo/completion.c b/drivers/md/dm-vdo/completion.c
new file mode 100644
index 000000000000..5ad85334632d
--- /dev/null
+++ b/drivers/md/dm-vdo/completion.c
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "completion.h"
+
+#include <linux/kernel.h>
+
+#include "logger.h"
+#include "permassert.h"
+
+#include "status-codes.h"
+#include "types.h"
+#include "vio.h"
+#include "vdo.h"
+
+/**
+ * DOC: vdo completions.
+ *
+ * Most of vdo's data structures are lock free, each either belonging to a single "zone," or
+ * divided into a number of zones whose accesses to the structure do not overlap. During normal
+ * operation, at most one thread will be operating in any given zone. Each zone has a
+ * vdo_work_queue which holds vdo_completions that are to be run in that zone. A completion may
+ * only be enqueued on one queue or operating in a single zone at a time.
+ *
+ * At each step of a multi-threaded operation, the completion performing the operation is given a
+ * callback, error handler, and thread id for the next step. A completion is "run" when it is
+ * operating on the correct thread (as specified by its callback_thread_id). If the value of its
+ * "result" field is an error (i.e. not VDO_SUCCESS), the function in its "error_handler" will be
+ * invoked. If the error_handler is NULL, or there is no error, the function set as its "callback"
+ * will be invoked. Generally, a completion will not be run directly, but rather will be
+ * "launched." In this case, it will check whether it is operating on the correct thread. If it is,
+ * it will run immediately. Otherwise, it will be enqueue on the vdo_work_queue associated with the
+ * completion's "callback_thread_id". When it is dequeued, it will be on the correct thread, and
+ * will get run. In some cases, the completion should get queued instead of running immediately,
+ * even if it is being launched from the correct thread. This is usually in cases where there is a
+ * long chain of callbacks, all on the same thread, which could overflow the stack. In such cases,
+ * the completion's "requeue" field should be set to true. Doing so will skip the current thread
+ * check and simply enqueue the completion.
+ *
+ * A completion may be "finished," in which case its "complete" field will be set to true before it
+ * is next run. It is a bug to attempt to set the result or re-finish a finished completion.
+ * Because a completion's fields are not safe to examine from any thread other than the one on
+ * which the completion is currently operating, this field is used only to aid in detecting
+ * programming errors. It can not be used for cross-thread checking on the status of an operation.
+ * A completion must be "reset" before it can be reused after it has been finished. Resetting will
+ * also clear any error from the result field.
+ **/
+
+void vdo_initialize_completion(struct vdo_completion *completion,
+ struct vdo *vdo,
+ enum vdo_completion_type type)
+{
+ memset(completion, 0, sizeof(*completion));
+ completion->vdo = vdo;
+ completion->type = type;
+ vdo_reset_completion(completion);
+}
+
+static inline void assert_incomplete(struct vdo_completion *completion)
+{
+ VDO_ASSERT_LOG_ONLY(!completion->complete, "completion is not complete");
+}
+
+/**
+ * vdo_set_completion_result() - Set the result of a completion.
+ *
+ * Older errors will not be masked.
+ */
+void vdo_set_completion_result(struct vdo_completion *completion, int result)
+{
+ assert_incomplete(completion);
+ if (completion->result == VDO_SUCCESS)
+ completion->result = result;
+}
+
+/**
+ * vdo_launch_completion_with_priority() - Run or enqueue a completion.
+ * @priority: The priority at which to enqueue the completion.
+ *
+ * If called on the correct thread (i.e. the one specified in the completion's callback_thread_id
+ * field) and not marked for requeue, the completion will be run immediately. Otherwise, the
+ * completion will be enqueued on the specified thread.
+ */
+void vdo_launch_completion_with_priority(struct vdo_completion *completion,
+ enum vdo_completion_priority priority)
+{
+ thread_id_t callback_thread = completion->callback_thread_id;
+
+ if (completion->requeue || (callback_thread != vdo_get_callback_thread_id())) {
+ vdo_enqueue_completion(completion, priority);
+ return;
+ }
+
+ vdo_run_completion(completion);
+}
+
+/** vdo_finish_completion() - Mark a completion as complete and then launch it. */
+void vdo_finish_completion(struct vdo_completion *completion)
+{
+ assert_incomplete(completion);
+ completion->complete = true;
+ if (completion->callback != NULL)
+ vdo_launch_completion(completion);
+}
+
+void vdo_enqueue_completion(struct vdo_completion *completion,
+ enum vdo_completion_priority priority)
+{
+ struct vdo *vdo = completion->vdo;
+ thread_id_t thread_id = completion->callback_thread_id;
+
+ if (VDO_ASSERT(thread_id < vdo->thread_config.thread_count,
+ "thread_id %u (completion type %d) is less than thread count %u",
+ thread_id, completion->type,
+ vdo->thread_config.thread_count) != VDO_SUCCESS)
+ BUG();
+
+ completion->requeue = false;
+ completion->priority = priority;
+ completion->my_queue = NULL;
+ vdo_enqueue_work_queue(vdo->threads[thread_id].queue, completion);
+}
+
+/**
+ * vdo_requeue_completion_if_needed() - Requeue a completion if not called on the specified thread.
+ *
+ * Return: True if the completion was requeued; callers may not access the completion in this case.
+ */
+bool vdo_requeue_completion_if_needed(struct vdo_completion *completion,
+ thread_id_t callback_thread_id)
+{
+ if (vdo_get_callback_thread_id() == callback_thread_id)
+ return false;
+
+ completion->callback_thread_id = callback_thread_id;
+ vdo_enqueue_completion(completion, VDO_WORK_Q_DEFAULT_PRIORITY);
+ return true;
+}
diff --git a/drivers/md/dm-vdo/completion.h b/drivers/md/dm-vdo/completion.h
new file mode 100644
index 000000000000..3407f34ce58c
--- /dev/null
+++ b/drivers/md/dm-vdo/completion.h
@@ -0,0 +1,152 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef VDO_COMPLETION_H
+#define VDO_COMPLETION_H
+
+#include "permassert.h"
+
+#include "status-codes.h"
+#include "types.h"
+
+/**
+ * vdo_run_completion() - Run a completion's callback or error handler on the current thread.
+ *
+ * Context: This function must be called from the correct callback thread.
+ */
+static inline void vdo_run_completion(struct vdo_completion *completion)
+{
+ if ((completion->result != VDO_SUCCESS) && (completion->error_handler != NULL)) {
+ completion->error_handler(completion);
+ return;
+ }
+
+ completion->callback(completion);
+}
+
+void vdo_set_completion_result(struct vdo_completion *completion, int result);
+
+void vdo_initialize_completion(struct vdo_completion *completion, struct vdo *vdo,
+ enum vdo_completion_type type);
+
+/**
+ * vdo_reset_completion() - Reset a completion to a clean state, while keeping the type, vdo and
+ * parent information.
+ */
+static inline void vdo_reset_completion(struct vdo_completion *completion)
+{
+ completion->result = VDO_SUCCESS;
+ completion->complete = false;
+}
+
+void vdo_launch_completion_with_priority(struct vdo_completion *completion,
+ enum vdo_completion_priority priority);
+
+/**
+ * vdo_launch_completion() - Launch a completion with default priority.
+ */
+static inline void vdo_launch_completion(struct vdo_completion *completion)
+{
+ vdo_launch_completion_with_priority(completion, VDO_WORK_Q_DEFAULT_PRIORITY);
+}
+
+/**
+ * vdo_continue_completion() - Continue processing a completion.
+ * @result: The current result (will not mask older errors).
+ *
+ * Continue processing a completion by setting the current result and calling
+ * vdo_launch_completion().
+ */
+static inline void vdo_continue_completion(struct vdo_completion *completion, int result)
+{
+ vdo_set_completion_result(completion, result);
+ vdo_launch_completion(completion);
+}
+
+void vdo_finish_completion(struct vdo_completion *completion);
+
+/**
+ * vdo_fail_completion() - Set the result of a completion if it does not already have an error,
+ * then finish it.
+ */
+static inline void vdo_fail_completion(struct vdo_completion *completion, int result)
+{
+ vdo_set_completion_result(completion, result);
+ vdo_finish_completion(completion);
+}
+
+/**
+ * vdo_assert_completion_type() - Assert that a completion is of the correct type.
+ *
+ * Return: VDO_SUCCESS or an error
+ */
+static inline int vdo_assert_completion_type(struct vdo_completion *completion,
+ enum vdo_completion_type expected)
+{
+ return VDO_ASSERT(expected == completion->type,
+ "completion type should be %u, not %u", expected,
+ completion->type);
+}
+
+static inline void vdo_set_completion_callback(struct vdo_completion *completion,
+ vdo_action_fn callback,
+ thread_id_t callback_thread_id)
+{
+ completion->callback = callback;
+ completion->callback_thread_id = callback_thread_id;
+}
+
+/**
+ * vdo_launch_completion_callback() - Set the callback for a completion and launch it immediately.
+ */
+static inline void vdo_launch_completion_callback(struct vdo_completion *completion,
+ vdo_action_fn callback,
+ thread_id_t callback_thread_id)
+{
+ vdo_set_completion_callback(completion, callback, callback_thread_id);
+ vdo_launch_completion(completion);
+}
+
+/**
+ * vdo_prepare_completion() - Prepare a completion for launch.
+ *
+ * Resets the completion, and then sets its callback, error handler, callback thread, and parent.
+ */
+static inline void vdo_prepare_completion(struct vdo_completion *completion,
+ vdo_action_fn callback,
+ vdo_action_fn error_handler,
+ thread_id_t callback_thread_id, void *parent)
+{
+ vdo_reset_completion(completion);
+ vdo_set_completion_callback(completion, callback, callback_thread_id);
+ completion->error_handler = error_handler;
+ completion->parent = parent;
+}
+
+/**
+ * vdo_prepare_completion_for_requeue() - Prepare a completion for launch ensuring that it will
+ * always be requeued.
+ *
+ * Resets the completion, and then sets its callback, error handler, callback thread, and parent.
+ */
+static inline void vdo_prepare_completion_for_requeue(struct vdo_completion *completion,
+ vdo_action_fn callback,
+ vdo_action_fn error_handler,
+ thread_id_t callback_thread_id,
+ void *parent)
+{
+ vdo_prepare_completion(completion, callback, error_handler,
+ callback_thread_id, parent);
+ completion->requeue = true;
+}
+
+void vdo_enqueue_completion(struct vdo_completion *completion,
+ enum vdo_completion_priority priority);
+
+
+bool vdo_requeue_completion_if_needed(struct vdo_completion *completion,
+ thread_id_t callback_thread_id);
+
+#endif /* VDO_COMPLETION_H */
diff --git a/drivers/md/dm-vdo/constants.h b/drivers/md/dm-vdo/constants.h
new file mode 100644
index 000000000000..a8c4d6e24b38
--- /dev/null
+++ b/drivers/md/dm-vdo/constants.h
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef VDO_CONSTANTS_H
+#define VDO_CONSTANTS_H
+
+#include <linux/blkdev.h>
+
+#include "types.h"
+
+enum {
+ /*
+ * The maximum number of contiguous PBNs which will go to a single bio submission queue,
+ * assuming there is more than one queue.
+ */
+ VDO_BIO_ROTATION_INTERVAL_LIMIT = 1024,
+
+ /* The number of entries on a block map page */
+ VDO_BLOCK_MAP_ENTRIES_PER_PAGE = 812,
+
+ /* The origin of the flat portion of the block map */
+ VDO_BLOCK_MAP_FLAT_PAGE_ORIGIN = 1,
+
+ /*
+ * The height of a block map tree. Assuming a root count of 60 and 812 entries per page,
+ * this is big enough to represent almost 95 PB of logical space.
+ */
+ VDO_BLOCK_MAP_TREE_HEIGHT = 5,
+
+ /* The default number of bio submission queues. */
+ DEFAULT_VDO_BIO_SUBMIT_QUEUE_COUNT = 4,
+
+ /* The number of contiguous PBNs to be submitted to a single bio queue. */
+ DEFAULT_VDO_BIO_SUBMIT_QUEUE_ROTATE_INTERVAL = 64,
+
+ /* The number of trees in the arboreal block map */
+ DEFAULT_VDO_BLOCK_MAP_TREE_ROOT_COUNT = 60,
+
+ /* The default size of the recovery journal, in blocks */
+ DEFAULT_VDO_RECOVERY_JOURNAL_SIZE = 32 * 1024,
+
+ /* The default size of each slab journal, in blocks */
+ DEFAULT_VDO_SLAB_JOURNAL_SIZE = 224,
+
+ /* Unit test minimum */
+ MINIMUM_VDO_SLAB_JOURNAL_BLOCKS = 2,
+
+ /*
+ * The initial size of lbn_operations and pbn_operations, which is based upon the expected
+ * maximum number of outstanding VIOs. This value was chosen to make it highly unlikely
+ * that the maps would need to be resized.
+ */
+ VDO_LOCK_MAP_CAPACITY = 10000,
+
+ /* The maximum number of logical zones */
+ MAX_VDO_LOGICAL_ZONES = 60,
+
+ /* The maximum number of physical zones */
+ MAX_VDO_PHYSICAL_ZONES = 16,
+
+ /* The base-2 logarithm of the maximum blocks in one slab */
+ MAX_VDO_SLAB_BITS = 23,
+
+ /* The maximum number of slabs the slab depot supports */
+ MAX_VDO_SLABS = 8192,
+
+ /*
+ * The maximum number of block map pages to load simultaneously during recovery or rebuild.
+ */
+ MAXIMUM_SIMULTANEOUS_VDO_BLOCK_MAP_RESTORATION_READS = 1024,
+
+ /* The maximum number of entries in the slab summary */
+ MAXIMUM_VDO_SLAB_SUMMARY_ENTRIES = MAX_VDO_SLABS * MAX_VDO_PHYSICAL_ZONES,
+
+ /* The maximum number of total threads in a VDO thread configuration. */
+ MAXIMUM_VDO_THREADS = 100,
+
+ /* The maximum number of VIOs in the system at once */
+ MAXIMUM_VDO_USER_VIOS = 2048,
+
+ /* The only physical block size supported by VDO */
+ VDO_BLOCK_SIZE = 4096,
+
+ /* The number of sectors per block */
+ VDO_SECTORS_PER_BLOCK = (VDO_BLOCK_SIZE >> SECTOR_SHIFT),
+
+ /* The size of a sector that will not be torn */
+ VDO_SECTOR_SIZE = 512,
+
+ /* The physical block number reserved for storing the zero block */
+ VDO_ZERO_BLOCK = 0,
+};
+
+#endif /* VDO_CONSTANTS_H */
diff --git a/drivers/md/dm-vdo/cpu.h b/drivers/md/dm-vdo/cpu.h
new file mode 100644
index 000000000000..d6a2615ba657
--- /dev/null
+++ b/drivers/md/dm-vdo/cpu.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef UDS_CPU_H
+#define UDS_CPU_H
+
+#include <linux/cache.h>
+
+/**
+ * uds_prefetch_address() - Minimize cache-miss latency by attempting to move data into a CPU cache
+ * before it is accessed.
+ *
+ * @address: the address to fetch (may be invalid)
+ * @for_write: must be constant at compile time--false if for reading, true if for writing
+ */
+static inline void uds_prefetch_address(const void *address, bool for_write)
+{
+ /*
+ * for_write won't be a constant if we are compiled with optimization turned off, in which
+ * case prefetching really doesn't matter. clang can't figure out that if for_write is a
+ * constant, it can be passed as the second, mandatorily constant argument to prefetch(),
+ * at least currently on llvm 12.
+ */
+ if (__builtin_constant_p(for_write)) {
+ if (for_write)
+ __builtin_prefetch(address, true);
+ else
+ __builtin_prefetch(address, false);
+ }
+}
+
+/**
+ * uds_prefetch_range() - Minimize cache-miss latency by attempting to move a range of addresses
+ * into a CPU cache before they are accessed.
+ *
+ * @start: the starting address to fetch (may be invalid)
+ * @size: the number of bytes in the address range
+ * @for_write: must be constant at compile time--false if for reading, true if for writing
+ */
+static inline void uds_prefetch_range(const void *start, unsigned int size,
+ bool for_write)
+{
+ /*
+ * Count the number of cache lines to fetch, allowing for the address range to span an
+ * extra cache line boundary due to address alignment.
+ */
+ const char *address = (const char *) start;
+ unsigned int offset = ((uintptr_t) address % L1_CACHE_BYTES);
+ unsigned int cache_lines = (1 + ((size + offset) / L1_CACHE_BYTES));
+
+ while (cache_lines-- > 0) {
+ uds_prefetch_address(address, for_write);
+ address += L1_CACHE_BYTES;
+ }
+}
+
+#endif /* UDS_CPU_H */
diff --git a/drivers/md/dm-vdo/data-vio.c b/drivers/md/dm-vdo/data-vio.c
new file mode 100644
index 000000000000..94f6f1ccfb7d
--- /dev/null
+++ b/drivers/md/dm-vdo/data-vio.c
@@ -0,0 +1,2063 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "data-vio.h"
+
+#include <linux/atomic.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/delay.h>
+#include <linux/device-mapper.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/lz4.h>
+#include <linux/minmax.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/wait.h>
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "murmurhash3.h"
+#include "permassert.h"
+
+#include "block-map.h"
+#include "dump.h"
+#include "encodings.h"
+#include "int-map.h"
+#include "io-submitter.h"
+#include "logical-zone.h"
+#include "packer.h"
+#include "recovery-journal.h"
+#include "slab-depot.h"
+#include "status-codes.h"
+#include "types.h"
+#include "vdo.h"
+#include "vio.h"
+#include "wait-queue.h"
+
+/**
+ * DOC: Bio flags.
+ *
+ * For certain flags set on user bios, if the user bio has not yet been acknowledged, setting those
+ * flags on our own bio(s) for that request may help underlying layers better fulfill the user
+ * bio's needs. This constant contains the aggregate of those flags; VDO strips all the other
+ * flags, as they convey incorrect information.
+ *
+ * These flags are always irrelevant if we have already finished the user bio as they are only
+ * hints on IO importance. If VDO has finished the user bio, any remaining IO done doesn't care how
+ * important finishing the finished bio was.
+ *
+ * Note that bio.c contains the complete list of flags we believe may be set; the following list
+ * explains the action taken with each of those flags VDO could receive:
+ *
+ * * REQ_SYNC: Passed down if the user bio is not yet completed, since it indicates the user bio
+ * completion is required for further work to be done by the issuer.
+ * * REQ_META: Passed down if the user bio is not yet completed, since it may mean the lower layer
+ * treats it as more urgent, similar to REQ_SYNC.
+ * * REQ_PRIO: Passed down if the user bio is not yet completed, since it indicates the user bio is
+ * important.
+ * * REQ_NOMERGE: Set only if the incoming bio was split; irrelevant to VDO IO.
+ * * REQ_IDLE: Set if the incoming bio had more IO quickly following; VDO's IO pattern doesn't
+ * match incoming IO, so this flag is incorrect for it.
+ * * REQ_FUA: Handled separately, and irrelevant to VDO IO otherwise.
+ * * REQ_RAHEAD: Passed down, as, for reads, it indicates trivial importance.
+ * * REQ_BACKGROUND: Not passed down, as VIOs are a limited resource and VDO needs them recycled
+ * ASAP to service heavy load, which is the only place where REQ_BACKGROUND might aid in load
+ * prioritization.
+ */
+static blk_opf_t PASSTHROUGH_FLAGS = (REQ_PRIO | REQ_META | REQ_SYNC | REQ_RAHEAD);
+
+/**
+ * DOC:
+ *
+ * The data_vio_pool maintains the pool of data_vios which a vdo uses to service incoming bios. For
+ * correctness, and in order to avoid potentially expensive or blocking memory allocations during
+ * normal operation, the number of concurrently active data_vios is capped. Furthermore, in order
+ * to avoid starvation of reads and writes, at most 75% of the data_vios may be used for
+ * discards. The data_vio_pool is responsible for enforcing these limits. Threads submitting bios
+ * for which a data_vio or discard permit are not available will block until the necessary
+ * resources are available. The pool is also responsible for distributing resources to blocked
+ * threads and waking them. Finally, the pool attempts to batch the work of recycling data_vios by
+ * performing the work of actually assigning resources to blocked threads or placing data_vios back
+ * into the pool on a single cpu at a time.
+ *
+ * The pool contains two "limiters", one for tracking data_vios and one for tracking discard
+ * permits. The limiters also provide safe cross-thread access to pool statistics without the need
+ * to take the pool's lock. When a thread submits a bio to a vdo device, it will first attempt to
+ * get a discard permit if it is a discard, and then to get a data_vio. If the necessary resources
+ * are available, the incoming bio will be assigned to the acquired data_vio, and it will be
+ * launched. However, if either of these are unavailable, the arrival time of the bio is recorded
+ * in the bio's bi_private field, the bio and its submitter are both queued on the appropriate
+ * limiter and the submitting thread will then put itself to sleep. (note that this mechanism will
+ * break if jiffies are only 32 bits.)
+ *
+ * Whenever a data_vio has completed processing for the bio it was servicing, release_data_vio()
+ * will be called on it. This function will add the data_vio to a funnel queue, and then check the
+ * state of the pool. If the pool is not currently processing released data_vios, the pool's
+ * completion will be enqueued on a cpu queue. This obviates the need for the releasing threads to
+ * hold the pool's lock, and also batches release work while avoiding starvation of the cpu
+ * threads.
+ *
+ * Whenever the pool's completion is run on a cpu thread, it calls process_release_callback() which
+ * processes a batch of returned data_vios (currently at most 32) from the pool's funnel queue. For
+ * each data_vio, it first checks whether that data_vio was processing a discard. If so, and there
+ * is a blocked bio waiting for a discard permit, that permit is notionally transferred to the
+ * eldest discard waiter, and that waiter is moved to the end of the list of discard bios waiting
+ * for a data_vio. If there are no discard waiters, the discard permit is returned to the pool.
+ * Next, the data_vio is assigned to the oldest blocked bio which either has a discard permit, or
+ * doesn't need one and relaunched. If neither of these exist, the data_vio is returned to the
+ * pool. Finally, if any waiting bios were launched, the threads which blocked trying to submit
+ * them are awakened.
+ */
+
+#define DATA_VIO_RELEASE_BATCH_SIZE 128
+
+static const unsigned int VDO_SECTORS_PER_BLOCK_MASK = VDO_SECTORS_PER_BLOCK - 1;
+static const u32 COMPRESSION_STATUS_MASK = 0xff;
+static const u32 MAY_NOT_COMPRESS_MASK = 0x80000000;
+
+struct limiter;
+typedef void (*assigner_fn)(struct limiter *limiter);
+
+/* Bookkeeping structure for a single type of resource. */
+struct limiter {
+ /* The data_vio_pool to which this limiter belongs */
+ struct data_vio_pool *pool;
+ /* The maximum number of data_vios available */
+ data_vio_count_t limit;
+ /* The number of resources in use */
+ data_vio_count_t busy;
+ /* The maximum number of resources ever simultaneously in use */
+ data_vio_count_t max_busy;
+ /* The number of resources to release */
+ data_vio_count_t release_count;
+ /* The number of waiters to wake */
+ data_vio_count_t wake_count;
+ /* The list of waiting bios which are known to process_release_callback() */
+ struct bio_list waiters;
+ /* The list of waiting bios which are not yet known to process_release_callback() */
+ struct bio_list new_waiters;
+ /* The list of waiters which have their permits */
+ struct bio_list *permitted_waiters;
+ /* The function for assigning a resource to a waiter */
+ assigner_fn assigner;
+ /* The queue of blocked threads */
+ wait_queue_head_t blocked_threads;
+ /* The arrival time of the eldest waiter */
+ u64 arrival;
+};
+
+/*
+ * A data_vio_pool is a collection of preallocated data_vios which may be acquired from any thread,
+ * and are released in batches.
+ */
+struct data_vio_pool {
+ /* Completion for scheduling releases */
+ struct vdo_completion completion;
+ /* The administrative state of the pool */
+ struct admin_state state;
+ /* Lock protecting the pool */
+ spinlock_t lock;
+ /* The main limiter controlling the total data_vios in the pool. */
+ struct limiter limiter;
+ /* The limiter controlling data_vios for discard */
+ struct limiter discard_limiter;
+ /* The list of bios which have discard permits but still need a data_vio */
+ struct bio_list permitted_discards;
+ /* The list of available data_vios */
+ struct list_head available;
+ /* The queue of data_vios waiting to be returned to the pool */
+ struct funnel_queue *queue;
+ /* Whether the pool is processing, or scheduled to process releases */
+ atomic_t processing;
+ /* The data vios in the pool */
+ struct data_vio data_vios[];
+};
+
+static const char * const ASYNC_OPERATION_NAMES[] = {
+ "launch",
+ "acknowledge_write",
+ "acquire_hash_lock",
+ "attempt_logical_block_lock",
+ "lock_duplicate_pbn",
+ "check_for_duplication",
+ "cleanup",
+ "compress_data_vio",
+ "find_block_map_slot",
+ "get_mapped_block_for_read",
+ "get_mapped_block_for_write",
+ "hash_data_vio",
+ "journal_remapping",
+ "vdo_attempt_packing",
+ "put_mapped_block",
+ "read_data_vio",
+ "update_dedupe_index",
+ "update_reference_counts",
+ "verify_duplication",
+ "write_data_vio",
+};
+
+/* The steps taken cleaning up a VIO, in the order they are performed. */
+enum data_vio_cleanup_stage {
+ VIO_CLEANUP_START,
+ VIO_RELEASE_HASH_LOCK = VIO_CLEANUP_START,
+ VIO_RELEASE_ALLOCATED,
+ VIO_RELEASE_RECOVERY_LOCKS,
+ VIO_RELEASE_LOGICAL,
+ VIO_CLEANUP_DONE
+};
+
+static inline struct data_vio_pool * __must_check
+as_data_vio_pool(struct vdo_completion *completion)
+{
+ vdo_assert_completion_type(completion, VDO_DATA_VIO_POOL_COMPLETION);
+ return container_of(completion, struct data_vio_pool, completion);
+}
+
+static inline u64 get_arrival_time(struct bio *bio)
+{
+ return (u64) bio->bi_private;
+}
+
+/**
+ * check_for_drain_complete_locked() - Check whether a data_vio_pool has no outstanding data_vios
+ * or waiters while holding the pool's lock.
+ */
+static bool check_for_drain_complete_locked(struct data_vio_pool *pool)
+{
+ if (pool->limiter.busy > 0)
+ return false;
+
+ VDO_ASSERT_LOG_ONLY((pool->discard_limiter.busy == 0),
+ "no outstanding discard permits");
+
+ return (bio_list_empty(&pool->limiter.new_waiters) &&
+ bio_list_empty(&pool->discard_limiter.new_waiters));
+}
+
+static void initialize_lbn_lock(struct data_vio *data_vio, logical_block_number_t lbn)
+{
+ struct vdo *vdo = vdo_from_data_vio(data_vio);
+ zone_count_t zone_number;
+ struct lbn_lock *lock = &data_vio->logical;
+
+ lock->lbn = lbn;
+ lock->locked = false;
+ vdo_waitq_init(&lock->waiters);
+ zone_number = vdo_compute_logical_zone(data_vio);
+ lock->zone = &vdo->logical_zones->zones[zone_number];
+}
+
+static void launch_locked_request(struct data_vio *data_vio)
+{
+ data_vio->logical.locked = true;
+ if (data_vio->write) {
+ struct vdo *vdo = vdo_from_data_vio(data_vio);
+
+ if (vdo_is_read_only(vdo)) {
+ continue_data_vio_with_error(data_vio, VDO_READ_ONLY);
+ return;
+ }
+ }
+
+ data_vio->last_async_operation = VIO_ASYNC_OP_FIND_BLOCK_MAP_SLOT;
+ vdo_find_block_map_slot(data_vio);
+}
+
+static void acknowledge_data_vio(struct data_vio *data_vio)
+{
+ struct vdo *vdo = vdo_from_data_vio(data_vio);
+ struct bio *bio = data_vio->user_bio;
+ int error = vdo_status_to_errno(data_vio->vio.completion.result);
+
+ if (bio == NULL)
+ return;
+
+ VDO_ASSERT_LOG_ONLY((data_vio->remaining_discard <=
+ (u32) (VDO_BLOCK_SIZE - data_vio->offset)),
+ "data_vio to acknowledge is not an incomplete discard");
+
+ data_vio->user_bio = NULL;
+ vdo_count_bios(&vdo->stats.bios_acknowledged, bio);
+ if (data_vio->is_partial)
+ vdo_count_bios(&vdo->stats.bios_acknowledged_partial, bio);
+
+ bio->bi_status = errno_to_blk_status(error);
+ bio_endio(bio);
+}
+
+static void copy_to_bio(struct bio *bio, char *data_ptr)
+{
+ struct bio_vec biovec;
+ struct bvec_iter iter;
+
+ bio_for_each_segment(biovec, bio, iter) {
+ memcpy_to_bvec(&biovec, data_ptr);
+ data_ptr += biovec.bv_len;
+ }
+}
+
+struct data_vio_compression_status get_data_vio_compression_status(struct data_vio *data_vio)
+{
+ u32 packed = atomic_read(&data_vio->compression.status);
+
+ /* pairs with cmpxchg in set_data_vio_compression_status */
+ smp_rmb();
+ return (struct data_vio_compression_status) {
+ .stage = packed & COMPRESSION_STATUS_MASK,
+ .may_not_compress = ((packed & MAY_NOT_COMPRESS_MASK) != 0),
+ };
+}
+
+/**
+ * pack_status() - Convert a data_vio_compression_status into a u32 which may be stored
+ * atomically.
+ * @status: The state to convert.
+ *
+ * Return: The compression state packed into a u32.
+ */
+static u32 __must_check pack_status(struct data_vio_compression_status status)
+{
+ return status.stage | (status.may_not_compress ? MAY_NOT_COMPRESS_MASK : 0);
+}
+
+/**
+ * set_data_vio_compression_status() - Set the compression status of a data_vio.
+ * @state: The expected current status of the data_vio.
+ * @new_state: The status to set.
+ *
+ * Return: true if the new status was set, false if the data_vio's compression status did not
+ * match the expected state, and so was left unchanged.
+ */
+static bool __must_check
+set_data_vio_compression_status(struct data_vio *data_vio,
+ struct data_vio_compression_status status,
+ struct data_vio_compression_status new_status)
+{
+ u32 actual;
+ u32 expected = pack_status(status);
+ u32 replacement = pack_status(new_status);
+
+ /*
+ * Extra barriers because this was original developed using a CAS operation that implicitly
+ * had them.
+ */
+ smp_mb__before_atomic();
+ actual = atomic_cmpxchg(&data_vio->compression.status, expected, replacement);
+ /* same as before_atomic */
+ smp_mb__after_atomic();
+ return (expected == actual);
+}
+
+struct data_vio_compression_status advance_data_vio_compression_stage(struct data_vio *data_vio)
+{
+ for (;;) {
+ struct data_vio_compression_status status =
+ get_data_vio_compression_status(data_vio);
+ struct data_vio_compression_status new_status = status;
+
+ if (status.stage == DATA_VIO_POST_PACKER) {
+ /* We're already in the last stage. */
+ return status;
+ }
+
+ if (status.may_not_compress) {
+ /*
+ * Compression has been dis-allowed for this VIO, so skip the rest of the
+ * path and go to the end.
+ */
+ new_status.stage = DATA_VIO_POST_PACKER;
+ } else {
+ /* Go to the next state. */
+ new_status.stage++;
+ }
+
+ if (set_data_vio_compression_status(data_vio, status, new_status))
+ return new_status;
+
+ /* Another thread changed the status out from under us so try again. */
+ }
+}
+
+/**
+ * cancel_data_vio_compression() - Prevent this data_vio from being compressed or packed.
+ *
+ * Return: true if the data_vio is in the packer and the caller was the first caller to cancel it.
+ */
+bool cancel_data_vio_compression(struct data_vio *data_vio)
+{
+ struct data_vio_compression_status status, new_status;
+
+ for (;;) {
+ status = get_data_vio_compression_status(data_vio);
+ if (status.may_not_compress || (status.stage == DATA_VIO_POST_PACKER)) {
+ /* This data_vio is already set up to not block in the packer. */
+ break;
+ }
+
+ new_status.stage = status.stage;
+ new_status.may_not_compress = true;
+
+ if (set_data_vio_compression_status(data_vio, status, new_status))
+ break;
+ }
+
+ return ((status.stage == DATA_VIO_PACKING) && !status.may_not_compress);
+}
+
+/**
+ * attempt_logical_block_lock() - Attempt to acquire the lock on a logical block.
+ * @completion: The data_vio for an external data request as a completion.
+ *
+ * This is the start of the path for all external requests. It is registered in launch_data_vio().
+ */
+static void attempt_logical_block_lock(struct vdo_completion *completion)
+{
+ struct data_vio *data_vio = as_data_vio(completion);
+ struct lbn_lock *lock = &data_vio->logical;
+ struct vdo *vdo = vdo_from_data_vio(data_vio);
+ struct data_vio *lock_holder;
+ int result;
+
+ assert_data_vio_in_logical_zone(data_vio);
+
+ if (data_vio->logical.lbn >= vdo->states.vdo.config.logical_blocks) {
+ continue_data_vio_with_error(data_vio, VDO_OUT_OF_RANGE);
+ return;
+ }
+
+ result = vdo_int_map_put(lock->zone->lbn_operations, lock->lbn,
+ data_vio, false, (void **) &lock_holder);
+ if (result != VDO_SUCCESS) {
+ continue_data_vio_with_error(data_vio, result);
+ return;
+ }
+
+ if (lock_holder == NULL) {
+ /* We got the lock */
+ launch_locked_request(data_vio);
+ return;
+ }
+
+ result = VDO_ASSERT(lock_holder->logical.locked, "logical block lock held");
+ if (result != VDO_SUCCESS) {
+ continue_data_vio_with_error(data_vio, result);
+ return;
+ }
+
+ /*
+ * If the new request is a pure read request (not read-modify-write) and the lock_holder is
+ * writing and has received an allocation, service the read request immediately by copying
+ * data from the lock_holder to avoid having to flush the write out of the packer just to
+ * prevent the read from waiting indefinitely. If the lock_holder does not yet have an
+ * allocation, prevent it from blocking in the packer and wait on it. This is necessary in
+ * order to prevent returning data that may not have actually been written.
+ */
+ if (!data_vio->write && READ_ONCE(lock_holder->allocation_succeeded)) {
+ copy_to_bio(data_vio->user_bio, lock_holder->vio.data + data_vio->offset);
+ acknowledge_data_vio(data_vio);
+ complete_data_vio(completion);
+ return;
+ }
+
+ data_vio->last_async_operation = VIO_ASYNC_OP_ATTEMPT_LOGICAL_BLOCK_LOCK;
+ vdo_waitq_enqueue_waiter(&lock_holder->logical.waiters, &data_vio->waiter);
+
+ /*
+ * Prevent writes and read-modify-writes from blocking indefinitely on lock holders in the
+ * packer.
+ */
+ if (lock_holder->write && cancel_data_vio_compression(lock_holder)) {
+ data_vio->compression.lock_holder = lock_holder;
+ launch_data_vio_packer_callback(data_vio,
+ vdo_remove_lock_holder_from_packer);
+ }
+}
+
+/**
+ * launch_data_vio() - (Re)initialize a data_vio to have a new logical block number, keeping the
+ * same parent and other state and send it on its way.
+ */
+static void launch_data_vio(struct data_vio *data_vio, logical_block_number_t lbn)
+{
+ struct vdo_completion *completion = &data_vio->vio.completion;
+
+ /*
+ * Clearing the tree lock must happen before initializing the LBN lock, which also adds
+ * information to the tree lock.
+ */
+ memset(&data_vio->tree_lock, 0, sizeof(data_vio->tree_lock));
+ initialize_lbn_lock(data_vio, lbn);
+ INIT_LIST_HEAD(&data_vio->hash_lock_entry);
+ INIT_LIST_HEAD(&data_vio->write_entry);
+
+ memset(&data_vio->allocation, 0, sizeof(data_vio->allocation));
+
+ data_vio->is_duplicate = false;
+
+ memset(&data_vio->record_name, 0, sizeof(data_vio->record_name));
+ memset(&data_vio->duplicate, 0, sizeof(data_vio->duplicate));
+ vdo_reset_completion(completion);
+ completion->error_handler = handle_data_vio_error;
+ set_data_vio_logical_callback(data_vio, attempt_logical_block_lock);
+ vdo_enqueue_completion(completion, VDO_DEFAULT_Q_MAP_BIO_PRIORITY);
+}
+
+static bool is_zero_block(char *block)
+{
+ int i;
+
+ for (i = 0; i < VDO_BLOCK_SIZE; i += sizeof(u64)) {
+ if (*((u64 *) &block[i]))
+ return false;
+ }
+
+ return true;
+}
+
+static void copy_from_bio(struct bio *bio, char *data_ptr)
+{
+ struct bio_vec biovec;
+ struct bvec_iter iter;
+
+ bio_for_each_segment(biovec, bio, iter) {
+ memcpy_from_bvec(data_ptr, &biovec);
+ data_ptr += biovec.bv_len;
+ }
+}
+
+static void launch_bio(struct vdo *vdo, struct data_vio *data_vio, struct bio *bio)
+{
+ logical_block_number_t lbn;
+ /*
+ * Zero out the fields which don't need to be preserved (i.e. which are not pointers to
+ * separately allocated objects).
+ */
+ memset(data_vio, 0, offsetof(struct data_vio, vio));
+ memset(&data_vio->compression, 0, offsetof(struct compression_state, block));
+
+ data_vio->user_bio = bio;
+ data_vio->offset = to_bytes(bio->bi_iter.bi_sector & VDO_SECTORS_PER_BLOCK_MASK);
+ data_vio->is_partial = (bio->bi_iter.bi_size < VDO_BLOCK_SIZE) || (data_vio->offset != 0);
+
+ /*
+ * Discards behave very differently than other requests when coming in from device-mapper.
+ * We have to be able to handle any size discards and various sector offsets within a
+ * block.
+ */
+ if (bio_op(bio) == REQ_OP_DISCARD) {
+ data_vio->remaining_discard = bio->bi_iter.bi_size;
+ data_vio->write = true;
+ data_vio->is_discard = true;
+ if (data_vio->is_partial) {
+ vdo_count_bios(&vdo->stats.bios_in_partial, bio);
+ data_vio->read = true;
+ }
+ } else if (data_vio->is_partial) {
+ vdo_count_bios(&vdo->stats.bios_in_partial, bio);
+ data_vio->read = true;
+ if (bio_data_dir(bio) == WRITE)
+ data_vio->write = true;
+ } else if (bio_data_dir(bio) == READ) {
+ data_vio->read = true;
+ } else {
+ /*
+ * Copy the bio data to a char array so that we can continue to use the data after
+ * we acknowledge the bio.
+ */
+ copy_from_bio(bio, data_vio->vio.data);
+ data_vio->is_zero = is_zero_block(data_vio->vio.data);
+ data_vio->write = true;
+ }
+
+ if (data_vio->user_bio->bi_opf & REQ_FUA)
+ data_vio->fua = true;
+
+ lbn = (bio->bi_iter.bi_sector - vdo->starting_sector_offset) / VDO_SECTORS_PER_BLOCK;
+ launch_data_vio(data_vio, lbn);
+}
+
+static void assign_data_vio(struct limiter *limiter, struct data_vio *data_vio)
+{
+ struct bio *bio = bio_list_pop(limiter->permitted_waiters);
+
+ launch_bio(limiter->pool->completion.vdo, data_vio, bio);
+ limiter->wake_count++;
+
+ bio = bio_list_peek(limiter->permitted_waiters);
+ limiter->arrival = ((bio == NULL) ? U64_MAX : get_arrival_time(bio));
+}
+
+static void assign_discard_permit(struct limiter *limiter)
+{
+ struct bio *bio = bio_list_pop(&limiter->waiters);
+
+ if (limiter->arrival == U64_MAX)
+ limiter->arrival = get_arrival_time(bio);
+
+ bio_list_add(limiter->permitted_waiters, bio);
+}
+
+static void get_waiters(struct limiter *limiter)
+{
+ bio_list_merge(&limiter->waiters, &limiter->new_waiters);
+ bio_list_init(&limiter->new_waiters);
+}
+
+static inline struct data_vio *get_available_data_vio(struct data_vio_pool *pool)
+{
+ struct data_vio *data_vio =
+ list_first_entry(&pool->available, struct data_vio, pool_entry);
+
+ list_del_init(&data_vio->pool_entry);
+ return data_vio;
+}
+
+static void assign_data_vio_to_waiter(struct limiter *limiter)
+{
+ assign_data_vio(limiter, get_available_data_vio(limiter->pool));
+}
+
+static void update_limiter(struct limiter *limiter)
+{
+ struct bio_list *waiters = &limiter->waiters;
+ data_vio_count_t available = limiter->limit - limiter->busy;
+
+ VDO_ASSERT_LOG_ONLY((limiter->release_count <= limiter->busy),
+ "Release count %u is not more than busy count %u",
+ limiter->release_count, limiter->busy);
+
+ get_waiters(limiter);
+ for (; (limiter->release_count > 0) && !bio_list_empty(waiters); limiter->release_count--)
+ limiter->assigner(limiter);
+
+ if (limiter->release_count > 0) {
+ WRITE_ONCE(limiter->busy, limiter->busy - limiter->release_count);
+ limiter->release_count = 0;
+ return;
+ }
+
+ for (; (available > 0) && !bio_list_empty(waiters); available--)
+ limiter->assigner(limiter);
+
+ WRITE_ONCE(limiter->busy, limiter->limit - available);
+ if (limiter->max_busy < limiter->busy)
+ WRITE_ONCE(limiter->max_busy, limiter->busy);
+}
+
+/**
+ * schedule_releases() - Ensure that release processing is scheduled.
+ *
+ * If this call switches the state to processing, enqueue. Otherwise, some other thread has already
+ * done so.
+ */
+static void schedule_releases(struct data_vio_pool *pool)
+{
+ /* Pairs with the barrier in process_release_callback(). */
+ smp_mb__before_atomic();
+ if (atomic_cmpxchg(&pool->processing, false, true))
+ return;
+
+ pool->completion.requeue = true;
+ vdo_launch_completion_with_priority(&pool->completion,
+ CPU_Q_COMPLETE_VIO_PRIORITY);
+}
+
+static void reuse_or_release_resources(struct data_vio_pool *pool,
+ struct data_vio *data_vio,
+ struct list_head *returned)
+{
+ if (data_vio->remaining_discard > 0) {
+ if (bio_list_empty(&pool->discard_limiter.waiters)) {
+ /* Return the data_vio's discard permit. */
+ pool->discard_limiter.release_count++;
+ } else {
+ assign_discard_permit(&pool->discard_limiter);
+ }
+ }
+
+ if (pool->limiter.arrival < pool->discard_limiter.arrival) {
+ assign_data_vio(&pool->limiter, data_vio);
+ } else if (pool->discard_limiter.arrival < U64_MAX) {
+ assign_data_vio(&pool->discard_limiter, data_vio);
+ } else {
+ list_add(&data_vio->pool_entry, returned);
+ pool->limiter.release_count++;
+ }
+}
+
+/**
+ * process_release_callback() - Process a batch of data_vio releases.
+ * @completion: The pool with data_vios to release.
+ */
+static void process_release_callback(struct vdo_completion *completion)
+{
+ struct data_vio_pool *pool = as_data_vio_pool(completion);
+ bool reschedule;
+ bool drained;
+ data_vio_count_t processed;
+ data_vio_count_t to_wake;
+ data_vio_count_t discards_to_wake;
+ LIST_HEAD(returned);
+
+ spin_lock(&pool->lock);
+ get_waiters(&pool->discard_limiter);
+ get_waiters(&pool->limiter);
+ spin_unlock(&pool->lock);
+
+ if (pool->limiter.arrival == U64_MAX) {
+ struct bio *bio = bio_list_peek(&pool->limiter.waiters);
+
+ if (bio != NULL)
+ pool->limiter.arrival = get_arrival_time(bio);
+ }
+
+ for (processed = 0; processed < DATA_VIO_RELEASE_BATCH_SIZE; processed++) {
+ struct data_vio *data_vio;
+ struct funnel_queue_entry *entry = vdo_funnel_queue_poll(pool->queue);
+
+ if (entry == NULL)
+ break;
+
+ data_vio = as_data_vio(container_of(entry, struct vdo_completion,
+ work_queue_entry_link));
+ acknowledge_data_vio(data_vio);
+ reuse_or_release_resources(pool, data_vio, &returned);
+ }
+
+ spin_lock(&pool->lock);
+ /*
+ * There is a race where waiters could be added while we are in the unlocked section above.
+ * Those waiters could not see the resources we are now about to release, so we assign
+ * those resources now as we have no guarantee of being rescheduled. This is handled in
+ * update_limiter().
+ */
+ update_limiter(&pool->discard_limiter);
+ list_splice(&returned, &pool->available);
+ update_limiter(&pool->limiter);
+ to_wake = pool->limiter.wake_count;
+ pool->limiter.wake_count = 0;
+ discards_to_wake = pool->discard_limiter.wake_count;
+ pool->discard_limiter.wake_count = 0;
+
+ atomic_set(&pool->processing, false);
+ /* Pairs with the barrier in schedule_releases(). */
+ smp_mb();
+
+ reschedule = !vdo_is_funnel_queue_empty(pool->queue);
+ drained = (!reschedule &&
+ vdo_is_state_draining(&pool->state) &&
+ check_for_drain_complete_locked(pool));
+ spin_unlock(&pool->lock);
+
+ if (to_wake > 0)
+ wake_up_nr(&pool->limiter.blocked_threads, to_wake);
+
+ if (discards_to_wake > 0)
+ wake_up_nr(&pool->discard_limiter.blocked_threads, discards_to_wake);
+
+ if (reschedule)
+ schedule_releases(pool);
+ else if (drained)
+ vdo_finish_draining(&pool->state);
+}
+
+static void initialize_limiter(struct limiter *limiter, struct data_vio_pool *pool,
+ assigner_fn assigner, data_vio_count_t limit)
+{
+ limiter->pool = pool;
+ limiter->assigner = assigner;
+ limiter->limit = limit;
+ limiter->arrival = U64_MAX;
+ init_waitqueue_head(&limiter->blocked_threads);
+}
+
+/**
+ * initialize_data_vio() - Allocate the components of a data_vio.
+ *
+ * The caller is responsible for cleaning up the data_vio on error.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int initialize_data_vio(struct data_vio *data_vio, struct vdo *vdo)
+{
+ struct bio *bio;
+ int result;
+
+ BUILD_BUG_ON(VDO_BLOCK_SIZE > PAGE_SIZE);
+ result = vdo_allocate_memory(VDO_BLOCK_SIZE, 0, "data_vio data",
+ &data_vio->vio.data);
+ if (result != VDO_SUCCESS)
+ return vdo_log_error_strerror(result,
+ "data_vio data allocation failure");
+
+ result = vdo_allocate_memory(VDO_BLOCK_SIZE, 0, "compressed block",
+ &data_vio->compression.block);
+ if (result != VDO_SUCCESS) {
+ return vdo_log_error_strerror(result,
+ "data_vio compressed block allocation failure");
+ }
+
+ result = vdo_allocate_memory(VDO_BLOCK_SIZE, 0, "vio scratch",
+ &data_vio->scratch_block);
+ if (result != VDO_SUCCESS)
+ return vdo_log_error_strerror(result,
+ "data_vio scratch allocation failure");
+
+ result = vdo_create_bio(&bio);
+ if (result != VDO_SUCCESS)
+ return vdo_log_error_strerror(result,
+ "data_vio data bio allocation failure");
+
+ vdo_initialize_completion(&data_vio->decrement_completion, vdo,
+ VDO_DECREMENT_COMPLETION);
+ initialize_vio(&data_vio->vio, bio, 1, VIO_TYPE_DATA, VIO_PRIORITY_DATA, vdo);
+
+ return VDO_SUCCESS;
+}
+
+static void destroy_data_vio(struct data_vio *data_vio)
+{
+ if (data_vio == NULL)
+ return;
+
+ vdo_free_bio(vdo_forget(data_vio->vio.bio));
+ vdo_free(vdo_forget(data_vio->vio.data));
+ vdo_free(vdo_forget(data_vio->compression.block));
+ vdo_free(vdo_forget(data_vio->scratch_block));
+}
+
+/**
+ * make_data_vio_pool() - Initialize a data_vio pool.
+ * @vdo: The vdo to which the pool will belong.
+ * @pool_size: The number of data_vios in the pool.
+ * @discard_limit: The maximum number of data_vios which may be used for discards.
+ * @pool: A pointer to hold the newly allocated pool.
+ */
+int make_data_vio_pool(struct vdo *vdo, data_vio_count_t pool_size,
+ data_vio_count_t discard_limit, struct data_vio_pool **pool_ptr)
+{
+ int result;
+ struct data_vio_pool *pool;
+ data_vio_count_t i;
+
+ result = vdo_allocate_extended(struct data_vio_pool, pool_size, struct data_vio,
+ __func__, &pool);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ VDO_ASSERT_LOG_ONLY((discard_limit <= pool_size),
+ "discard limit does not exceed pool size");
+ initialize_limiter(&pool->discard_limiter, pool, assign_discard_permit,
+ discard_limit);
+ pool->discard_limiter.permitted_waiters = &pool->permitted_discards;
+ initialize_limiter(&pool->limiter, pool, assign_data_vio_to_waiter, pool_size);
+ pool->limiter.permitted_waiters = &pool->limiter.waiters;
+ INIT_LIST_HEAD(&pool->available);
+ spin_lock_init(&pool->lock);
+ vdo_set_admin_state_code(&pool->state, VDO_ADMIN_STATE_NORMAL_OPERATION);
+ vdo_initialize_completion(&pool->completion, vdo, VDO_DATA_VIO_POOL_COMPLETION);
+ vdo_prepare_completion(&pool->completion, process_release_callback,
+ process_release_callback, vdo->thread_config.cpu_thread,
+ NULL);
+
+ result = vdo_make_funnel_queue(&pool->queue);
+ if (result != VDO_SUCCESS) {
+ free_data_vio_pool(vdo_forget(pool));
+ return result;
+ }
+
+ for (i = 0; i < pool_size; i++) {
+ struct data_vio *data_vio = &pool->data_vios[i];
+
+ result = initialize_data_vio(data_vio, vdo);
+ if (result != VDO_SUCCESS) {
+ destroy_data_vio(data_vio);
+ free_data_vio_pool(pool);
+ return result;
+ }
+
+ list_add(&data_vio->pool_entry, &pool->available);
+ }
+
+ *pool_ptr = pool;
+ return VDO_SUCCESS;
+}
+
+/**
+ * free_data_vio_pool() - Free a data_vio_pool and the data_vios in it.
+ *
+ * All data_vios must be returned to the pool before calling this function.
+ */
+void free_data_vio_pool(struct data_vio_pool *pool)
+{
+ struct data_vio *data_vio, *tmp;
+
+ if (pool == NULL)
+ return;
+
+ /*
+ * Pairs with the barrier in process_release_callback(). Possibly not needed since it
+ * caters to an enqueue vs. free race.
+ */
+ smp_mb();
+ BUG_ON(atomic_read(&pool->processing));
+
+ spin_lock(&pool->lock);
+ VDO_ASSERT_LOG_ONLY((pool->limiter.busy == 0),
+ "data_vio pool must not have %u busy entries when being freed",
+ pool->limiter.busy);
+ VDO_ASSERT_LOG_ONLY((bio_list_empty(&pool->limiter.waiters) &&
+ bio_list_empty(&pool->limiter.new_waiters)),
+ "data_vio pool must not have threads waiting to read or write when being freed");
+ VDO_ASSERT_LOG_ONLY((bio_list_empty(&pool->discard_limiter.waiters) &&
+ bio_list_empty(&pool->discard_limiter.new_waiters)),
+ "data_vio pool must not have threads waiting to discard when being freed");
+ spin_unlock(&pool->lock);
+
+ list_for_each_entry_safe(data_vio, tmp, &pool->available, pool_entry) {
+ list_del_init(&data_vio->pool_entry);
+ destroy_data_vio(data_vio);
+ }
+
+ vdo_free_funnel_queue(vdo_forget(pool->queue));
+ vdo_free(pool);
+}
+
+static bool acquire_permit(struct limiter *limiter)
+{
+ if (limiter->busy >= limiter->limit)
+ return false;
+
+ WRITE_ONCE(limiter->busy, limiter->busy + 1);
+ if (limiter->max_busy < limiter->busy)
+ WRITE_ONCE(limiter->max_busy, limiter->busy);
+ return true;
+}
+
+static void wait_permit(struct limiter *limiter, struct bio *bio)
+ __releases(&limiter->pool->lock)
+{
+ DEFINE_WAIT(wait);
+
+ bio_list_add(&limiter->new_waiters, bio);
+ prepare_to_wait_exclusive(&limiter->blocked_threads, &wait,
+ TASK_UNINTERRUPTIBLE);
+ spin_unlock(&limiter->pool->lock);
+ io_schedule();
+ finish_wait(&limiter->blocked_threads, &wait);
+}
+
+/**
+ * vdo_launch_bio() - Acquire a data_vio from the pool, assign the bio to it, and launch it.
+ *
+ * This will block if data_vios or discard permits are not available.
+ */
+void vdo_launch_bio(struct data_vio_pool *pool, struct bio *bio)
+{
+ struct data_vio *data_vio;
+
+ VDO_ASSERT_LOG_ONLY(!vdo_is_state_quiescent(&pool->state),
+ "data_vio_pool not quiescent on acquire");
+
+ bio->bi_private = (void *) jiffies;
+ spin_lock(&pool->lock);
+ if ((bio_op(bio) == REQ_OP_DISCARD) &&
+ !acquire_permit(&pool->discard_limiter)) {
+ wait_permit(&pool->discard_limiter, bio);
+ return;
+ }
+
+ if (!acquire_permit(&pool->limiter)) {
+ wait_permit(&pool->limiter, bio);
+ return;
+ }
+
+ data_vio = get_available_data_vio(pool);
+ spin_unlock(&pool->lock);
+ launch_bio(pool->completion.vdo, data_vio, bio);
+}
+
+/* Implements vdo_admin_initiator_fn. */
+static void initiate_drain(struct admin_state *state)
+{
+ bool drained;
+ struct data_vio_pool *pool = container_of(state, struct data_vio_pool, state);
+
+ spin_lock(&pool->lock);
+ drained = check_for_drain_complete_locked(pool);
+ spin_unlock(&pool->lock);
+
+ if (drained)
+ vdo_finish_draining(state);
+}
+
+static void assert_on_vdo_cpu_thread(const struct vdo *vdo, const char *name)
+{
+ VDO_ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == vdo->thread_config.cpu_thread),
+ "%s called on cpu thread", name);
+}
+
+/**
+ * drain_data_vio_pool() - Wait asynchronously for all data_vios to be returned to the pool.
+ * @completion: The completion to notify when the pool has drained.
+ */
+void drain_data_vio_pool(struct data_vio_pool *pool, struct vdo_completion *completion)
+{
+ assert_on_vdo_cpu_thread(completion->vdo, __func__);
+ vdo_start_draining(&pool->state, VDO_ADMIN_STATE_SUSPENDING, completion,
+ initiate_drain);
+}
+
+/**
+ * resume_data_vio_pool() - Resume a data_vio pool.
+ * @completion: The completion to notify when the pool has resumed.
+ */
+void resume_data_vio_pool(struct data_vio_pool *pool, struct vdo_completion *completion)
+{
+ assert_on_vdo_cpu_thread(completion->vdo, __func__);
+ vdo_continue_completion(completion, vdo_resume_if_quiescent(&pool->state));
+}
+
+static void dump_limiter(const char *name, struct limiter *limiter)
+{
+ vdo_log_info("%s: %u of %u busy (max %u), %s", name, limiter->busy,
+ limiter->limit, limiter->max_busy,
+ ((bio_list_empty(&limiter->waiters) &&
+ bio_list_empty(&limiter->new_waiters)) ?
+ "no waiters" : "has waiters"));
+}
+
+/**
+ * dump_data_vio_pool() - Dump a data_vio pool to the log.
+ * @dump_vios: Whether to dump the details of each busy data_vio as well.
+ */
+void dump_data_vio_pool(struct data_vio_pool *pool, bool dump_vios)
+{
+ /*
+ * In order that syslog can empty its buffer, sleep after 35 elements for 4ms (till the
+ * second clock tick). These numbers were picked based on experiments with lab machines.
+ */
+ static const int ELEMENTS_PER_BATCH = 35;
+ static const int SLEEP_FOR_SYSLOG = 4000;
+
+ if (pool == NULL)
+ return;
+
+ spin_lock(&pool->lock);
+ dump_limiter("data_vios", &pool->limiter);
+ dump_limiter("discard permits", &pool->discard_limiter);
+ if (dump_vios) {
+ int i;
+ int dumped = 0;
+
+ for (i = 0; i < pool->limiter.limit; i++) {
+ struct data_vio *data_vio = &pool->data_vios[i];
+
+ if (!list_empty(&data_vio->pool_entry))
+ continue;
+
+ dump_data_vio(data_vio);
+ if (++dumped >= ELEMENTS_PER_BATCH) {
+ spin_unlock(&pool->lock);
+ dumped = 0;
+ fsleep(SLEEP_FOR_SYSLOG);
+ spin_lock(&pool->lock);
+ }
+ }
+ }
+
+ spin_unlock(&pool->lock);
+}
+
+data_vio_count_t get_data_vio_pool_active_discards(struct data_vio_pool *pool)
+{
+ return READ_ONCE(pool->discard_limiter.busy);
+}
+
+data_vio_count_t get_data_vio_pool_discard_limit(struct data_vio_pool *pool)
+{
+ return READ_ONCE(pool->discard_limiter.limit);
+}
+
+data_vio_count_t get_data_vio_pool_maximum_discards(struct data_vio_pool *pool)
+{
+ return READ_ONCE(pool->discard_limiter.max_busy);
+}
+
+int set_data_vio_pool_discard_limit(struct data_vio_pool *pool, data_vio_count_t limit)
+{
+ if (get_data_vio_pool_request_limit(pool) < limit) {
+ // The discard limit may not be higher than the data_vio limit.
+ return -EINVAL;
+ }
+
+ spin_lock(&pool->lock);
+ pool->discard_limiter.limit = limit;
+ spin_unlock(&pool->lock);
+
+ return VDO_SUCCESS;
+}
+
+data_vio_count_t get_data_vio_pool_active_requests(struct data_vio_pool *pool)
+{
+ return READ_ONCE(pool->limiter.busy);
+}
+
+data_vio_count_t get_data_vio_pool_request_limit(struct data_vio_pool *pool)
+{
+ return READ_ONCE(pool->limiter.limit);
+}
+
+data_vio_count_t get_data_vio_pool_maximum_requests(struct data_vio_pool *pool)
+{
+ return READ_ONCE(pool->limiter.max_busy);
+}
+
+static void update_data_vio_error_stats(struct data_vio *data_vio)
+{
+ u8 index = 0;
+ static const char * const operations[] = {
+ [0] = "empty",
+ [1] = "read",
+ [2] = "write",
+ [3] = "read-modify-write",
+ [5] = "read+fua",
+ [6] = "write+fua",
+ [7] = "read-modify-write+fua",
+ };
+
+ if (data_vio->read)
+ index = 1;
+
+ if (data_vio->write)
+ index += 2;
+
+ if (data_vio->fua)
+ index += 4;
+
+ update_vio_error_stats(&data_vio->vio,
+ "Completing %s vio for LBN %llu with error after %s",
+ operations[index],
+ (unsigned long long) data_vio->logical.lbn,
+ get_data_vio_operation_name(data_vio));
+}
+
+static void perform_cleanup_stage(struct data_vio *data_vio,
+ enum data_vio_cleanup_stage stage);
+
+/**
+ * release_allocated_lock() - Release the PBN lock and/or the reference on the allocated block at
+ * the end of processing a data_vio.
+ */
+static void release_allocated_lock(struct vdo_completion *completion)
+{
+ struct data_vio *data_vio = as_data_vio(completion);
+
+ assert_data_vio_in_allocated_zone(data_vio);
+ release_data_vio_allocation_lock(data_vio, false);
+ perform_cleanup_stage(data_vio, VIO_RELEASE_RECOVERY_LOCKS);
+}
+
+/** release_lock() - Release an uncontended LBN lock. */
+static void release_lock(struct data_vio *data_vio, struct lbn_lock *lock)
+{
+ struct int_map *lock_map = lock->zone->lbn_operations;
+ struct data_vio *lock_holder;
+
+ if (!lock->locked) {
+ /* The lock is not locked, so it had better not be registered in the lock map. */
+ struct data_vio *lock_holder = vdo_int_map_get(lock_map, lock->lbn);
+
+ VDO_ASSERT_LOG_ONLY((data_vio != lock_holder),
+ "no logical block lock held for block %llu",
+ (unsigned long long) lock->lbn);
+ return;
+ }
+
+ /* Release the lock by removing the lock from the map. */
+ lock_holder = vdo_int_map_remove(lock_map, lock->lbn);
+ VDO_ASSERT_LOG_ONLY((data_vio == lock_holder),
+ "logical block lock mismatch for block %llu",
+ (unsigned long long) lock->lbn);
+ lock->locked = false;
+}
+
+/** transfer_lock() - Transfer a contended LBN lock to the eldest waiter. */
+static void transfer_lock(struct data_vio *data_vio, struct lbn_lock *lock)
+{
+ struct data_vio *lock_holder, *next_lock_holder;
+ int result;
+
+ VDO_ASSERT_LOG_ONLY(lock->locked, "lbn_lock with waiters is not locked");
+
+ /* Another data_vio is waiting for the lock, transfer it in a single lock map operation. */
+ next_lock_holder =
+ vdo_waiter_as_data_vio(vdo_waitq_dequeue_waiter(&lock->waiters));
+
+ /* Transfer the remaining lock waiters to the next lock holder. */
+ vdo_waitq_transfer_all_waiters(&lock->waiters,
+ &next_lock_holder->logical.waiters);
+
+ result = vdo_int_map_put(lock->zone->lbn_operations, lock->lbn,
+ next_lock_holder, true, (void **) &lock_holder);
+ if (result != VDO_SUCCESS) {
+ continue_data_vio_with_error(next_lock_holder, result);
+ return;
+ }
+
+ VDO_ASSERT_LOG_ONLY((lock_holder == data_vio),
+ "logical block lock mismatch for block %llu",
+ (unsigned long long) lock->lbn);
+ lock->locked = false;
+
+ /*
+ * If there are still waiters, other data_vios must be trying to get the lock we just
+ * transferred. We must ensure that the new lock holder doesn't block in the packer.
+ */
+ if (vdo_waitq_has_waiters(&next_lock_holder->logical.waiters))
+ cancel_data_vio_compression(next_lock_holder);
+
+ /*
+ * Avoid stack overflow on lock transfer.
+ * FIXME: this is only an issue in the 1 thread config.
+ */
+ next_lock_holder->vio.completion.requeue = true;
+ launch_locked_request(next_lock_holder);
+}
+
+/**
+ * release_logical_lock() - Release the logical block lock and flush generation lock at the end of
+ * processing a data_vio.
+ */
+static void release_logical_lock(struct vdo_completion *completion)
+{
+ struct data_vio *data_vio = as_data_vio(completion);
+ struct lbn_lock *lock = &data_vio->logical;
+
+ assert_data_vio_in_logical_zone(data_vio);
+
+ if (vdo_waitq_has_waiters(&lock->waiters))
+ transfer_lock(data_vio, lock);
+ else
+ release_lock(data_vio, lock);
+
+ vdo_release_flush_generation_lock(data_vio);
+ perform_cleanup_stage(data_vio, VIO_CLEANUP_DONE);
+}
+
+/** clean_hash_lock() - Release the hash lock at the end of processing a data_vio. */
+static void clean_hash_lock(struct vdo_completion *completion)
+{
+ struct data_vio *data_vio = as_data_vio(completion);
+
+ assert_data_vio_in_hash_zone(data_vio);
+ if (completion->result != VDO_SUCCESS) {
+ vdo_clean_failed_hash_lock(data_vio);
+ return;
+ }
+
+ vdo_release_hash_lock(data_vio);
+ perform_cleanup_stage(data_vio, VIO_RELEASE_LOGICAL);
+}
+
+/**
+ * finish_cleanup() - Make some assertions about a data_vio which has finished cleaning up.
+ *
+ * If it is part of a multi-block discard, starts on the next block, otherwise, returns it to the
+ * pool.
+ */
+static void finish_cleanup(struct data_vio *data_vio)
+{
+ struct vdo_completion *completion = &data_vio->vio.completion;
+
+ VDO_ASSERT_LOG_ONLY(data_vio->allocation.lock == NULL,
+ "complete data_vio has no allocation lock");
+ VDO_ASSERT_LOG_ONLY(data_vio->hash_lock == NULL,
+ "complete data_vio has no hash lock");
+ if ((data_vio->remaining_discard <= VDO_BLOCK_SIZE) ||
+ (completion->result != VDO_SUCCESS)) {
+ struct data_vio_pool *pool = completion->vdo->data_vio_pool;
+
+ vdo_funnel_queue_put(pool->queue, &completion->work_queue_entry_link);
+ schedule_releases(pool);
+ return;
+ }
+
+ data_vio->remaining_discard -= min_t(u32, data_vio->remaining_discard,
+ VDO_BLOCK_SIZE - data_vio->offset);
+ data_vio->is_partial = (data_vio->remaining_discard < VDO_BLOCK_SIZE);
+ data_vio->read = data_vio->is_partial;
+ data_vio->offset = 0;
+ completion->requeue = true;
+ launch_data_vio(data_vio, data_vio->logical.lbn + 1);
+}
+
+/** perform_cleanup_stage() - Perform the next step in the process of cleaning up a data_vio. */
+static void perform_cleanup_stage(struct data_vio *data_vio,
+ enum data_vio_cleanup_stage stage)
+{
+ struct vdo *vdo = vdo_from_data_vio(data_vio);
+
+ switch (stage) {
+ case VIO_RELEASE_HASH_LOCK:
+ if (data_vio->hash_lock != NULL) {
+ launch_data_vio_hash_zone_callback(data_vio, clean_hash_lock);
+ return;
+ }
+ fallthrough;
+
+ case VIO_RELEASE_ALLOCATED:
+ if (data_vio_has_allocation(data_vio)) {
+ launch_data_vio_allocated_zone_callback(data_vio,
+ release_allocated_lock);
+ return;
+ }
+ fallthrough;
+
+ case VIO_RELEASE_RECOVERY_LOCKS:
+ if ((data_vio->recovery_sequence_number > 0) &&
+ (READ_ONCE(vdo->read_only_notifier.read_only_error) == VDO_SUCCESS) &&
+ (data_vio->vio.completion.result != VDO_READ_ONLY))
+ vdo_log_warning("VDO not read-only when cleaning data_vio with RJ lock");
+ fallthrough;
+
+ case VIO_RELEASE_LOGICAL:
+ launch_data_vio_logical_callback(data_vio, release_logical_lock);
+ return;
+
+ default:
+ finish_cleanup(data_vio);
+ }
+}
+
+void complete_data_vio(struct vdo_completion *completion)
+{
+ struct data_vio *data_vio = as_data_vio(completion);
+
+ completion->error_handler = NULL;
+ data_vio->last_async_operation = VIO_ASYNC_OP_CLEANUP;
+ perform_cleanup_stage(data_vio,
+ (data_vio->write ? VIO_CLEANUP_START : VIO_RELEASE_LOGICAL));
+}
+
+static void enter_read_only_mode(struct vdo_completion *completion)
+{
+ if (vdo_is_read_only(completion->vdo))
+ return;
+
+ if (completion->result != VDO_READ_ONLY) {
+ struct data_vio *data_vio = as_data_vio(completion);
+
+ vdo_log_error_strerror(completion->result,
+ "Preparing to enter read-only mode: data_vio for LBN %llu (becoming mapped to %llu, previously mapped to %llu, allocated %llu) is completing with a fatal error after operation %s",
+ (unsigned long long) data_vio->logical.lbn,
+ (unsigned long long) data_vio->new_mapped.pbn,
+ (unsigned long long) data_vio->mapped.pbn,
+ (unsigned long long) data_vio->allocation.pbn,
+ get_data_vio_operation_name(data_vio));
+ }
+
+ vdo_enter_read_only_mode(completion->vdo, completion->result);
+}
+
+void handle_data_vio_error(struct vdo_completion *completion)
+{
+ struct data_vio *data_vio = as_data_vio(completion);
+
+ if ((completion->result == VDO_READ_ONLY) || (data_vio->user_bio == NULL))
+ enter_read_only_mode(completion);
+
+ update_data_vio_error_stats(data_vio);
+ complete_data_vio(completion);
+}
+
+/**
+ * get_data_vio_operation_name() - Get the name of the last asynchronous operation performed on a
+ * data_vio.
+ */
+const char *get_data_vio_operation_name(struct data_vio *data_vio)
+{
+ BUILD_BUG_ON((MAX_VIO_ASYNC_OPERATION_NUMBER - MIN_VIO_ASYNC_OPERATION_NUMBER) !=
+ ARRAY_SIZE(ASYNC_OPERATION_NAMES));
+
+ return ((data_vio->last_async_operation < MAX_VIO_ASYNC_OPERATION_NUMBER) ?
+ ASYNC_OPERATION_NAMES[data_vio->last_async_operation] :
+ "unknown async operation");
+}
+
+/**
+ * data_vio_allocate_data_block() - Allocate a data block.
+ *
+ * @write_lock_type: The type of write lock to obtain on the block.
+ * @callback: The callback which will attempt an allocation in the current zone and continue if it
+ * succeeds.
+ * @error_handler: The handler for errors while allocating.
+ */
+void data_vio_allocate_data_block(struct data_vio *data_vio,
+ enum pbn_lock_type write_lock_type,
+ vdo_action_fn callback, vdo_action_fn error_handler)
+{
+ struct allocation *allocation = &data_vio->allocation;
+
+ VDO_ASSERT_LOG_ONLY((allocation->pbn == VDO_ZERO_BLOCK),
+ "data_vio does not have an allocation");
+ allocation->write_lock_type = write_lock_type;
+ allocation->zone = vdo_get_next_allocation_zone(data_vio->logical.zone);
+ allocation->first_allocation_zone = allocation->zone->zone_number;
+
+ data_vio->vio.completion.error_handler = error_handler;
+ launch_data_vio_allocated_zone_callback(data_vio, callback);
+}
+
+/**
+ * release_data_vio_allocation_lock() - Release the PBN lock on a data_vio's allocated block.
+ * @reset: If true, the allocation will be reset (i.e. any allocated pbn will be forgotten).
+ *
+ * If the reference to the locked block is still provisional, it will be released as well.
+ */
+void release_data_vio_allocation_lock(struct data_vio *data_vio, bool reset)
+{
+ struct allocation *allocation = &data_vio->allocation;
+ physical_block_number_t locked_pbn = allocation->pbn;
+
+ assert_data_vio_in_allocated_zone(data_vio);
+
+ if (reset || vdo_pbn_lock_has_provisional_reference(allocation->lock))
+ allocation->pbn = VDO_ZERO_BLOCK;
+
+ vdo_release_physical_zone_pbn_lock(allocation->zone, locked_pbn,
+ vdo_forget(allocation->lock));
+}
+
+/**
+ * uncompress_data_vio() - Uncompress the data a data_vio has just read.
+ * @mapping_state: The mapping state indicating which fragment to decompress.
+ * @buffer: The buffer to receive the uncompressed data.
+ */
+int uncompress_data_vio(struct data_vio *data_vio,
+ enum block_mapping_state mapping_state, char *buffer)
+{
+ int size;
+ u16 fragment_offset, fragment_size;
+ struct compressed_block *block = data_vio->compression.block;
+ int result = vdo_get_compressed_block_fragment(mapping_state, block,
+ &fragment_offset, &fragment_size);
+
+ if (result != VDO_SUCCESS) {
+ vdo_log_debug("%s: compressed fragment error %d", __func__, result);
+ return result;
+ }
+
+ size = LZ4_decompress_safe((block->data + fragment_offset), buffer,
+ fragment_size, VDO_BLOCK_SIZE);
+ if (size != VDO_BLOCK_SIZE) {
+ vdo_log_debug("%s: lz4 error", __func__);
+ return VDO_INVALID_FRAGMENT;
+ }
+
+ return VDO_SUCCESS;
+}
+
+/**
+ * modify_for_partial_write() - Do the modify-write part of a read-modify-write cycle.
+ * @completion: The data_vio which has just finished its read.
+ *
+ * This callback is registered in read_block().
+ */
+static void modify_for_partial_write(struct vdo_completion *completion)
+{
+ struct data_vio *data_vio = as_data_vio(completion);
+ char *data = data_vio->vio.data;
+ struct bio *bio = data_vio->user_bio;
+
+ assert_data_vio_on_cpu_thread(data_vio);
+
+ if (bio_op(bio) == REQ_OP_DISCARD) {
+ memset(data + data_vio->offset, '\0', min_t(u32,
+ data_vio->remaining_discard,
+ VDO_BLOCK_SIZE - data_vio->offset));
+ } else {
+ copy_from_bio(bio, data + data_vio->offset);
+ }
+
+ data_vio->is_zero = is_zero_block(data);
+ data_vio->read = false;
+ launch_data_vio_logical_callback(data_vio,
+ continue_data_vio_with_block_map_slot);
+}
+
+static void complete_read(struct vdo_completion *completion)
+{
+ struct data_vio *data_vio = as_data_vio(completion);
+ char *data = data_vio->vio.data;
+ bool compressed = vdo_is_state_compressed(data_vio->mapped.state);
+
+ assert_data_vio_on_cpu_thread(data_vio);
+
+ if (compressed) {
+ int result = uncompress_data_vio(data_vio, data_vio->mapped.state, data);
+
+ if (result != VDO_SUCCESS) {
+ continue_data_vio_with_error(data_vio, result);
+ return;
+ }
+ }
+
+ if (data_vio->write) {
+ modify_for_partial_write(completion);
+ return;
+ }
+
+ if (compressed || data_vio->is_partial)
+ copy_to_bio(data_vio->user_bio, data + data_vio->offset);
+
+ acknowledge_data_vio(data_vio);
+ complete_data_vio(completion);
+}
+
+static void read_endio(struct bio *bio)
+{
+ struct data_vio *data_vio = vio_as_data_vio(bio->bi_private);
+ int result = blk_status_to_errno(bio->bi_status);
+
+ vdo_count_completed_bios(bio);
+ if (result != VDO_SUCCESS) {
+ continue_data_vio_with_error(data_vio, result);
+ return;
+ }
+
+ launch_data_vio_cpu_callback(data_vio, complete_read,
+ CPU_Q_COMPLETE_READ_PRIORITY);
+}
+
+static void complete_zero_read(struct vdo_completion *completion)
+{
+ struct data_vio *data_vio = as_data_vio(completion);
+
+ assert_data_vio_on_cpu_thread(data_vio);
+
+ if (data_vio->is_partial) {
+ memset(data_vio->vio.data, 0, VDO_BLOCK_SIZE);
+ if (data_vio->write) {
+ modify_for_partial_write(completion);
+ return;
+ }
+ } else {
+ zero_fill_bio(data_vio->user_bio);
+ }
+
+ complete_read(completion);
+}
+
+/**
+ * read_block() - Read a block asynchronously.
+ *
+ * This is the callback registered in read_block_mapping().
+ */
+static void read_block(struct vdo_completion *completion)
+{
+ struct data_vio *data_vio = as_data_vio(completion);
+ struct vio *vio = as_vio(completion);
+ int result = VDO_SUCCESS;
+
+ if (data_vio->mapped.pbn == VDO_ZERO_BLOCK) {
+ launch_data_vio_cpu_callback(data_vio, complete_zero_read,
+ CPU_Q_COMPLETE_VIO_PRIORITY);
+ return;
+ }
+
+ data_vio->last_async_operation = VIO_ASYNC_OP_READ_DATA_VIO;
+ if (vdo_is_state_compressed(data_vio->mapped.state)) {
+ result = vio_reset_bio(vio, (char *) data_vio->compression.block,
+ read_endio, REQ_OP_READ, data_vio->mapped.pbn);
+ } else {
+ blk_opf_t opf = ((data_vio->user_bio->bi_opf & PASSTHROUGH_FLAGS) | REQ_OP_READ);
+
+ if (data_vio->is_partial) {
+ result = vio_reset_bio(vio, vio->data, read_endio, opf,
+ data_vio->mapped.pbn);
+ } else {
+ /* A full 4k read. Use the incoming bio to avoid having to copy the data */
+ bio_reset(vio->bio, vio->bio->bi_bdev, opf);
+ bio_init_clone(data_vio->user_bio->bi_bdev, vio->bio,
+ data_vio->user_bio, GFP_KERNEL);
+
+ /* Copy over the original bio iovec and opflags. */
+ vdo_set_bio_properties(vio->bio, vio, read_endio, opf,
+ data_vio->mapped.pbn);
+ }
+ }
+
+ if (result != VDO_SUCCESS) {
+ continue_data_vio_with_error(data_vio, result);
+ return;
+ }
+
+ vdo_submit_data_vio(data_vio);
+}
+
+static inline struct data_vio *
+reference_count_update_completion_as_data_vio(struct vdo_completion *completion)
+{
+ if (completion->type == VIO_COMPLETION)
+ return as_data_vio(completion);
+
+ return container_of(completion, struct data_vio, decrement_completion);
+}
+
+/**
+ * update_block_map() - Rendezvous of the data_vio and decrement completions after each has
+ * made its reference updates. Handle any error from either, or proceed
+ * to updating the block map.
+ * @completion: The completion of the write in progress.
+ */
+static void update_block_map(struct vdo_completion *completion)
+{
+ struct data_vio *data_vio = reference_count_update_completion_as_data_vio(completion);
+
+ assert_data_vio_in_logical_zone(data_vio);
+
+ if (!data_vio->first_reference_operation_complete) {
+ /* Rendezvous, we're first */
+ data_vio->first_reference_operation_complete = true;
+ return;
+ }
+
+ completion = &data_vio->vio.completion;
+ vdo_set_completion_result(completion, data_vio->decrement_completion.result);
+ if (completion->result != VDO_SUCCESS) {
+ handle_data_vio_error(completion);
+ return;
+ }
+
+ completion->error_handler = handle_data_vio_error;
+ if (data_vio->hash_lock != NULL)
+ set_data_vio_hash_zone_callback(data_vio, vdo_continue_hash_lock);
+ else
+ completion->callback = complete_data_vio;
+
+ data_vio->last_async_operation = VIO_ASYNC_OP_PUT_MAPPED_BLOCK;
+ vdo_put_mapped_block(data_vio);
+}
+
+static void decrement_reference_count(struct vdo_completion *completion)
+{
+ struct data_vio *data_vio = container_of(completion, struct data_vio,
+ decrement_completion);
+
+ assert_data_vio_in_mapped_zone(data_vio);
+
+ vdo_set_completion_callback(completion, update_block_map,
+ data_vio->logical.zone->thread_id);
+ completion->error_handler = update_block_map;
+ vdo_modify_reference_count(completion, &data_vio->decrement_updater);
+}
+
+static void increment_reference_count(struct vdo_completion *completion)
+{
+ struct data_vio *data_vio = as_data_vio(completion);
+
+ assert_data_vio_in_new_mapped_zone(data_vio);
+
+ if (data_vio->downgrade_allocation_lock) {
+ /*
+ * Now that the data has been written, it's safe to deduplicate against the
+ * block. Downgrade the allocation lock to a read lock so it can be used later by
+ * the hash lock. This is done here since it needs to happen sometime before we
+ * return to the hash zone, and we are currently on the correct thread. For
+ * compressed blocks, the downgrade will have already been done.
+ */
+ vdo_downgrade_pbn_write_lock(data_vio->allocation.lock, false);
+ }
+
+ set_data_vio_logical_callback(data_vio, update_block_map);
+ completion->error_handler = update_block_map;
+ vdo_modify_reference_count(completion, &data_vio->increment_updater);
+}
+
+/** journal_remapping() - Add a recovery journal entry for a data remapping. */
+static void journal_remapping(struct vdo_completion *completion)
+{
+ struct data_vio *data_vio = as_data_vio(completion);
+
+ assert_data_vio_in_journal_zone(data_vio);
+
+ data_vio->decrement_updater.operation = VDO_JOURNAL_DATA_REMAPPING;
+ data_vio->decrement_updater.zpbn = data_vio->mapped;
+ if (data_vio->new_mapped.pbn == VDO_ZERO_BLOCK) {
+ data_vio->first_reference_operation_complete = true;
+ if (data_vio->mapped.pbn == VDO_ZERO_BLOCK)
+ set_data_vio_logical_callback(data_vio, update_block_map);
+ } else {
+ set_data_vio_new_mapped_zone_callback(data_vio,
+ increment_reference_count);
+ }
+
+ if (data_vio->mapped.pbn == VDO_ZERO_BLOCK) {
+ data_vio->first_reference_operation_complete = true;
+ } else {
+ vdo_set_completion_callback(&data_vio->decrement_completion,
+ decrement_reference_count,
+ data_vio->mapped.zone->thread_id);
+ }
+
+ data_vio->last_async_operation = VIO_ASYNC_OP_JOURNAL_REMAPPING;
+ vdo_add_recovery_journal_entry(completion->vdo->recovery_journal, data_vio);
+}
+
+/**
+ * read_old_block_mapping() - Get the previous PBN/LBN mapping of an in-progress write.
+ *
+ * Gets the previous PBN mapped to this LBN from the block map, so as to make an appropriate
+ * journal entry referencing the removal of this LBN->PBN mapping.
+ */
+static void read_old_block_mapping(struct vdo_completion *completion)
+{
+ struct data_vio *data_vio = as_data_vio(completion);
+
+ assert_data_vio_in_logical_zone(data_vio);
+
+ data_vio->last_async_operation = VIO_ASYNC_OP_GET_MAPPED_BLOCK_FOR_WRITE;
+ set_data_vio_journal_callback(data_vio, journal_remapping);
+ vdo_get_mapped_block(data_vio);
+}
+
+void update_metadata_for_data_vio_write(struct data_vio *data_vio, struct pbn_lock *lock)
+{
+ data_vio->increment_updater = (struct reference_updater) {
+ .operation = VDO_JOURNAL_DATA_REMAPPING,
+ .increment = true,
+ .zpbn = data_vio->new_mapped,
+ .lock = lock,
+ };
+
+ launch_data_vio_logical_callback(data_vio, read_old_block_mapping);
+}
+
+/**
+ * pack_compressed_data() - Attempt to pack the compressed data_vio into a block.
+ *
+ * This is the callback registered in launch_compress_data_vio().
+ */
+static void pack_compressed_data(struct vdo_completion *completion)
+{
+ struct data_vio *data_vio = as_data_vio(completion);
+
+ assert_data_vio_in_packer_zone(data_vio);
+
+ if (!vdo_get_compressing(vdo_from_data_vio(data_vio)) ||
+ get_data_vio_compression_status(data_vio).may_not_compress) {
+ write_data_vio(data_vio);
+ return;
+ }
+
+ data_vio->last_async_operation = VIO_ASYNC_OP_ATTEMPT_PACKING;
+ vdo_attempt_packing(data_vio);
+}
+
+/**
+ * compress_data_vio() - Do the actual work of compressing the data on a CPU queue.
+ *
+ * This callback is registered in launch_compress_data_vio().
+ */
+static void compress_data_vio(struct vdo_completion *completion)
+{
+ struct data_vio *data_vio = as_data_vio(completion);
+ int size;
+
+ assert_data_vio_on_cpu_thread(data_vio);
+
+ /*
+ * By putting the compressed data at the start of the compressed block data field, we won't
+ * need to copy it if this data_vio becomes a compressed write agent.
+ */
+ size = LZ4_compress_default(data_vio->vio.data,
+ data_vio->compression.block->data, VDO_BLOCK_SIZE,
+ VDO_MAX_COMPRESSED_FRAGMENT_SIZE,
+ (char *) vdo_get_work_queue_private_data());
+ if ((size > 0) && (size < VDO_COMPRESSED_BLOCK_DATA_SIZE)) {
+ data_vio->compression.size = size;
+ launch_data_vio_packer_callback(data_vio, pack_compressed_data);
+ return;
+ }
+
+ write_data_vio(data_vio);
+}
+
+/**
+ * launch_compress_data_vio() - Continue a write by attempting to compress the data.
+ *
+ * This is a re-entry point to vio_write used by hash locks.
+ */
+void launch_compress_data_vio(struct data_vio *data_vio)
+{
+ VDO_ASSERT_LOG_ONLY(!data_vio->is_duplicate, "compressing a non-duplicate block");
+ VDO_ASSERT_LOG_ONLY(data_vio->hash_lock != NULL,
+ "data_vio to compress has a hash_lock");
+ VDO_ASSERT_LOG_ONLY(data_vio_has_allocation(data_vio),
+ "data_vio to compress has an allocation");
+
+ /*
+ * There are 4 reasons why a data_vio which has reached this point will not be eligible for
+ * compression:
+ *
+ * 1) Since data_vios can block indefinitely in the packer, it would be bad to do so if the
+ * write request also requests FUA.
+ *
+ * 2) A data_vio should not be compressed when compression is disabled for the vdo.
+ *
+ * 3) A data_vio could be doing a partial write on behalf of a larger discard which has not
+ * yet been acknowledged and hence blocking in the packer would be bad.
+ *
+ * 4) Some other data_vio may be waiting on this data_vio in which case blocking in the
+ * packer would also be bad.
+ */
+ if (data_vio->fua ||
+ !vdo_get_compressing(vdo_from_data_vio(data_vio)) ||
+ ((data_vio->user_bio != NULL) && (bio_op(data_vio->user_bio) == REQ_OP_DISCARD)) ||
+ (advance_data_vio_compression_stage(data_vio).stage != DATA_VIO_COMPRESSING)) {
+ write_data_vio(data_vio);
+ return;
+ }
+
+ data_vio->last_async_operation = VIO_ASYNC_OP_COMPRESS_DATA_VIO;
+ launch_data_vio_cpu_callback(data_vio, compress_data_vio,
+ CPU_Q_COMPRESS_BLOCK_PRIORITY);
+}
+
+/**
+ * hash_data_vio() - Hash the data in a data_vio and set the hash zone (which also flags the record
+ * name as set).
+
+ * This callback is registered in prepare_for_dedupe().
+ */
+static void hash_data_vio(struct vdo_completion *completion)
+{
+ struct data_vio *data_vio = as_data_vio(completion);
+
+ assert_data_vio_on_cpu_thread(data_vio);
+ VDO_ASSERT_LOG_ONLY(!data_vio->is_zero, "zero blocks should not be hashed");
+
+ murmurhash3_128(data_vio->vio.data, VDO_BLOCK_SIZE, 0x62ea60be,
+ &data_vio->record_name);
+
+ data_vio->hash_zone = vdo_select_hash_zone(vdo_from_data_vio(data_vio)->hash_zones,
+ &data_vio->record_name);
+ data_vio->last_async_operation = VIO_ASYNC_OP_ACQUIRE_VDO_HASH_LOCK;
+ launch_data_vio_hash_zone_callback(data_vio, vdo_acquire_hash_lock);
+}
+
+/** prepare_for_dedupe() - Prepare for the dedupe path after attempting to get an allocation. */
+static void prepare_for_dedupe(struct data_vio *data_vio)
+{
+ /* We don't care what thread we are on. */
+ VDO_ASSERT_LOG_ONLY(!data_vio->is_zero, "must not prepare to dedupe zero blocks");
+
+ /*
+ * Before we can dedupe, we need to know the record name, so the first
+ * step is to hash the block data.
+ */
+ data_vio->last_async_operation = VIO_ASYNC_OP_HASH_DATA_VIO;
+ launch_data_vio_cpu_callback(data_vio, hash_data_vio, CPU_Q_HASH_BLOCK_PRIORITY);
+}
+
+/**
+ * write_bio_finished() - This is the bio_end_io function registered in write_block() to be called
+ * when a data_vio's write to the underlying storage has completed.
+ */
+static void write_bio_finished(struct bio *bio)
+{
+ struct data_vio *data_vio = vio_as_data_vio((struct vio *) bio->bi_private);
+
+ vdo_count_completed_bios(bio);
+ vdo_set_completion_result(&data_vio->vio.completion,
+ blk_status_to_errno(bio->bi_status));
+ data_vio->downgrade_allocation_lock = true;
+ update_metadata_for_data_vio_write(data_vio, data_vio->allocation.lock);
+}
+
+/** write_data_vio() - Write a data block to storage without compression. */
+void write_data_vio(struct data_vio *data_vio)
+{
+ struct data_vio_compression_status status, new_status;
+ int result;
+
+ if (!data_vio_has_allocation(data_vio)) {
+ /*
+ * There was no space to write this block and we failed to deduplicate or compress
+ * it.
+ */
+ continue_data_vio_with_error(data_vio, VDO_NO_SPACE);
+ return;
+ }
+
+ new_status = (struct data_vio_compression_status) {
+ .stage = DATA_VIO_POST_PACKER,
+ .may_not_compress = true,
+ };
+
+ do {
+ status = get_data_vio_compression_status(data_vio);
+ } while ((status.stage != DATA_VIO_POST_PACKER) &&
+ !set_data_vio_compression_status(data_vio, status, new_status));
+
+ /* Write the data from the data block buffer. */
+ result = vio_reset_bio(&data_vio->vio, data_vio->vio.data,
+ write_bio_finished, REQ_OP_WRITE,
+ data_vio->allocation.pbn);
+ if (result != VDO_SUCCESS) {
+ continue_data_vio_with_error(data_vio, result);
+ return;
+ }
+
+ data_vio->last_async_operation = VIO_ASYNC_OP_WRITE_DATA_VIO;
+ vdo_submit_data_vio(data_vio);
+}
+
+/**
+ * acknowledge_write_callback() - Acknowledge a write to the requestor.
+ *
+ * This callback is registered in allocate_block() and continue_write_with_block_map_slot().
+ */
+static void acknowledge_write_callback(struct vdo_completion *completion)
+{
+ struct data_vio *data_vio = as_data_vio(completion);
+ struct vdo *vdo = completion->vdo;
+
+ VDO_ASSERT_LOG_ONLY((!vdo_uses_bio_ack_queue(vdo) ||
+ (vdo_get_callback_thread_id() == vdo->thread_config.bio_ack_thread)),
+ "%s() called on bio ack queue", __func__);
+ VDO_ASSERT_LOG_ONLY(data_vio_has_flush_generation_lock(data_vio),
+ "write VIO to be acknowledged has a flush generation lock");
+ acknowledge_data_vio(data_vio);
+ if (data_vio->new_mapped.pbn == VDO_ZERO_BLOCK) {
+ /* This is a zero write or discard */
+ update_metadata_for_data_vio_write(data_vio, NULL);
+ return;
+ }
+
+ prepare_for_dedupe(data_vio);
+}
+
+/**
+ * allocate_block() - Attempt to allocate a block in the current allocation zone.
+ *
+ * This callback is registered in continue_write_with_block_map_slot().
+ */
+static void allocate_block(struct vdo_completion *completion)
+{
+ struct data_vio *data_vio = as_data_vio(completion);
+
+ assert_data_vio_in_allocated_zone(data_vio);
+
+ if (!vdo_allocate_block_in_zone(data_vio))
+ return;
+
+ completion->error_handler = handle_data_vio_error;
+ WRITE_ONCE(data_vio->allocation_succeeded, true);
+ data_vio->new_mapped = (struct zoned_pbn) {
+ .zone = data_vio->allocation.zone,
+ .pbn = data_vio->allocation.pbn,
+ .state = VDO_MAPPING_STATE_UNCOMPRESSED,
+ };
+
+ if (data_vio->fua) {
+ prepare_for_dedupe(data_vio);
+ return;
+ }
+
+ data_vio->last_async_operation = VIO_ASYNC_OP_ACKNOWLEDGE_WRITE;
+ launch_data_vio_on_bio_ack_queue(data_vio, acknowledge_write_callback);
+}
+
+/**
+ * handle_allocation_error() - Handle an error attempting to allocate a block.
+ *
+ * This error handler is registered in continue_write_with_block_map_slot().
+ */
+static void handle_allocation_error(struct vdo_completion *completion)
+{
+ struct data_vio *data_vio = as_data_vio(completion);
+
+ if (completion->result == VDO_NO_SPACE) {
+ /* We failed to get an allocation, but we can try to dedupe. */
+ vdo_reset_completion(completion);
+ completion->error_handler = handle_data_vio_error;
+ prepare_for_dedupe(data_vio);
+ return;
+ }
+
+ /* We got a "real" error, not just a failure to allocate, so fail the request. */
+ handle_data_vio_error(completion);
+}
+
+static int assert_is_discard(struct data_vio *data_vio)
+{
+ int result = VDO_ASSERT(data_vio->is_discard,
+ "data_vio with no block map page is a discard");
+
+ return ((result == VDO_SUCCESS) ? result : VDO_READ_ONLY);
+}
+
+/**
+ * continue_data_vio_with_block_map_slot() - Read the data_vio's mapping from the block map.
+ *
+ * This callback is registered in launch_read_data_vio().
+ */
+void continue_data_vio_with_block_map_slot(struct vdo_completion *completion)
+{
+ struct data_vio *data_vio = as_data_vio(completion);
+
+ assert_data_vio_in_logical_zone(data_vio);
+ if (data_vio->read) {
+ set_data_vio_logical_callback(data_vio, read_block);
+ data_vio->last_async_operation = VIO_ASYNC_OP_GET_MAPPED_BLOCK_FOR_READ;
+ vdo_get_mapped_block(data_vio);
+ return;
+ }
+
+ vdo_acquire_flush_generation_lock(data_vio);
+
+ if (data_vio->tree_lock.tree_slots[0].block_map_slot.pbn == VDO_ZERO_BLOCK) {
+ /*
+ * This is a discard for a block on a block map page which has not been allocated, so
+ * there's nothing more we need to do.
+ */
+ completion->callback = complete_data_vio;
+ continue_data_vio_with_error(data_vio, assert_is_discard(data_vio));
+ return;
+ }
+
+ /*
+ * We need an allocation if this is neither a full-block discard nor a
+ * full-block zero write.
+ */
+ if (!data_vio->is_zero && (!data_vio->is_discard || data_vio->is_partial)) {
+ data_vio_allocate_data_block(data_vio, VIO_WRITE_LOCK, allocate_block,
+ handle_allocation_error);
+ return;
+ }
+
+
+ /*
+ * We don't need to write any data, so skip allocation and just update the block map and
+ * reference counts (via the journal).
+ */
+ data_vio->new_mapped.pbn = VDO_ZERO_BLOCK;
+ if (data_vio->is_zero)
+ data_vio->new_mapped.state = VDO_MAPPING_STATE_UNCOMPRESSED;
+
+ if (data_vio->remaining_discard > VDO_BLOCK_SIZE) {
+ /* This is not the final block of a discard so we can't acknowledge it yet. */
+ update_metadata_for_data_vio_write(data_vio, NULL);
+ return;
+ }
+
+ data_vio->last_async_operation = VIO_ASYNC_OP_ACKNOWLEDGE_WRITE;
+ launch_data_vio_on_bio_ack_queue(data_vio, acknowledge_write_callback);
+}
diff --git a/drivers/md/dm-vdo/data-vio.h b/drivers/md/dm-vdo/data-vio.h
new file mode 100644
index 000000000000..25926b6cd98b
--- /dev/null
+++ b/drivers/md/dm-vdo/data-vio.h
@@ -0,0 +1,670 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef DATA_VIO_H
+#define DATA_VIO_H
+
+#include <linux/atomic.h>
+#include <linux/bio.h>
+#include <linux/list.h>
+
+#include "permassert.h"
+
+#include "indexer.h"
+
+#include "block-map.h"
+#include "completion.h"
+#include "constants.h"
+#include "dedupe.h"
+#include "encodings.h"
+#include "logical-zone.h"
+#include "physical-zone.h"
+#include "types.h"
+#include "vdo.h"
+#include "vio.h"
+#include "wait-queue.h"
+
+/* Codes for describing the last asynchronous operation performed on a vio. */
+enum async_operation_number {
+ MIN_VIO_ASYNC_OPERATION_NUMBER,
+ VIO_ASYNC_OP_LAUNCH = MIN_VIO_ASYNC_OPERATION_NUMBER,
+ VIO_ASYNC_OP_ACKNOWLEDGE_WRITE,
+ VIO_ASYNC_OP_ACQUIRE_VDO_HASH_LOCK,
+ VIO_ASYNC_OP_ATTEMPT_LOGICAL_BLOCK_LOCK,
+ VIO_ASYNC_OP_LOCK_DUPLICATE_PBN,
+ VIO_ASYNC_OP_CHECK_FOR_DUPLICATION,
+ VIO_ASYNC_OP_CLEANUP,
+ VIO_ASYNC_OP_COMPRESS_DATA_VIO,
+ VIO_ASYNC_OP_FIND_BLOCK_MAP_SLOT,
+ VIO_ASYNC_OP_GET_MAPPED_BLOCK_FOR_READ,
+ VIO_ASYNC_OP_GET_MAPPED_BLOCK_FOR_WRITE,
+ VIO_ASYNC_OP_HASH_DATA_VIO,
+ VIO_ASYNC_OP_JOURNAL_REMAPPING,
+ VIO_ASYNC_OP_ATTEMPT_PACKING,
+ VIO_ASYNC_OP_PUT_MAPPED_BLOCK,
+ VIO_ASYNC_OP_READ_DATA_VIO,
+ VIO_ASYNC_OP_UPDATE_DEDUPE_INDEX,
+ VIO_ASYNC_OP_UPDATE_REFERENCE_COUNTS,
+ VIO_ASYNC_OP_VERIFY_DUPLICATION,
+ VIO_ASYNC_OP_WRITE_DATA_VIO,
+ MAX_VIO_ASYNC_OPERATION_NUMBER,
+} __packed;
+
+struct lbn_lock {
+ logical_block_number_t lbn;
+ bool locked;
+ struct vdo_wait_queue waiters;
+ struct logical_zone *zone;
+};
+
+/* A position in the arboreal block map at a specific level. */
+struct block_map_tree_slot {
+ page_number_t page_index;
+ struct block_map_slot block_map_slot;
+};
+
+/* Fields for using the arboreal block map. */
+struct tree_lock {
+ /* The current height at which this data_vio is operating */
+ height_t height;
+ /* The block map tree for this LBN */
+ root_count_t root_index;
+ /* Whether we hold a page lock */
+ bool locked;
+ /* The key for the lock map */
+ u64 key;
+ /* The queue of waiters for the page this vio is allocating or loading */
+ struct vdo_wait_queue waiters;
+ /* The block map tree slots for this LBN */
+ struct block_map_tree_slot tree_slots[VDO_BLOCK_MAP_TREE_HEIGHT + 1];
+};
+
+struct zoned_pbn {
+ physical_block_number_t pbn;
+ enum block_mapping_state state;
+ struct physical_zone *zone;
+};
+
+/*
+ * Where a data_vio is on the compression path; advance_compression_stage() depends on the order of
+ * this enum.
+ */
+enum data_vio_compression_stage {
+ /* A data_vio which has not yet entered the compression path */
+ DATA_VIO_PRE_COMPRESSOR,
+ /* A data_vio which is in the compressor */
+ DATA_VIO_COMPRESSING,
+ /* A data_vio which is blocked in the packer */
+ DATA_VIO_PACKING,
+ /* A data_vio which is no longer on the compression path (and never will be) */
+ DATA_VIO_POST_PACKER,
+};
+
+struct data_vio_compression_status {
+ enum data_vio_compression_stage stage;
+ bool may_not_compress;
+};
+
+struct compression_state {
+ /*
+ * The current compression status of this data_vio. This field contains a value which
+ * consists of a data_vio_compression_stage and a flag indicating whether a request has
+ * been made to cancel (or prevent) compression for this data_vio.
+ *
+ * This field should be accessed through the get_data_vio_compression_status() and
+ * set_data_vio_compression_status() methods. It should not be accessed directly.
+ */
+ atomic_t status;
+
+ /* The compressed size of this block */
+ u16 size;
+
+ /* The packer input or output bin slot which holds the enclosing data_vio */
+ slot_number_t slot;
+
+ /* The packer bin to which the enclosing data_vio has been assigned */
+ struct packer_bin *bin;
+
+ /* A link in the chain of data_vios which have been packed together */
+ struct data_vio *next_in_batch;
+
+ /* A vio which is blocked in the packer while holding a lock this vio needs. */
+ struct data_vio *lock_holder;
+
+ /*
+ * The compressed block used to hold the compressed form of this block and that of any
+ * other blocks for which this data_vio is the compressed write agent.
+ */
+ struct compressed_block *block;
+};
+
+/* Fields supporting allocation of data blocks. */
+struct allocation {
+ /* The physical zone in which to allocate a physical block */
+ struct physical_zone *zone;
+
+ /* The block allocated to this vio */
+ physical_block_number_t pbn;
+
+ /*
+ * If non-NULL, the pooled PBN lock held on the allocated block. Must be a write lock until
+ * the block has been written, after which it will become a read lock.
+ */
+ struct pbn_lock *lock;
+
+ /* The type of write lock to obtain on the allocated block */
+ enum pbn_lock_type write_lock_type;
+
+ /* The zone which was the start of the current allocation cycle */
+ zone_count_t first_allocation_zone;
+
+ /* Whether this vio should wait for a clean slab */
+ bool wait_for_clean_slab;
+};
+
+struct reference_updater {
+ enum journal_operation operation;
+ bool increment;
+ struct zoned_pbn zpbn;
+ struct pbn_lock *lock;
+ struct vdo_waiter waiter;
+};
+
+/* A vio for processing user data requests. */
+struct data_vio {
+ /* The vdo_wait_queue entry structure */
+ struct vdo_waiter waiter;
+
+ /* The logical block of this request */
+ struct lbn_lock logical;
+
+ /* The state for traversing the block map tree */
+ struct tree_lock tree_lock;
+
+ /* The current partition address of this block */
+ struct zoned_pbn mapped;
+
+ /* The hash of this vio (if not zero) */
+ struct uds_record_name record_name;
+
+ /* Used for logging and debugging */
+ enum async_operation_number last_async_operation;
+
+ /* The operations to record in the recovery and slab journals */
+ struct reference_updater increment_updater;
+ struct reference_updater decrement_updater;
+
+ u16 read : 1;
+ u16 write : 1;
+ u16 fua : 1;
+ u16 is_zero : 1;
+ u16 is_discard : 1;
+ u16 is_partial : 1;
+ u16 is_duplicate : 1;
+ u16 first_reference_operation_complete : 1;
+ u16 downgrade_allocation_lock : 1;
+
+ struct allocation allocation;
+
+ /*
+ * Whether this vio has received an allocation. This field is examined from threads not in
+ * the allocation zone.
+ */
+ bool allocation_succeeded;
+
+ /* The new partition address of this block after the vio write completes */
+ struct zoned_pbn new_mapped;
+
+ /* The hash zone responsible for the name (NULL if is_zero_block) */
+ struct hash_zone *hash_zone;
+
+ /* The lock this vio holds or shares with other vios with the same data */
+ struct hash_lock *hash_lock;
+
+ /* All data_vios sharing a hash lock are kept in a list linking these list entries */
+ struct list_head hash_lock_entry;
+
+ /* The block number in the partition of the UDS deduplication advice */
+ struct zoned_pbn duplicate;
+
+ /*
+ * The sequence number of the recovery journal block containing the increment entry for
+ * this vio.
+ */
+ sequence_number_t recovery_sequence_number;
+
+ /* The point in the recovery journal where this write last made an entry */
+ struct journal_point recovery_journal_point;
+
+ /* The list of vios in user initiated write requests */
+ struct list_head write_entry;
+
+ /* The generation number of the VDO that this vio belongs to */
+ sequence_number_t flush_generation;
+
+ /* The completion to use for fetching block map pages for this vio */
+ struct vdo_page_completion page_completion;
+
+ /* The user bio that initiated this VIO */
+ struct bio *user_bio;
+
+ /* partial block support */
+ block_size_t offset;
+
+ /*
+ * The number of bytes to be discarded. For discards, this field will always be positive,
+ * whereas for non-discards it will always be 0. Hence it can be used to determine whether
+ * a data_vio is processing a discard, even after the user_bio has been acknowledged.
+ */
+ u32 remaining_discard;
+
+ struct dedupe_context *dedupe_context;
+
+ /* Fields beyond this point will not be reset when a pooled data_vio is reused. */
+
+ struct vio vio;
+
+ /* The completion for making reference count decrements */
+ struct vdo_completion decrement_completion;
+
+ /* All of the fields necessary for the compression path */
+ struct compression_state compression;
+
+ /* A block used as output during compression or uncompression */
+ char *scratch_block;
+
+ struct list_head pool_entry;
+};
+
+static inline struct data_vio *vio_as_data_vio(struct vio *vio)
+{
+ VDO_ASSERT_LOG_ONLY((vio->type == VIO_TYPE_DATA), "vio is a data_vio");
+ return container_of(vio, struct data_vio, vio);
+}
+
+static inline struct data_vio *as_data_vio(struct vdo_completion *completion)
+{
+ return vio_as_data_vio(as_vio(completion));
+}
+
+static inline struct data_vio *vdo_waiter_as_data_vio(struct vdo_waiter *waiter)
+{
+ if (waiter == NULL)
+ return NULL;
+
+ return container_of(waiter, struct data_vio, waiter);
+}
+
+static inline struct data_vio *data_vio_from_reference_updater(struct reference_updater *updater)
+{
+ if (updater->increment)
+ return container_of(updater, struct data_vio, increment_updater);
+
+ return container_of(updater, struct data_vio, decrement_updater);
+}
+
+static inline bool data_vio_has_flush_generation_lock(struct data_vio *data_vio)
+{
+ return !list_empty(&data_vio->write_entry);
+}
+
+static inline struct vdo *vdo_from_data_vio(struct data_vio *data_vio)
+{
+ return data_vio->vio.completion.vdo;
+}
+
+static inline bool data_vio_has_allocation(struct data_vio *data_vio)
+{
+ return (data_vio->allocation.pbn != VDO_ZERO_BLOCK);
+}
+
+struct data_vio_compression_status __must_check
+advance_data_vio_compression_stage(struct data_vio *data_vio);
+struct data_vio_compression_status __must_check
+get_data_vio_compression_status(struct data_vio *data_vio);
+bool cancel_data_vio_compression(struct data_vio *data_vio);
+
+struct data_vio_pool;
+
+int make_data_vio_pool(struct vdo *vdo, data_vio_count_t pool_size,
+ data_vio_count_t discard_limit, struct data_vio_pool **pool_ptr);
+void free_data_vio_pool(struct data_vio_pool *pool);
+void vdo_launch_bio(struct data_vio_pool *pool, struct bio *bio);
+void drain_data_vio_pool(struct data_vio_pool *pool, struct vdo_completion *completion);
+void resume_data_vio_pool(struct data_vio_pool *pool, struct vdo_completion *completion);
+
+void dump_data_vio_pool(struct data_vio_pool *pool, bool dump_vios);
+data_vio_count_t get_data_vio_pool_active_discards(struct data_vio_pool *pool);
+data_vio_count_t get_data_vio_pool_discard_limit(struct data_vio_pool *pool);
+data_vio_count_t get_data_vio_pool_maximum_discards(struct data_vio_pool *pool);
+int __must_check set_data_vio_pool_discard_limit(struct data_vio_pool *pool,
+ data_vio_count_t limit);
+data_vio_count_t get_data_vio_pool_active_requests(struct data_vio_pool *pool);
+data_vio_count_t get_data_vio_pool_request_limit(struct data_vio_pool *pool);
+data_vio_count_t get_data_vio_pool_maximum_requests(struct data_vio_pool *pool);
+
+void complete_data_vio(struct vdo_completion *completion);
+void handle_data_vio_error(struct vdo_completion *completion);
+
+static inline void continue_data_vio(struct data_vio *data_vio)
+{
+ vdo_launch_completion(&data_vio->vio.completion);
+}
+
+/**
+ * continue_data_vio_with_error() - Set an error code and then continue processing a data_vio.
+ *
+ * This will not mask older errors. This function can be called with a success code, but it is more
+ * efficient to call continue_data_vio() if the caller knows the result was a success.
+ */
+static inline void continue_data_vio_with_error(struct data_vio *data_vio, int result)
+{
+ vdo_continue_completion(&data_vio->vio.completion, result);
+}
+
+const char * __must_check get_data_vio_operation_name(struct data_vio *data_vio);
+
+static inline void assert_data_vio_in_hash_zone(struct data_vio *data_vio)
+{
+ thread_id_t expected = data_vio->hash_zone->thread_id;
+ thread_id_t thread_id = vdo_get_callback_thread_id();
+ /*
+ * It's odd to use the LBN, but converting the record name to hex is a bit clunky for an
+ * inline, and the LBN better than nothing as an identifier.
+ */
+ VDO_ASSERT_LOG_ONLY((expected == thread_id),
+ "data_vio for logical block %llu on thread %u, should be on hash zone thread %u",
+ (unsigned long long) data_vio->logical.lbn, thread_id, expected);
+}
+
+static inline void set_data_vio_hash_zone_callback(struct data_vio *data_vio,
+ vdo_action_fn callback)
+{
+ vdo_set_completion_callback(&data_vio->vio.completion, callback,
+ data_vio->hash_zone->thread_id);
+}
+
+/**
+ * launch_data_vio_hash_zone_callback() - Set a callback as a hash zone operation and invoke it
+ * immediately.
+ */
+static inline void launch_data_vio_hash_zone_callback(struct data_vio *data_vio,
+ vdo_action_fn callback)
+{
+ set_data_vio_hash_zone_callback(data_vio, callback);
+ vdo_launch_completion(&data_vio->vio.completion);
+}
+
+static inline void assert_data_vio_in_logical_zone(struct data_vio *data_vio)
+{
+ thread_id_t expected = data_vio->logical.zone->thread_id;
+ thread_id_t thread_id = vdo_get_callback_thread_id();
+
+ VDO_ASSERT_LOG_ONLY((expected == thread_id),
+ "data_vio for logical block %llu on thread %u, should be on thread %u",
+ (unsigned long long) data_vio->logical.lbn, thread_id, expected);
+}
+
+static inline void set_data_vio_logical_callback(struct data_vio *data_vio,
+ vdo_action_fn callback)
+{
+ vdo_set_completion_callback(&data_vio->vio.completion, callback,
+ data_vio->logical.zone->thread_id);
+}
+
+/**
+ * launch_data_vio_logical_callback() - Set a callback as a logical block operation and invoke it
+ * immediately.
+ */
+static inline void launch_data_vio_logical_callback(struct data_vio *data_vio,
+ vdo_action_fn callback)
+{
+ set_data_vio_logical_callback(data_vio, callback);
+ vdo_launch_completion(&data_vio->vio.completion);
+}
+
+static inline void assert_data_vio_in_allocated_zone(struct data_vio *data_vio)
+{
+ thread_id_t expected = data_vio->allocation.zone->thread_id;
+ thread_id_t thread_id = vdo_get_callback_thread_id();
+
+ VDO_ASSERT_LOG_ONLY((expected == thread_id),
+ "struct data_vio for allocated physical block %llu on thread %u, should be on thread %u",
+ (unsigned long long) data_vio->allocation.pbn, thread_id,
+ expected);
+}
+
+static inline void set_data_vio_allocated_zone_callback(struct data_vio *data_vio,
+ vdo_action_fn callback)
+{
+ vdo_set_completion_callback(&data_vio->vio.completion, callback,
+ data_vio->allocation.zone->thread_id);
+}
+
+/**
+ * launch_data_vio_allocated_zone_callback() - Set a callback as a physical block operation in a
+ * data_vio's allocated zone and queue the data_vio and
+ * invoke it immediately.
+ */
+static inline void launch_data_vio_allocated_zone_callback(struct data_vio *data_vio,
+ vdo_action_fn callback)
+{
+ set_data_vio_allocated_zone_callback(data_vio, callback);
+ vdo_launch_completion(&data_vio->vio.completion);
+}
+
+static inline void assert_data_vio_in_duplicate_zone(struct data_vio *data_vio)
+{
+ thread_id_t expected = data_vio->duplicate.zone->thread_id;
+ thread_id_t thread_id = vdo_get_callback_thread_id();
+
+ VDO_ASSERT_LOG_ONLY((expected == thread_id),
+ "data_vio for duplicate physical block %llu on thread %u, should be on thread %u",
+ (unsigned long long) data_vio->duplicate.pbn, thread_id,
+ expected);
+}
+
+static inline void set_data_vio_duplicate_zone_callback(struct data_vio *data_vio,
+ vdo_action_fn callback)
+{
+ vdo_set_completion_callback(&data_vio->vio.completion, callback,
+ data_vio->duplicate.zone->thread_id);
+}
+
+/**
+ * launch_data_vio_duplicate_zone_callback() - Set a callback as a physical block operation in a
+ * data_vio's duplicate zone and queue the data_vio and
+ * invoke it immediately.
+ */
+static inline void launch_data_vio_duplicate_zone_callback(struct data_vio *data_vio,
+ vdo_action_fn callback)
+{
+ set_data_vio_duplicate_zone_callback(data_vio, callback);
+ vdo_launch_completion(&data_vio->vio.completion);
+}
+
+static inline void assert_data_vio_in_mapped_zone(struct data_vio *data_vio)
+{
+ thread_id_t expected = data_vio->mapped.zone->thread_id;
+ thread_id_t thread_id = vdo_get_callback_thread_id();
+
+ VDO_ASSERT_LOG_ONLY((expected == thread_id),
+ "data_vio for mapped physical block %llu on thread %u, should be on thread %u",
+ (unsigned long long) data_vio->mapped.pbn, thread_id, expected);
+}
+
+static inline void set_data_vio_mapped_zone_callback(struct data_vio *data_vio,
+ vdo_action_fn callback)
+{
+ vdo_set_completion_callback(&data_vio->vio.completion, callback,
+ data_vio->mapped.zone->thread_id);
+}
+
+static inline void assert_data_vio_in_new_mapped_zone(struct data_vio *data_vio)
+{
+ thread_id_t expected = data_vio->new_mapped.zone->thread_id;
+ thread_id_t thread_id = vdo_get_callback_thread_id();
+
+ VDO_ASSERT_LOG_ONLY((expected == thread_id),
+ "data_vio for new_mapped physical block %llu on thread %u, should be on thread %u",
+ (unsigned long long) data_vio->new_mapped.pbn, thread_id,
+ expected);
+}
+
+static inline void set_data_vio_new_mapped_zone_callback(struct data_vio *data_vio,
+ vdo_action_fn callback)
+{
+ vdo_set_completion_callback(&data_vio->vio.completion, callback,
+ data_vio->new_mapped.zone->thread_id);
+}
+
+static inline void assert_data_vio_in_journal_zone(struct data_vio *data_vio)
+{
+ thread_id_t journal_thread = vdo_from_data_vio(data_vio)->thread_config.journal_thread;
+ thread_id_t thread_id = vdo_get_callback_thread_id();
+
+ VDO_ASSERT_LOG_ONLY((journal_thread == thread_id),
+ "data_vio for logical block %llu on thread %u, should be on journal thread %u",
+ (unsigned long long) data_vio->logical.lbn, thread_id,
+ journal_thread);
+}
+
+static inline void set_data_vio_journal_callback(struct data_vio *data_vio,
+ vdo_action_fn callback)
+{
+ thread_id_t journal_thread = vdo_from_data_vio(data_vio)->thread_config.journal_thread;
+
+ vdo_set_completion_callback(&data_vio->vio.completion, callback, journal_thread);
+}
+
+/**
+ * launch_data_vio_journal_callback() - Set a callback as a journal operation and invoke it
+ * immediately.
+ */
+static inline void launch_data_vio_journal_callback(struct data_vio *data_vio,
+ vdo_action_fn callback)
+{
+ set_data_vio_journal_callback(data_vio, callback);
+ vdo_launch_completion(&data_vio->vio.completion);
+}
+
+static inline void assert_data_vio_in_packer_zone(struct data_vio *data_vio)
+{
+ thread_id_t packer_thread = vdo_from_data_vio(data_vio)->thread_config.packer_thread;
+ thread_id_t thread_id = vdo_get_callback_thread_id();
+
+ VDO_ASSERT_LOG_ONLY((packer_thread == thread_id),
+ "data_vio for logical block %llu on thread %u, should be on packer thread %u",
+ (unsigned long long) data_vio->logical.lbn, thread_id,
+ packer_thread);
+}
+
+static inline void set_data_vio_packer_callback(struct data_vio *data_vio,
+ vdo_action_fn callback)
+{
+ thread_id_t packer_thread = vdo_from_data_vio(data_vio)->thread_config.packer_thread;
+
+ vdo_set_completion_callback(&data_vio->vio.completion, callback, packer_thread);
+}
+
+/**
+ * launch_data_vio_packer_callback() - Set a callback as a packer operation and invoke it
+ * immediately.
+ */
+static inline void launch_data_vio_packer_callback(struct data_vio *data_vio,
+ vdo_action_fn callback)
+{
+ set_data_vio_packer_callback(data_vio, callback);
+ vdo_launch_completion(&data_vio->vio.completion);
+}
+
+static inline void assert_data_vio_on_cpu_thread(struct data_vio *data_vio)
+{
+ thread_id_t cpu_thread = vdo_from_data_vio(data_vio)->thread_config.cpu_thread;
+ thread_id_t thread_id = vdo_get_callback_thread_id();
+
+ VDO_ASSERT_LOG_ONLY((cpu_thread == thread_id),
+ "data_vio for logical block %llu on thread %u, should be on cpu thread %u",
+ (unsigned long long) data_vio->logical.lbn, thread_id,
+ cpu_thread);
+}
+
+static inline void set_data_vio_cpu_callback(struct data_vio *data_vio,
+ vdo_action_fn callback)
+{
+ thread_id_t cpu_thread = vdo_from_data_vio(data_vio)->thread_config.cpu_thread;
+
+ vdo_set_completion_callback(&data_vio->vio.completion, callback, cpu_thread);
+}
+
+/**
+ * launch_data_vio_cpu_callback() - Set a callback to run on the CPU queues and invoke it
+ * immediately.
+ */
+static inline void launch_data_vio_cpu_callback(struct data_vio *data_vio,
+ vdo_action_fn callback,
+ enum vdo_completion_priority priority)
+{
+ set_data_vio_cpu_callback(data_vio, callback);
+ vdo_launch_completion_with_priority(&data_vio->vio.completion, priority);
+}
+
+static inline void set_data_vio_bio_zone_callback(struct data_vio *data_vio,
+ vdo_action_fn callback)
+{
+ vdo_set_completion_callback(&data_vio->vio.completion, callback,
+ get_vio_bio_zone_thread_id(&data_vio->vio));
+}
+
+/**
+ * launch_data_vio_bio_zone_callback() - Set a callback as a bio zone operation and invoke it
+ * immediately.
+ */
+static inline void launch_data_vio_bio_zone_callback(struct data_vio *data_vio,
+ vdo_action_fn callback)
+{
+ set_data_vio_bio_zone_callback(data_vio, callback);
+ vdo_launch_completion_with_priority(&data_vio->vio.completion,
+ BIO_Q_DATA_PRIORITY);
+}
+
+/**
+ * launch_data_vio_on_bio_ack_queue() - If the vdo uses a bio_ack queue, set a callback to run on
+ * it and invoke it immediately, otherwise, just run the
+ * callback on the current thread.
+ */
+static inline void launch_data_vio_on_bio_ack_queue(struct data_vio *data_vio,
+ vdo_action_fn callback)
+{
+ struct vdo_completion *completion = &data_vio->vio.completion;
+ struct vdo *vdo = completion->vdo;
+
+ if (!vdo_uses_bio_ack_queue(vdo)) {
+ callback(completion);
+ return;
+ }
+
+ vdo_set_completion_callback(completion, callback,
+ vdo->thread_config.bio_ack_thread);
+ vdo_launch_completion_with_priority(completion, BIO_ACK_Q_ACK_PRIORITY);
+}
+
+void data_vio_allocate_data_block(struct data_vio *data_vio,
+ enum pbn_lock_type write_lock_type,
+ vdo_action_fn callback, vdo_action_fn error_handler);
+
+void release_data_vio_allocation_lock(struct data_vio *data_vio, bool reset);
+
+int __must_check uncompress_data_vio(struct data_vio *data_vio,
+ enum block_mapping_state mapping_state,
+ char *buffer);
+
+void update_metadata_for_data_vio_write(struct data_vio *data_vio,
+ struct pbn_lock *lock);
+void write_data_vio(struct data_vio *data_vio);
+void launch_compress_data_vio(struct data_vio *data_vio);
+void continue_data_vio_with_block_map_slot(struct vdo_completion *completion);
+
+#endif /* DATA_VIO_H */
diff --git a/drivers/md/dm-vdo/dedupe.c b/drivers/md/dm-vdo/dedupe.c
new file mode 100644
index 000000000000..117266e1b3ae
--- /dev/null
+++ b/drivers/md/dm-vdo/dedupe.c
@@ -0,0 +1,3003 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+/**
+ * DOC:
+ *
+ * Hash Locks:
+ *
+ * A hash_lock controls and coordinates writing, index access, and dedupe among groups of data_vios
+ * concurrently writing identical blocks, allowing them to deduplicate not only against advice but
+ * also against each other. This saves on index queries and allows those data_vios to concurrently
+ * deduplicate against a single block instead of being serialized through a PBN read lock. Only one
+ * index query is needed for each hash_lock, instead of one for every data_vio.
+ *
+ * Hash_locks are assigned to hash_zones by computing a modulus on the hash itself. Each hash_zone
+ * has a single dedicated queue and thread for performing all operations on the hash_locks assigned
+ * to that zone. The concurrency guarantees of this single-threaded model allow the code to omit
+ * more fine-grained locking for the hash_lock structures.
+ *
+ * A hash_lock acts like a state machine perhaps more than as a lock. Other than the starting and
+ * ending states INITIALIZING and BYPASSING, every state represents and is held for the duration of
+ * an asynchronous operation. All state transitions are performed on the thread of the hash_zone
+ * containing the lock. An asynchronous operation is almost always performed upon entering a state,
+ * and the callback from that operation triggers exiting the state and entering a new state.
+ *
+ * In all states except DEDUPING, there is a single data_vio, called the lock agent, performing the
+ * asynchronous operations on behalf of the lock. The agent will change during the lifetime of the
+ * lock if the lock is shared by more than one data_vio. data_vios waiting to deduplicate are kept
+ * on a wait queue. Viewed a different way, the agent holds the lock exclusively until the lock
+ * enters the DEDUPING state, at which point it becomes a shared lock that all the waiters (and any
+ * new data_vios that arrive) use to share a PBN lock. In state DEDUPING, there is no agent. When
+ * the last data_vio in the lock calls back in DEDUPING, it becomes the agent and the lock becomes
+ * exclusive again. New data_vios that arrive in the lock will also go on the wait queue.
+ *
+ * The existence of lock waiters is a key factor controlling which state the lock transitions to
+ * next. When the lock is new or has waiters, it will always try to reach DEDUPING, and when it
+ * doesn't, it will try to clean up and exit.
+ *
+ * Deduping requires holding a PBN lock on a block that is known to contain data identical to the
+ * data_vios in the lock, so the lock will send the agent to the duplicate zone to acquire the PBN
+ * lock (LOCKING), to the kernel I/O threads to read and verify the data (VERIFYING), or to write a
+ * new copy of the data to a full data block or a slot in a compressed block (WRITING).
+ *
+ * Cleaning up consists of updating the index when the data location is different from the initial
+ * index query (UPDATING, triggered by stale advice, compression, and rollover), releasing the PBN
+ * lock on the duplicate block (UNLOCKING), and if the agent is the last data_vio referencing the
+ * lock, releasing the hash_lock itself back to the hash zone (BYPASSING).
+ *
+ * The shortest sequence of states is for non-concurrent writes of new data:
+ * INITIALIZING -> QUERYING -> WRITING -> BYPASSING
+ * This sequence is short because no PBN read lock or index update is needed.
+ *
+ * Non-concurrent, finding valid advice looks like this (endpoints elided):
+ * -> QUERYING -> LOCKING -> VERIFYING -> DEDUPING -> UNLOCKING ->
+ * Or with stale advice (endpoints elided):
+ * -> QUERYING -> LOCKING -> VERIFYING -> UNLOCKING -> WRITING -> UPDATING ->
+ *
+ * When there are not enough available reference count increments available on a PBN for a data_vio
+ * to deduplicate, a new lock is forked and the excess waiters roll over to the new lock (which
+ * goes directly to WRITING). The new lock takes the place of the old lock in the lock map so new
+ * data_vios will be directed to it. The two locks will proceed independently, but only the new
+ * lock will have the right to update the index (unless it also forks).
+ *
+ * Since rollover happens in a lock instance, once a valid data location has been selected, it will
+ * not change. QUERYING and WRITING are only performed once per lock lifetime. All other
+ * non-endpoint states can be re-entered.
+ *
+ * The function names in this module follow a convention referencing the states and transitions in
+ * the state machine. For example, for the LOCKING state, there are start_locking() and
+ * finish_locking() functions. start_locking() is invoked by the finish function of the state (or
+ * states) that transition to LOCKING. It performs the actual lock state change and must be invoked
+ * on the hash zone thread. finish_locking() is called by (or continued via callback from) the
+ * code actually obtaining the lock. It does any bookkeeping or decision-making required and
+ * invokes the appropriate start function of the state being transitioned to after LOCKING.
+ *
+ * ----------------------------------------------------------------------
+ *
+ * Index Queries:
+ *
+ * A query to the UDS index is handled asynchronously by the index's threads. When the query is
+ * complete, a callback supplied with the query will be called from one of the those threads. Under
+ * heavy system load, the index may be slower to respond than is desirable for reasonable I/O
+ * throughput. Since deduplication of writes is not necessary for correct operation of a VDO
+ * device, it is acceptable to timeout out slow index queries and proceed to fulfill a write
+ * request without deduplicating. However, because the uds_request struct itself is supplied by the
+ * caller, we can not simply reuse a uds_request object which we have chosen to timeout. Hence,
+ * each hash_zone maintains a pool of dedupe_contexts which each contain a uds_request along with a
+ * reference to the data_vio on behalf of which they are performing a query.
+ *
+ * When a hash_lock needs to query the index, it attempts to acquire an unused dedupe_context from
+ * its hash_zone's pool. If one is available, that context is prepared, associated with the
+ * hash_lock's agent, added to the list of pending contexts, and then sent to the index. The
+ * context's state will be transitioned from DEDUPE_CONTEXT_IDLE to DEDUPE_CONTEXT_PENDING. If all
+ * goes well, the dedupe callback will be called by the index which will change the context's state
+ * to DEDUPE_CONTEXT_COMPLETE, and the associated data_vio will be enqueued to run back in the hash
+ * zone where the query results will be processed and the context will be put back in the idle
+ * state and returned to the hash_zone's available list.
+ *
+ * The first time an index query is launched from a given hash_zone, a timer is started. When the
+ * timer fires, the hash_zone's completion is enqueued to run in the hash_zone where the zone's
+ * pending list will be searched for any contexts in the pending state which have been running for
+ * too long. Those contexts are transitioned to the DEDUPE_CONTEXT_TIMED_OUT state and moved to the
+ * zone's timed_out list where they won't be examined again if there is a subsequent time out). The
+ * data_vios associated with timed out contexts are sent to continue processing their write
+ * operation without deduplicating. The timer is also restarted.
+ *
+ * When the dedupe callback is run for a context which is in the timed out state, that context is
+ * moved to the DEDUPE_CONTEXT_TIMED_OUT_COMPLETE state. No other action need be taken as the
+ * associated data_vios have already been dispatched.
+ *
+ * If a hash_lock needs a dedupe context, and the available list is empty, the timed_out list will
+ * be searched for any contexts which are timed out and complete. One of these will be used
+ * immediately, and the rest will be returned to the available list and marked idle.
+ */
+
+#include "dedupe.h"
+
+#include <linux/atomic.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/ratelimit.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "numeric.h"
+#include "permassert.h"
+#include "string-utils.h"
+
+#include "indexer.h"
+
+#include "action-manager.h"
+#include "admin-state.h"
+#include "completion.h"
+#include "constants.h"
+#include "data-vio.h"
+#include "int-map.h"
+#include "io-submitter.h"
+#include "packer.h"
+#include "physical-zone.h"
+#include "slab-depot.h"
+#include "statistics.h"
+#include "types.h"
+#include "vdo.h"
+#include "wait-queue.h"
+
+struct uds_attribute {
+ struct attribute attr;
+ const char *(*show_string)(struct hash_zones *hash_zones);
+};
+
+#define DEDUPE_QUERY_TIMER_IDLE 0
+#define DEDUPE_QUERY_TIMER_RUNNING 1
+#define DEDUPE_QUERY_TIMER_FIRED 2
+
+enum dedupe_context_state {
+ DEDUPE_CONTEXT_IDLE,
+ DEDUPE_CONTEXT_PENDING,
+ DEDUPE_CONTEXT_TIMED_OUT,
+ DEDUPE_CONTEXT_COMPLETE,
+ DEDUPE_CONTEXT_TIMED_OUT_COMPLETE,
+};
+
+/* Possible index states: closed, opened, or transitioning between those two. */
+enum index_state {
+ IS_CLOSED,
+ IS_CHANGING,
+ IS_OPENED,
+};
+
+static const char *CLOSED = "closed";
+static const char *CLOSING = "closing";
+static const char *ERROR = "error";
+static const char *OFFLINE = "offline";
+static const char *ONLINE = "online";
+static const char *OPENING = "opening";
+static const char *SUSPENDED = "suspended";
+static const char *UNKNOWN = "unknown";
+
+/* Version 2 uses the kernel space UDS index and is limited to 16 bytes */
+#define UDS_ADVICE_VERSION 2
+/* version byte + state byte + 64-bit little-endian PBN */
+#define UDS_ADVICE_SIZE (1 + 1 + sizeof(u64))
+
+enum hash_lock_state {
+ /* State for locks that are not in use or are being initialized. */
+ VDO_HASH_LOCK_INITIALIZING,
+
+ /* This is the sequence of states typically used on the non-dedupe path. */
+ VDO_HASH_LOCK_QUERYING,
+ VDO_HASH_LOCK_WRITING,
+ VDO_HASH_LOCK_UPDATING,
+
+ /* The remaining states are typically used on the dedupe path in this order. */
+ VDO_HASH_LOCK_LOCKING,
+ VDO_HASH_LOCK_VERIFYING,
+ VDO_HASH_LOCK_DEDUPING,
+ VDO_HASH_LOCK_UNLOCKING,
+
+ /*
+ * Terminal state for locks returning to the pool. Must be last both because it's the final
+ * state, and also because it's used to count the states.
+ */
+ VDO_HASH_LOCK_BYPASSING,
+};
+
+static const char * const LOCK_STATE_NAMES[] = {
+ [VDO_HASH_LOCK_BYPASSING] = "BYPASSING",
+ [VDO_HASH_LOCK_DEDUPING] = "DEDUPING",
+ [VDO_HASH_LOCK_INITIALIZING] = "INITIALIZING",
+ [VDO_HASH_LOCK_LOCKING] = "LOCKING",
+ [VDO_HASH_LOCK_QUERYING] = "QUERYING",
+ [VDO_HASH_LOCK_UNLOCKING] = "UNLOCKING",
+ [VDO_HASH_LOCK_UPDATING] = "UPDATING",
+ [VDO_HASH_LOCK_VERIFYING] = "VERIFYING",
+ [VDO_HASH_LOCK_WRITING] = "WRITING",
+};
+
+struct hash_lock {
+ /* The block hash covered by this lock */
+ struct uds_record_name hash;
+
+ /* When the lock is unused, this list entry allows the lock to be pooled */
+ struct list_head pool_node;
+
+ /*
+ * A list containing the data VIOs sharing this lock, all having the same record name and
+ * data block contents, linked by their hash_lock_node fields.
+ */
+ struct list_head duplicate_ring;
+
+ /* The number of data_vios sharing this lock instance */
+ data_vio_count_t reference_count;
+
+ /* The maximum value of reference_count in the lifetime of this lock */
+ data_vio_count_t max_references;
+
+ /* The current state of this lock */
+ enum hash_lock_state state;
+
+ /* True if the UDS index should be updated with new advice */
+ bool update_advice;
+
+ /* True if the advice has been verified to be a true duplicate */
+ bool verified;
+
+ /* True if the lock has already accounted for an initial verification */
+ bool verify_counted;
+
+ /* True if this lock is registered in the lock map (cleared on rollover) */
+ bool registered;
+
+ /*
+ * If verified is false, this is the location of a possible duplicate. If verified is true,
+ * it is the verified location of a true duplicate.
+ */
+ struct zoned_pbn duplicate;
+
+ /* The PBN lock on the block containing the duplicate data */
+ struct pbn_lock *duplicate_lock;
+
+ /* The data_vio designated to act on behalf of the lock */
+ struct data_vio *agent;
+
+ /*
+ * Other data_vios with data identical to the agent who are currently waiting for the agent
+ * to get the information they all need to deduplicate--either against each other, or
+ * against an existing duplicate on disk.
+ */
+ struct vdo_wait_queue waiters;
+};
+
+#define LOCK_POOL_CAPACITY MAXIMUM_VDO_USER_VIOS
+
+struct hash_zones {
+ struct action_manager *manager;
+ struct uds_parameters parameters;
+ struct uds_index_session *index_session;
+ struct ratelimit_state ratelimiter;
+ atomic64_t timeouts;
+ atomic64_t dedupe_context_busy;
+
+ /* This spinlock protects the state fields and the starting of dedupe requests. */
+ spinlock_t lock;
+
+ /* The fields in the next block are all protected by the lock */
+ struct vdo_completion completion;
+ enum index_state index_state;
+ enum index_state index_target;
+ struct admin_state state;
+ bool changing;
+ bool create_flag;
+ bool dedupe_flag;
+ bool error_flag;
+ u64 reported_timeouts;
+
+ /* The number of zones */
+ zone_count_t zone_count;
+ /* The hash zones themselves */
+ struct hash_zone zones[];
+};
+
+/* These are in milliseconds. */
+unsigned int vdo_dedupe_index_timeout_interval = 5000;
+unsigned int vdo_dedupe_index_min_timer_interval = 100;
+/* Same two variables, in jiffies for easier consumption. */
+static u64 vdo_dedupe_index_timeout_jiffies;
+static u64 vdo_dedupe_index_min_timer_jiffies;
+
+static inline struct hash_zone *as_hash_zone(struct vdo_completion *completion)
+{
+ vdo_assert_completion_type(completion, VDO_HASH_ZONE_COMPLETION);
+ return container_of(completion, struct hash_zone, completion);
+}
+
+static inline struct hash_zones *as_hash_zones(struct vdo_completion *completion)
+{
+ vdo_assert_completion_type(completion, VDO_HASH_ZONES_COMPLETION);
+ return container_of(completion, struct hash_zones, completion);
+}
+
+static inline void assert_in_hash_zone(struct hash_zone *zone, const char *name)
+{
+ VDO_ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == zone->thread_id),
+ "%s called on hash zone thread", name);
+}
+
+static inline bool change_context_state(struct dedupe_context *context, int old, int new)
+{
+ return (atomic_cmpxchg(&context->state, old, new) == old);
+}
+
+static inline bool change_timer_state(struct hash_zone *zone, int old, int new)
+{
+ return (atomic_cmpxchg(&zone->timer_state, old, new) == old);
+}
+
+/**
+ * return_hash_lock_to_pool() - (Re)initialize a hash lock and return it to its pool.
+ * @zone: The zone from which the lock was borrowed.
+ * @lock: The lock that is no longer in use.
+ */
+static void return_hash_lock_to_pool(struct hash_zone *zone, struct hash_lock *lock)
+{
+ memset(lock, 0, sizeof(*lock));
+ INIT_LIST_HEAD(&lock->pool_node);
+ INIT_LIST_HEAD(&lock->duplicate_ring);
+ vdo_waitq_init(&lock->waiters);
+ list_add_tail(&lock->pool_node, &zone->lock_pool);
+}
+
+/**
+ * vdo_get_duplicate_lock() - Get the PBN lock on the duplicate data location for a data_vio from
+ * the hash_lock the data_vio holds (if there is one).
+ * @data_vio: The data_vio to query.
+ *
+ * Return: The PBN lock on the data_vio's duplicate location.
+ */
+struct pbn_lock *vdo_get_duplicate_lock(struct data_vio *data_vio)
+{
+ if (data_vio->hash_lock == NULL)
+ return NULL;
+
+ return data_vio->hash_lock->duplicate_lock;
+}
+
+/**
+ * hash_lock_key() - Return hash_lock's record name as a hash code.
+ * @lock: The hash lock.
+ *
+ * Return: The key to use for the int map.
+ */
+static inline u64 hash_lock_key(struct hash_lock *lock)
+{
+ return get_unaligned_le64(&lock->hash.name);
+}
+
+/**
+ * get_hash_lock_state_name() - Get the string representation of a hash lock state.
+ * @state: The hash lock state.
+ *
+ * Return: The short string representing the state
+ */
+static const char *get_hash_lock_state_name(enum hash_lock_state state)
+{
+ /* Catch if a state has been added without updating the name array. */
+ BUILD_BUG_ON((VDO_HASH_LOCK_BYPASSING + 1) != ARRAY_SIZE(LOCK_STATE_NAMES));
+ return (state < ARRAY_SIZE(LOCK_STATE_NAMES)) ? LOCK_STATE_NAMES[state] : "INVALID";
+}
+
+/**
+ * assert_hash_lock_agent() - Assert that a data_vio is the agent of its hash lock, and that this
+ * is being called in the hash zone.
+ * @data_vio: The data_vio expected to be the lock agent.
+ * @where: A string describing the function making the assertion.
+ */
+static void assert_hash_lock_agent(struct data_vio *data_vio, const char *where)
+{
+ /* Not safe to access the agent field except from the hash zone. */
+ assert_data_vio_in_hash_zone(data_vio);
+ VDO_ASSERT_LOG_ONLY(data_vio == data_vio->hash_lock->agent,
+ "%s must be for the hash lock agent", where);
+}
+
+/**
+ * set_duplicate_lock() - Set the duplicate lock held by a hash lock. May only be called in the
+ * physical zone of the PBN lock.
+ * @hash_lock: The hash lock to update.
+ * @pbn_lock: The PBN read lock to use as the duplicate lock.
+ */
+static void set_duplicate_lock(struct hash_lock *hash_lock, struct pbn_lock *pbn_lock)
+{
+ VDO_ASSERT_LOG_ONLY((hash_lock->duplicate_lock == NULL),
+ "hash lock must not already hold a duplicate lock");
+ pbn_lock->holder_count += 1;
+ hash_lock->duplicate_lock = pbn_lock;
+}
+
+/**
+ * dequeue_lock_waiter() - Remove the first data_vio from the lock's waitq and return it.
+ * @lock: The lock containing the wait queue.
+ *
+ * Return: The first (oldest) waiter in the queue, or NULL if the queue is empty.
+ */
+static inline struct data_vio *dequeue_lock_waiter(struct hash_lock *lock)
+{
+ return vdo_waiter_as_data_vio(vdo_waitq_dequeue_waiter(&lock->waiters));
+}
+
+/**
+ * set_hash_lock() - Set, change, or clear the hash lock a data_vio is using.
+ * @data_vio: The data_vio to update.
+ * @new_lock: The hash lock the data_vio is joining.
+ *
+ * Updates the hash lock (or locks) to reflect the change in membership.
+ */
+static void set_hash_lock(struct data_vio *data_vio, struct hash_lock *new_lock)
+{
+ struct hash_lock *old_lock = data_vio->hash_lock;
+
+ if (old_lock != NULL) {
+ VDO_ASSERT_LOG_ONLY(data_vio->hash_zone != NULL,
+ "must have a hash zone when holding a hash lock");
+ VDO_ASSERT_LOG_ONLY(!list_empty(&data_vio->hash_lock_entry),
+ "must be on a hash lock ring when holding a hash lock");
+ VDO_ASSERT_LOG_ONLY(old_lock->reference_count > 0,
+ "hash lock reference must be counted");
+
+ if ((old_lock->state != VDO_HASH_LOCK_BYPASSING) &&
+ (old_lock->state != VDO_HASH_LOCK_UNLOCKING)) {
+ /*
+ * If the reference count goes to zero in a non-terminal state, we're most
+ * likely leaking this lock.
+ */
+ VDO_ASSERT_LOG_ONLY(old_lock->reference_count > 1,
+ "hash locks should only become unreferenced in a terminal state, not state %s",
+ get_hash_lock_state_name(old_lock->state));
+ }
+
+ list_del_init(&data_vio->hash_lock_entry);
+ old_lock->reference_count -= 1;
+
+ data_vio->hash_lock = NULL;
+ }
+
+ if (new_lock != NULL) {
+ /*
+ * Keep all data_vios sharing the lock on a ring since they can complete in any
+ * order and we'll always need a pointer to one to compare data.
+ */
+ list_move_tail(&data_vio->hash_lock_entry, &new_lock->duplicate_ring);
+ new_lock->reference_count += 1;
+ if (new_lock->max_references < new_lock->reference_count)
+ new_lock->max_references = new_lock->reference_count;
+
+ data_vio->hash_lock = new_lock;
+ }
+}
+
+/* There are loops in the state diagram, so some forward decl's are needed. */
+static void start_deduping(struct hash_lock *lock, struct data_vio *agent,
+ bool agent_is_done);
+static void start_locking(struct hash_lock *lock, struct data_vio *agent);
+static void start_writing(struct hash_lock *lock, struct data_vio *agent);
+static void unlock_duplicate_pbn(struct vdo_completion *completion);
+static void transfer_allocation_lock(struct data_vio *data_vio);
+
+/**
+ * exit_hash_lock() - Bottleneck for data_vios that have written or deduplicated and that are no
+ * longer needed to be an agent for the hash lock.
+ * @data_vio: The data_vio to complete and send to be cleaned up.
+ */
+static void exit_hash_lock(struct data_vio *data_vio)
+{
+ /* Release the hash lock now, saving a thread transition in cleanup. */
+ vdo_release_hash_lock(data_vio);
+
+ /* Complete the data_vio and start the clean-up path to release any locks it still holds. */
+ data_vio->vio.completion.callback = complete_data_vio;
+
+ continue_data_vio(data_vio);
+}
+
+/**
+ * set_duplicate_location() - Set the location of the duplicate block for data_vio, updating the
+ * is_duplicate and duplicate fields from a zoned_pbn.
+ * @data_vio: The data_vio to modify.
+ * @source: The location of the duplicate.
+ */
+static void set_duplicate_location(struct data_vio *data_vio,
+ const struct zoned_pbn source)
+{
+ data_vio->is_duplicate = (source.pbn != VDO_ZERO_BLOCK);
+ data_vio->duplicate = source;
+}
+
+/**
+ * retire_lock_agent() - Retire the active lock agent, replacing it with the first lock waiter, and
+ * make the retired agent exit the hash lock.
+ * @lock: The hash lock to update.
+ *
+ * Return: The new lock agent (which will be NULL if there was no waiter)
+ */
+static struct data_vio *retire_lock_agent(struct hash_lock *lock)
+{
+ struct data_vio *old_agent = lock->agent;
+ struct data_vio *new_agent = dequeue_lock_waiter(lock);
+
+ lock->agent = new_agent;
+ exit_hash_lock(old_agent);
+ if (new_agent != NULL)
+ set_duplicate_location(new_agent, lock->duplicate);
+ return new_agent;
+}
+
+/**
+ * wait_on_hash_lock() - Add a data_vio to the lock's queue of waiters.
+ * @lock: The hash lock on which to wait.
+ * @data_vio: The data_vio to add to the queue.
+ */
+static void wait_on_hash_lock(struct hash_lock *lock, struct data_vio *data_vio)
+{
+ vdo_waitq_enqueue_waiter(&lock->waiters, &data_vio->waiter);
+
+ /*
+ * Make sure the agent doesn't block indefinitely in the packer since it now has at least
+ * one other data_vio waiting on it.
+ */
+ if ((lock->state != VDO_HASH_LOCK_WRITING) || !cancel_data_vio_compression(lock->agent))
+ return;
+
+ /*
+ * Even though we're waiting, we also have to send ourselves as a one-way message to the
+ * packer to ensure the agent continues executing. This is safe because
+ * cancel_vio_compression() guarantees the agent won't continue executing until this
+ * message arrives in the packer, and because the wait queue link isn't used for sending
+ * the message.
+ */
+ data_vio->compression.lock_holder = lock->agent;
+ launch_data_vio_packer_callback(data_vio, vdo_remove_lock_holder_from_packer);
+}
+
+/**
+ * abort_waiter() - waiter_callback_fn function that shunts waiters to write their blocks without
+ * optimization.
+ * @waiter: The data_vio's waiter link.
+ * @context: Not used.
+ */
+static void abort_waiter(struct vdo_waiter *waiter, void *context __always_unused)
+{
+ write_data_vio(vdo_waiter_as_data_vio(waiter));
+}
+
+/**
+ * start_bypassing() - Stop using the hash lock.
+ * @lock: The hash lock.
+ * @agent: The data_vio acting as the agent for the lock.
+ *
+ * Stops using the hash lock. This is the final transition for hash locks which did not get an
+ * error.
+ */
+static void start_bypassing(struct hash_lock *lock, struct data_vio *agent)
+{
+ lock->state = VDO_HASH_LOCK_BYPASSING;
+ exit_hash_lock(agent);
+}
+
+void vdo_clean_failed_hash_lock(struct data_vio *data_vio)
+{
+ struct hash_lock *lock = data_vio->hash_lock;
+
+ if (lock->state == VDO_HASH_LOCK_BYPASSING) {
+ exit_hash_lock(data_vio);
+ return;
+ }
+
+ if (lock->agent == NULL) {
+ lock->agent = data_vio;
+ } else if (data_vio != lock->agent) {
+ exit_hash_lock(data_vio);
+ return;
+ }
+
+ lock->state = VDO_HASH_LOCK_BYPASSING;
+
+ /* Ensure we don't attempt to update advice when cleaning up. */
+ lock->update_advice = false;
+
+ vdo_waitq_notify_all_waiters(&lock->waiters, abort_waiter, NULL);
+
+ if (lock->duplicate_lock != NULL) {
+ /* The agent must reference the duplicate zone to launch it. */
+ data_vio->duplicate = lock->duplicate;
+ launch_data_vio_duplicate_zone_callback(data_vio, unlock_duplicate_pbn);
+ return;
+ }
+
+ lock->agent = NULL;
+ data_vio->is_duplicate = false;
+ exit_hash_lock(data_vio);
+}
+
+/**
+ * finish_unlocking() - Handle the result of the agent for the lock releasing a read lock on
+ * duplicate candidate.
+ * @completion: The completion of the data_vio acting as the lock's agent.
+ *
+ * This continuation is registered in unlock_duplicate_pbn().
+ */
+static void finish_unlocking(struct vdo_completion *completion)
+{
+ struct data_vio *agent = as_data_vio(completion);
+ struct hash_lock *lock = agent->hash_lock;
+
+ assert_hash_lock_agent(agent, __func__);
+
+ VDO_ASSERT_LOG_ONLY(lock->duplicate_lock == NULL,
+ "must have released the duplicate lock for the hash lock");
+
+ if (!lock->verified) {
+ /*
+ * UNLOCKING -> WRITING transition: The lock we released was on an unverified
+ * block, so it must have been a lock on advice we were verifying, not on a
+ * location that was used for deduplication. Go write (or compress) the block to
+ * get a location to dedupe against.
+ */
+ start_writing(lock, agent);
+ return;
+ }
+
+ /*
+ * With the lock released, the verified duplicate block may already have changed and will
+ * need to be re-verified if a waiter arrived.
+ */
+ lock->verified = false;
+
+ if (vdo_waitq_has_waiters(&lock->waiters)) {
+ /*
+ * UNLOCKING -> LOCKING transition: A new data_vio entered the hash lock while the
+ * agent was releasing the PBN lock. The current agent exits and the waiter has to
+ * re-lock and re-verify the duplicate location.
+ *
+ * TODO: If we used the current agent to re-acquire the PBN lock we wouldn't need
+ * to re-verify.
+ */
+ agent = retire_lock_agent(lock);
+ start_locking(lock, agent);
+ return;
+ }
+
+ /*
+ * UNLOCKING -> BYPASSING transition: The agent is done with the lock and no other
+ * data_vios reference it, so remove it from the lock map and return it to the pool.
+ */
+ start_bypassing(lock, agent);
+}
+
+/**
+ * unlock_duplicate_pbn() - Release a read lock on the PBN of the block that may or may not have
+ * contained duplicate data.
+ * @completion: The completion of the data_vio acting as the lock's agent.
+ *
+ * This continuation is launched by start_unlocking(), and calls back to finish_unlocking() on the
+ * hash zone thread.
+ */
+static void unlock_duplicate_pbn(struct vdo_completion *completion)
+{
+ struct data_vio *agent = as_data_vio(completion);
+ struct hash_lock *lock = agent->hash_lock;
+
+ assert_data_vio_in_duplicate_zone(agent);
+ VDO_ASSERT_LOG_ONLY(lock->duplicate_lock != NULL,
+ "must have a duplicate lock to release");
+
+ vdo_release_physical_zone_pbn_lock(agent->duplicate.zone, agent->duplicate.pbn,
+ vdo_forget(lock->duplicate_lock));
+ if (lock->state == VDO_HASH_LOCK_BYPASSING) {
+ complete_data_vio(completion);
+ return;
+ }
+
+ launch_data_vio_hash_zone_callback(agent, finish_unlocking);
+}
+
+/**
+ * start_unlocking() - Release a read lock on the PBN of the block that may or may not have
+ * contained duplicate data.
+ * @lock: The hash lock.
+ * @agent: The data_vio currently acting as the agent for the lock.
+ */
+static void start_unlocking(struct hash_lock *lock, struct data_vio *agent)
+{
+ lock->state = VDO_HASH_LOCK_UNLOCKING;
+ launch_data_vio_duplicate_zone_callback(agent, unlock_duplicate_pbn);
+}
+
+static void release_context(struct dedupe_context *context)
+{
+ struct hash_zone *zone = context->zone;
+
+ WRITE_ONCE(zone->active, zone->active - 1);
+ list_move(&context->list_entry, &zone->available);
+}
+
+static void process_update_result(struct data_vio *agent)
+{
+ struct dedupe_context *context = agent->dedupe_context;
+
+ if ((context == NULL) ||
+ !change_context_state(context, DEDUPE_CONTEXT_COMPLETE, DEDUPE_CONTEXT_IDLE))
+ return;
+
+ release_context(context);
+}
+
+/**
+ * finish_updating() - Process the result of a UDS update performed by the agent for the lock.
+ * @completion: The completion of the data_vio that performed the update
+ *
+ * This continuation is registered in start_querying().
+ */
+static void finish_updating(struct vdo_completion *completion)
+{
+ struct data_vio *agent = as_data_vio(completion);
+ struct hash_lock *lock = agent->hash_lock;
+
+ assert_hash_lock_agent(agent, __func__);
+
+ process_update_result(agent);
+
+ /*
+ * UDS was updated successfully, so don't update again unless the duplicate location
+ * changes due to rollover.
+ */
+ lock->update_advice = false;
+
+ if (vdo_waitq_has_waiters(&lock->waiters)) {
+ /*
+ * UPDATING -> DEDUPING transition: A new data_vio arrived during the UDS update.
+ * Send it on the verified dedupe path. The agent is done with the lock, but the
+ * lock may still need to use it to clean up after rollover.
+ */
+ start_deduping(lock, agent, true);
+ return;
+ }
+
+ if (lock->duplicate_lock != NULL) {
+ /*
+ * UPDATING -> UNLOCKING transition: No one is waiting to dedupe, but we hold a
+ * duplicate PBN lock, so go release it.
+ */
+ start_unlocking(lock, agent);
+ return;
+ }
+
+ /*
+ * UPDATING -> BYPASSING transition: No one is waiting to dedupe and there's no lock to
+ * release.
+ */
+ start_bypassing(lock, agent);
+}
+
+static void query_index(struct data_vio *data_vio, enum uds_request_type operation);
+
+/**
+ * start_updating() - Continue deduplication with the last step, updating UDS with the location of
+ * the duplicate that should be returned as advice in the future.
+ * @lock: The hash lock.
+ * @agent: The data_vio currently acting as the agent for the lock.
+ */
+static void start_updating(struct hash_lock *lock, struct data_vio *agent)
+{
+ lock->state = VDO_HASH_LOCK_UPDATING;
+
+ VDO_ASSERT_LOG_ONLY(lock->verified, "new advice should have been verified");
+ VDO_ASSERT_LOG_ONLY(lock->update_advice, "should only update advice if needed");
+
+ agent->last_async_operation = VIO_ASYNC_OP_UPDATE_DEDUPE_INDEX;
+ set_data_vio_hash_zone_callback(agent, finish_updating);
+ query_index(agent, UDS_UPDATE);
+}
+
+/**
+ * finish_deduping() - Handle a data_vio that has finished deduplicating against the block locked
+ * by the hash lock.
+ * @lock: The hash lock.
+ * @data_vio: The lock holder that has finished deduplicating.
+ *
+ * If there are other data_vios still sharing the lock, this will just release the data_vio's share
+ * of the lock and finish processing the data_vio. If this is the last data_vio holding the lock,
+ * this makes the data_vio the lock agent and uses it to advance the state of the lock so it can
+ * eventually be released.
+ */
+static void finish_deduping(struct hash_lock *lock, struct data_vio *data_vio)
+{
+ struct data_vio *agent = data_vio;
+
+ VDO_ASSERT_LOG_ONLY(lock->agent == NULL, "shouldn't have an agent in DEDUPING");
+ VDO_ASSERT_LOG_ONLY(!vdo_waitq_has_waiters(&lock->waiters),
+ "shouldn't have any lock waiters in DEDUPING");
+
+ /* Just release the lock reference if other data_vios are still deduping. */
+ if (lock->reference_count > 1) {
+ exit_hash_lock(data_vio);
+ return;
+ }
+
+ /* The hash lock must have an agent for all other lock states. */
+ lock->agent = agent;
+ if (lock->update_advice) {
+ /*
+ * DEDUPING -> UPDATING transition: The location of the duplicate block changed
+ * since the initial UDS query because of compression, rollover, or because the
+ * query agent didn't have an allocation. The UDS update was delayed in case there
+ * was another change in location, but with only this data_vio using the hash lock,
+ * it's time to update the advice.
+ */
+ start_updating(lock, agent);
+ } else {
+ /*
+ * DEDUPING -> UNLOCKING transition: Release the PBN read lock on the duplicate
+ * location so the hash lock itself can be released (contingent on no new data_vios
+ * arriving in the lock before the agent returns).
+ */
+ start_unlocking(lock, agent);
+ }
+}
+
+/**
+ * acquire_lock() - Get the lock for a record name.
+ * @zone: The zone responsible for the hash.
+ * @hash: The hash to lock.
+ * @replace_lock: If non-NULL, the lock already registered for the hash which should be replaced by
+ * the new lock.
+ * @lock_ptr: A pointer to receive the hash lock.
+ *
+ * Gets the lock for the hash (record name) of the data in a data_vio, or if one does not exist (or
+ * if we are explicitly rolling over), initialize a new lock for the hash and register it in the
+ * zone. This must only be called in the correct thread for the zone.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+static int __must_check acquire_lock(struct hash_zone *zone,
+ const struct uds_record_name *hash,
+ struct hash_lock *replace_lock,
+ struct hash_lock **lock_ptr)
+{
+ struct hash_lock *lock, *new_lock;
+ int result;
+
+ /*
+ * Borrow and prepare a lock from the pool so we don't have to do two int_map accesses
+ * in the common case of no lock contention.
+ */
+ result = VDO_ASSERT(!list_empty(&zone->lock_pool),
+ "never need to wait for a free hash lock");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ new_lock = list_entry(zone->lock_pool.prev, struct hash_lock, pool_node);
+ list_del_init(&new_lock->pool_node);
+
+ /*
+ * Fill in the hash of the new lock so we can map it, since we have to use the hash as the
+ * map key.
+ */
+ new_lock->hash = *hash;
+
+ result = vdo_int_map_put(zone->hash_lock_map, hash_lock_key(new_lock),
+ new_lock, (replace_lock != NULL), (void **) &lock);
+ if (result != VDO_SUCCESS) {
+ return_hash_lock_to_pool(zone, vdo_forget(new_lock));
+ return result;
+ }
+
+ if (replace_lock != NULL) {
+ /* On mismatch put the old lock back and return a severe error */
+ VDO_ASSERT_LOG_ONLY(lock == replace_lock,
+ "old lock must have been in the lock map");
+ /* TODO: Check earlier and bail out? */
+ VDO_ASSERT_LOG_ONLY(replace_lock->registered,
+ "old lock must have been marked registered");
+ replace_lock->registered = false;
+ }
+
+ if (lock == replace_lock) {
+ lock = new_lock;
+ lock->registered = true;
+ } else {
+ /* There's already a lock for the hash, so we don't need the borrowed lock. */
+ return_hash_lock_to_pool(zone, vdo_forget(new_lock));
+ }
+
+ *lock_ptr = lock;
+ return VDO_SUCCESS;
+}
+
+/**
+ * enter_forked_lock() - Bind the data_vio to a new hash lock.
+ *
+ * Implements waiter_callback_fn. Binds the data_vio that was waiting to a new hash lock and waits
+ * on that lock.
+ */
+static void enter_forked_lock(struct vdo_waiter *waiter, void *context)
+{
+ struct data_vio *data_vio = vdo_waiter_as_data_vio(waiter);
+ struct hash_lock *new_lock = context;
+
+ set_hash_lock(data_vio, new_lock);
+ wait_on_hash_lock(new_lock, data_vio);
+}
+
+/**
+ * fork_hash_lock() - Fork a hash lock because it has run out of increments on the duplicate PBN.
+ * @old_lock: The hash lock to fork.
+ * @new_agent: The data_vio that will be the agent for the new lock.
+ *
+ * Transfers the new agent and any lock waiters to a new hash lock instance which takes the place
+ * of the old lock in the lock map. The old lock remains active, but will not update advice.
+ */
+static void fork_hash_lock(struct hash_lock *old_lock, struct data_vio *new_agent)
+{
+ struct hash_lock *new_lock;
+ int result;
+
+ result = acquire_lock(new_agent->hash_zone, &new_agent->record_name, old_lock,
+ &new_lock);
+ if (result != VDO_SUCCESS) {
+ continue_data_vio_with_error(new_agent, result);
+ return;
+ }
+
+ /*
+ * Only one of the two locks should update UDS. The old lock is out of references, so it
+ * would be poor dedupe advice in the short term.
+ */
+ old_lock->update_advice = false;
+ new_lock->update_advice = true;
+
+ set_hash_lock(new_agent, new_lock);
+ new_lock->agent = new_agent;
+
+ vdo_waitq_notify_all_waiters(&old_lock->waiters, enter_forked_lock, new_lock);
+
+ new_agent->is_duplicate = false;
+ start_writing(new_lock, new_agent);
+}
+
+/**
+ * launch_dedupe() - Reserve a reference count increment for a data_vio and launch it on the dedupe
+ * path.
+ * @lock: The hash lock.
+ * @data_vio: The data_vio to deduplicate using the hash lock.
+ * @has_claim: true if the data_vio already has claimed an increment from the duplicate lock.
+ *
+ * If no increments are available, this will roll over to a new hash lock and launch the data_vio
+ * as the writing agent for that lock.
+ */
+static void launch_dedupe(struct hash_lock *lock, struct data_vio *data_vio,
+ bool has_claim)
+{
+ if (!has_claim && !vdo_claim_pbn_lock_increment(lock->duplicate_lock)) {
+ /* Out of increments, so must roll over to a new lock. */
+ fork_hash_lock(lock, data_vio);
+ return;
+ }
+
+ /* Deduplicate against the lock's verified location. */
+ set_duplicate_location(data_vio, lock->duplicate);
+ data_vio->new_mapped = data_vio->duplicate;
+ update_metadata_for_data_vio_write(data_vio, lock->duplicate_lock);
+}
+
+/**
+ * start_deduping() - Enter the hash lock state where data_vios deduplicate in parallel against a
+ * true copy of their data on disk.
+ * @lock: The hash lock.
+ * @agent: The data_vio acting as the agent for the lock.
+ * @agent_is_done: true only if the agent has already written or deduplicated against its data.
+ *
+ * If the agent itself needs to deduplicate, an increment for it must already have been claimed
+ * from the duplicate lock, ensuring the hash lock will still have a data_vio holding it.
+ */
+static void start_deduping(struct hash_lock *lock, struct data_vio *agent,
+ bool agent_is_done)
+{
+ lock->state = VDO_HASH_LOCK_DEDUPING;
+
+ /*
+ * We don't take the downgraded allocation lock from the agent unless we actually need to
+ * deduplicate against it.
+ */
+ if (lock->duplicate_lock == NULL) {
+ VDO_ASSERT_LOG_ONLY(!vdo_is_state_compressed(agent->new_mapped.state),
+ "compression must have shared a lock");
+ VDO_ASSERT_LOG_ONLY(agent_is_done,
+ "agent must have written the new duplicate");
+ transfer_allocation_lock(agent);
+ }
+
+ VDO_ASSERT_LOG_ONLY(vdo_is_pbn_read_lock(lock->duplicate_lock),
+ "duplicate_lock must be a PBN read lock");
+
+ /*
+ * This state is not like any of the other states. There is no designated agent--the agent
+ * transitioning to this state and all the waiters will be launched to deduplicate in
+ * parallel.
+ */
+ lock->agent = NULL;
+
+ /*
+ * Launch the agent (if not already deduplicated) and as many lock waiters as we have
+ * available increments for on the dedupe path. If we run out of increments, rollover will
+ * be triggered and the remaining waiters will be transferred to the new lock.
+ */
+ if (!agent_is_done) {
+ launch_dedupe(lock, agent, true);
+ agent = NULL;
+ }
+ while (vdo_waitq_has_waiters(&lock->waiters))
+ launch_dedupe(lock, dequeue_lock_waiter(lock), false);
+
+ if (agent_is_done) {
+ /*
+ * In the degenerate case where all the waiters rolled over to a new lock, this
+ * will continue to use the old agent to clean up this lock, and otherwise it just
+ * lets the agent exit the lock.
+ */
+ finish_deduping(lock, agent);
+ }
+}
+
+/**
+ * increment_stat() - Increment a statistic counter in a non-atomic yet thread-safe manner.
+ * @stat: The statistic field to increment.
+ */
+static inline void increment_stat(u64 *stat)
+{
+ /*
+ * Must only be mutated on the hash zone thread. Prevents any compiler shenanigans from
+ * affecting other threads reading stats.
+ */
+ WRITE_ONCE(*stat, *stat + 1);
+}
+
+/**
+ * finish_verifying() - Handle the result of the agent for the lock comparing its data to the
+ * duplicate candidate.
+ * @completion: The completion of the data_vio used to verify dedupe
+ *
+ * This continuation is registered in start_verifying().
+ */
+static void finish_verifying(struct vdo_completion *completion)
+{
+ struct data_vio *agent = as_data_vio(completion);
+ struct hash_lock *lock = agent->hash_lock;
+
+ assert_hash_lock_agent(agent, __func__);
+
+ lock->verified = agent->is_duplicate;
+
+ /*
+ * Only count the result of the initial verification of the advice as valid or stale, and
+ * not any re-verifications due to PBN lock releases.
+ */
+ if (!lock->verify_counted) {
+ lock->verify_counted = true;
+ if (lock->verified)
+ increment_stat(&agent->hash_zone->statistics.dedupe_advice_valid);
+ else
+ increment_stat(&agent->hash_zone->statistics.dedupe_advice_stale);
+ }
+
+ /*
+ * Even if the block is a verified duplicate, we can't start to deduplicate unless we can
+ * claim a reference count increment for the agent.
+ */
+ if (lock->verified && !vdo_claim_pbn_lock_increment(lock->duplicate_lock)) {
+ agent->is_duplicate = false;
+ lock->verified = false;
+ }
+
+ if (lock->verified) {
+ /*
+ * VERIFYING -> DEDUPING transition: The advice is for a true duplicate, so start
+ * deduplicating against it, if references are available.
+ */
+ start_deduping(lock, agent, false);
+ } else {
+ /*
+ * VERIFYING -> UNLOCKING transition: Either the verify failed or we'd try to
+ * dedupe and roll over immediately, which would fail because it would leave the
+ * lock without an agent to release the PBN lock. In both cases, the data will have
+ * to be written or compressed, but first the advice PBN must be unlocked by the
+ * VERIFYING agent.
+ */
+ lock->update_advice = true;
+ start_unlocking(lock, agent);
+ }
+}
+
+static bool blocks_equal(char *block1, char *block2)
+{
+ int i;
+
+ for (i = 0; i < VDO_BLOCK_SIZE; i += sizeof(u64)) {
+ if (*((u64 *) &block1[i]) != *((u64 *) &block2[i]))
+ return false;
+ }
+
+ return true;
+}
+
+static void verify_callback(struct vdo_completion *completion)
+{
+ struct data_vio *agent = as_data_vio(completion);
+
+ agent->is_duplicate = blocks_equal(agent->vio.data, agent->scratch_block);
+ launch_data_vio_hash_zone_callback(agent, finish_verifying);
+}
+
+static void uncompress_and_verify(struct vdo_completion *completion)
+{
+ struct data_vio *agent = as_data_vio(completion);
+ int result;
+
+ result = uncompress_data_vio(agent, agent->duplicate.state,
+ agent->scratch_block);
+ if (result == VDO_SUCCESS) {
+ verify_callback(completion);
+ return;
+ }
+
+ agent->is_duplicate = false;
+ launch_data_vio_hash_zone_callback(agent, finish_verifying);
+}
+
+static void verify_endio(struct bio *bio)
+{
+ struct data_vio *agent = vio_as_data_vio(bio->bi_private);
+ int result = blk_status_to_errno(bio->bi_status);
+
+ vdo_count_completed_bios(bio);
+ if (result != VDO_SUCCESS) {
+ agent->is_duplicate = false;
+ launch_data_vio_hash_zone_callback(agent, finish_verifying);
+ return;
+ }
+
+ if (vdo_is_state_compressed(agent->duplicate.state)) {
+ launch_data_vio_cpu_callback(agent, uncompress_and_verify,
+ CPU_Q_COMPRESS_BLOCK_PRIORITY);
+ return;
+ }
+
+ launch_data_vio_cpu_callback(agent, verify_callback,
+ CPU_Q_COMPLETE_READ_PRIORITY);
+}
+
+/**
+ * start_verifying() - Begin the data verification phase.
+ * @lock: The hash lock (must be LOCKING).
+ * @agent: The data_vio to use to read and compare candidate data.
+ *
+ * Continue the deduplication path for a hash lock by using the agent to read (and possibly
+ * decompress) the data at the candidate duplicate location, comparing it to the data in the agent
+ * to verify that the candidate is identical to all the data_vios sharing the hash. If so, it can
+ * be deduplicated against, otherwise a data_vio allocation will have to be written to and used for
+ * dedupe.
+ */
+static void start_verifying(struct hash_lock *lock, struct data_vio *agent)
+{
+ int result;
+ struct vio *vio = &agent->vio;
+ char *buffer = (vdo_is_state_compressed(agent->duplicate.state) ?
+ (char *) agent->compression.block :
+ agent->scratch_block);
+
+ lock->state = VDO_HASH_LOCK_VERIFYING;
+ VDO_ASSERT_LOG_ONLY(!lock->verified, "hash lock only verifies advice once");
+
+ agent->last_async_operation = VIO_ASYNC_OP_VERIFY_DUPLICATION;
+ result = vio_reset_bio(vio, buffer, verify_endio, REQ_OP_READ,
+ agent->duplicate.pbn);
+ if (result != VDO_SUCCESS) {
+ set_data_vio_hash_zone_callback(agent, finish_verifying);
+ continue_data_vio_with_error(agent, result);
+ return;
+ }
+
+ set_data_vio_bio_zone_callback(agent, vdo_submit_vio);
+ vdo_launch_completion_with_priority(&vio->completion, BIO_Q_VERIFY_PRIORITY);
+}
+
+/**
+ * finish_locking() - Handle the result of the agent for the lock attempting to obtain a PBN read
+ * lock on the candidate duplicate block.
+ * @completion: The completion of the data_vio that attempted to get the read lock.
+ *
+ * This continuation is registered in lock_duplicate_pbn().
+ */
+static void finish_locking(struct vdo_completion *completion)
+{
+ struct data_vio *agent = as_data_vio(completion);
+ struct hash_lock *lock = agent->hash_lock;
+
+ assert_hash_lock_agent(agent, __func__);
+
+ if (!agent->is_duplicate) {
+ VDO_ASSERT_LOG_ONLY(lock->duplicate_lock == NULL,
+ "must not hold duplicate_lock if not flagged as a duplicate");
+ /*
+ * LOCKING -> WRITING transition: The advice block is being modified or has no
+ * available references, so try to write or compress the data, remembering to
+ * update UDS later with the new advice.
+ */
+ increment_stat(&agent->hash_zone->statistics.dedupe_advice_stale);
+ lock->update_advice = true;
+ start_writing(lock, agent);
+ return;
+ }
+
+ VDO_ASSERT_LOG_ONLY(lock->duplicate_lock != NULL,
+ "must hold duplicate_lock if flagged as a duplicate");
+
+ if (!lock->verified) {
+ /*
+ * LOCKING -> VERIFYING transition: Continue on the unverified dedupe path, reading
+ * the candidate duplicate and comparing it to the agent's data to decide whether
+ * it is a true duplicate or stale advice.
+ */
+ start_verifying(lock, agent);
+ return;
+ }
+
+ if (!vdo_claim_pbn_lock_increment(lock->duplicate_lock)) {
+ /*
+ * LOCKING -> UNLOCKING transition: The verified block was re-locked, but has no
+ * available increments left. Must first release the useless PBN read lock before
+ * rolling over to a new copy of the block.
+ */
+ agent->is_duplicate = false;
+ lock->verified = false;
+ lock->update_advice = true;
+ start_unlocking(lock, agent);
+ return;
+ }
+
+ /*
+ * LOCKING -> DEDUPING transition: Continue on the verified dedupe path, deduplicating
+ * against a location that was previously verified or written to.
+ */
+ start_deduping(lock, agent, false);
+}
+
+static bool acquire_provisional_reference(struct data_vio *agent, struct pbn_lock *lock,
+ struct slab_depot *depot)
+{
+ /* Ensure that the newly-locked block is referenced. */
+ struct vdo_slab *slab = vdo_get_slab(depot, agent->duplicate.pbn);
+ int result = vdo_acquire_provisional_reference(slab, agent->duplicate.pbn, lock);
+
+ if (result == VDO_SUCCESS)
+ return true;
+
+ vdo_log_warning_strerror(result,
+ "Error acquiring provisional reference for dedupe candidate; aborting dedupe");
+ agent->is_duplicate = false;
+ vdo_release_physical_zone_pbn_lock(agent->duplicate.zone,
+ agent->duplicate.pbn, lock);
+ continue_data_vio_with_error(agent, result);
+ return false;
+}
+
+/**
+ * lock_duplicate_pbn() - Acquire a read lock on the PBN of the block containing candidate
+ * duplicate data (compressed or uncompressed).
+ * @completion: The completion of the data_vio attempting to acquire the physical block lock on
+ * behalf of its hash lock.
+ *
+ * If the PBN is already locked for writing, the lock attempt is abandoned and is_duplicate will be
+ * cleared before calling back. This continuation is launched from start_locking(), and calls back
+ * to finish_locking() on the hash zone thread.
+ */
+static void lock_duplicate_pbn(struct vdo_completion *completion)
+{
+ unsigned int increment_limit;
+ struct pbn_lock *lock;
+ int result;
+
+ struct data_vio *agent = as_data_vio(completion);
+ struct slab_depot *depot = vdo_from_data_vio(agent)->depot;
+ struct physical_zone *zone = agent->duplicate.zone;
+
+ assert_data_vio_in_duplicate_zone(agent);
+
+ set_data_vio_hash_zone_callback(agent, finish_locking);
+
+ /*
+ * While in the zone that owns it, find out how many additional references can be made to
+ * the block if it turns out to truly be a duplicate.
+ */
+ increment_limit = vdo_get_increment_limit(depot, agent->duplicate.pbn);
+ if (increment_limit == 0) {
+ /*
+ * We could deduplicate against it later if a reference happened to be released
+ * during verification, but it's probably better to bail out now.
+ */
+ agent->is_duplicate = false;
+ continue_data_vio(agent);
+ return;
+ }
+
+ result = vdo_attempt_physical_zone_pbn_lock(zone, agent->duplicate.pbn,
+ VIO_READ_LOCK, &lock);
+ if (result != VDO_SUCCESS) {
+ continue_data_vio_with_error(agent, result);
+ return;
+ }
+
+ if (!vdo_is_pbn_read_lock(lock)) {
+ /*
+ * There are three cases of write locks: uncompressed data block writes, compressed
+ * (packed) block writes, and block map page writes. In all three cases, we give up
+ * on trying to verify the advice and don't bother to try deduplicate against the
+ * data in the write lock holder.
+ *
+ * 1) We don't ever want to try to deduplicate against a block map page.
+ *
+ * 2a) It's very unlikely we'd deduplicate against an entire packed block, both
+ * because of the chance of matching it, and because we don't record advice for it,
+ * but for the uncompressed representation of all the fragments it contains. The
+ * only way we'd be getting lock contention is if we've written the same
+ * representation coincidentally before, had it become unreferenced, and it just
+ * happened to be packed together from compressed writes when we go to verify the
+ * lucky advice. Giving up is a minuscule loss of potential dedupe.
+ *
+ * 2b) If the advice is for a slot of a compressed block, it's about to get
+ * smashed, and the write smashing it cannot contain our data--it would have to be
+ * writing on behalf of our hash lock, but that's impossible since we're the lock
+ * agent.
+ *
+ * 3a) If the lock is held by a data_vio with different data, the advice is already
+ * stale or is about to become stale.
+ *
+ * 3b) If the lock is held by a data_vio that matches us, we may as well either
+ * write it ourselves (or reference the copy we already wrote) instead of
+ * potentially having many duplicates wait for the lock holder to write, journal,
+ * hash, and finally arrive in the hash lock. We lose a chance to avoid a UDS
+ * update in the very rare case of advice for a free block that just happened to be
+ * allocated to a data_vio with the same hash. There's also a chance to save on a
+ * block write, at the cost of a block verify. Saving on a full block compare in
+ * all stale advice cases almost certainly outweighs saving a UDS update and
+ * trading a write for a read in a lucky case where advice would have been saved
+ * from becoming stale.
+ */
+ agent->is_duplicate = false;
+ continue_data_vio(agent);
+ return;
+ }
+
+ if (lock->holder_count == 0) {
+ if (!acquire_provisional_reference(agent, lock, depot))
+ return;
+
+ /*
+ * The increment limit we grabbed earlier is still valid. The lock now holds the
+ * rights to acquire all those references. Those rights will be claimed by hash
+ * locks sharing this read lock.
+ */
+ lock->increment_limit = increment_limit;
+ }
+
+ /*
+ * We've successfully acquired a read lock on behalf of the hash lock, so mark it as such.
+ */
+ set_duplicate_lock(agent->hash_lock, lock);
+
+ /*
+ * TODO: Optimization: We could directly launch the block verify, then switch to a hash
+ * thread.
+ */
+ continue_data_vio(agent);
+}
+
+/**
+ * start_locking() - Continue deduplication for a hash lock that has obtained valid advice of a
+ * potential duplicate through its agent.
+ * @lock: The hash lock (currently must be QUERYING).
+ * @agent: The data_vio bearing the dedupe advice.
+ */
+static void start_locking(struct hash_lock *lock, struct data_vio *agent)
+{
+ VDO_ASSERT_LOG_ONLY(lock->duplicate_lock == NULL,
+ "must not acquire a duplicate lock when already holding it");
+
+ lock->state = VDO_HASH_LOCK_LOCKING;
+
+ /*
+ * TODO: Optimization: If we arrange to continue on the duplicate zone thread when
+ * accepting the advice, and don't explicitly change lock states (or use an agent-local
+ * state, or an atomic), we can avoid a thread transition here.
+ */
+ agent->last_async_operation = VIO_ASYNC_OP_LOCK_DUPLICATE_PBN;
+ launch_data_vio_duplicate_zone_callback(agent, lock_duplicate_pbn);
+}
+
+/**
+ * finish_writing() - Re-entry point for the lock agent after it has finished writing or
+ * compressing its copy of the data block.
+ * @lock: The hash lock, which must be in state WRITING.
+ * @agent: The data_vio that wrote its data for the lock.
+ *
+ * The agent will never need to dedupe against anything, so it's done with the lock, but the lock
+ * may not be finished with it, as a UDS update might still be needed.
+ *
+ * If there are other lock holders, the agent will hand the job to one of them and exit, leaving
+ * the lock to deduplicate against the just-written block. If there are no other lock holders, the
+ * agent either exits (and later tears down the hash lock), or it remains the agent and updates
+ * UDS.
+ */
+static void finish_writing(struct hash_lock *lock, struct data_vio *agent)
+{
+ /*
+ * Dedupe against the data block or compressed block slot the agent wrote. Since we know
+ * the write succeeded, there's no need to verify it.
+ */
+ lock->duplicate = agent->new_mapped;
+ lock->verified = true;
+
+ if (vdo_is_state_compressed(lock->duplicate.state) && lock->registered) {
+ /*
+ * Compression means the location we gave in the UDS query is not the location
+ * we're using to deduplicate.
+ */
+ lock->update_advice = true;
+ }
+
+ /* If there are any waiters, we need to start deduping them. */
+ if (vdo_waitq_has_waiters(&lock->waiters)) {
+ /*
+ * WRITING -> DEDUPING transition: an asynchronously-written block failed to
+ * compress, so the PBN lock on the written copy was already transferred. The agent
+ * is done with the lock, but the lock may still need to use it to clean up after
+ * rollover.
+ */
+ start_deduping(lock, agent, true);
+ return;
+ }
+
+ /*
+ * There are no waiters and the agent has successfully written, so take a step towards
+ * being able to release the hash lock (or just release it).
+ */
+ if (lock->update_advice) {
+ /*
+ * WRITING -> UPDATING transition: There's no waiter and a UDS update is needed, so
+ * retain the WRITING agent and use it to launch the update. The happens on
+ * compression, rollover, or the QUERYING agent not having an allocation.
+ */
+ start_updating(lock, agent);
+ } else if (lock->duplicate_lock != NULL) {
+ /*
+ * WRITING -> UNLOCKING transition: There's no waiter and no update needed, but the
+ * compressed write gave us a shared duplicate lock that we must release.
+ */
+ set_duplicate_location(agent, lock->duplicate);
+ start_unlocking(lock, agent);
+ } else {
+ /*
+ * WRITING -> BYPASSING transition: There's no waiter, no update needed, and no
+ * duplicate lock held, so both the agent and lock have no more work to do. The
+ * agent will release its allocation lock in cleanup.
+ */
+ start_bypassing(lock, agent);
+ }
+}
+
+/**
+ * select_writing_agent() - Search through the lock waiters for a data_vio that has an allocation.
+ * @lock: The hash lock to modify.
+ *
+ * If an allocation is found, swap agents, put the old agent at the head of the wait queue, then
+ * return the new agent. Otherwise, just return the current agent.
+ */
+static struct data_vio *select_writing_agent(struct hash_lock *lock)
+{
+ struct vdo_wait_queue temp_queue;
+ struct data_vio *data_vio;
+
+ vdo_waitq_init(&temp_queue);
+
+ /*
+ * Move waiters to the temp queue one-by-one until we find an allocation. Not ideal to
+ * search, but it only happens when nearly out of space.
+ */
+ while (((data_vio = dequeue_lock_waiter(lock)) != NULL) &&
+ !data_vio_has_allocation(data_vio)) {
+ /* Use the lower-level enqueue since we're just moving waiters around. */
+ vdo_waitq_enqueue_waiter(&temp_queue, &data_vio->waiter);
+ }
+
+ if (data_vio != NULL) {
+ /*
+ * Move the rest of the waiters over to the temp queue, preserving the order they
+ * arrived at the lock.
+ */
+ vdo_waitq_transfer_all_waiters(&lock->waiters, &temp_queue);
+
+ /*
+ * The current agent is being replaced and will have to wait to dedupe; make it the
+ * first waiter since it was the first to reach the lock.
+ */
+ vdo_waitq_enqueue_waiter(&lock->waiters, &lock->agent->waiter);
+ lock->agent = data_vio;
+ } else {
+ /* No one has an allocation, so keep the current agent. */
+ data_vio = lock->agent;
+ }
+
+ /* Swap all the waiters back onto the lock's queue. */
+ vdo_waitq_transfer_all_waiters(&temp_queue, &lock->waiters);
+ return data_vio;
+}
+
+/**
+ * start_writing() - Begin the non-duplicate write path.
+ * @lock: The hash lock (currently must be QUERYING).
+ * @agent: The data_vio currently acting as the agent for the lock.
+ *
+ * Begins the non-duplicate write path for a hash lock that had no advice, selecting a data_vio
+ * with an allocation as a new agent, if necessary, then resuming the agent on the data_vio write
+ * path.
+ */
+static void start_writing(struct hash_lock *lock, struct data_vio *agent)
+{
+ lock->state = VDO_HASH_LOCK_WRITING;
+
+ /*
+ * The agent might not have received an allocation and so can't be used for writing, but
+ * it's entirely possible that one of the waiters did.
+ */
+ if (!data_vio_has_allocation(agent)) {
+ agent = select_writing_agent(lock);
+ /* If none of the waiters had an allocation, the writes all have to fail. */
+ if (!data_vio_has_allocation(agent)) {
+ /*
+ * TODO: Should we keep a variant of BYPASSING that causes new arrivals to
+ * fail immediately if they don't have an allocation? It might be possible
+ * that on some path there would be non-waiters still referencing the lock,
+ * so it would remain in the map as everything is currently spelled, even
+ * if the agent and all waiters release.
+ */
+ continue_data_vio_with_error(agent, VDO_NO_SPACE);
+ return;
+ }
+ }
+
+ /*
+ * If the agent compresses, it might wait indefinitely in the packer, which would be bad if
+ * there are any other data_vios waiting.
+ */
+ if (vdo_waitq_has_waiters(&lock->waiters))
+ cancel_data_vio_compression(agent);
+
+ /*
+ * Send the agent to the compress/pack/write path in vioWrite. If it succeeds, it will
+ * return to the hash lock via vdo_continue_hash_lock() and call finish_writing().
+ */
+ launch_compress_data_vio(agent);
+}
+
+/*
+ * Decode VDO duplicate advice from the old_metadata field of a UDS request.
+ * Returns true if valid advice was found and decoded
+ */
+static bool decode_uds_advice(struct dedupe_context *context)
+{
+ const struct uds_request *request = &context->request;
+ struct data_vio *data_vio = context->requestor;
+ size_t offset = 0;
+ const struct uds_record_data *encoding = &request->old_metadata;
+ struct vdo *vdo = vdo_from_data_vio(data_vio);
+ struct zoned_pbn *advice = &data_vio->duplicate;
+ u8 version;
+ int result;
+
+ if ((request->status != UDS_SUCCESS) || !request->found)
+ return false;
+
+ version = encoding->data[offset++];
+ if (version != UDS_ADVICE_VERSION) {
+ vdo_log_error("invalid UDS advice version code %u", version);
+ return false;
+ }
+
+ advice->state = encoding->data[offset++];
+ advice->pbn = get_unaligned_le64(&encoding->data[offset]);
+ offset += sizeof(u64);
+ BUG_ON(offset != UDS_ADVICE_SIZE);
+
+ /* Don't use advice that's clearly meaningless. */
+ if ((advice->state == VDO_MAPPING_STATE_UNMAPPED) || (advice->pbn == VDO_ZERO_BLOCK)) {
+ vdo_log_debug("Invalid advice from deduplication server: pbn %llu, state %u. Giving up on deduplication of logical block %llu",
+ (unsigned long long) advice->pbn, advice->state,
+ (unsigned long long) data_vio->logical.lbn);
+ atomic64_inc(&vdo->stats.invalid_advice_pbn_count);
+ return false;
+ }
+
+ result = vdo_get_physical_zone(vdo, advice->pbn, &advice->zone);
+ if ((result != VDO_SUCCESS) || (advice->zone == NULL)) {
+ vdo_log_debug("Invalid physical block number from deduplication server: %llu, giving up on deduplication of logical block %llu",
+ (unsigned long long) advice->pbn,
+ (unsigned long long) data_vio->logical.lbn);
+ atomic64_inc(&vdo->stats.invalid_advice_pbn_count);
+ return false;
+ }
+
+ return true;
+}
+
+static void process_query_result(struct data_vio *agent)
+{
+ struct dedupe_context *context = agent->dedupe_context;
+
+ if (context == NULL)
+ return;
+
+ if (change_context_state(context, DEDUPE_CONTEXT_COMPLETE, DEDUPE_CONTEXT_IDLE)) {
+ agent->is_duplicate = decode_uds_advice(context);
+ release_context(context);
+ }
+}
+
+/**
+ * finish_querying() - Process the result of a UDS query performed by the agent for the lock.
+ * @completion: The completion of the data_vio that performed the query.
+ *
+ * This continuation is registered in start_querying().
+ */
+static void finish_querying(struct vdo_completion *completion)
+{
+ struct data_vio *agent = as_data_vio(completion);
+ struct hash_lock *lock = agent->hash_lock;
+
+ assert_hash_lock_agent(agent, __func__);
+
+ process_query_result(agent);
+
+ if (agent->is_duplicate) {
+ lock->duplicate = agent->duplicate;
+ /*
+ * QUERYING -> LOCKING transition: Valid advice was obtained from UDS. Use the
+ * QUERYING agent to start the hash lock on the unverified dedupe path, verifying
+ * that the advice can be used.
+ */
+ start_locking(lock, agent);
+ } else {
+ /*
+ * The agent will be used as the duplicate if has an allocation; if it does, that
+ * location was posted to UDS, so no update will be needed.
+ */
+ lock->update_advice = !data_vio_has_allocation(agent);
+ /*
+ * QUERYING -> WRITING transition: There was no advice or the advice wasn't valid,
+ * so try to write or compress the data.
+ */
+ start_writing(lock, agent);
+ }
+}
+
+/**
+ * start_querying() - Start deduplication for a hash lock.
+ * @lock: The initialized hash lock.
+ * @data_vio: The data_vio that has just obtained the new lock.
+ *
+ * Starts deduplication for a hash lock that has finished initializing by making the data_vio that
+ * requested it the agent, entering the QUERYING state, and using the agent to perform the UDS
+ * query on behalf of the lock.
+ */
+static void start_querying(struct hash_lock *lock, struct data_vio *data_vio)
+{
+ lock->agent = data_vio;
+ lock->state = VDO_HASH_LOCK_QUERYING;
+ data_vio->last_async_operation = VIO_ASYNC_OP_CHECK_FOR_DUPLICATION;
+ set_data_vio_hash_zone_callback(data_vio, finish_querying);
+ query_index(data_vio,
+ (data_vio_has_allocation(data_vio) ? UDS_POST : UDS_QUERY));
+}
+
+/**
+ * report_bogus_lock_state() - Complain that a data_vio has entered a hash_lock that is in an
+ * unimplemented or unusable state and continue the data_vio with an
+ * error.
+ * @lock: The hash lock.
+ * @data_vio: The data_vio attempting to enter the lock.
+ */
+static void report_bogus_lock_state(struct hash_lock *lock, struct data_vio *data_vio)
+{
+ VDO_ASSERT_LOG_ONLY(false, "hash lock must not be in unimplemented state %s",
+ get_hash_lock_state_name(lock->state));
+ continue_data_vio_with_error(data_vio, VDO_LOCK_ERROR);
+}
+
+/**
+ * vdo_continue_hash_lock() - Continue the processing state after writing, compressing, or
+ * deduplicating.
+ * @data_vio: The data_vio to continue processing in its hash lock.
+ *
+ * Asynchronously continue processing a data_vio in its hash lock after it has finished writing,
+ * compressing, or deduplicating, so it can share the result with any data_vios waiting in the hash
+ * lock, or update the UDS index, or simply release its share of the lock.
+ *
+ * Context: This must only be called in the correct thread for the hash zone.
+ */
+void vdo_continue_hash_lock(struct vdo_completion *completion)
+{
+ struct data_vio *data_vio = as_data_vio(completion);
+ struct hash_lock *lock = data_vio->hash_lock;
+
+ switch (lock->state) {
+ case VDO_HASH_LOCK_WRITING:
+ VDO_ASSERT_LOG_ONLY(data_vio == lock->agent,
+ "only the lock agent may continue the lock");
+ finish_writing(lock, data_vio);
+ break;
+
+ case VDO_HASH_LOCK_DEDUPING:
+ finish_deduping(lock, data_vio);
+ break;
+
+ case VDO_HASH_LOCK_BYPASSING:
+ /* This data_vio has finished the write path and the lock doesn't need it. */
+ exit_hash_lock(data_vio);
+ break;
+
+ case VDO_HASH_LOCK_INITIALIZING:
+ case VDO_HASH_LOCK_QUERYING:
+ case VDO_HASH_LOCK_UPDATING:
+ case VDO_HASH_LOCK_LOCKING:
+ case VDO_HASH_LOCK_VERIFYING:
+ case VDO_HASH_LOCK_UNLOCKING:
+ /* A lock in this state should never be re-entered. */
+ report_bogus_lock_state(lock, data_vio);
+ break;
+
+ default:
+ report_bogus_lock_state(lock, data_vio);
+ }
+}
+
+/**
+ * is_hash_collision() - Check to see if a hash collision has occurred.
+ * @lock: The lock to check.
+ * @candidate: The data_vio seeking to share the lock.
+ *
+ * Check whether the data in data_vios sharing a lock is different than in a data_vio seeking to
+ * share the lock, which should only be possible in the extremely unlikely case of a hash
+ * collision.
+ *
+ * Return: true if the given data_vio must not share the lock because it doesn't have the same data
+ * as the lock holders.
+ */
+static bool is_hash_collision(struct hash_lock *lock, struct data_vio *candidate)
+{
+ struct data_vio *lock_holder;
+ struct hash_zone *zone;
+ bool collides;
+
+ if (list_empty(&lock->duplicate_ring))
+ return false;
+
+ lock_holder = list_first_entry(&lock->duplicate_ring, struct data_vio,
+ hash_lock_entry);
+ zone = candidate->hash_zone;
+ collides = !blocks_equal(lock_holder->vio.data, candidate->vio.data);
+ if (collides)
+ increment_stat(&zone->statistics.concurrent_hash_collisions);
+ else
+ increment_stat(&zone->statistics.concurrent_data_matches);
+
+ return collides;
+}
+
+static inline int assert_hash_lock_preconditions(const struct data_vio *data_vio)
+{
+ int result;
+
+ /* FIXME: BUG_ON() and/or enter read-only mode? */
+ result = VDO_ASSERT(data_vio->hash_lock == NULL,
+ "must not already hold a hash lock");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = VDO_ASSERT(list_empty(&data_vio->hash_lock_entry),
+ "must not already be a member of a hash lock ring");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ return VDO_ASSERT(data_vio->recovery_sequence_number == 0,
+ "must not hold a recovery lock when getting a hash lock");
+}
+
+/**
+ * vdo_acquire_hash_lock() - Acquire or share a lock on a record name.
+ * @data_vio: The data_vio acquiring a lock on its record name.
+ *
+ * Acquire or share a lock on the hash (record name) of the data in a data_vio, updating the
+ * data_vio to reference the lock. This must only be called in the correct thread for the zone. In
+ * the unlikely case of a hash collision, this function will succeed, but the data_vio will not get
+ * a lock reference.
+ */
+void vdo_acquire_hash_lock(struct vdo_completion *completion)
+{
+ struct data_vio *data_vio = as_data_vio(completion);
+ struct hash_lock *lock;
+ int result;
+
+ assert_data_vio_in_hash_zone(data_vio);
+
+ result = assert_hash_lock_preconditions(data_vio);
+ if (result != VDO_SUCCESS) {
+ continue_data_vio_with_error(data_vio, result);
+ return;
+ }
+
+ result = acquire_lock(data_vio->hash_zone, &data_vio->record_name, NULL, &lock);
+ if (result != VDO_SUCCESS) {
+ continue_data_vio_with_error(data_vio, result);
+ return;
+ }
+
+ if (is_hash_collision(lock, data_vio)) {
+ /*
+ * Hash collisions are extremely unlikely, but the bogus dedupe would be a data
+ * corruption. Bypass optimization entirely. We can't compress a data_vio without
+ * a hash_lock as the compressed write depends on the hash_lock to manage the
+ * references for the compressed block.
+ */
+ write_data_vio(data_vio);
+ return;
+ }
+
+ set_hash_lock(data_vio, lock);
+ switch (lock->state) {
+ case VDO_HASH_LOCK_INITIALIZING:
+ start_querying(lock, data_vio);
+ return;
+
+ case VDO_HASH_LOCK_QUERYING:
+ case VDO_HASH_LOCK_WRITING:
+ case VDO_HASH_LOCK_UPDATING:
+ case VDO_HASH_LOCK_LOCKING:
+ case VDO_HASH_LOCK_VERIFYING:
+ case VDO_HASH_LOCK_UNLOCKING:
+ /* The lock is busy, and can't be shared yet. */
+ wait_on_hash_lock(lock, data_vio);
+ return;
+
+ case VDO_HASH_LOCK_BYPASSING:
+ /* We can't use this lock, so bypass optimization entirely. */
+ vdo_release_hash_lock(data_vio);
+ write_data_vio(data_vio);
+ return;
+
+ case VDO_HASH_LOCK_DEDUPING:
+ launch_dedupe(lock, data_vio, false);
+ return;
+
+ default:
+ /* A lock in this state should not be acquired by new VIOs. */
+ report_bogus_lock_state(lock, data_vio);
+ }
+}
+
+/**
+ * vdo_release_hash_lock() - Release a data_vio's share of a hash lock, if held, and null out the
+ * data_vio's reference to it.
+ * @data_vio: The data_vio releasing its hash lock.
+ *
+ * If the data_vio is the only one holding the lock, this also releases any resources or locks used
+ * by the hash lock (such as a PBN read lock on a block containing data with the same hash) and
+ * returns the lock to the hash zone's lock pool.
+ *
+ * Context: This must only be called in the correct thread for the hash zone.
+ */
+void vdo_release_hash_lock(struct data_vio *data_vio)
+{
+ u64 lock_key;
+ struct hash_lock *lock = data_vio->hash_lock;
+ struct hash_zone *zone = data_vio->hash_zone;
+
+ if (lock == NULL)
+ return;
+
+ set_hash_lock(data_vio, NULL);
+
+ if (lock->reference_count > 0) {
+ /* The lock is still in use by other data_vios. */
+ return;
+ }
+
+ lock_key = hash_lock_key(lock);
+ if (lock->registered) {
+ struct hash_lock *removed;
+
+ removed = vdo_int_map_remove(zone->hash_lock_map, lock_key);
+ VDO_ASSERT_LOG_ONLY(lock == removed,
+ "hash lock being released must have been mapped");
+ } else {
+ VDO_ASSERT_LOG_ONLY(lock != vdo_int_map_get(zone->hash_lock_map, lock_key),
+ "unregistered hash lock must not be in the lock map");
+ }
+
+ VDO_ASSERT_LOG_ONLY(!vdo_waitq_has_waiters(&lock->waiters),
+ "hash lock returned to zone must have no waiters");
+ VDO_ASSERT_LOG_ONLY((lock->duplicate_lock == NULL),
+ "hash lock returned to zone must not reference a PBN lock");
+ VDO_ASSERT_LOG_ONLY((lock->state == VDO_HASH_LOCK_BYPASSING),
+ "returned hash lock must not be in use with state %s",
+ get_hash_lock_state_name(lock->state));
+ VDO_ASSERT_LOG_ONLY(list_empty(&lock->pool_node),
+ "hash lock returned to zone must not be in a pool ring");
+ VDO_ASSERT_LOG_ONLY(list_empty(&lock->duplicate_ring),
+ "hash lock returned to zone must not reference DataVIOs");
+
+ return_hash_lock_to_pool(zone, lock);
+}
+
+/**
+ * transfer_allocation_lock() - Transfer a data_vio's downgraded allocation PBN lock to the
+ * data_vio's hash lock, converting it to a duplicate PBN lock.
+ * @data_vio: The data_vio holding the allocation lock to transfer.
+ */
+static void transfer_allocation_lock(struct data_vio *data_vio)
+{
+ struct allocation *allocation = &data_vio->allocation;
+ struct hash_lock *hash_lock = data_vio->hash_lock;
+
+ VDO_ASSERT_LOG_ONLY(data_vio->new_mapped.pbn == allocation->pbn,
+ "transferred lock must be for the block written");
+
+ allocation->pbn = VDO_ZERO_BLOCK;
+
+ VDO_ASSERT_LOG_ONLY(vdo_is_pbn_read_lock(allocation->lock),
+ "must have downgraded the allocation lock before transfer");
+
+ hash_lock->duplicate = data_vio->new_mapped;
+ data_vio->duplicate = data_vio->new_mapped;
+
+ /*
+ * Since the lock is being transferred, the holder count doesn't change (and isn't even
+ * safe to examine on this thread).
+ */
+ hash_lock->duplicate_lock = vdo_forget(allocation->lock);
+}
+
+/**
+ * vdo_share_compressed_write_lock() - Make a data_vio's hash lock a shared holder of the PBN lock
+ * on the compressed block to which its data was just written.
+ * @data_vio: The data_vio which was just compressed.
+ * @pbn_lock: The PBN lock on the compressed block.
+ *
+ * If the lock is still a write lock (as it will be for the first share), it will be converted to a
+ * read lock. This also reserves a reference count increment for the data_vio.
+ */
+void vdo_share_compressed_write_lock(struct data_vio *data_vio,
+ struct pbn_lock *pbn_lock)
+{
+ bool claimed;
+
+ VDO_ASSERT_LOG_ONLY(vdo_get_duplicate_lock(data_vio) == NULL,
+ "a duplicate PBN lock should not exist when writing");
+ VDO_ASSERT_LOG_ONLY(vdo_is_state_compressed(data_vio->new_mapped.state),
+ "lock transfer must be for a compressed write");
+ assert_data_vio_in_new_mapped_zone(data_vio);
+
+ /* First sharer downgrades the lock. */
+ if (!vdo_is_pbn_read_lock(pbn_lock))
+ vdo_downgrade_pbn_write_lock(pbn_lock, true);
+
+ /*
+ * Get a share of the PBN lock, ensuring it cannot be released until after this data_vio
+ * has had a chance to journal a reference.
+ */
+ data_vio->duplicate = data_vio->new_mapped;
+ data_vio->hash_lock->duplicate = data_vio->new_mapped;
+ set_duplicate_lock(data_vio->hash_lock, pbn_lock);
+
+ /*
+ * Claim a reference for this data_vio. Necessary since another hash_lock might start
+ * deduplicating against it before our incRef.
+ */
+ claimed = vdo_claim_pbn_lock_increment(pbn_lock);
+ VDO_ASSERT_LOG_ONLY(claimed, "impossible to fail to claim an initial increment");
+}
+
+static void start_uds_queue(void *ptr)
+{
+ /*
+ * Allow the UDS dedupe worker thread to do memory allocations. It will only do allocations
+ * during the UDS calls that open or close an index, but those allocations can safely sleep
+ * while reserving a large amount of memory. We could use an allocations_allowed boolean
+ * (like the base threads do), but it would be an unnecessary embellishment.
+ */
+ struct vdo_thread *thread = vdo_get_work_queue_owner(vdo_get_current_work_queue());
+
+ vdo_register_allocating_thread(&thread->allocating_thread, NULL);
+}
+
+static void finish_uds_queue(void *ptr __always_unused)
+{
+ vdo_unregister_allocating_thread();
+}
+
+static void close_index(struct hash_zones *zones)
+ __must_hold(&zones->lock)
+{
+ int result;
+
+ /*
+ * Change the index state so that get_index_statistics() will not try to use the index
+ * session we are closing.
+ */
+ zones->index_state = IS_CHANGING;
+ /* Close the index session, while not holding the lock. */
+ spin_unlock(&zones->lock);
+ result = uds_close_index(zones->index_session);
+
+ if (result != UDS_SUCCESS)
+ vdo_log_error_strerror(result, "Error closing index");
+ spin_lock(&zones->lock);
+ zones->index_state = IS_CLOSED;
+ zones->error_flag |= result != UDS_SUCCESS;
+ /* ASSERTION: We leave in IS_CLOSED state. */
+}
+
+static void open_index(struct hash_zones *zones)
+ __must_hold(&zones->lock)
+{
+ /* ASSERTION: We enter in IS_CLOSED state. */
+ int result;
+ bool create_flag = zones->create_flag;
+
+ zones->create_flag = false;
+ /*
+ * Change the index state so that the it will be reported to the outside world as
+ * "opening".
+ */
+ zones->index_state = IS_CHANGING;
+ zones->error_flag = false;
+
+ /* Open the index session, while not holding the lock */
+ spin_unlock(&zones->lock);
+ result = uds_open_index(create_flag ? UDS_CREATE : UDS_LOAD,
+ &zones->parameters, zones->index_session);
+ if (result != UDS_SUCCESS)
+ vdo_log_error_strerror(result, "Error opening index");
+
+ spin_lock(&zones->lock);
+ if (!create_flag) {
+ switch (result) {
+ case -ENOENT:
+ /*
+ * Either there is no index, or there is no way we can recover the index.
+ * We will be called again and try to create a new index.
+ */
+ zones->index_state = IS_CLOSED;
+ zones->create_flag = true;
+ return;
+ default:
+ break;
+ }
+ }
+ if (result == UDS_SUCCESS) {
+ zones->index_state = IS_OPENED;
+ } else {
+ zones->index_state = IS_CLOSED;
+ zones->index_target = IS_CLOSED;
+ zones->error_flag = true;
+ spin_unlock(&zones->lock);
+ vdo_log_info("Setting UDS index target state to error");
+ spin_lock(&zones->lock);
+ }
+ /*
+ * ASSERTION: On success, we leave in IS_OPENED state.
+ * ASSERTION: On failure, we leave in IS_CLOSED state.
+ */
+}
+
+static void change_dedupe_state(struct vdo_completion *completion)
+{
+ struct hash_zones *zones = as_hash_zones(completion);
+
+ spin_lock(&zones->lock);
+
+ /* Loop until the index is in the target state and the create flag is clear. */
+ while (vdo_is_state_normal(&zones->state) &&
+ ((zones->index_state != zones->index_target) || zones->create_flag)) {
+ if (zones->index_state == IS_OPENED)
+ close_index(zones);
+ else
+ open_index(zones);
+ }
+
+ zones->changing = false;
+ spin_unlock(&zones->lock);
+}
+
+static void start_expiration_timer(struct dedupe_context *context)
+{
+ u64 start_time = context->submission_jiffies;
+ u64 end_time;
+
+ if (!change_timer_state(context->zone, DEDUPE_QUERY_TIMER_IDLE,
+ DEDUPE_QUERY_TIMER_RUNNING))
+ return;
+
+ end_time = max(start_time + vdo_dedupe_index_timeout_jiffies,
+ jiffies + vdo_dedupe_index_min_timer_jiffies);
+ mod_timer(&context->zone->timer, end_time);
+}
+
+/**
+ * report_dedupe_timeouts() - Record and eventually report that some dedupe requests reached their
+ * expiration time without getting answers, so we timed them out.
+ * @zones: the hash zones.
+ * @timeouts: the number of newly timed out requests.
+ */
+static void report_dedupe_timeouts(struct hash_zones *zones, unsigned int timeouts)
+{
+ atomic64_add(timeouts, &zones->timeouts);
+ spin_lock(&zones->lock);
+ if (__ratelimit(&zones->ratelimiter)) {
+ u64 unreported = atomic64_read(&zones->timeouts);
+
+ unreported -= zones->reported_timeouts;
+ vdo_log_debug("UDS index timeout on %llu requests",
+ (unsigned long long) unreported);
+ zones->reported_timeouts += unreported;
+ }
+ spin_unlock(&zones->lock);
+}
+
+static int initialize_index(struct vdo *vdo, struct hash_zones *zones)
+{
+ int result;
+ off_t uds_offset;
+ struct volume_geometry geometry = vdo->geometry;
+ static const struct vdo_work_queue_type uds_queue_type = {
+ .start = start_uds_queue,
+ .finish = finish_uds_queue,
+ .max_priority = UDS_Q_MAX_PRIORITY,
+ .default_priority = UDS_Q_PRIORITY,
+ };
+
+ vdo_set_dedupe_index_timeout_interval(vdo_dedupe_index_timeout_interval);
+ vdo_set_dedupe_index_min_timer_interval(vdo_dedupe_index_min_timer_interval);
+
+ /*
+ * Since we will save up the timeouts that would have been reported but were ratelimited,
+ * we don't need to report ratelimiting.
+ */
+ ratelimit_default_init(&zones->ratelimiter);
+ ratelimit_set_flags(&zones->ratelimiter, RATELIMIT_MSG_ON_RELEASE);
+ uds_offset = ((vdo_get_index_region_start(geometry) -
+ geometry.bio_offset) * VDO_BLOCK_SIZE);
+ zones->parameters = (struct uds_parameters) {
+ .bdev = vdo->device_config->owned_device->bdev,
+ .offset = uds_offset,
+ .size = (vdo_get_index_region_size(geometry) * VDO_BLOCK_SIZE),
+ .memory_size = geometry.index_config.mem,
+ .sparse = geometry.index_config.sparse,
+ .nonce = (u64) geometry.nonce,
+ };
+
+ result = uds_create_index_session(&zones->index_session);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ result = vdo_make_thread(vdo, vdo->thread_config.dedupe_thread, &uds_queue_type,
+ 1, NULL);
+ if (result != VDO_SUCCESS) {
+ uds_destroy_index_session(vdo_forget(zones->index_session));
+ vdo_log_error("UDS index queue initialization failed (%d)", result);
+ return result;
+ }
+
+ vdo_initialize_completion(&zones->completion, vdo, VDO_HASH_ZONES_COMPLETION);
+ vdo_set_completion_callback(&zones->completion, change_dedupe_state,
+ vdo->thread_config.dedupe_thread);
+ return VDO_SUCCESS;
+}
+
+/**
+ * finish_index_operation() - This is the UDS callback for index queries.
+ * @request: The uds request which has just completed.
+ */
+static void finish_index_operation(struct uds_request *request)
+{
+ struct dedupe_context *context = container_of(request, struct dedupe_context,
+ request);
+
+ if (change_context_state(context, DEDUPE_CONTEXT_PENDING,
+ DEDUPE_CONTEXT_COMPLETE)) {
+ /*
+ * This query has not timed out, so send its data_vio back to its hash zone to
+ * process the results.
+ */
+ continue_data_vio(context->requestor);
+ return;
+ }
+
+ /*
+ * This query has timed out, so try to mark it complete and hence eligible for reuse. Its
+ * data_vio has already moved on.
+ */
+ if (!change_context_state(context, DEDUPE_CONTEXT_TIMED_OUT,
+ DEDUPE_CONTEXT_TIMED_OUT_COMPLETE)) {
+ VDO_ASSERT_LOG_ONLY(false, "uds request was timed out (state %d)",
+ atomic_read(&context->state));
+ }
+
+ vdo_funnel_queue_put(context->zone->timed_out_complete, &context->queue_entry);
+}
+
+/**
+ * check_for_drain_complete() - Check whether this zone has drained.
+ * @zone: The zone to check.
+ */
+static void check_for_drain_complete(struct hash_zone *zone)
+{
+ data_vio_count_t recycled = 0;
+
+ if (!vdo_is_state_draining(&zone->state))
+ return;
+
+ if ((atomic_read(&zone->timer_state) == DEDUPE_QUERY_TIMER_IDLE) ||
+ change_timer_state(zone, DEDUPE_QUERY_TIMER_RUNNING,
+ DEDUPE_QUERY_TIMER_IDLE)) {
+ del_timer_sync(&zone->timer);
+ } else {
+ /*
+ * There is an in flight time-out, which must get processed before we can continue.
+ */
+ return;
+ }
+
+ for (;;) {
+ struct dedupe_context *context;
+ struct funnel_queue_entry *entry;
+
+ entry = vdo_funnel_queue_poll(zone->timed_out_complete);
+ if (entry == NULL)
+ break;
+
+ context = container_of(entry, struct dedupe_context, queue_entry);
+ atomic_set(&context->state, DEDUPE_CONTEXT_IDLE);
+ list_add(&context->list_entry, &zone->available);
+ recycled++;
+ }
+
+ if (recycled > 0)
+ WRITE_ONCE(zone->active, zone->active - recycled);
+ VDO_ASSERT_LOG_ONLY(READ_ONCE(zone->active) == 0, "all contexts inactive");
+ vdo_finish_draining(&zone->state);
+}
+
+static void timeout_index_operations_callback(struct vdo_completion *completion)
+{
+ struct dedupe_context *context, *tmp;
+ struct hash_zone *zone = as_hash_zone(completion);
+ u64 timeout_jiffies = msecs_to_jiffies(vdo_dedupe_index_timeout_interval);
+ unsigned long cutoff = jiffies - timeout_jiffies;
+ unsigned int timed_out = 0;
+
+ atomic_set(&zone->timer_state, DEDUPE_QUERY_TIMER_IDLE);
+ list_for_each_entry_safe(context, tmp, &zone->pending, list_entry) {
+ if (cutoff <= context->submission_jiffies) {
+ /*
+ * We have reached the oldest query which has not timed out yet, so restart
+ * the timer.
+ */
+ start_expiration_timer(context);
+ break;
+ }
+
+ if (!change_context_state(context, DEDUPE_CONTEXT_PENDING,
+ DEDUPE_CONTEXT_TIMED_OUT)) {
+ /*
+ * This context completed between the time the timeout fired, and now. We
+ * can treat it as a successful query, its requestor is already enqueued
+ * to process it.
+ */
+ continue;
+ }
+
+ /*
+ * Remove this context from the pending list so we won't look at it again on a
+ * subsequent timeout. Once the index completes it, it will be reused. Meanwhile,
+ * send its requestor on its way.
+ */
+ list_del_init(&context->list_entry);
+ continue_data_vio(context->requestor);
+ timed_out++;
+ }
+
+ if (timed_out > 0)
+ report_dedupe_timeouts(completion->vdo->hash_zones, timed_out);
+
+ check_for_drain_complete(zone);
+}
+
+static void timeout_index_operations(struct timer_list *t)
+{
+ struct hash_zone *zone = from_timer(zone, t, timer);
+
+ if (change_timer_state(zone, DEDUPE_QUERY_TIMER_RUNNING,
+ DEDUPE_QUERY_TIMER_FIRED))
+ vdo_launch_completion(&zone->completion);
+}
+
+static int __must_check initialize_zone(struct vdo *vdo, struct hash_zones *zones,
+ zone_count_t zone_number)
+{
+ int result;
+ data_vio_count_t i;
+ struct hash_zone *zone = &zones->zones[zone_number];
+
+ result = vdo_int_map_create(VDO_LOCK_MAP_CAPACITY, &zone->hash_lock_map);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ vdo_set_admin_state_code(&zone->state, VDO_ADMIN_STATE_NORMAL_OPERATION);
+ zone->zone_number = zone_number;
+ zone->thread_id = vdo->thread_config.hash_zone_threads[zone_number];
+ vdo_initialize_completion(&zone->completion, vdo, VDO_HASH_ZONE_COMPLETION);
+ vdo_set_completion_callback(&zone->completion, timeout_index_operations_callback,
+ zone->thread_id);
+ INIT_LIST_HEAD(&zone->lock_pool);
+ result = vdo_allocate(LOCK_POOL_CAPACITY, struct hash_lock, "hash_lock array",
+ &zone->lock_array);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ for (i = 0; i < LOCK_POOL_CAPACITY; i++)
+ return_hash_lock_to_pool(zone, &zone->lock_array[i]);
+
+ INIT_LIST_HEAD(&zone->available);
+ INIT_LIST_HEAD(&zone->pending);
+ result = vdo_make_funnel_queue(&zone->timed_out_complete);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ timer_setup(&zone->timer, timeout_index_operations, 0);
+
+ for (i = 0; i < MAXIMUM_VDO_USER_VIOS; i++) {
+ struct dedupe_context *context = &zone->contexts[i];
+
+ context->zone = zone;
+ context->request.callback = finish_index_operation;
+ context->request.session = zones->index_session;
+ list_add(&context->list_entry, &zone->available);
+ }
+
+ return vdo_make_default_thread(vdo, zone->thread_id);
+}
+
+/** get_thread_id_for_zone() - Implements vdo_zone_thread_getter_fn. */
+static thread_id_t get_thread_id_for_zone(void *context, zone_count_t zone_number)
+{
+ struct hash_zones *zones = context;
+
+ return zones->zones[zone_number].thread_id;
+}
+
+/**
+ * vdo_make_hash_zones() - Create the hash zones.
+ *
+ * @vdo: The vdo to which the zone will belong.
+ * @zones_ptr: A pointer to hold the zones.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+int vdo_make_hash_zones(struct vdo *vdo, struct hash_zones **zones_ptr)
+{
+ int result;
+ struct hash_zones *zones;
+ zone_count_t z;
+ zone_count_t zone_count = vdo->thread_config.hash_zone_count;
+
+ if (zone_count == 0)
+ return VDO_SUCCESS;
+
+ result = vdo_allocate_extended(struct hash_zones, zone_count, struct hash_zone,
+ __func__, &zones);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = initialize_index(vdo, zones);
+ if (result != VDO_SUCCESS) {
+ vdo_free(zones);
+ return result;
+ }
+
+ vdo_set_admin_state_code(&zones->state, VDO_ADMIN_STATE_NEW);
+
+ zones->zone_count = zone_count;
+ for (z = 0; z < zone_count; z++) {
+ result = initialize_zone(vdo, zones, z);
+ if (result != VDO_SUCCESS) {
+ vdo_free_hash_zones(zones);
+ return result;
+ }
+ }
+
+ result = vdo_make_action_manager(zones->zone_count, get_thread_id_for_zone,
+ vdo->thread_config.admin_thread, zones, NULL,
+ vdo, &zones->manager);
+ if (result != VDO_SUCCESS) {
+ vdo_free_hash_zones(zones);
+ return result;
+ }
+
+ *zones_ptr = zones;
+ return VDO_SUCCESS;
+}
+
+void vdo_finish_dedupe_index(struct hash_zones *zones)
+{
+ if (zones == NULL)
+ return;
+
+ uds_destroy_index_session(vdo_forget(zones->index_session));
+}
+
+/**
+ * vdo_free_hash_zones() - Free the hash zones.
+ * @zones: The zone to free.
+ */
+void vdo_free_hash_zones(struct hash_zones *zones)
+{
+ zone_count_t i;
+
+ if (zones == NULL)
+ return;
+
+ vdo_free(vdo_forget(zones->manager));
+
+ for (i = 0; i < zones->zone_count; i++) {
+ struct hash_zone *zone = &zones->zones[i];
+
+ vdo_free_funnel_queue(vdo_forget(zone->timed_out_complete));
+ vdo_int_map_free(vdo_forget(zone->hash_lock_map));
+ vdo_free(vdo_forget(zone->lock_array));
+ }
+
+ if (zones->index_session != NULL)
+ vdo_finish_dedupe_index(zones);
+
+ ratelimit_state_exit(&zones->ratelimiter);
+ vdo_free(zones);
+}
+
+static void initiate_suspend_index(struct admin_state *state)
+{
+ struct hash_zones *zones = container_of(state, struct hash_zones, state);
+ enum index_state index_state;
+
+ spin_lock(&zones->lock);
+ index_state = zones->index_state;
+ spin_unlock(&zones->lock);
+
+ if (index_state != IS_CLOSED) {
+ bool save = vdo_is_state_saving(&zones->state);
+ int result;
+
+ result = uds_suspend_index_session(zones->index_session, save);
+ if (result != UDS_SUCCESS)
+ vdo_log_error_strerror(result, "Error suspending dedupe index");
+ }
+
+ vdo_finish_draining(state);
+}
+
+/**
+ * suspend_index() - Suspend the UDS index prior to draining hash zones.
+ *
+ * Implements vdo_action_preamble_fn
+ */
+static void suspend_index(void *context, struct vdo_completion *completion)
+{
+ struct hash_zones *zones = context;
+
+ vdo_start_draining(&zones->state,
+ vdo_get_current_manager_operation(zones->manager), completion,
+ initiate_suspend_index);
+}
+
+/**
+ * initiate_drain() - Initiate a drain.
+ *
+ * Implements vdo_admin_initiator_fn.
+ */
+static void initiate_drain(struct admin_state *state)
+{
+ check_for_drain_complete(container_of(state, struct hash_zone, state));
+}
+
+/**
+ * drain_hash_zone() - Drain a hash zone.
+ *
+ * Implements vdo_zone_action_fn.
+ */
+static void drain_hash_zone(void *context, zone_count_t zone_number,
+ struct vdo_completion *parent)
+{
+ struct hash_zones *zones = context;
+
+ vdo_start_draining(&zones->zones[zone_number].state,
+ vdo_get_current_manager_operation(zones->manager), parent,
+ initiate_drain);
+}
+
+/** vdo_drain_hash_zones() - Drain all hash zones. */
+void vdo_drain_hash_zones(struct hash_zones *zones, struct vdo_completion *parent)
+{
+ vdo_schedule_operation(zones->manager, parent->vdo->suspend_type, suspend_index,
+ drain_hash_zone, NULL, parent);
+}
+
+static void launch_dedupe_state_change(struct hash_zones *zones)
+ __must_hold(&zones->lock)
+{
+ /* ASSERTION: We enter with the lock held. */
+ if (zones->changing || !vdo_is_state_normal(&zones->state))
+ /* Either a change is already in progress, or changes are not allowed. */
+ return;
+
+ if (zones->create_flag || (zones->index_state != zones->index_target)) {
+ zones->changing = true;
+ vdo_launch_completion(&zones->completion);
+ return;
+ }
+
+ /* ASSERTION: We exit with the lock held. */
+}
+
+/**
+ * resume_index() - Resume the UDS index prior to resuming hash zones.
+ *
+ * Implements vdo_action_preamble_fn
+ */
+static void resume_index(void *context, struct vdo_completion *parent)
+{
+ struct hash_zones *zones = context;
+ struct device_config *config = parent->vdo->device_config;
+ int result;
+
+ zones->parameters.bdev = config->owned_device->bdev;
+ result = uds_resume_index_session(zones->index_session, zones->parameters.bdev);
+ if (result != UDS_SUCCESS)
+ vdo_log_error_strerror(result, "Error resuming dedupe index");
+
+ spin_lock(&zones->lock);
+ vdo_resume_if_quiescent(&zones->state);
+
+ if (config->deduplication) {
+ zones->index_target = IS_OPENED;
+ WRITE_ONCE(zones->dedupe_flag, true);
+ } else {
+ zones->index_target = IS_CLOSED;
+ }
+
+ launch_dedupe_state_change(zones);
+ spin_unlock(&zones->lock);
+
+ vdo_finish_completion(parent);
+}
+
+/**
+ * resume_hash_zone() - Resume a hash zone.
+ *
+ * Implements vdo_zone_action_fn.
+ */
+static void resume_hash_zone(void *context, zone_count_t zone_number,
+ struct vdo_completion *parent)
+{
+ struct hash_zone *zone = &(((struct hash_zones *) context)->zones[zone_number]);
+
+ vdo_fail_completion(parent, vdo_resume_if_quiescent(&zone->state));
+}
+
+/**
+ * vdo_resume_hash_zones() - Resume a set of hash zones.
+ * @zones: The hash zones to resume.
+ * @parent: The object to notify when the zones have resumed.
+ */
+void vdo_resume_hash_zones(struct hash_zones *zones, struct vdo_completion *parent)
+{
+ if (vdo_is_read_only(parent->vdo)) {
+ vdo_launch_completion(parent);
+ return;
+ }
+
+ vdo_schedule_operation(zones->manager, VDO_ADMIN_STATE_RESUMING, resume_index,
+ resume_hash_zone, NULL, parent);
+}
+
+/**
+ * get_hash_zone_statistics() - Add the statistics for this hash zone to the tally for all zones.
+ * @zone: The hash zone to query.
+ * @tally: The tally
+ */
+static void get_hash_zone_statistics(const struct hash_zone *zone,
+ struct hash_lock_statistics *tally)
+{
+ const struct hash_lock_statistics *stats = &zone->statistics;
+
+ tally->dedupe_advice_valid += READ_ONCE(stats->dedupe_advice_valid);
+ tally->dedupe_advice_stale += READ_ONCE(stats->dedupe_advice_stale);
+ tally->concurrent_data_matches += READ_ONCE(stats->concurrent_data_matches);
+ tally->concurrent_hash_collisions += READ_ONCE(stats->concurrent_hash_collisions);
+ tally->curr_dedupe_queries += READ_ONCE(zone->active);
+}
+
+static void get_index_statistics(struct hash_zones *zones,
+ struct index_statistics *stats)
+{
+ enum index_state state;
+ struct uds_index_stats index_stats;
+ int result;
+
+ spin_lock(&zones->lock);
+ state = zones->index_state;
+ spin_unlock(&zones->lock);
+
+ if (state != IS_OPENED)
+ return;
+
+ result = uds_get_index_session_stats(zones->index_session, &index_stats);
+ if (result != UDS_SUCCESS) {
+ vdo_log_error_strerror(result, "Error reading index stats");
+ return;
+ }
+
+ stats->entries_indexed = index_stats.entries_indexed;
+ stats->posts_found = index_stats.posts_found;
+ stats->posts_not_found = index_stats.posts_not_found;
+ stats->queries_found = index_stats.queries_found;
+ stats->queries_not_found = index_stats.queries_not_found;
+ stats->updates_found = index_stats.updates_found;
+ stats->updates_not_found = index_stats.updates_not_found;
+ stats->entries_discarded = index_stats.entries_discarded;
+}
+
+/**
+ * vdo_get_dedupe_statistics() - Tally the statistics from all the hash zones and the UDS index.
+ * @hash_zones: The hash zones to query
+ *
+ * Return: The sum of the hash lock statistics from all hash zones plus the statistics from the UDS
+ * index
+ */
+void vdo_get_dedupe_statistics(struct hash_zones *zones, struct vdo_statistics *stats)
+
+{
+ zone_count_t zone;
+
+ for (zone = 0; zone < zones->zone_count; zone++)
+ get_hash_zone_statistics(&zones->zones[zone], &stats->hash_lock);
+
+ get_index_statistics(zones, &stats->index);
+
+ /*
+ * zones->timeouts gives the number of timeouts, and dedupe_context_busy gives the number
+ * of queries not made because of earlier timeouts.
+ */
+ stats->dedupe_advice_timeouts =
+ (atomic64_read(&zones->timeouts) + atomic64_read(&zones->dedupe_context_busy));
+}
+
+/**
+ * vdo_select_hash_zone() - Select the hash zone responsible for locking a given record name.
+ * @zones: The hash_zones from which to select.
+ * @name: The record name.
+ *
+ * Return: The hash zone responsible for the record name.
+ */
+struct hash_zone *vdo_select_hash_zone(struct hash_zones *zones,
+ const struct uds_record_name *name)
+{
+ /*
+ * Use a fragment of the record name as a hash code. Eight bits of hash should suffice
+ * since the number of hash zones is small.
+ * TODO: Verify that the first byte is independent enough.
+ */
+ u32 hash = name->name[0];
+
+ /*
+ * Scale the 8-bit hash fragment to a zone index by treating it as a binary fraction and
+ * multiplying that by the zone count. If the hash is uniformly distributed over [0 ..
+ * 2^8-1], then (hash * count / 2^8) should be uniformly distributed over [0 .. count-1].
+ * The multiply and shift is much faster than a divide (modulus) on X86 CPUs.
+ */
+ hash = (hash * zones->zone_count) >> 8;
+ return &zones->zones[hash];
+}
+
+/**
+ * dump_hash_lock() - Dump a compact description of hash_lock to the log if the lock is not on the
+ * free list.
+ * @lock: The hash lock to dump.
+ */
+static void dump_hash_lock(const struct hash_lock *lock)
+{
+ const char *state;
+
+ if (!list_empty(&lock->pool_node)) {
+ /* This lock is on the free list. */
+ return;
+ }
+
+ /*
+ * Necessarily cryptic since we can log a lot of these. First three chars of state is
+ * unambiguous. 'U' indicates a lock not registered in the map.
+ */
+ state = get_hash_lock_state_name(lock->state);
+ vdo_log_info(" hl %px: %3.3s %c%llu/%u rc=%u wc=%zu agt=%px",
+ lock, state, (lock->registered ? 'D' : 'U'),
+ (unsigned long long) lock->duplicate.pbn,
+ lock->duplicate.state, lock->reference_count,
+ vdo_waitq_num_waiters(&lock->waiters), lock->agent);
+}
+
+static const char *index_state_to_string(struct hash_zones *zones,
+ enum index_state state)
+{
+ if (!vdo_is_state_normal(&zones->state))
+ return SUSPENDED;
+
+ switch (state) {
+ case IS_CLOSED:
+ return zones->error_flag ? ERROR : CLOSED;
+ case IS_CHANGING:
+ return zones->index_target == IS_OPENED ? OPENING : CLOSING;
+ case IS_OPENED:
+ return READ_ONCE(zones->dedupe_flag) ? ONLINE : OFFLINE;
+ default:
+ return UNKNOWN;
+ }
+}
+
+/**
+ * dump_hash_zone() - Dump information about a hash zone to the log for debugging.
+ * @zone: The zone to dump.
+ */
+static void dump_hash_zone(const struct hash_zone *zone)
+{
+ data_vio_count_t i;
+
+ if (zone->hash_lock_map == NULL) {
+ vdo_log_info("struct hash_zone %u: NULL map", zone->zone_number);
+ return;
+ }
+
+ vdo_log_info("struct hash_zone %u: mapSize=%zu",
+ zone->zone_number, vdo_int_map_size(zone->hash_lock_map));
+ for (i = 0; i < LOCK_POOL_CAPACITY; i++)
+ dump_hash_lock(&zone->lock_array[i]);
+}
+
+/**
+ * vdo_dump_hash_zones() - Dump information about the hash zones to the log for debugging.
+ * @zones: The zones to dump.
+ */
+void vdo_dump_hash_zones(struct hash_zones *zones)
+{
+ const char *state, *target;
+ zone_count_t zone;
+
+ spin_lock(&zones->lock);
+ state = index_state_to_string(zones, zones->index_state);
+ target = (zones->changing ? index_state_to_string(zones, zones->index_target) : NULL);
+ spin_unlock(&zones->lock);
+
+ vdo_log_info("UDS index: state: %s", state);
+ if (target != NULL)
+ vdo_log_info("UDS index: changing to state: %s", target);
+
+ for (zone = 0; zone < zones->zone_count; zone++)
+ dump_hash_zone(&zones->zones[zone]);
+}
+
+void vdo_set_dedupe_index_timeout_interval(unsigned int value)
+{
+ u64 alb_jiffies;
+
+ /* Arbitrary maximum value is two minutes */
+ if (value > 120000)
+ value = 120000;
+ /* Arbitrary minimum value is 2 jiffies */
+ alb_jiffies = msecs_to_jiffies(value);
+
+ if (alb_jiffies < 2) {
+ alb_jiffies = 2;
+ value = jiffies_to_msecs(alb_jiffies);
+ }
+ vdo_dedupe_index_timeout_interval = value;
+ vdo_dedupe_index_timeout_jiffies = alb_jiffies;
+}
+
+void vdo_set_dedupe_index_min_timer_interval(unsigned int value)
+{
+ u64 min_jiffies;
+
+ /* Arbitrary maximum value is one second */
+ if (value > 1000)
+ value = 1000;
+
+ /* Arbitrary minimum value is 2 jiffies */
+ min_jiffies = msecs_to_jiffies(value);
+
+ if (min_jiffies < 2) {
+ min_jiffies = 2;
+ value = jiffies_to_msecs(min_jiffies);
+ }
+
+ vdo_dedupe_index_min_timer_interval = value;
+ vdo_dedupe_index_min_timer_jiffies = min_jiffies;
+}
+
+/**
+ * acquire_context() - Acquire a dedupe context from a hash_zone if any are available.
+ * @zone: the hash zone
+ *
+ * Return: A dedupe_context or NULL if none are available
+ */
+static struct dedupe_context * __must_check acquire_context(struct hash_zone *zone)
+{
+ struct dedupe_context *context;
+ struct funnel_queue_entry *entry;
+
+ assert_in_hash_zone(zone, __func__);
+
+ if (!list_empty(&zone->available)) {
+ WRITE_ONCE(zone->active, zone->active + 1);
+ context = list_first_entry(&zone->available, struct dedupe_context,
+ list_entry);
+ list_del_init(&context->list_entry);
+ return context;
+ }
+
+ entry = vdo_funnel_queue_poll(zone->timed_out_complete);
+ return ((entry == NULL) ?
+ NULL : container_of(entry, struct dedupe_context, queue_entry));
+}
+
+static void prepare_uds_request(struct uds_request *request, struct data_vio *data_vio,
+ enum uds_request_type operation)
+{
+ request->record_name = data_vio->record_name;
+ request->type = operation;
+ if ((operation == UDS_POST) || (operation == UDS_UPDATE)) {
+ size_t offset = 0;
+ struct uds_record_data *encoding = &request->new_metadata;
+
+ encoding->data[offset++] = UDS_ADVICE_VERSION;
+ encoding->data[offset++] = data_vio->new_mapped.state;
+ put_unaligned_le64(data_vio->new_mapped.pbn, &encoding->data[offset]);
+ offset += sizeof(u64);
+ BUG_ON(offset != UDS_ADVICE_SIZE);
+ }
+}
+
+/*
+ * The index operation will inquire about data_vio.record_name, providing (if the operation is
+ * appropriate) advice from the data_vio's new_mapped fields. The advice found in the index (or
+ * NULL if none) will be returned via receive_data_vio_dedupe_advice(). dedupe_context.status is
+ * set to the return status code of any asynchronous index processing.
+ */
+static void query_index(struct data_vio *data_vio, enum uds_request_type operation)
+{
+ int result;
+ struct dedupe_context *context;
+ struct vdo *vdo = vdo_from_data_vio(data_vio);
+ struct hash_zone *zone = data_vio->hash_zone;
+
+ assert_data_vio_in_hash_zone(data_vio);
+
+ if (!READ_ONCE(vdo->hash_zones->dedupe_flag)) {
+ continue_data_vio(data_vio);
+ return;
+ }
+
+ context = acquire_context(zone);
+ if (context == NULL) {
+ atomic64_inc(&vdo->hash_zones->dedupe_context_busy);
+ continue_data_vio(data_vio);
+ return;
+ }
+
+ data_vio->dedupe_context = context;
+ context->requestor = data_vio;
+ context->submission_jiffies = jiffies;
+ prepare_uds_request(&context->request, data_vio, operation);
+ atomic_set(&context->state, DEDUPE_CONTEXT_PENDING);
+ list_add_tail(&context->list_entry, &zone->pending);
+ start_expiration_timer(context);
+ result = uds_launch_request(&context->request);
+ if (result != UDS_SUCCESS) {
+ context->request.status = result;
+ finish_index_operation(&context->request);
+ }
+}
+
+static void set_target_state(struct hash_zones *zones, enum index_state target,
+ bool change_dedupe, bool dedupe, bool set_create)
+{
+ const char *old_state, *new_state;
+
+ spin_lock(&zones->lock);
+ old_state = index_state_to_string(zones, zones->index_target);
+ if (change_dedupe)
+ WRITE_ONCE(zones->dedupe_flag, dedupe);
+
+ if (set_create)
+ zones->create_flag = true;
+
+ zones->index_target = target;
+ launch_dedupe_state_change(zones);
+ new_state = index_state_to_string(zones, zones->index_target);
+ spin_unlock(&zones->lock);
+
+ if (old_state != new_state)
+ vdo_log_info("Setting UDS index target state to %s", new_state);
+}
+
+const char *vdo_get_dedupe_index_state_name(struct hash_zones *zones)
+{
+ const char *state;
+
+ spin_lock(&zones->lock);
+ state = index_state_to_string(zones, zones->index_state);
+ spin_unlock(&zones->lock);
+
+ return state;
+}
+
+/* Handle a dmsetup message relevant to the index. */
+int vdo_message_dedupe_index(struct hash_zones *zones, const char *name)
+{
+ if (strcasecmp(name, "index-close") == 0) {
+ set_target_state(zones, IS_CLOSED, false, false, false);
+ return 0;
+ } else if (strcasecmp(name, "index-create") == 0) {
+ set_target_state(zones, IS_OPENED, false, false, true);
+ return 0;
+ } else if (strcasecmp(name, "index-disable") == 0) {
+ set_target_state(zones, IS_OPENED, true, false, false);
+ return 0;
+ } else if (strcasecmp(name, "index-enable") == 0) {
+ set_target_state(zones, IS_OPENED, true, true, false);
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+void vdo_set_dedupe_state_normal(struct hash_zones *zones)
+{
+ vdo_set_admin_state_code(&zones->state, VDO_ADMIN_STATE_NORMAL_OPERATION);
+}
+
+/* If create_flag, create a new index without first attempting to load an existing index. */
+void vdo_start_dedupe_index(struct hash_zones *zones, bool create_flag)
+{
+ set_target_state(zones, IS_OPENED, true, true, create_flag);
+}
diff --git a/drivers/md/dm-vdo/dedupe.h b/drivers/md/dm-vdo/dedupe.h
new file mode 100644
index 000000000000..9000d6f3eece
--- /dev/null
+++ b/drivers/md/dm-vdo/dedupe.h
@@ -0,0 +1,120 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef VDO_DEDUPE_H
+#define VDO_DEDUPE_H
+
+#include <linux/list.h>
+#include <linux/timer.h>
+
+#include "indexer.h"
+
+#include "admin-state.h"
+#include "constants.h"
+#include "statistics.h"
+#include "types.h"
+#include "wait-queue.h"
+
+struct dedupe_context {
+ struct hash_zone *zone;
+ struct uds_request request;
+ struct list_head list_entry;
+ struct funnel_queue_entry queue_entry;
+ u64 submission_jiffies;
+ struct data_vio *requestor;
+ atomic_t state;
+};
+
+struct hash_lock;
+
+struct hash_zone {
+ /* Which hash zone this is */
+ zone_count_t zone_number;
+
+ /* The administrative state of the zone */
+ struct admin_state state;
+
+ /* The thread ID for this zone */
+ thread_id_t thread_id;
+
+ /* Mapping from record name fields to hash_locks */
+ struct int_map *hash_lock_map;
+
+ /* List containing all unused hash_locks */
+ struct list_head lock_pool;
+
+ /*
+ * Statistics shared by all hash locks in this zone. Only modified on the hash zone thread,
+ * but queried by other threads.
+ */
+ struct hash_lock_statistics statistics;
+
+ /* Array of all hash_locks */
+ struct hash_lock *lock_array;
+
+ /* These fields are used to manage the dedupe contexts */
+ struct list_head available;
+ struct list_head pending;
+ struct funnel_queue *timed_out_complete;
+ struct timer_list timer;
+ struct vdo_completion completion;
+ unsigned int active;
+ atomic_t timer_state;
+
+ /* The dedupe contexts for querying the index from this zone */
+ struct dedupe_context contexts[MAXIMUM_VDO_USER_VIOS];
+};
+
+struct hash_zones;
+
+struct pbn_lock * __must_check vdo_get_duplicate_lock(struct data_vio *data_vio);
+
+void vdo_acquire_hash_lock(struct vdo_completion *completion);
+void vdo_continue_hash_lock(struct vdo_completion *completion);
+void vdo_release_hash_lock(struct data_vio *data_vio);
+void vdo_clean_failed_hash_lock(struct data_vio *data_vio);
+void vdo_share_compressed_write_lock(struct data_vio *data_vio,
+ struct pbn_lock *pbn_lock);
+
+int __must_check vdo_make_hash_zones(struct vdo *vdo, struct hash_zones **zones_ptr);
+
+void vdo_free_hash_zones(struct hash_zones *zones);
+
+void vdo_drain_hash_zones(struct hash_zones *zones, struct vdo_completion *parent);
+
+void vdo_get_dedupe_statistics(struct hash_zones *zones, struct vdo_statistics *stats);
+
+struct hash_zone * __must_check vdo_select_hash_zone(struct hash_zones *zones,
+ const struct uds_record_name *name);
+
+void vdo_dump_hash_zones(struct hash_zones *zones);
+
+const char *vdo_get_dedupe_index_state_name(struct hash_zones *zones);
+
+u64 vdo_get_dedupe_index_timeout_count(struct hash_zones *zones);
+
+int vdo_message_dedupe_index(struct hash_zones *zones, const char *name);
+
+void vdo_set_dedupe_state_normal(struct hash_zones *zones);
+
+void vdo_start_dedupe_index(struct hash_zones *zones, bool create_flag);
+
+void vdo_resume_hash_zones(struct hash_zones *zones, struct vdo_completion *parent);
+
+void vdo_finish_dedupe_index(struct hash_zones *zones);
+
+/* Interval (in milliseconds) from submission until switching to fast path and skipping UDS. */
+extern unsigned int vdo_dedupe_index_timeout_interval;
+
+/*
+ * Minimum time interval (in milliseconds) between timer invocations to check for requests waiting
+ * for UDS that should now time out.
+ */
+extern unsigned int vdo_dedupe_index_min_timer_interval;
+
+void vdo_set_dedupe_index_timeout_interval(unsigned int value);
+void vdo_set_dedupe_index_min_timer_interval(unsigned int value);
+
+#endif /* VDO_DEDUPE_H */
diff --git a/drivers/md/dm-vdo/dm-vdo-target.c b/drivers/md/dm-vdo/dm-vdo-target.c
new file mode 100644
index 000000000000..5a4b0a927f56
--- /dev/null
+++ b/drivers/md/dm-vdo/dm-vdo-target.c
@@ -0,0 +1,2910 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/device-mapper.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+
+#include "admin-state.h"
+#include "block-map.h"
+#include "completion.h"
+#include "constants.h"
+#include "data-vio.h"
+#include "dedupe.h"
+#include "dump.h"
+#include "encodings.h"
+#include "errors.h"
+#include "flush.h"
+#include "io-submitter.h"
+#include "logger.h"
+#include "memory-alloc.h"
+#include "message-stats.h"
+#include "recovery-journal.h"
+#include "repair.h"
+#include "slab-depot.h"
+#include "status-codes.h"
+#include "string-utils.h"
+#include "thread-device.h"
+#include "thread-registry.h"
+#include "thread-utils.h"
+#include "types.h"
+#include "vdo.h"
+#include "vio.h"
+
+enum admin_phases {
+ GROW_LOGICAL_PHASE_START,
+ GROW_LOGICAL_PHASE_GROW_BLOCK_MAP,
+ GROW_LOGICAL_PHASE_END,
+ GROW_LOGICAL_PHASE_ERROR,
+ GROW_PHYSICAL_PHASE_START,
+ GROW_PHYSICAL_PHASE_COPY_SUMMARY,
+ GROW_PHYSICAL_PHASE_UPDATE_COMPONENTS,
+ GROW_PHYSICAL_PHASE_USE_NEW_SLABS,
+ GROW_PHYSICAL_PHASE_END,
+ GROW_PHYSICAL_PHASE_ERROR,
+ LOAD_PHASE_START,
+ LOAD_PHASE_LOAD_DEPOT,
+ LOAD_PHASE_MAKE_DIRTY,
+ LOAD_PHASE_PREPARE_TO_ALLOCATE,
+ LOAD_PHASE_SCRUB_SLABS,
+ LOAD_PHASE_DATA_REDUCTION,
+ LOAD_PHASE_FINISHED,
+ LOAD_PHASE_DRAIN_JOURNAL,
+ LOAD_PHASE_WAIT_FOR_READ_ONLY,
+ PRE_LOAD_PHASE_START,
+ PRE_LOAD_PHASE_LOAD_COMPONENTS,
+ PRE_LOAD_PHASE_END,
+ PREPARE_GROW_PHYSICAL_PHASE_START,
+ RESUME_PHASE_START,
+ RESUME_PHASE_ALLOW_READ_ONLY_MODE,
+ RESUME_PHASE_DEDUPE,
+ RESUME_PHASE_DEPOT,
+ RESUME_PHASE_JOURNAL,
+ RESUME_PHASE_BLOCK_MAP,
+ RESUME_PHASE_LOGICAL_ZONES,
+ RESUME_PHASE_PACKER,
+ RESUME_PHASE_FLUSHER,
+ RESUME_PHASE_DATA_VIOS,
+ RESUME_PHASE_END,
+ SUSPEND_PHASE_START,
+ SUSPEND_PHASE_PACKER,
+ SUSPEND_PHASE_DATA_VIOS,
+ SUSPEND_PHASE_DEDUPE,
+ SUSPEND_PHASE_FLUSHES,
+ SUSPEND_PHASE_LOGICAL_ZONES,
+ SUSPEND_PHASE_BLOCK_MAP,
+ SUSPEND_PHASE_JOURNAL,
+ SUSPEND_PHASE_DEPOT,
+ SUSPEND_PHASE_READ_ONLY_WAIT,
+ SUSPEND_PHASE_WRITE_SUPER_BLOCK,
+ SUSPEND_PHASE_END,
+};
+
+static const char * const ADMIN_PHASE_NAMES[] = {
+ "GROW_LOGICAL_PHASE_START",
+ "GROW_LOGICAL_PHASE_GROW_BLOCK_MAP",
+ "GROW_LOGICAL_PHASE_END",
+ "GROW_LOGICAL_PHASE_ERROR",
+ "GROW_PHYSICAL_PHASE_START",
+ "GROW_PHYSICAL_PHASE_COPY_SUMMARY",
+ "GROW_PHYSICAL_PHASE_UPDATE_COMPONENTS",
+ "GROW_PHYSICAL_PHASE_USE_NEW_SLABS",
+ "GROW_PHYSICAL_PHASE_END",
+ "GROW_PHYSICAL_PHASE_ERROR",
+ "LOAD_PHASE_START",
+ "LOAD_PHASE_LOAD_DEPOT",
+ "LOAD_PHASE_MAKE_DIRTY",
+ "LOAD_PHASE_PREPARE_TO_ALLOCATE",
+ "LOAD_PHASE_SCRUB_SLABS",
+ "LOAD_PHASE_DATA_REDUCTION",
+ "LOAD_PHASE_FINISHED",
+ "LOAD_PHASE_DRAIN_JOURNAL",
+ "LOAD_PHASE_WAIT_FOR_READ_ONLY",
+ "PRE_LOAD_PHASE_START",
+ "PRE_LOAD_PHASE_LOAD_COMPONENTS",
+ "PRE_LOAD_PHASE_END",
+ "PREPARE_GROW_PHYSICAL_PHASE_START",
+ "RESUME_PHASE_START",
+ "RESUME_PHASE_ALLOW_READ_ONLY_MODE",
+ "RESUME_PHASE_DEDUPE",
+ "RESUME_PHASE_DEPOT",
+ "RESUME_PHASE_JOURNAL",
+ "RESUME_PHASE_BLOCK_MAP",
+ "RESUME_PHASE_LOGICAL_ZONES",
+ "RESUME_PHASE_PACKER",
+ "RESUME_PHASE_FLUSHER",
+ "RESUME_PHASE_DATA_VIOS",
+ "RESUME_PHASE_END",
+ "SUSPEND_PHASE_START",
+ "SUSPEND_PHASE_PACKER",
+ "SUSPEND_PHASE_DATA_VIOS",
+ "SUSPEND_PHASE_DEDUPE",
+ "SUSPEND_PHASE_FLUSHES",
+ "SUSPEND_PHASE_LOGICAL_ZONES",
+ "SUSPEND_PHASE_BLOCK_MAP",
+ "SUSPEND_PHASE_JOURNAL",
+ "SUSPEND_PHASE_DEPOT",
+ "SUSPEND_PHASE_READ_ONLY_WAIT",
+ "SUSPEND_PHASE_WRITE_SUPER_BLOCK",
+ "SUSPEND_PHASE_END",
+};
+
+/* If we bump this, update the arrays below */
+#define TABLE_VERSION 4
+
+/* arrays for handling different table versions */
+static const u8 REQUIRED_ARGC[] = { 10, 12, 9, 7, 6 };
+/* pool name no longer used. only here for verification of older versions */
+static const u8 POOL_NAME_ARG_INDEX[] = { 8, 10, 8 };
+
+/*
+ * Track in-use instance numbers using a flat bit array.
+ *
+ * O(n) run time isn't ideal, but if we have 1000 VDO devices in use simultaneously we still only
+ * need to scan 16 words, so it's not likely to be a big deal compared to other resource usage.
+ */
+
+/*
+ * This minimum size for the bit array creates a numbering space of 0-999, which allows
+ * successive starts of the same volume to have different instance numbers in any
+ * reasonably-sized test. Changing instances on restart allows vdoMonReport to detect that
+ * the ephemeral stats have reset to zero.
+ */
+#define BIT_COUNT_MINIMUM 1000
+/* Grow the bit array by this many bits when needed */
+#define BIT_COUNT_INCREMENT 100
+
+struct instance_tracker {
+ unsigned int bit_count;
+ unsigned long *words;
+ unsigned int count;
+ unsigned int next;
+};
+
+static DEFINE_MUTEX(instances_lock);
+static struct instance_tracker instances;
+
+/**
+ * free_device_config() - Free a device config created by parse_device_config().
+ * @config: The config to free.
+ */
+static void free_device_config(struct device_config *config)
+{
+ if (config == NULL)
+ return;
+
+ if (config->owned_device != NULL)
+ dm_put_device(config->owning_target, config->owned_device);
+
+ vdo_free(config->parent_device_name);
+ vdo_free(config->original_string);
+
+ /* Reduce the chance a use-after-free (as in BZ 1669960) happens to work. */
+ memset(config, 0, sizeof(*config));
+ vdo_free(config);
+}
+
+/**
+ * get_version_number() - Decide the version number from argv.
+ *
+ * @argc: The number of table values.
+ * @argv: The array of table values.
+ * @error_ptr: A pointer to return a error string in.
+ * @version_ptr: A pointer to return the version.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+static int get_version_number(int argc, char **argv, char **error_ptr,
+ unsigned int *version_ptr)
+{
+ /* version, if it exists, is in a form of V<n> */
+ if (sscanf(argv[0], "V%u", version_ptr) == 1) {
+ if (*version_ptr < 1 || *version_ptr > TABLE_VERSION) {
+ *error_ptr = "Unknown version number detected";
+ return VDO_BAD_CONFIGURATION;
+ }
+ } else {
+ /* V0 actually has no version number in the table string */
+ *version_ptr = 0;
+ }
+
+ /*
+ * V0 and V1 have no optional parameters. There will always be a parameter for thread
+ * config, even if it's a "." to show it's an empty list.
+ */
+ if (*version_ptr <= 1) {
+ if (argc != REQUIRED_ARGC[*version_ptr]) {
+ *error_ptr = "Incorrect number of arguments for version";
+ return VDO_BAD_CONFIGURATION;
+ }
+ } else if (argc < REQUIRED_ARGC[*version_ptr]) {
+ *error_ptr = "Incorrect number of arguments for version";
+ return VDO_BAD_CONFIGURATION;
+ }
+
+ if (*version_ptr != TABLE_VERSION) {
+ vdo_log_warning("Detected version mismatch between kernel module and tools kernel: %d, tool: %d",
+ TABLE_VERSION, *version_ptr);
+ vdo_log_warning("Please consider upgrading management tools to match kernel.");
+ }
+ return VDO_SUCCESS;
+}
+
+/* Free a list of non-NULL string pointers, and then the list itself. */
+static void free_string_array(char **string_array)
+{
+ unsigned int offset;
+
+ for (offset = 0; string_array[offset] != NULL; offset++)
+ vdo_free(string_array[offset]);
+ vdo_free(string_array);
+}
+
+/*
+ * Split the input string into substrings, separated at occurrences of the indicated character,
+ * returning a null-terminated list of string pointers.
+ *
+ * The string pointers and the pointer array itself should both be freed with vdo_free() when no
+ * longer needed. This can be done with vdo_free_string_array (below) if the pointers in the array
+ * are not changed. Since the array and copied strings are allocated by this function, it may only
+ * be used in contexts where allocation is permitted.
+ *
+ * Empty substrings are not ignored; that is, returned substrings may be empty strings if the
+ * separator occurs twice in a row.
+ */
+static int split_string(const char *string, char separator, char ***substring_array_ptr)
+{
+ unsigned int current_substring = 0, substring_count = 1;
+ const char *s;
+ char **substrings;
+ int result;
+ ptrdiff_t length;
+
+ for (s = string; *s != 0; s++) {
+ if (*s == separator)
+ substring_count++;
+ }
+
+ result = vdo_allocate(substring_count + 1, char *, "string-splitting array",
+ &substrings);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ for (s = string; *s != 0; s++) {
+ if (*s == separator) {
+ ptrdiff_t length = s - string;
+
+ result = vdo_allocate(length + 1, char, "split string",
+ &substrings[current_substring]);
+ if (result != VDO_SUCCESS) {
+ free_string_array(substrings);
+ return result;
+ }
+ /*
+ * Trailing NUL is already in place after allocation; deal with the zero or
+ * more non-NUL bytes in the string.
+ */
+ if (length > 0)
+ memcpy(substrings[current_substring], string, length);
+ string = s + 1;
+ current_substring++;
+ BUG_ON(current_substring >= substring_count);
+ }
+ }
+ /* Process final string, with no trailing separator. */
+ BUG_ON(current_substring != (substring_count - 1));
+ length = strlen(string);
+
+ result = vdo_allocate(length + 1, char, "split string",
+ &substrings[current_substring]);
+ if (result != VDO_SUCCESS) {
+ free_string_array(substrings);
+ return result;
+ }
+ memcpy(substrings[current_substring], string, length);
+ current_substring++;
+ /* substrings[current_substring] is NULL already */
+ *substring_array_ptr = substrings;
+ return VDO_SUCCESS;
+}
+
+/*
+ * Join the input substrings into one string, joined with the indicated character, returning a
+ * string. array_length is a bound on the number of valid elements in substring_array, in case it
+ * is not NULL-terminated.
+ */
+static int join_strings(char **substring_array, size_t array_length, char separator,
+ char **string_ptr)
+{
+ size_t string_length = 0;
+ size_t i;
+ int result;
+ char *output, *current_position;
+
+ for (i = 0; (i < array_length) && (substring_array[i] != NULL); i++)
+ string_length += strlen(substring_array[i]) + 1;
+
+ result = vdo_allocate(string_length, char, __func__, &output);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ current_position = &output[0];
+
+ for (i = 0; (i < array_length) && (substring_array[i] != NULL); i++) {
+ current_position = vdo_append_to_buffer(current_position,
+ output + string_length, "%s",
+ substring_array[i]);
+ *current_position = separator;
+ current_position++;
+ }
+
+ /* We output one too many separators; replace the last with a zero byte. */
+ if (current_position != output)
+ *(current_position - 1) = '\0';
+
+ *string_ptr = output;
+ return VDO_SUCCESS;
+}
+
+/**
+ * parse_bool() - Parse a two-valued option into a bool.
+ * @bool_str: The string value to convert to a bool.
+ * @true_str: The string value which should be converted to true.
+ * @false_str: The string value which should be converted to false.
+ * @bool_ptr: A pointer to return the bool value in.
+ *
+ * Return: VDO_SUCCESS or an error if bool_str is neither true_str nor false_str.
+ */
+static inline int __must_check parse_bool(const char *bool_str, const char *true_str,
+ const char *false_str, bool *bool_ptr)
+{
+ bool value = false;
+
+ if (strcmp(bool_str, true_str) == 0)
+ value = true;
+ else if (strcmp(bool_str, false_str) == 0)
+ value = false;
+ else
+ return VDO_BAD_CONFIGURATION;
+
+ *bool_ptr = value;
+ return VDO_SUCCESS;
+}
+
+/**
+ * process_one_thread_config_spec() - Process one component of a thread parameter configuration
+ * string and update the configuration data structure.
+ * @thread_param_type: The type of thread specified.
+ * @count: The thread count requested.
+ * @config: The configuration data structure to update.
+ *
+ * If the thread count requested is invalid, a message is logged and -EINVAL returned. If the
+ * thread name is unknown, a message is logged but no error is returned.
+ *
+ * Return: VDO_SUCCESS or -EINVAL
+ */
+static int process_one_thread_config_spec(const char *thread_param_type,
+ unsigned int count,
+ struct thread_count_config *config)
+{
+ /* Handle limited thread parameters */
+ if (strcmp(thread_param_type, "bioRotationInterval") == 0) {
+ if (count == 0) {
+ vdo_log_error("thread config string error: 'bioRotationInterval' of at least 1 is required");
+ return -EINVAL;
+ } else if (count > VDO_BIO_ROTATION_INTERVAL_LIMIT) {
+ vdo_log_error("thread config string error: 'bioRotationInterval' cannot be higher than %d",
+ VDO_BIO_ROTATION_INTERVAL_LIMIT);
+ return -EINVAL;
+ }
+ config->bio_rotation_interval = count;
+ return VDO_SUCCESS;
+ }
+ if (strcmp(thread_param_type, "logical") == 0) {
+ if (count > MAX_VDO_LOGICAL_ZONES) {
+ vdo_log_error("thread config string error: at most %d 'logical' threads are allowed",
+ MAX_VDO_LOGICAL_ZONES);
+ return -EINVAL;
+ }
+ config->logical_zones = count;
+ return VDO_SUCCESS;
+ }
+ if (strcmp(thread_param_type, "physical") == 0) {
+ if (count > MAX_VDO_PHYSICAL_ZONES) {
+ vdo_log_error("thread config string error: at most %d 'physical' threads are allowed",
+ MAX_VDO_PHYSICAL_ZONES);
+ return -EINVAL;
+ }
+ config->physical_zones = count;
+ return VDO_SUCCESS;
+ }
+ /* Handle other thread count parameters */
+ if (count > MAXIMUM_VDO_THREADS) {
+ vdo_log_error("thread config string error: at most %d '%s' threads are allowed",
+ MAXIMUM_VDO_THREADS, thread_param_type);
+ return -EINVAL;
+ }
+ if (strcmp(thread_param_type, "hash") == 0) {
+ config->hash_zones = count;
+ return VDO_SUCCESS;
+ }
+ if (strcmp(thread_param_type, "cpu") == 0) {
+ if (count == 0) {
+ vdo_log_error("thread config string error: at least one 'cpu' thread required");
+ return -EINVAL;
+ }
+ config->cpu_threads = count;
+ return VDO_SUCCESS;
+ }
+ if (strcmp(thread_param_type, "ack") == 0) {
+ config->bio_ack_threads = count;
+ return VDO_SUCCESS;
+ }
+ if (strcmp(thread_param_type, "bio") == 0) {
+ if (count == 0) {
+ vdo_log_error("thread config string error: at least one 'bio' thread required");
+ return -EINVAL;
+ }
+ config->bio_threads = count;
+ return VDO_SUCCESS;
+ }
+
+ /*
+ * Don't fail, just log. This will handle version mismatches between user mode tools and
+ * kernel.
+ */
+ vdo_log_info("unknown thread parameter type \"%s\"", thread_param_type);
+ return VDO_SUCCESS;
+}
+
+/**
+ * parse_one_thread_config_spec() - Parse one component of a thread parameter configuration string
+ * and update the configuration data structure.
+ * @spec: The thread parameter specification string.
+ * @config: The configuration data to be updated.
+ */
+static int parse_one_thread_config_spec(const char *spec,
+ struct thread_count_config *config)
+{
+ unsigned int count;
+ char **fields;
+ int result;
+
+ result = split_string(spec, '=', &fields);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ if ((fields[0] == NULL) || (fields[1] == NULL) || (fields[2] != NULL)) {
+ vdo_log_error("thread config string error: expected thread parameter assignment, saw \"%s\"",
+ spec);
+ free_string_array(fields);
+ return -EINVAL;
+ }
+
+ result = kstrtouint(fields[1], 10, &count);
+ if (result) {
+ vdo_log_error("thread config string error: integer value needed, found \"%s\"",
+ fields[1]);
+ free_string_array(fields);
+ return result;
+ }
+
+ result = process_one_thread_config_spec(fields[0], count, config);
+ free_string_array(fields);
+ return result;
+}
+
+/**
+ * parse_thread_config_string() - Parse the configuration string passed and update the specified
+ * counts and other parameters of various types of threads to be
+ * created.
+ * @string: Thread parameter configuration string.
+ * @config: The thread configuration data to update.
+ *
+ * The configuration string should contain one or more comma-separated specs of the form
+ * "typename=number"; the supported type names are "cpu", "ack", "bio", "bioRotationInterval",
+ * "logical", "physical", and "hash".
+ *
+ * If an error occurs during parsing of a single key/value pair, we deem it serious enough to stop
+ * further parsing.
+ *
+ * This function can't set the "reason" value the caller wants to pass back, because we'd want to
+ * format it to say which field was invalid, and we can't allocate the "reason" strings
+ * dynamically. So if an error occurs, we'll log the details and pass back an error.
+ *
+ * Return: VDO_SUCCESS or -EINVAL or -ENOMEM
+ */
+static int parse_thread_config_string(const char *string,
+ struct thread_count_config *config)
+{
+ int result = VDO_SUCCESS;
+ char **specs;
+
+ if (strcmp(".", string) != 0) {
+ unsigned int i;
+
+ result = split_string(string, ',', &specs);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ for (i = 0; specs[i] != NULL; i++) {
+ result = parse_one_thread_config_spec(specs[i], config);
+ if (result != VDO_SUCCESS)
+ break;
+ }
+ free_string_array(specs);
+ }
+ return result;
+}
+
+/**
+ * process_one_key_value_pair() - Process one component of an optional parameter string and update
+ * the configuration data structure.
+ * @key: The optional parameter key name.
+ * @value: The optional parameter value.
+ * @config: The configuration data structure to update.
+ *
+ * If the value requested is invalid, a message is logged and -EINVAL returned. If the key is
+ * unknown, a message is logged but no error is returned.
+ *
+ * Return: VDO_SUCCESS or -EINVAL
+ */
+static int process_one_key_value_pair(const char *key, unsigned int value,
+ struct device_config *config)
+{
+ /* Non thread optional parameters */
+ if (strcmp(key, "maxDiscard") == 0) {
+ if (value == 0) {
+ vdo_log_error("optional parameter error: at least one max discard block required");
+ return -EINVAL;
+ }
+ /* Max discard sectors in blkdev_issue_discard is UINT_MAX >> 9 */
+ if (value > (UINT_MAX / VDO_BLOCK_SIZE)) {
+ vdo_log_error("optional parameter error: at most %d max discard blocks are allowed",
+ UINT_MAX / VDO_BLOCK_SIZE);
+ return -EINVAL;
+ }
+ config->max_discard_blocks = value;
+ return VDO_SUCCESS;
+ }
+ /* Handles unknown key names */
+ return process_one_thread_config_spec(key, value, &config->thread_counts);
+}
+
+/**
+ * parse_one_key_value_pair() - Parse one key/value pair and update the configuration data
+ * structure.
+ * @key: The optional key name.
+ * @value: The optional value.
+ * @config: The configuration data to be updated.
+ *
+ * Return: VDO_SUCCESS or error.
+ */
+static int parse_one_key_value_pair(const char *key, const char *value,
+ struct device_config *config)
+{
+ unsigned int count;
+ int result;
+
+ if (strcmp(key, "deduplication") == 0)
+ return parse_bool(value, "on", "off", &config->deduplication);
+
+ if (strcmp(key, "compression") == 0)
+ return parse_bool(value, "on", "off", &config->compression);
+
+ /* The remaining arguments must have integral values. */
+ result = kstrtouint(value, 10, &count);
+ if (result) {
+ vdo_log_error("optional config string error: integer value needed, found \"%s\"",
+ value);
+ return result;
+ }
+ return process_one_key_value_pair(key, count, config);
+}
+
+/**
+ * parse_key_value_pairs() - Parse all key/value pairs from a list of arguments.
+ * @argc: The total number of arguments in list.
+ * @argv: The list of key/value pairs.
+ * @config: The device configuration data to update.
+ *
+ * If an error occurs during parsing of a single key/value pair, we deem it serious enough to stop
+ * further parsing.
+ *
+ * This function can't set the "reason" value the caller wants to pass back, because we'd want to
+ * format it to say which field was invalid, and we can't allocate the "reason" strings
+ * dynamically. So if an error occurs, we'll log the details and return the error.
+ *
+ * Return: VDO_SUCCESS or error
+ */
+static int parse_key_value_pairs(int argc, char **argv, struct device_config *config)
+{
+ int result = VDO_SUCCESS;
+
+ while (argc) {
+ result = parse_one_key_value_pair(argv[0], argv[1], config);
+ if (result != VDO_SUCCESS)
+ break;
+
+ argc -= 2;
+ argv += 2;
+ }
+
+ return result;
+}
+
+/**
+ * parse_optional_arguments() - Parse the configuration string passed in for optional arguments.
+ * @arg_set: The structure holding the arguments to parse.
+ * @error_ptr: Pointer to a buffer to hold the error string.
+ * @config: Pointer to device configuration data to update.
+ *
+ * For V0/V1 configurations, there will only be one optional parameter; the thread configuration.
+ * The configuration string should contain one or more comma-separated specs of the form
+ * "typename=number"; the supported type names are "cpu", "ack", "bio", "bioRotationInterval",
+ * "logical", "physical", and "hash".
+ *
+ * For V2 configurations and beyond, there could be any number of arguments. They should contain
+ * one or more key/value pairs separated by a space.
+ *
+ * Return: VDO_SUCCESS or error
+ */
+static int parse_optional_arguments(struct dm_arg_set *arg_set, char **error_ptr,
+ struct device_config *config)
+{
+ int result = VDO_SUCCESS;
+
+ if (config->version == 0 || config->version == 1) {
+ result = parse_thread_config_string(arg_set->argv[0],
+ &config->thread_counts);
+ if (result != VDO_SUCCESS) {
+ *error_ptr = "Invalid thread-count configuration";
+ return VDO_BAD_CONFIGURATION;
+ }
+ } else {
+ if ((arg_set->argc % 2) != 0) {
+ *error_ptr = "Odd number of optional arguments given but they should be <key> <value> pairs";
+ return VDO_BAD_CONFIGURATION;
+ }
+ result = parse_key_value_pairs(arg_set->argc, arg_set->argv, config);
+ if (result != VDO_SUCCESS) {
+ *error_ptr = "Invalid optional argument configuration";
+ return VDO_BAD_CONFIGURATION;
+ }
+ }
+ return result;
+}
+
+/**
+ * handle_parse_error() - Handle a parsing error.
+ * @config: The config to free.
+ * @error_ptr: A place to store a constant string about the error.
+ * @error_str: A constant string to store in error_ptr.
+ */
+static void handle_parse_error(struct device_config *config, char **error_ptr,
+ char *error_str)
+{
+ free_device_config(config);
+ *error_ptr = error_str;
+}
+
+/**
+ * parse_device_config() - Convert the dmsetup table into a struct device_config.
+ * @argc: The number of table values.
+ * @argv: The array of table values.
+ * @ti: The target structure for this table.
+ * @config_ptr: A pointer to return the allocated config.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+static int parse_device_config(int argc, char **argv, struct dm_target *ti,
+ struct device_config **config_ptr)
+{
+ bool enable_512e;
+ size_t logical_bytes = to_bytes(ti->len);
+ struct dm_arg_set arg_set;
+ char **error_ptr = &ti->error;
+ struct device_config *config = NULL;
+ int result;
+
+ if ((logical_bytes % VDO_BLOCK_SIZE) != 0) {
+ handle_parse_error(config, error_ptr,
+ "Logical size must be a multiple of 4096");
+ return VDO_BAD_CONFIGURATION;
+ }
+
+ if (argc == 0) {
+ handle_parse_error(config, error_ptr, "Incorrect number of arguments");
+ return VDO_BAD_CONFIGURATION;
+ }
+
+ result = vdo_allocate(1, struct device_config, "device_config", &config);
+ if (result != VDO_SUCCESS) {
+ handle_parse_error(config, error_ptr,
+ "Could not allocate config structure");
+ return VDO_BAD_CONFIGURATION;
+ }
+
+ config->owning_target = ti;
+ config->logical_blocks = logical_bytes / VDO_BLOCK_SIZE;
+ INIT_LIST_HEAD(&config->config_list);
+
+ /* Save the original string. */
+ result = join_strings(argv, argc, ' ', &config->original_string);
+ if (result != VDO_SUCCESS) {
+ handle_parse_error(config, error_ptr, "Could not populate string");
+ return VDO_BAD_CONFIGURATION;
+ }
+
+ vdo_log_info("table line: %s", config->original_string);
+
+ config->thread_counts = (struct thread_count_config) {
+ .bio_ack_threads = 1,
+ .bio_threads = DEFAULT_VDO_BIO_SUBMIT_QUEUE_COUNT,
+ .bio_rotation_interval = DEFAULT_VDO_BIO_SUBMIT_QUEUE_ROTATE_INTERVAL,
+ .cpu_threads = 1,
+ .logical_zones = 0,
+ .physical_zones = 0,
+ .hash_zones = 0,
+ };
+ config->max_discard_blocks = 1;
+ config->deduplication = true;
+ config->compression = false;
+
+ arg_set.argc = argc;
+ arg_set.argv = argv;
+
+ result = get_version_number(argc, argv, error_ptr, &config->version);
+ if (result != VDO_SUCCESS) {
+ /* get_version_number sets error_ptr itself. */
+ handle_parse_error(config, error_ptr, *error_ptr);
+ return result;
+ }
+ /* Move the arg pointer forward only if the argument was there. */
+ if (config->version >= 1)
+ dm_shift_arg(&arg_set);
+
+ result = vdo_duplicate_string(dm_shift_arg(&arg_set), "parent device name",
+ &config->parent_device_name);
+ if (result != VDO_SUCCESS) {
+ handle_parse_error(config, error_ptr,
+ "Could not copy parent device name");
+ return VDO_BAD_CONFIGURATION;
+ }
+
+ /* Get the physical blocks, if known. */
+ if (config->version >= 1) {
+ result = kstrtoull(dm_shift_arg(&arg_set), 10, &config->physical_blocks);
+ if (result != VDO_SUCCESS) {
+ handle_parse_error(config, error_ptr,
+ "Invalid physical block count");
+ return VDO_BAD_CONFIGURATION;
+ }
+ }
+
+ /* Get the logical block size and validate */
+ result = parse_bool(dm_shift_arg(&arg_set), "512", "4096", &enable_512e);
+ if (result != VDO_SUCCESS) {
+ handle_parse_error(config, error_ptr, "Invalid logical block size");
+ return VDO_BAD_CONFIGURATION;
+ }
+ config->logical_block_size = (enable_512e ? 512 : 4096);
+
+ /* Skip past the two no longer used read cache options. */
+ if (config->version <= 1)
+ dm_consume_args(&arg_set, 2);
+
+ /* Get the page cache size. */
+ result = kstrtouint(dm_shift_arg(&arg_set), 10, &config->cache_size);
+ if (result != VDO_SUCCESS) {
+ handle_parse_error(config, error_ptr,
+ "Invalid block map page cache size");
+ return VDO_BAD_CONFIGURATION;
+ }
+
+ /* Get the block map era length. */
+ result = kstrtouint(dm_shift_arg(&arg_set), 10, &config->block_map_maximum_age);
+ if (result != VDO_SUCCESS) {
+ handle_parse_error(config, error_ptr, "Invalid block map maximum age");
+ return VDO_BAD_CONFIGURATION;
+ }
+
+ /* Skip past the no longer used MD RAID5 optimization mode */
+ if (config->version <= 2)
+ dm_consume_args(&arg_set, 1);
+
+ /* Skip past the no longer used write policy setting */
+ if (config->version <= 3)
+ dm_consume_args(&arg_set, 1);
+
+ /* Skip past the no longer used pool name for older table lines */
+ if (config->version <= 2) {
+ /*
+ * Make sure the enum to get the pool name from argv directly is still in sync with
+ * the parsing of the table line.
+ */
+ if (&arg_set.argv[0] != &argv[POOL_NAME_ARG_INDEX[config->version]]) {
+ handle_parse_error(config, error_ptr,
+ "Pool name not in expected location");
+ return VDO_BAD_CONFIGURATION;
+ }
+ dm_shift_arg(&arg_set);
+ }
+
+ /* Get the optional arguments and validate. */
+ result = parse_optional_arguments(&arg_set, error_ptr, config);
+ if (result != VDO_SUCCESS) {
+ /* parse_optional_arguments sets error_ptr itself. */
+ handle_parse_error(config, error_ptr, *error_ptr);
+ return result;
+ }
+
+ /*
+ * Logical, physical, and hash zone counts can all be zero; then we get one thread doing
+ * everything, our older configuration. If any zone count is non-zero, the others must be
+ * as well.
+ */
+ if (((config->thread_counts.logical_zones == 0) !=
+ (config->thread_counts.physical_zones == 0)) ||
+ ((config->thread_counts.physical_zones == 0) !=
+ (config->thread_counts.hash_zones == 0))) {
+ handle_parse_error(config, error_ptr,
+ "Logical, physical, and hash zones counts must all be zero or all non-zero");
+ return VDO_BAD_CONFIGURATION;
+ }
+
+ if (config->cache_size <
+ (2 * MAXIMUM_VDO_USER_VIOS * config->thread_counts.logical_zones)) {
+ handle_parse_error(config, error_ptr,
+ "Insufficient block map cache for logical zones");
+ return VDO_BAD_CONFIGURATION;
+ }
+
+ result = dm_get_device(ti, config->parent_device_name,
+ dm_table_get_mode(ti->table), &config->owned_device);
+ if (result != 0) {
+ vdo_log_error("couldn't open device \"%s\": error %d",
+ config->parent_device_name, result);
+ handle_parse_error(config, error_ptr, "Unable to open storage device");
+ return VDO_BAD_CONFIGURATION;
+ }
+
+ if (config->version == 0) {
+ u64 device_size = i_size_read(config->owned_device->bdev->bd_inode);
+
+ config->physical_blocks = device_size / VDO_BLOCK_SIZE;
+ }
+
+ *config_ptr = config;
+ return result;
+}
+
+static struct vdo *get_vdo_for_target(struct dm_target *ti)
+{
+ return ((struct device_config *) ti->private)->vdo;
+}
+
+
+static int vdo_map_bio(struct dm_target *ti, struct bio *bio)
+{
+ struct vdo *vdo = get_vdo_for_target(ti);
+ struct vdo_work_queue *current_work_queue;
+ const struct admin_state_code *code = vdo_get_admin_state_code(&vdo->admin.state);
+
+ VDO_ASSERT_LOG_ONLY(code->normal, "vdo should not receive bios while in state %s",
+ code->name);
+
+ /* Count all incoming bios. */
+ vdo_count_bios(&vdo->stats.bios_in, bio);
+
+
+ /* Handle empty bios. Empty flush bios are not associated with a vio. */
+ if ((bio_op(bio) == REQ_OP_FLUSH) || ((bio->bi_opf & REQ_PREFLUSH) != 0)) {
+ vdo_launch_flush(vdo, bio);
+ return DM_MAPIO_SUBMITTED;
+ }
+
+ /* This could deadlock, */
+ current_work_queue = vdo_get_current_work_queue();
+ BUG_ON((current_work_queue != NULL) &&
+ (vdo == vdo_get_work_queue_owner(current_work_queue)->vdo));
+ vdo_launch_bio(vdo->data_vio_pool, bio);
+ return DM_MAPIO_SUBMITTED;
+}
+
+static void vdo_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+ struct vdo *vdo = get_vdo_for_target(ti);
+
+ limits->logical_block_size = vdo->device_config->logical_block_size;
+ limits->physical_block_size = VDO_BLOCK_SIZE;
+
+ /* The minimum io size for random io */
+ blk_limits_io_min(limits, VDO_BLOCK_SIZE);
+ /* The optimal io size for streamed/sequential io */
+ blk_limits_io_opt(limits, VDO_BLOCK_SIZE);
+
+ /*
+ * Sets the maximum discard size that will be passed into VDO. This value comes from a
+ * table line value passed in during dmsetup create.
+ *
+ * The value 1024 is the largest usable value on HD systems. A 2048 sector discard on a
+ * busy HD system takes 31 seconds. We should use a value no higher than 1024, which takes
+ * 15 to 16 seconds on a busy HD system. However, using large values results in 120 second
+ * blocked task warnings in kernel logs. In order to avoid these warnings, we choose to
+ * use the smallest reasonable value.
+ *
+ * The value is used by dm-thin to determine whether to pass down discards. The block layer
+ * splits large discards on this boundary when this is set.
+ */
+ limits->max_discard_sectors =
+ (vdo->device_config->max_discard_blocks * VDO_SECTORS_PER_BLOCK);
+
+ /*
+ * Force discards to not begin or end with a partial block by stating the granularity is
+ * 4k.
+ */
+ limits->discard_granularity = VDO_BLOCK_SIZE;
+}
+
+static int vdo_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn,
+ void *data)
+{
+ struct device_config *config = get_vdo_for_target(ti)->device_config;
+
+ return fn(ti, config->owned_device, 0,
+ config->physical_blocks * VDO_SECTORS_PER_BLOCK, data);
+}
+
+/*
+ * Status line is:
+ * <device> <operating mode> <in recovery> <index state> <compression state>
+ * <used physical blocks> <total physical blocks>
+ */
+
+static void vdo_status(struct dm_target *ti, status_type_t status_type,
+ unsigned int status_flags, char *result, unsigned int maxlen)
+{
+ struct vdo *vdo = get_vdo_for_target(ti);
+ struct vdo_statistics *stats;
+ struct device_config *device_config;
+ /* N.B.: The DMEMIT macro uses the variables named "sz", "result", "maxlen". */
+ int sz = 0;
+
+ switch (status_type) {
+ case STATUSTYPE_INFO:
+ /* Report info for dmsetup status */
+ mutex_lock(&vdo->stats_mutex);
+ vdo_fetch_statistics(vdo, &vdo->stats_buffer);
+ stats = &vdo->stats_buffer;
+
+ DMEMIT("/dev/%pg %s %s %s %s %llu %llu",
+ vdo_get_backing_device(vdo), stats->mode,
+ stats->in_recovery_mode ? "recovering" : "-",
+ vdo_get_dedupe_index_state_name(vdo->hash_zones),
+ vdo_get_compressing(vdo) ? "online" : "offline",
+ stats->data_blocks_used + stats->overhead_blocks_used,
+ stats->physical_blocks);
+ mutex_unlock(&vdo->stats_mutex);
+ break;
+
+ case STATUSTYPE_TABLE:
+ /* Report the string actually specified in the beginning. */
+ device_config = (struct device_config *) ti->private;
+ DMEMIT("%s", device_config->original_string);
+ break;
+
+ case STATUSTYPE_IMA:
+ /* FIXME: We ought to be more detailed here, but this is what thin does. */
+ *result = '\0';
+ break;
+ }
+}
+
+static block_count_t __must_check get_underlying_device_block_count(const struct vdo *vdo)
+{
+ return i_size_read(vdo_get_backing_device(vdo)->bd_inode) / VDO_BLOCK_SIZE;
+}
+
+static int __must_check process_vdo_message_locked(struct vdo *vdo, unsigned int argc,
+ char **argv)
+{
+ if ((argc == 2) && (strcasecmp(argv[0], "compression") == 0)) {
+ if (strcasecmp(argv[1], "on") == 0) {
+ vdo_set_compressing(vdo, true);
+ return 0;
+ }
+
+ if (strcasecmp(argv[1], "off") == 0) {
+ vdo_set_compressing(vdo, false);
+ return 0;
+ }
+
+ vdo_log_warning("invalid argument '%s' to dmsetup compression message",
+ argv[1]);
+ return -EINVAL;
+ }
+
+ vdo_log_warning("unrecognized dmsetup message '%s' received", argv[0]);
+ return -EINVAL;
+}
+
+/*
+ * If the message is a dump, just do it. Otherwise, check that no other message is being processed,
+ * and only proceed if so.
+ * Returns -EBUSY if another message is being processed
+ */
+static int __must_check process_vdo_message(struct vdo *vdo, unsigned int argc,
+ char **argv)
+{
+ int result;
+
+ /*
+ * All messages which may be processed in parallel with other messages should be handled
+ * here before the atomic check below. Messages which should be exclusive should be
+ * processed in process_vdo_message_locked().
+ */
+
+ /* Dump messages should always be processed */
+ if (strcasecmp(argv[0], "dump") == 0)
+ return vdo_dump(vdo, argc, argv, "dmsetup message");
+
+ if (argc == 1) {
+ if (strcasecmp(argv[0], "dump-on-shutdown") == 0) {
+ vdo->dump_on_shutdown = true;
+ return 0;
+ }
+
+ /* Index messages should always be processed */
+ if ((strcasecmp(argv[0], "index-close") == 0) ||
+ (strcasecmp(argv[0], "index-create") == 0) ||
+ (strcasecmp(argv[0], "index-disable") == 0) ||
+ (strcasecmp(argv[0], "index-enable") == 0))
+ return vdo_message_dedupe_index(vdo->hash_zones, argv[0]);
+ }
+
+ if (atomic_cmpxchg(&vdo->processing_message, 0, 1) != 0)
+ return -EBUSY;
+
+ result = process_vdo_message_locked(vdo, argc, argv);
+
+ /* Pairs with the implicit barrier in cmpxchg just above */
+ smp_wmb();
+ atomic_set(&vdo->processing_message, 0);
+ return result;
+}
+
+static int vdo_message(struct dm_target *ti, unsigned int argc, char **argv,
+ char *result_buffer, unsigned int maxlen)
+{
+ struct registered_thread allocating_thread, instance_thread;
+ struct vdo *vdo;
+ int result;
+
+ if (argc == 0) {
+ vdo_log_warning("unspecified dmsetup message");
+ return -EINVAL;
+ }
+
+ vdo = get_vdo_for_target(ti);
+ vdo_register_allocating_thread(&allocating_thread, NULL);
+ vdo_register_thread_device_id(&instance_thread, &vdo->instance);
+
+ /*
+ * Must be done here so we don't map return codes. The code in dm-ioctl expects a 1 for a
+ * return code to look at the buffer and see if it is full or not.
+ */
+ if ((argc == 1) && (strcasecmp(argv[0], "stats") == 0)) {
+ vdo_write_stats(vdo, result_buffer, maxlen);
+ result = 1;
+ } else {
+ result = vdo_status_to_errno(process_vdo_message(vdo, argc, argv));
+ }
+
+ vdo_unregister_thread_device_id();
+ vdo_unregister_allocating_thread();
+ return result;
+}
+
+static void configure_target_capabilities(struct dm_target *ti)
+{
+ ti->discards_supported = 1;
+ ti->flush_supported = true;
+ ti->num_discard_bios = 1;
+ ti->num_flush_bios = 1;
+
+ /*
+ * If this value changes, please make sure to update the value for max_discard_sectors
+ * accordingly.
+ */
+ BUG_ON(dm_set_target_max_io_len(ti, VDO_SECTORS_PER_BLOCK) != 0);
+}
+
+/*
+ * Implements vdo_filter_fn.
+ */
+static bool vdo_uses_device(struct vdo *vdo, const void *context)
+{
+ const struct device_config *config = context;
+
+ return vdo_get_backing_device(vdo)->bd_dev == config->owned_device->bdev->bd_dev;
+}
+
+/**
+ * get_thread_id_for_phase() - Get the thread id for the current phase of the admin operation in
+ * progress.
+ */
+static thread_id_t __must_check get_thread_id_for_phase(struct vdo *vdo)
+{
+ switch (vdo->admin.phase) {
+ case RESUME_PHASE_PACKER:
+ case RESUME_PHASE_FLUSHER:
+ case SUSPEND_PHASE_PACKER:
+ case SUSPEND_PHASE_FLUSHES:
+ return vdo->thread_config.packer_thread;
+
+ case RESUME_PHASE_DATA_VIOS:
+ case SUSPEND_PHASE_DATA_VIOS:
+ return vdo->thread_config.cpu_thread;
+
+ case LOAD_PHASE_DRAIN_JOURNAL:
+ case RESUME_PHASE_JOURNAL:
+ case SUSPEND_PHASE_JOURNAL:
+ return vdo->thread_config.journal_thread;
+
+ default:
+ return vdo->thread_config.admin_thread;
+ }
+}
+
+static struct vdo_completion *prepare_admin_completion(struct vdo *vdo,
+ vdo_action_fn callback,
+ vdo_action_fn error_handler)
+{
+ struct vdo_completion *completion = &vdo->admin.completion;
+
+ /*
+ * We can't use vdo_prepare_completion_for_requeue() here because we don't want to reset
+ * any error in the completion.
+ */
+ completion->callback = callback;
+ completion->error_handler = error_handler;
+ completion->callback_thread_id = get_thread_id_for_phase(vdo);
+ completion->requeue = true;
+ return completion;
+}
+
+/**
+ * advance_phase() - Increment the phase of the current admin operation and prepare the admin
+ * completion to run on the thread for the next phase.
+ * @vdo: The on which an admin operation is being performed
+ *
+ * Return: The current phase
+ */
+static u32 advance_phase(struct vdo *vdo)
+{
+ u32 phase = vdo->admin.phase++;
+
+ vdo->admin.completion.callback_thread_id = get_thread_id_for_phase(vdo);
+ vdo->admin.completion.requeue = true;
+ return phase;
+}
+
+/*
+ * Perform an administrative operation (load, suspend, grow logical, or grow physical). This method
+ * should not be called from vdo threads.
+ */
+static int perform_admin_operation(struct vdo *vdo, u32 starting_phase,
+ vdo_action_fn callback, vdo_action_fn error_handler,
+ const char *type)
+{
+ int result;
+ struct vdo_administrator *admin = &vdo->admin;
+
+ if (atomic_cmpxchg(&admin->busy, 0, 1) != 0) {
+ return vdo_log_error_strerror(VDO_COMPONENT_BUSY,
+ "Can't start %s operation, another operation is already in progress",
+ type);
+ }
+
+ admin->phase = starting_phase;
+ reinit_completion(&admin->callback_sync);
+ vdo_reset_completion(&admin->completion);
+ vdo_launch_completion(prepare_admin_completion(vdo, callback, error_handler));
+
+ /*
+ * Using the "interruptible" interface means that Linux will not log a message when we wait
+ * for more than 120 seconds.
+ */
+ while (wait_for_completion_interruptible(&admin->callback_sync)) {
+ /* However, if we get a signal in a user-mode process, we could spin... */
+ fsleep(1000);
+ }
+
+ result = admin->completion.result;
+ /* pairs with implicit barrier in cmpxchg above */
+ smp_wmb();
+ atomic_set(&admin->busy, 0);
+ return result;
+}
+
+/* Assert that we are operating on the correct thread for the current phase. */
+static void assert_admin_phase_thread(struct vdo *vdo, const char *what)
+{
+ VDO_ASSERT_LOG_ONLY(vdo_get_callback_thread_id() == get_thread_id_for_phase(vdo),
+ "%s on correct thread for %s", what,
+ ADMIN_PHASE_NAMES[vdo->admin.phase]);
+}
+
+/**
+ * finish_operation_callback() - Callback to finish an admin operation.
+ * @completion: The admin_completion.
+ */
+static void finish_operation_callback(struct vdo_completion *completion)
+{
+ struct vdo_administrator *admin = &completion->vdo->admin;
+
+ vdo_finish_operation(&admin->state, completion->result);
+ complete(&admin->callback_sync);
+}
+
+/**
+ * decode_from_super_block() - Decode the VDO state from the super block and validate that it is
+ * correct.
+ * @vdo: The vdo being loaded.
+ *
+ * On error from this method, the component states must be destroyed explicitly. If this method
+ * returns successfully, the component states must not be destroyed.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int __must_check decode_from_super_block(struct vdo *vdo)
+{
+ const struct device_config *config = vdo->device_config;
+ int result;
+
+ result = vdo_decode_component_states(vdo->super_block.buffer, &vdo->geometry,
+ &vdo->states);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ vdo_set_state(vdo, vdo->states.vdo.state);
+ vdo->load_state = vdo->states.vdo.state;
+
+ /*
+ * If the device config specifies a larger logical size than was recorded in the super
+ * block, just accept it.
+ */
+ if (vdo->states.vdo.config.logical_blocks < config->logical_blocks) {
+ vdo_log_warning("Growing logical size: a logical size of %llu blocks was specified, but that differs from the %llu blocks configured in the vdo super block",
+ (unsigned long long) config->logical_blocks,
+ (unsigned long long) vdo->states.vdo.config.logical_blocks);
+ vdo->states.vdo.config.logical_blocks = config->logical_blocks;
+ }
+
+ result = vdo_validate_component_states(&vdo->states, vdo->geometry.nonce,
+ config->physical_blocks,
+ config->logical_blocks);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ vdo->layout = vdo->states.layout;
+ return VDO_SUCCESS;
+}
+
+/**
+ * decode_vdo() - Decode the component data portion of a super block and fill in the corresponding
+ * portions of the vdo being loaded.
+ * @vdo: The vdo being loaded.
+ *
+ * This will also allocate the recovery journal and slab depot. If this method is called with an
+ * asynchronous layer (i.e. a thread config which specifies at least one base thread), the block
+ * map and packer will be constructed as well.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int __must_check decode_vdo(struct vdo *vdo)
+{
+ block_count_t maximum_age, journal_length;
+ struct partition *partition;
+ int result;
+
+ result = decode_from_super_block(vdo);
+ if (result != VDO_SUCCESS) {
+ vdo_destroy_component_states(&vdo->states);
+ return result;
+ }
+
+ maximum_age = vdo_convert_maximum_age(vdo->device_config->block_map_maximum_age);
+ journal_length =
+ vdo_get_recovery_journal_length(vdo->states.vdo.config.recovery_journal_size);
+ if (maximum_age > (journal_length / 2)) {
+ return vdo_log_error_strerror(VDO_BAD_CONFIGURATION,
+ "maximum age: %llu exceeds limit %llu",
+ (unsigned long long) maximum_age,
+ (unsigned long long) (journal_length / 2));
+ }
+
+ if (maximum_age == 0) {
+ return vdo_log_error_strerror(VDO_BAD_CONFIGURATION,
+ "maximum age must be greater than 0");
+ }
+
+ result = vdo_enable_read_only_entry(vdo);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ partition = vdo_get_known_partition(&vdo->layout,
+ VDO_RECOVERY_JOURNAL_PARTITION);
+ result = vdo_decode_recovery_journal(vdo->states.recovery_journal,
+ vdo->states.vdo.nonce, vdo, partition,
+ vdo->states.vdo.complete_recoveries,
+ vdo->states.vdo.config.recovery_journal_size,
+ &vdo->recovery_journal);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ partition = vdo_get_known_partition(&vdo->layout, VDO_SLAB_SUMMARY_PARTITION);
+ result = vdo_decode_slab_depot(vdo->states.slab_depot, vdo, partition,
+ &vdo->depot);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = vdo_decode_block_map(vdo->states.block_map,
+ vdo->states.vdo.config.logical_blocks, vdo,
+ vdo->recovery_journal, vdo->states.vdo.nonce,
+ vdo->device_config->cache_size, maximum_age,
+ &vdo->block_map);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = vdo_make_physical_zones(vdo, &vdo->physical_zones);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ /* The logical zones depend on the physical zones already existing. */
+ result = vdo_make_logical_zones(vdo, &vdo->logical_zones);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ return vdo_make_hash_zones(vdo, &vdo->hash_zones);
+}
+
+/**
+ * pre_load_callback() - Callback to initiate a pre-load, registered in vdo_initialize().
+ * @completion: The admin completion.
+ */
+static void pre_load_callback(struct vdo_completion *completion)
+{
+ struct vdo *vdo = completion->vdo;
+ int result;
+
+ assert_admin_phase_thread(vdo, __func__);
+
+ switch (advance_phase(vdo)) {
+ case PRE_LOAD_PHASE_START:
+ result = vdo_start_operation(&vdo->admin.state,
+ VDO_ADMIN_STATE_PRE_LOADING);
+ if (result != VDO_SUCCESS) {
+ vdo_continue_completion(completion, result);
+ return;
+ }
+
+ vdo_load_super_block(vdo, completion);
+ return;
+
+ case PRE_LOAD_PHASE_LOAD_COMPONENTS:
+ vdo_continue_completion(completion, decode_vdo(vdo));
+ return;
+
+ case PRE_LOAD_PHASE_END:
+ break;
+
+ default:
+ vdo_set_completion_result(completion, UDS_BAD_STATE);
+ }
+
+ finish_operation_callback(completion);
+}
+
+static void release_instance(unsigned int instance)
+{
+ mutex_lock(&instances_lock);
+ if (instance >= instances.bit_count) {
+ VDO_ASSERT_LOG_ONLY(false,
+ "instance number %u must be less than bit count %u",
+ instance, instances.bit_count);
+ } else if (test_bit(instance, instances.words) == 0) {
+ VDO_ASSERT_LOG_ONLY(false, "instance number %u must be allocated", instance);
+ } else {
+ __clear_bit(instance, instances.words);
+ instances.count -= 1;
+ }
+ mutex_unlock(&instances_lock);
+}
+
+static void set_device_config(struct dm_target *ti, struct vdo *vdo,
+ struct device_config *config)
+{
+ list_del_init(&config->config_list);
+ list_add_tail(&config->config_list, &vdo->device_config_list);
+ config->vdo = vdo;
+ ti->private = config;
+ configure_target_capabilities(ti);
+}
+
+static int vdo_initialize(struct dm_target *ti, unsigned int instance,
+ struct device_config *config)
+{
+ struct vdo *vdo;
+ int result;
+ u64 block_size = VDO_BLOCK_SIZE;
+ u64 logical_size = to_bytes(ti->len);
+ block_count_t logical_blocks = logical_size / block_size;
+
+ vdo_log_info("loading device '%s'", vdo_get_device_name(ti));
+ vdo_log_debug("Logical block size = %llu", (u64) config->logical_block_size);
+ vdo_log_debug("Logical blocks = %llu", logical_blocks);
+ vdo_log_debug("Physical block size = %llu", (u64) block_size);
+ vdo_log_debug("Physical blocks = %llu", config->physical_blocks);
+ vdo_log_debug("Block map cache blocks = %u", config->cache_size);
+ vdo_log_debug("Block map maximum age = %u", config->block_map_maximum_age);
+ vdo_log_debug("Deduplication = %s", (config->deduplication ? "on" : "off"));
+ vdo_log_debug("Compression = %s", (config->compression ? "on" : "off"));
+
+ vdo = vdo_find_matching(vdo_uses_device, config);
+ if (vdo != NULL) {
+ vdo_log_error("Existing vdo already uses device %s",
+ vdo->device_config->parent_device_name);
+ ti->error = "Cannot share storage device with already-running VDO";
+ return VDO_BAD_CONFIGURATION;
+ }
+
+ result = vdo_make(instance, config, &ti->error, &vdo);
+ if (result != VDO_SUCCESS) {
+ vdo_log_error("Could not create VDO device. (VDO error %d, message %s)",
+ result, ti->error);
+ vdo_destroy(vdo);
+ return result;
+ }
+
+ result = perform_admin_operation(vdo, PRE_LOAD_PHASE_START, pre_load_callback,
+ finish_operation_callback, "pre-load");
+ if (result != VDO_SUCCESS) {
+ ti->error = ((result == VDO_INVALID_ADMIN_STATE) ?
+ "Pre-load is only valid immediately after initialization" :
+ "Cannot load metadata from device");
+ vdo_log_error("Could not start VDO device. (VDO error %d, message %s)",
+ result, ti->error);
+ vdo_destroy(vdo);
+ return result;
+ }
+
+ set_device_config(ti, vdo, config);
+ vdo->device_config = config;
+ return VDO_SUCCESS;
+}
+
+/* Implements vdo_filter_fn. */
+static bool __must_check vdo_is_named(struct vdo *vdo, const void *context)
+{
+ struct dm_target *ti = vdo->device_config->owning_target;
+ const char *device_name = vdo_get_device_name(ti);
+
+ return strcmp(device_name, context) == 0;
+}
+
+/**
+ * get_bit_array_size() - Return the number of bytes needed to store a bit array of the specified
+ * capacity in an array of unsigned longs.
+ * @bit_count: The number of bits the array must hold.
+ *
+ * Return: the number of bytes needed for the array representation.
+ */
+static size_t get_bit_array_size(unsigned int bit_count)
+{
+ /* Round up to a multiple of the word size and convert to a byte count. */
+ return (BITS_TO_LONGS(bit_count) * sizeof(unsigned long));
+}
+
+/**
+ * grow_bit_array() - Re-allocate the bitmap word array so there will more instance numbers that
+ * can be allocated.
+ *
+ * Since the array is initially NULL, this also initializes the array the first time we allocate an
+ * instance number.
+ *
+ * Return: VDO_SUCCESS or an error code from the allocation
+ */
+static int grow_bit_array(void)
+{
+ unsigned int new_count = max(instances.bit_count + BIT_COUNT_INCREMENT,
+ (unsigned int) BIT_COUNT_MINIMUM);
+ unsigned long *new_words;
+ int result;
+
+ result = vdo_reallocate_memory(instances.words,
+ get_bit_array_size(instances.bit_count),
+ get_bit_array_size(new_count),
+ "instance number bit array", &new_words);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ instances.bit_count = new_count;
+ instances.words = new_words;
+ return VDO_SUCCESS;
+}
+
+/**
+ * allocate_instance() - Allocate an instance number.
+ * @instance_ptr: A point to hold the instance number
+ *
+ * Return: VDO_SUCCESS or an error code
+ *
+ * This function must be called while holding the instances lock.
+ */
+static int allocate_instance(unsigned int *instance_ptr)
+{
+ unsigned int instance;
+ int result;
+
+ /* If there are no unallocated instances, grow the bit array. */
+ if (instances.count >= instances.bit_count) {
+ result = grow_bit_array();
+ if (result != VDO_SUCCESS)
+ return result;
+ }
+
+ /*
+ * There must be a zero bit somewhere now. Find it, starting just after the last instance
+ * allocated.
+ */
+ instance = find_next_zero_bit(instances.words, instances.bit_count,
+ instances.next);
+ if (instance >= instances.bit_count) {
+ /* Nothing free after next, so wrap around to instance zero. */
+ instance = find_first_zero_bit(instances.words, instances.bit_count);
+ result = VDO_ASSERT(instance < instances.bit_count,
+ "impossibly, no zero bit found");
+ if (result != VDO_SUCCESS)
+ return result;
+ }
+
+ __set_bit(instance, instances.words);
+ instances.count++;
+ instances.next = instance + 1;
+ *instance_ptr = instance;
+ return VDO_SUCCESS;
+}
+
+static int construct_new_vdo_registered(struct dm_target *ti, unsigned int argc,
+ char **argv, unsigned int instance)
+{
+ int result;
+ struct device_config *config;
+
+ result = parse_device_config(argc, argv, ti, &config);
+ if (result != VDO_SUCCESS) {
+ vdo_log_error_strerror(result, "parsing failed: %s", ti->error);
+ release_instance(instance);
+ return -EINVAL;
+ }
+
+ /* Beyond this point, the instance number will be cleaned up for us if needed */
+ result = vdo_initialize(ti, instance, config);
+ if (result != VDO_SUCCESS) {
+ release_instance(instance);
+ free_device_config(config);
+ return vdo_status_to_errno(result);
+ }
+
+ return VDO_SUCCESS;
+}
+
+static int construct_new_vdo(struct dm_target *ti, unsigned int argc, char **argv)
+{
+ int result;
+ unsigned int instance;
+ struct registered_thread instance_thread;
+
+ mutex_lock(&instances_lock);
+ result = allocate_instance(&instance);
+ mutex_unlock(&instances_lock);
+ if (result != VDO_SUCCESS)
+ return -ENOMEM;
+
+ vdo_register_thread_device_id(&instance_thread, &instance);
+ result = construct_new_vdo_registered(ti, argc, argv, instance);
+ vdo_unregister_thread_device_id();
+ return result;
+}
+
+/**
+ * check_may_grow_physical() - Callback to check that we're not in recovery mode, used in
+ * vdo_prepare_to_grow_physical().
+ * @completion: The admin completion.
+ */
+static void check_may_grow_physical(struct vdo_completion *completion)
+{
+ struct vdo *vdo = completion->vdo;
+
+ assert_admin_phase_thread(vdo, __func__);
+
+ /* These checks can only be done from a vdo thread. */
+ if (vdo_is_read_only(vdo))
+ vdo_set_completion_result(completion, VDO_READ_ONLY);
+
+ if (vdo_in_recovery_mode(vdo))
+ vdo_set_completion_result(completion, VDO_RETRY_AFTER_REBUILD);
+
+ finish_operation_callback(completion);
+}
+
+static block_count_t get_partition_size(struct layout *layout, enum partition_id id)
+{
+ return vdo_get_known_partition(layout, id)->count;
+}
+
+/**
+ * grow_layout() - Make the layout for growing a vdo.
+ * @vdo: The vdo preparing to grow.
+ * @old_size: The current size of the vdo.
+ * @new_size: The size to which the vdo will be grown.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+static int grow_layout(struct vdo *vdo, block_count_t old_size, block_count_t new_size)
+{
+ int result;
+ block_count_t min_new_size;
+
+ if (vdo->next_layout.size == new_size) {
+ /* We are already prepared to grow to the new size, so we're done. */
+ return VDO_SUCCESS;
+ }
+
+ /* Make a copy completion if there isn't one */
+ if (vdo->partition_copier == NULL) {
+ vdo->partition_copier = dm_kcopyd_client_create(NULL);
+ if (IS_ERR(vdo->partition_copier)) {
+ result = PTR_ERR(vdo->partition_copier);
+ vdo->partition_copier = NULL;
+ return result;
+ }
+ }
+
+ /* Free any unused preparation. */
+ vdo_uninitialize_layout(&vdo->next_layout);
+
+ /*
+ * Make a new layout with the existing partition sizes for everything but the slab depot
+ * partition.
+ */
+ result = vdo_initialize_layout(new_size, vdo->layout.start,
+ get_partition_size(&vdo->layout,
+ VDO_BLOCK_MAP_PARTITION),
+ get_partition_size(&vdo->layout,
+ VDO_RECOVERY_JOURNAL_PARTITION),
+ get_partition_size(&vdo->layout,
+ VDO_SLAB_SUMMARY_PARTITION),
+ &vdo->next_layout);
+ if (result != VDO_SUCCESS) {
+ dm_kcopyd_client_destroy(vdo_forget(vdo->partition_copier));
+ return result;
+ }
+
+ /* Ensure the new journal and summary are entirely within the added blocks. */
+ min_new_size = (old_size +
+ get_partition_size(&vdo->next_layout,
+ VDO_SLAB_SUMMARY_PARTITION) +
+ get_partition_size(&vdo->next_layout,
+ VDO_RECOVERY_JOURNAL_PARTITION));
+ if (min_new_size > new_size) {
+ /* Copying the journal and summary would destroy some old metadata. */
+ vdo_uninitialize_layout(&vdo->next_layout);
+ dm_kcopyd_client_destroy(vdo_forget(vdo->partition_copier));
+ return VDO_INCREMENT_TOO_SMALL;
+ }
+
+ return VDO_SUCCESS;
+}
+
+static int prepare_to_grow_physical(struct vdo *vdo, block_count_t new_physical_blocks)
+{
+ int result;
+ block_count_t current_physical_blocks = vdo->states.vdo.config.physical_blocks;
+
+ vdo_log_info("Preparing to resize physical to %llu",
+ (unsigned long long) new_physical_blocks);
+ VDO_ASSERT_LOG_ONLY((new_physical_blocks > current_physical_blocks),
+ "New physical size is larger than current physical size");
+ result = perform_admin_operation(vdo, PREPARE_GROW_PHYSICAL_PHASE_START,
+ check_may_grow_physical,
+ finish_operation_callback,
+ "prepare grow-physical");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = grow_layout(vdo, current_physical_blocks, new_physical_blocks);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = vdo_prepare_to_grow_slab_depot(vdo->depot,
+ vdo_get_known_partition(&vdo->next_layout,
+ VDO_SLAB_DEPOT_PARTITION));
+ if (result != VDO_SUCCESS) {
+ vdo_uninitialize_layout(&vdo->next_layout);
+ return result;
+ }
+
+ vdo_log_info("Done preparing to resize physical");
+ return VDO_SUCCESS;
+}
+
+/**
+ * validate_new_device_config() - Check whether a new device config represents a valid modification
+ * to an existing config.
+ * @to_validate: The new config to validate.
+ * @config: The existing config.
+ * @may_grow: Set to true if growing the logical and physical size of the vdo is currently
+ * permitted.
+ * @error_ptr: A pointer to hold the reason for any error.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int validate_new_device_config(struct device_config *to_validate,
+ struct device_config *config, bool may_grow,
+ char **error_ptr)
+{
+ if (to_validate->owning_target->begin != config->owning_target->begin) {
+ *error_ptr = "Starting sector cannot change";
+ return VDO_PARAMETER_MISMATCH;
+ }
+
+ if (to_validate->logical_block_size != config->logical_block_size) {
+ *error_ptr = "Logical block size cannot change";
+ return VDO_PARAMETER_MISMATCH;
+ }
+
+ if (to_validate->logical_blocks < config->logical_blocks) {
+ *error_ptr = "Can't shrink VDO logical size";
+ return VDO_PARAMETER_MISMATCH;
+ }
+
+ if (to_validate->cache_size != config->cache_size) {
+ *error_ptr = "Block map cache size cannot change";
+ return VDO_PARAMETER_MISMATCH;
+ }
+
+ if (to_validate->block_map_maximum_age != config->block_map_maximum_age) {
+ *error_ptr = "Block map maximum age cannot change";
+ return VDO_PARAMETER_MISMATCH;
+ }
+
+ if (memcmp(&to_validate->thread_counts, &config->thread_counts,
+ sizeof(struct thread_count_config)) != 0) {
+ *error_ptr = "Thread configuration cannot change";
+ return VDO_PARAMETER_MISMATCH;
+ }
+
+ if (to_validate->physical_blocks < config->physical_blocks) {
+ *error_ptr = "Removing physical storage from a VDO is not supported";
+ return VDO_NOT_IMPLEMENTED;
+ }
+
+ if (!may_grow && (to_validate->physical_blocks > config->physical_blocks)) {
+ *error_ptr = "VDO physical size may not grow in current state";
+ return VDO_NOT_IMPLEMENTED;
+ }
+
+ return VDO_SUCCESS;
+}
+
+static int prepare_to_modify(struct dm_target *ti, struct device_config *config,
+ struct vdo *vdo)
+{
+ int result;
+ bool may_grow = (vdo_get_admin_state(vdo) != VDO_ADMIN_STATE_PRE_LOADED);
+
+ result = validate_new_device_config(config, vdo->device_config, may_grow,
+ &ti->error);
+ if (result != VDO_SUCCESS)
+ return -EINVAL;
+
+ if (config->logical_blocks > vdo->device_config->logical_blocks) {
+ block_count_t logical_blocks = vdo->states.vdo.config.logical_blocks;
+
+ vdo_log_info("Preparing to resize logical to %llu",
+ (unsigned long long) config->logical_blocks);
+ VDO_ASSERT_LOG_ONLY((config->logical_blocks > logical_blocks),
+ "New logical size is larger than current size");
+
+ result = vdo_prepare_to_grow_block_map(vdo->block_map,
+ config->logical_blocks);
+ if (result != VDO_SUCCESS) {
+ ti->error = "Device vdo_prepare_to_grow_logical failed";
+ return result;
+ }
+
+ vdo_log_info("Done preparing to resize logical");
+ }
+
+ if (config->physical_blocks > vdo->device_config->physical_blocks) {
+ result = prepare_to_grow_physical(vdo, config->physical_blocks);
+ if (result != VDO_SUCCESS) {
+ if (result == VDO_PARAMETER_MISMATCH) {
+ /*
+ * If we don't trap this case, vdo_status_to_errno() will remap
+ * it to -EIO, which is misleading and ahistorical.
+ */
+ result = -EINVAL;
+ }
+
+ if (result == VDO_TOO_MANY_SLABS)
+ ti->error = "Device vdo_prepare_to_grow_physical failed (specified physical size too big based on formatted slab size)";
+ else
+ ti->error = "Device vdo_prepare_to_grow_physical failed";
+
+ return result;
+ }
+ }
+
+ if (strcmp(config->parent_device_name, vdo->device_config->parent_device_name) != 0) {
+ const char *device_name = vdo_get_device_name(config->owning_target);
+
+ vdo_log_info("Updating backing device of %s from %s to %s", device_name,
+ vdo->device_config->parent_device_name,
+ config->parent_device_name);
+ }
+
+ return VDO_SUCCESS;
+}
+
+static int update_existing_vdo(const char *device_name, struct dm_target *ti,
+ unsigned int argc, char **argv, struct vdo *vdo)
+{
+ int result;
+ struct device_config *config;
+
+ result = parse_device_config(argc, argv, ti, &config);
+ if (result != VDO_SUCCESS)
+ return -EINVAL;
+
+ vdo_log_info("preparing to modify device '%s'", device_name);
+ result = prepare_to_modify(ti, config, vdo);
+ if (result != VDO_SUCCESS) {
+ free_device_config(config);
+ return vdo_status_to_errno(result);
+ }
+
+ set_device_config(ti, vdo, config);
+ return VDO_SUCCESS;
+}
+
+static int vdo_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+ int result;
+ struct registered_thread allocating_thread, instance_thread;
+ const char *device_name;
+ struct vdo *vdo;
+
+ vdo_register_allocating_thread(&allocating_thread, NULL);
+ device_name = vdo_get_device_name(ti);
+ vdo = vdo_find_matching(vdo_is_named, device_name);
+ if (vdo == NULL) {
+ result = construct_new_vdo(ti, argc, argv);
+ } else {
+ vdo_register_thread_device_id(&instance_thread, &vdo->instance);
+ result = update_existing_vdo(device_name, ti, argc, argv, vdo);
+ vdo_unregister_thread_device_id();
+ }
+
+ vdo_unregister_allocating_thread();
+ return result;
+}
+
+static void vdo_dtr(struct dm_target *ti)
+{
+ struct device_config *config = ti->private;
+ struct vdo *vdo = vdo_forget(config->vdo);
+
+ list_del_init(&config->config_list);
+ if (list_empty(&vdo->device_config_list)) {
+ const char *device_name;
+
+ /* This was the last config referencing the VDO. Free it. */
+ unsigned int instance = vdo->instance;
+ struct registered_thread allocating_thread, instance_thread;
+
+ vdo_register_thread_device_id(&instance_thread, &instance);
+ vdo_register_allocating_thread(&allocating_thread, NULL);
+
+ device_name = vdo_get_device_name(ti);
+ vdo_log_info("stopping device '%s'", device_name);
+ if (vdo->dump_on_shutdown)
+ vdo_dump_all(vdo, "device shutdown");
+
+ vdo_destroy(vdo_forget(vdo));
+ vdo_log_info("device '%s' stopped", device_name);
+ vdo_unregister_thread_device_id();
+ vdo_unregister_allocating_thread();
+ release_instance(instance);
+ } else if (config == vdo->device_config) {
+ /*
+ * The VDO still references this config. Give it a reference to a config that isn't
+ * being destroyed.
+ */
+ vdo->device_config = list_first_entry(&vdo->device_config_list,
+ struct device_config, config_list);
+ }
+
+ free_device_config(config);
+ ti->private = NULL;
+}
+
+static void vdo_presuspend(struct dm_target *ti)
+{
+ get_vdo_for_target(ti)->suspend_type =
+ (dm_noflush_suspending(ti) ? VDO_ADMIN_STATE_SUSPENDING : VDO_ADMIN_STATE_SAVING);
+}
+
+/**
+ * write_super_block_for_suspend() - Update the VDO state and save the super block.
+ * @completion: The admin completion
+ */
+static void write_super_block_for_suspend(struct vdo_completion *completion)
+{
+ struct vdo *vdo = completion->vdo;
+
+ switch (vdo_get_state(vdo)) {
+ case VDO_DIRTY:
+ case VDO_NEW:
+ vdo_set_state(vdo, VDO_CLEAN);
+ break;
+
+ case VDO_CLEAN:
+ case VDO_READ_ONLY_MODE:
+ case VDO_FORCE_REBUILD:
+ case VDO_RECOVERING:
+ case VDO_REBUILD_FOR_UPGRADE:
+ break;
+
+ case VDO_REPLAYING:
+ default:
+ vdo_continue_completion(completion, UDS_BAD_STATE);
+ return;
+ }
+
+ vdo_save_components(vdo, completion);
+}
+
+/**
+ * suspend_callback() - Callback to initiate a suspend, registered in vdo_postsuspend().
+ * @completion: The sub-task completion.
+ */
+static void suspend_callback(struct vdo_completion *completion)
+{
+ struct vdo *vdo = completion->vdo;
+ struct admin_state *state = &vdo->admin.state;
+ int result;
+
+ assert_admin_phase_thread(vdo, __func__);
+
+ switch (advance_phase(vdo)) {
+ case SUSPEND_PHASE_START:
+ if (vdo_get_admin_state_code(state)->quiescent) {
+ /* Already suspended */
+ break;
+ }
+
+ vdo_continue_completion(completion,
+ vdo_start_operation(state, vdo->suspend_type));
+ return;
+
+ case SUSPEND_PHASE_PACKER:
+ /*
+ * If the VDO was already resumed from a prior suspend while read-only, some of the
+ * components may not have been resumed. By setting a read-only error here, we
+ * guarantee that the result of this suspend will be VDO_READ_ONLY and not
+ * VDO_INVALID_ADMIN_STATE in that case.
+ */
+ if (vdo_in_read_only_mode(vdo))
+ vdo_set_completion_result(completion, VDO_READ_ONLY);
+
+ vdo_drain_packer(vdo->packer, completion);
+ return;
+
+ case SUSPEND_PHASE_DATA_VIOS:
+ drain_data_vio_pool(vdo->data_vio_pool, completion);
+ return;
+
+ case SUSPEND_PHASE_DEDUPE:
+ vdo_drain_hash_zones(vdo->hash_zones, completion);
+ return;
+
+ case SUSPEND_PHASE_FLUSHES:
+ vdo_drain_flusher(vdo->flusher, completion);
+ return;
+
+ case SUSPEND_PHASE_LOGICAL_ZONES:
+ /*
+ * Attempt to flush all I/O before completing post suspend work. We believe a
+ * suspended device is expected to have persisted all data written before the
+ * suspend, even if it hasn't been flushed yet.
+ */
+ result = vdo_synchronous_flush(vdo);
+ if (result != VDO_SUCCESS)
+ vdo_enter_read_only_mode(vdo, result);
+
+ vdo_drain_logical_zones(vdo->logical_zones,
+ vdo_get_admin_state_code(state), completion);
+ return;
+
+ case SUSPEND_PHASE_BLOCK_MAP:
+ vdo_drain_block_map(vdo->block_map, vdo_get_admin_state_code(state),
+ completion);
+ return;
+
+ case SUSPEND_PHASE_JOURNAL:
+ vdo_drain_recovery_journal(vdo->recovery_journal,
+ vdo_get_admin_state_code(state), completion);
+ return;
+
+ case SUSPEND_PHASE_DEPOT:
+ vdo_drain_slab_depot(vdo->depot, vdo_get_admin_state_code(state),
+ completion);
+ return;
+
+ case SUSPEND_PHASE_READ_ONLY_WAIT:
+ vdo_wait_until_not_entering_read_only_mode(completion);
+ return;
+
+ case SUSPEND_PHASE_WRITE_SUPER_BLOCK:
+ if (vdo_is_state_suspending(state) || (completion->result != VDO_SUCCESS)) {
+ /* If we didn't save the VDO or there was an error, we're done. */
+ break;
+ }
+
+ write_super_block_for_suspend(completion);
+ return;
+
+ case SUSPEND_PHASE_END:
+ break;
+
+ default:
+ vdo_set_completion_result(completion, UDS_BAD_STATE);
+ }
+
+ finish_operation_callback(completion);
+}
+
+static void vdo_postsuspend(struct dm_target *ti)
+{
+ struct vdo *vdo = get_vdo_for_target(ti);
+ struct registered_thread instance_thread;
+ const char *device_name;
+ int result;
+
+ vdo_register_thread_device_id(&instance_thread, &vdo->instance);
+ device_name = vdo_get_device_name(vdo->device_config->owning_target);
+ vdo_log_info("suspending device '%s'", device_name);
+
+ /*
+ * It's important to note any error here does not actually stop device-mapper from
+ * suspending the device. All this work is done post suspend.
+ */
+ result = perform_admin_operation(vdo, SUSPEND_PHASE_START, suspend_callback,
+ suspend_callback, "suspend");
+
+ if ((result == VDO_SUCCESS) || (result == VDO_READ_ONLY)) {
+ /*
+ * Treat VDO_READ_ONLY as a success since a read-only suspension still leaves the
+ * VDO suspended.
+ */
+ vdo_log_info("device '%s' suspended", device_name);
+ } else if (result == VDO_INVALID_ADMIN_STATE) {
+ vdo_log_error("Suspend invoked while in unexpected state: %s",
+ vdo_get_admin_state(vdo)->name);
+ } else {
+ vdo_log_error_strerror(result, "Suspend of device '%s' failed",
+ device_name);
+ }
+
+ vdo_unregister_thread_device_id();
+}
+
+/**
+ * was_new() - Check whether the vdo was new when it was loaded.
+ * @vdo: The vdo to query.
+ *
+ * Return: true if the vdo was new.
+ */
+static bool was_new(const struct vdo *vdo)
+{
+ return (vdo->load_state == VDO_NEW);
+}
+
+/**
+ * requires_repair() - Check whether a vdo requires recovery or rebuild.
+ * @vdo: The vdo to query.
+ *
+ * Return: true if the vdo must be repaired.
+ */
+static bool __must_check requires_repair(const struct vdo *vdo)
+{
+ switch (vdo_get_state(vdo)) {
+ case VDO_DIRTY:
+ case VDO_FORCE_REBUILD:
+ case VDO_REPLAYING:
+ case VDO_REBUILD_FOR_UPGRADE:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+/**
+ * get_load_type() - Determine how the slab depot was loaded.
+ * @vdo: The vdo.
+ *
+ * Return: How the depot was loaded.
+ */
+static enum slab_depot_load_type get_load_type(struct vdo *vdo)
+{
+ if (vdo_state_requires_read_only_rebuild(vdo->load_state))
+ return VDO_SLAB_DEPOT_REBUILD_LOAD;
+
+ if (vdo_state_requires_recovery(vdo->load_state))
+ return VDO_SLAB_DEPOT_RECOVERY_LOAD;
+
+ return VDO_SLAB_DEPOT_NORMAL_LOAD;
+}
+
+/**
+ * load_callback() - Callback to do the destructive parts of loading a VDO.
+ * @completion: The sub-task completion.
+ */
+static void load_callback(struct vdo_completion *completion)
+{
+ struct vdo *vdo = completion->vdo;
+ int result;
+
+ assert_admin_phase_thread(vdo, __func__);
+
+ switch (advance_phase(vdo)) {
+ case LOAD_PHASE_START:
+ result = vdo_start_operation(&vdo->admin.state, VDO_ADMIN_STATE_LOADING);
+ if (result != VDO_SUCCESS) {
+ vdo_continue_completion(completion, result);
+ return;
+ }
+
+ /* Prepare the recovery journal for new entries. */
+ vdo_open_recovery_journal(vdo->recovery_journal, vdo->depot,
+ vdo->block_map);
+ vdo_allow_read_only_mode_entry(completion);
+ return;
+
+ case LOAD_PHASE_LOAD_DEPOT:
+ vdo_set_dedupe_state_normal(vdo->hash_zones);
+ if (vdo_is_read_only(vdo)) {
+ /*
+ * In read-only mode we don't use the allocator and it may not even be
+ * readable, so don't bother trying to load it.
+ */
+ vdo_set_completion_result(completion, VDO_READ_ONLY);
+ break;
+ }
+
+ if (requires_repair(vdo)) {
+ vdo_repair(completion);
+ return;
+ }
+
+ vdo_load_slab_depot(vdo->depot,
+ (was_new(vdo) ? VDO_ADMIN_STATE_FORMATTING :
+ VDO_ADMIN_STATE_LOADING),
+ completion, NULL);
+ return;
+
+ case LOAD_PHASE_MAKE_DIRTY:
+ vdo_set_state(vdo, VDO_DIRTY);
+ vdo_save_components(vdo, completion);
+ return;
+
+ case LOAD_PHASE_PREPARE_TO_ALLOCATE:
+ vdo_initialize_block_map_from_journal(vdo->block_map,
+ vdo->recovery_journal);
+ vdo_prepare_slab_depot_to_allocate(vdo->depot, get_load_type(vdo),
+ completion);
+ return;
+
+ case LOAD_PHASE_SCRUB_SLABS:
+ if (vdo_state_requires_recovery(vdo->load_state))
+ vdo_enter_recovery_mode(vdo);
+
+ vdo_scrub_all_unrecovered_slabs(vdo->depot, completion);
+ return;
+
+ case LOAD_PHASE_DATA_REDUCTION:
+ WRITE_ONCE(vdo->compressing, vdo->device_config->compression);
+ if (vdo->device_config->deduplication) {
+ /*
+ * Don't try to load or rebuild the index first (and log scary error
+ * messages) if this is known to be a newly-formatted volume.
+ */
+ vdo_start_dedupe_index(vdo->hash_zones, was_new(vdo));
+ }
+
+ vdo->allocations_allowed = false;
+ fallthrough;
+
+ case LOAD_PHASE_FINISHED:
+ break;
+
+ case LOAD_PHASE_DRAIN_JOURNAL:
+ vdo_drain_recovery_journal(vdo->recovery_journal, VDO_ADMIN_STATE_SAVING,
+ completion);
+ return;
+
+ case LOAD_PHASE_WAIT_FOR_READ_ONLY:
+ /* Avoid an infinite loop */
+ completion->error_handler = NULL;
+ vdo->admin.phase = LOAD_PHASE_FINISHED;
+ vdo_wait_until_not_entering_read_only_mode(completion);
+ return;
+
+ default:
+ vdo_set_completion_result(completion, UDS_BAD_STATE);
+ }
+
+ finish_operation_callback(completion);
+}
+
+/**
+ * handle_load_error() - Handle an error during the load operation.
+ * @completion: The admin completion.
+ *
+ * If at all possible, brings the vdo online in read-only mode. This handler is registered in
+ * vdo_preresume_registered().
+ */
+static void handle_load_error(struct vdo_completion *completion)
+{
+ struct vdo *vdo = completion->vdo;
+
+ if (vdo_requeue_completion_if_needed(completion,
+ vdo->thread_config.admin_thread))
+ return;
+
+ if (vdo_state_requires_read_only_rebuild(vdo->load_state) &&
+ (vdo->admin.phase == LOAD_PHASE_MAKE_DIRTY)) {
+ vdo_log_error_strerror(completion->result, "aborting load");
+ vdo->admin.phase = LOAD_PHASE_DRAIN_JOURNAL;
+ load_callback(vdo_forget(completion));
+ return;
+ }
+
+ vdo_log_error_strerror(completion->result,
+ "Entering read-only mode due to load error");
+ vdo->admin.phase = LOAD_PHASE_WAIT_FOR_READ_ONLY;
+ vdo_enter_read_only_mode(vdo, completion->result);
+ completion->result = VDO_READ_ONLY;
+ load_callback(completion);
+}
+
+/**
+ * write_super_block_for_resume() - Update the VDO state and save the super block.
+ * @completion: The admin completion
+ */
+static void write_super_block_for_resume(struct vdo_completion *completion)
+{
+ struct vdo *vdo = completion->vdo;
+
+ switch (vdo_get_state(vdo)) {
+ case VDO_CLEAN:
+ case VDO_NEW:
+ vdo_set_state(vdo, VDO_DIRTY);
+ vdo_save_components(vdo, completion);
+ return;
+
+ case VDO_DIRTY:
+ case VDO_READ_ONLY_MODE:
+ case VDO_FORCE_REBUILD:
+ case VDO_RECOVERING:
+ case VDO_REBUILD_FOR_UPGRADE:
+ /* No need to write the super block in these cases */
+ vdo_launch_completion(completion);
+ return;
+
+ case VDO_REPLAYING:
+ default:
+ vdo_continue_completion(completion, UDS_BAD_STATE);
+ }
+}
+
+/**
+ * resume_callback() - Callback to resume a VDO.
+ * @completion: The admin completion.
+ */
+static void resume_callback(struct vdo_completion *completion)
+{
+ struct vdo *vdo = completion->vdo;
+ int result;
+
+ assert_admin_phase_thread(vdo, __func__);
+
+ switch (advance_phase(vdo)) {
+ case RESUME_PHASE_START:
+ result = vdo_start_operation(&vdo->admin.state,
+ VDO_ADMIN_STATE_RESUMING);
+ if (result != VDO_SUCCESS) {
+ vdo_continue_completion(completion, result);
+ return;
+ }
+
+ write_super_block_for_resume(completion);
+ return;
+
+ case RESUME_PHASE_ALLOW_READ_ONLY_MODE:
+ vdo_allow_read_only_mode_entry(completion);
+ return;
+
+ case RESUME_PHASE_DEDUPE:
+ vdo_resume_hash_zones(vdo->hash_zones, completion);
+ return;
+
+ case RESUME_PHASE_DEPOT:
+ vdo_resume_slab_depot(vdo->depot, completion);
+ return;
+
+ case RESUME_PHASE_JOURNAL:
+ vdo_resume_recovery_journal(vdo->recovery_journal, completion);
+ return;
+
+ case RESUME_PHASE_BLOCK_MAP:
+ vdo_resume_block_map(vdo->block_map, completion);
+ return;
+
+ case RESUME_PHASE_LOGICAL_ZONES:
+ vdo_resume_logical_zones(vdo->logical_zones, completion);
+ return;
+
+ case RESUME_PHASE_PACKER:
+ {
+ bool was_enabled = vdo_get_compressing(vdo);
+ bool enable = vdo->device_config->compression;
+
+ if (enable != was_enabled)
+ WRITE_ONCE(vdo->compressing, enable);
+ vdo_log_info("compression is %s", (enable ? "enabled" : "disabled"));
+
+ vdo_resume_packer(vdo->packer, completion);
+ return;
+ }
+
+ case RESUME_PHASE_FLUSHER:
+ vdo_resume_flusher(vdo->flusher, completion);
+ return;
+
+ case RESUME_PHASE_DATA_VIOS:
+ resume_data_vio_pool(vdo->data_vio_pool, completion);
+ return;
+
+ case RESUME_PHASE_END:
+ break;
+
+ default:
+ vdo_set_completion_result(completion, UDS_BAD_STATE);
+ }
+
+ finish_operation_callback(completion);
+}
+
+/**
+ * grow_logical_callback() - Callback to initiate a grow logical.
+ * @completion: The admin completion.
+ *
+ * Registered in perform_grow_logical().
+ */
+static void grow_logical_callback(struct vdo_completion *completion)
+{
+ struct vdo *vdo = completion->vdo;
+ int result;
+
+ assert_admin_phase_thread(vdo, __func__);
+
+ switch (advance_phase(vdo)) {
+ case GROW_LOGICAL_PHASE_START:
+ if (vdo_is_read_only(vdo)) {
+ vdo_log_error_strerror(VDO_READ_ONLY,
+ "Can't grow logical size of a read-only VDO");
+ vdo_set_completion_result(completion, VDO_READ_ONLY);
+ break;
+ }
+
+ result = vdo_start_operation(&vdo->admin.state,
+ VDO_ADMIN_STATE_SUSPENDED_OPERATION);
+ if (result != VDO_SUCCESS) {
+ vdo_continue_completion(completion, result);
+ return;
+ }
+
+ vdo->states.vdo.config.logical_blocks = vdo->block_map->next_entry_count;
+ vdo_save_components(vdo, completion);
+ return;
+
+ case GROW_LOGICAL_PHASE_GROW_BLOCK_MAP:
+ vdo_grow_block_map(vdo->block_map, completion);
+ return;
+
+ case GROW_LOGICAL_PHASE_END:
+ break;
+
+ case GROW_LOGICAL_PHASE_ERROR:
+ vdo_enter_read_only_mode(vdo, completion->result);
+ break;
+
+ default:
+ vdo_set_completion_result(completion, UDS_BAD_STATE);
+ }
+
+ finish_operation_callback(completion);
+}
+
+/**
+ * handle_logical_growth_error() - Handle an error during the grow physical process.
+ * @completion: The admin completion.
+ */
+static void handle_logical_growth_error(struct vdo_completion *completion)
+{
+ struct vdo *vdo = completion->vdo;
+
+ if (vdo->admin.phase == GROW_LOGICAL_PHASE_GROW_BLOCK_MAP) {
+ /*
+ * We've failed to write the new size in the super block, so set our in memory
+ * config back to the old size.
+ */
+ vdo->states.vdo.config.logical_blocks = vdo->block_map->entry_count;
+ vdo_abandon_block_map_growth(vdo->block_map);
+ }
+
+ vdo->admin.phase = GROW_LOGICAL_PHASE_ERROR;
+ grow_logical_callback(completion);
+}
+
+/**
+ * perform_grow_logical() - Grow the logical size of the vdo.
+ * @vdo: The vdo to grow.
+ * @new_logical_blocks: The size to which the vdo should be grown.
+ *
+ * Context: This method may only be called when the vdo has been suspended and must not be called
+ * from a base thread.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int perform_grow_logical(struct vdo *vdo, block_count_t new_logical_blocks)
+{
+ int result;
+
+ if (vdo->device_config->logical_blocks == new_logical_blocks) {
+ /*
+ * A table was loaded for which we prepared to grow, but a table without that
+ * growth was what we are resuming with.
+ */
+ vdo_abandon_block_map_growth(vdo->block_map);
+ return VDO_SUCCESS;
+ }
+
+ vdo_log_info("Resizing logical to %llu",
+ (unsigned long long) new_logical_blocks);
+ if (vdo->block_map->next_entry_count != new_logical_blocks)
+ return VDO_PARAMETER_MISMATCH;
+
+ result = perform_admin_operation(vdo, GROW_LOGICAL_PHASE_START,
+ grow_logical_callback,
+ handle_logical_growth_error, "grow logical");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ vdo_log_info("Logical blocks now %llu", (unsigned long long) new_logical_blocks);
+ return VDO_SUCCESS;
+}
+
+static void copy_callback(int read_err, unsigned long write_err, void *context)
+{
+ struct vdo_completion *completion = context;
+ int result = (((read_err == 0) && (write_err == 0)) ? VDO_SUCCESS : -EIO);
+
+ vdo_continue_completion(completion, result);
+}
+
+static void partition_to_region(struct partition *partition, struct vdo *vdo,
+ struct dm_io_region *region)
+{
+ physical_block_number_t pbn = partition->offset - vdo->geometry.bio_offset;
+
+ *region = (struct dm_io_region) {
+ .bdev = vdo_get_backing_device(vdo),
+ .sector = pbn * VDO_SECTORS_PER_BLOCK,
+ .count = partition->count * VDO_SECTORS_PER_BLOCK,
+ };
+}
+
+/**
+ * copy_partition() - Copy a partition from the location specified in the current layout to that in
+ * the next layout.
+ * @vdo: The vdo preparing to grow.
+ * @id: The ID of the partition to copy.
+ * @parent: The completion to notify when the copy is complete.
+ */
+static void copy_partition(struct vdo *vdo, enum partition_id id,
+ struct vdo_completion *parent)
+{
+ struct dm_io_region read_region, write_regions[1];
+ struct partition *from = vdo_get_known_partition(&vdo->layout, id);
+ struct partition *to = vdo_get_known_partition(&vdo->next_layout, id);
+
+ partition_to_region(from, vdo, &read_region);
+ partition_to_region(to, vdo, &write_regions[0]);
+ dm_kcopyd_copy(vdo->partition_copier, &read_region, 1, write_regions, 0,
+ copy_callback, parent);
+}
+
+/**
+ * grow_physical_callback() - Callback to initiate a grow physical.
+ * @completion: The admin completion.
+ *
+ * Registered in perform_grow_physical().
+ */
+static void grow_physical_callback(struct vdo_completion *completion)
+{
+ struct vdo *vdo = completion->vdo;
+ int result;
+
+ assert_admin_phase_thread(vdo, __func__);
+
+ switch (advance_phase(vdo)) {
+ case GROW_PHYSICAL_PHASE_START:
+ if (vdo_is_read_only(vdo)) {
+ vdo_log_error_strerror(VDO_READ_ONLY,
+ "Can't grow physical size of a read-only VDO");
+ vdo_set_completion_result(completion, VDO_READ_ONLY);
+ break;
+ }
+
+ result = vdo_start_operation(&vdo->admin.state,
+ VDO_ADMIN_STATE_SUSPENDED_OPERATION);
+ if (result != VDO_SUCCESS) {
+ vdo_continue_completion(completion, result);
+ return;
+ }
+
+ /* Copy the journal into the new layout. */
+ copy_partition(vdo, VDO_RECOVERY_JOURNAL_PARTITION, completion);
+ return;
+
+ case GROW_PHYSICAL_PHASE_COPY_SUMMARY:
+ copy_partition(vdo, VDO_SLAB_SUMMARY_PARTITION, completion);
+ return;
+
+ case GROW_PHYSICAL_PHASE_UPDATE_COMPONENTS:
+ vdo_uninitialize_layout(&vdo->layout);
+ vdo->layout = vdo->next_layout;
+ vdo_forget(vdo->next_layout.head);
+ vdo->states.vdo.config.physical_blocks = vdo->layout.size;
+ vdo_update_slab_depot_size(vdo->depot);
+ vdo_save_components(vdo, completion);
+ return;
+
+ case GROW_PHYSICAL_PHASE_USE_NEW_SLABS:
+ vdo_use_new_slabs(vdo->depot, completion);
+ return;
+
+ case GROW_PHYSICAL_PHASE_END:
+ vdo->depot->summary_origin =
+ vdo_get_known_partition(&vdo->layout,
+ VDO_SLAB_SUMMARY_PARTITION)->offset;
+ vdo->recovery_journal->origin =
+ vdo_get_known_partition(&vdo->layout,
+ VDO_RECOVERY_JOURNAL_PARTITION)->offset;
+ break;
+
+ case GROW_PHYSICAL_PHASE_ERROR:
+ vdo_enter_read_only_mode(vdo, completion->result);
+ break;
+
+ default:
+ vdo_set_completion_result(completion, UDS_BAD_STATE);
+ }
+
+ vdo_uninitialize_layout(&vdo->next_layout);
+ finish_operation_callback(completion);
+}
+
+/**
+ * handle_physical_growth_error() - Handle an error during the grow physical process.
+ * @completion: The sub-task completion.
+ */
+static void handle_physical_growth_error(struct vdo_completion *completion)
+{
+ completion->vdo->admin.phase = GROW_PHYSICAL_PHASE_ERROR;
+ grow_physical_callback(completion);
+}
+
+/**
+ * perform_grow_physical() - Grow the physical size of the vdo.
+ * @vdo: The vdo to resize.
+ * @new_physical_blocks: The new physical size in blocks.
+ *
+ * Context: This method may only be called when the vdo has been suspended and must not be called
+ * from a base thread.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int perform_grow_physical(struct vdo *vdo, block_count_t new_physical_blocks)
+{
+ int result;
+ block_count_t new_depot_size, prepared_depot_size;
+ block_count_t old_physical_blocks = vdo->states.vdo.config.physical_blocks;
+
+ /* Skip any noop grows. */
+ if (old_physical_blocks == new_physical_blocks)
+ return VDO_SUCCESS;
+
+ if (new_physical_blocks != vdo->next_layout.size) {
+ /*
+ * Either the VDO isn't prepared to grow, or it was prepared to grow to a different
+ * size. Doing this check here relies on the fact that the call to this method is
+ * done under the dmsetup message lock.
+ */
+ vdo_uninitialize_layout(&vdo->next_layout);
+ vdo_abandon_new_slabs(vdo->depot);
+ return VDO_PARAMETER_MISMATCH;
+ }
+
+ /* Validate that we are prepared to grow appropriately. */
+ new_depot_size =
+ vdo_get_known_partition(&vdo->next_layout, VDO_SLAB_DEPOT_PARTITION)->count;
+ prepared_depot_size = (vdo->depot->new_slabs == NULL) ? 0 : vdo->depot->new_size;
+ if (prepared_depot_size != new_depot_size)
+ return VDO_PARAMETER_MISMATCH;
+
+ result = perform_admin_operation(vdo, GROW_PHYSICAL_PHASE_START,
+ grow_physical_callback,
+ handle_physical_growth_error, "grow physical");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ vdo_log_info("Physical block count was %llu, now %llu",
+ (unsigned long long) old_physical_blocks,
+ (unsigned long long) new_physical_blocks);
+ return VDO_SUCCESS;
+}
+
+/**
+ * apply_new_vdo_configuration() - Attempt to make any configuration changes from the table being
+ * resumed.
+ * @vdo: The vdo being resumed.
+ * @config: The new device configuration derived from the table with which the vdo is being
+ * resumed.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int __must_check apply_new_vdo_configuration(struct vdo *vdo,
+ struct device_config *config)
+{
+ int result;
+
+ result = perform_grow_logical(vdo, config->logical_blocks);
+ if (result != VDO_SUCCESS) {
+ vdo_log_error("grow logical operation failed, result = %d", result);
+ return result;
+ }
+
+ result = perform_grow_physical(vdo, config->physical_blocks);
+ if (result != VDO_SUCCESS)
+ vdo_log_error("resize operation failed, result = %d", result);
+
+ return result;
+}
+
+static int vdo_preresume_registered(struct dm_target *ti, struct vdo *vdo)
+{
+ struct device_config *config = ti->private;
+ const char *device_name = vdo_get_device_name(ti);
+ block_count_t backing_blocks;
+ int result;
+
+ backing_blocks = get_underlying_device_block_count(vdo);
+ if (backing_blocks < config->physical_blocks) {
+ /* FIXME: can this still happen? */
+ vdo_log_error("resume of device '%s' failed: backing device has %llu blocks but VDO physical size is %llu blocks",
+ device_name, (unsigned long long) backing_blocks,
+ (unsigned long long) config->physical_blocks);
+ return -EINVAL;
+ }
+
+ if (vdo_get_admin_state(vdo) == VDO_ADMIN_STATE_PRE_LOADED) {
+ vdo_log_info("starting device '%s'", device_name);
+ result = perform_admin_operation(vdo, LOAD_PHASE_START, load_callback,
+ handle_load_error, "load");
+ if ((result != VDO_SUCCESS) && (result != VDO_READ_ONLY)) {
+ /*
+ * Something has gone very wrong. Make sure everything has drained and
+ * leave the device in an unresumable state.
+ */
+ vdo_log_error_strerror(result,
+ "Start failed, could not load VDO metadata");
+ vdo->suspend_type = VDO_ADMIN_STATE_STOPPING;
+ perform_admin_operation(vdo, SUSPEND_PHASE_START,
+ suspend_callback, suspend_callback,
+ "suspend");
+ return result;
+ }
+
+ /* Even if the VDO is read-only, it is now able to handle read requests. */
+ vdo_log_info("device '%s' started", device_name);
+ }
+
+ vdo_log_info("resuming device '%s'", device_name);
+
+ /* If this fails, the VDO was not in a state to be resumed. This should never happen. */
+ result = apply_new_vdo_configuration(vdo, config);
+ BUG_ON(result == VDO_INVALID_ADMIN_STATE);
+
+ /*
+ * Now that we've tried to modify the vdo, the new config *is* the config, whether the
+ * modifications worked or not.
+ */
+ vdo->device_config = config;
+
+ /*
+ * Any error here is highly unexpected and the state of the vdo is questionable, so we mark
+ * it read-only in memory. Because we are suspended, the read-only state will not be
+ * written to disk.
+ */
+ if (result != VDO_SUCCESS) {
+ vdo_log_error_strerror(result,
+ "Commit of modifications to device '%s' failed",
+ device_name);
+ vdo_enter_read_only_mode(vdo, result);
+ return result;
+ }
+
+ if (vdo_get_admin_state(vdo)->normal) {
+ /* The VDO was just started, so we don't need to resume it. */
+ return VDO_SUCCESS;
+ }
+
+ result = perform_admin_operation(vdo, RESUME_PHASE_START, resume_callback,
+ resume_callback, "resume");
+ BUG_ON(result == VDO_INVALID_ADMIN_STATE);
+ if (result == VDO_READ_ONLY) {
+ /* Even if the vdo is read-only, it has still resumed. */
+ result = VDO_SUCCESS;
+ }
+
+ if (result != VDO_SUCCESS)
+ vdo_log_error("resume of device '%s' failed with error: %d", device_name,
+ result);
+
+ return result;
+}
+
+static int vdo_preresume(struct dm_target *ti)
+{
+ struct registered_thread instance_thread;
+ struct vdo *vdo = get_vdo_for_target(ti);
+ int result;
+
+ vdo_register_thread_device_id(&instance_thread, &vdo->instance);
+ result = vdo_preresume_registered(ti, vdo);
+ if ((result == VDO_PARAMETER_MISMATCH) || (result == VDO_INVALID_ADMIN_STATE))
+ result = -EINVAL;
+ vdo_unregister_thread_device_id();
+ return vdo_status_to_errno(result);
+}
+
+static void vdo_resume(struct dm_target *ti)
+{
+ struct registered_thread instance_thread;
+
+ vdo_register_thread_device_id(&instance_thread,
+ &get_vdo_for_target(ti)->instance);
+ vdo_log_info("device '%s' resumed", vdo_get_device_name(ti));
+ vdo_unregister_thread_device_id();
+}
+
+/*
+ * If anything changes that affects how user tools will interact with vdo, update the version
+ * number and make sure documentation about the change is complete so tools can properly update
+ * their management code.
+ */
+static struct target_type vdo_target_bio = {
+ .features = DM_TARGET_SINGLETON,
+ .name = "vdo",
+ .version = { 9, 0, 0 },
+ .module = THIS_MODULE,
+ .ctr = vdo_ctr,
+ .dtr = vdo_dtr,
+ .io_hints = vdo_io_hints,
+ .iterate_devices = vdo_iterate_devices,
+ .map = vdo_map_bio,
+ .message = vdo_message,
+ .status = vdo_status,
+ .presuspend = vdo_presuspend,
+ .postsuspend = vdo_postsuspend,
+ .preresume = vdo_preresume,
+ .resume = vdo_resume,
+};
+
+static bool dm_registered;
+
+static void vdo_module_destroy(void)
+{
+ vdo_log_debug("unloading");
+
+ if (dm_registered)
+ dm_unregister_target(&vdo_target_bio);
+
+ VDO_ASSERT_LOG_ONLY(instances.count == 0,
+ "should have no instance numbers still in use, but have %u",
+ instances.count);
+ vdo_free(instances.words);
+ memset(&instances, 0, sizeof(struct instance_tracker));
+}
+
+static int __init vdo_init(void)
+{
+ int result = 0;
+
+ /* Memory tracking must be initialized first for accurate accounting. */
+ vdo_memory_init();
+ vdo_initialize_threads_mutex();
+ vdo_initialize_thread_device_registry();
+ vdo_initialize_device_registry_once();
+
+ /* Add VDO errors to the set of errors registered by the indexer. */
+ result = vdo_register_status_codes();
+ if (result != VDO_SUCCESS) {
+ vdo_log_error("vdo_register_status_codes failed %d", result);
+ vdo_module_destroy();
+ return result;
+ }
+
+ result = dm_register_target(&vdo_target_bio);
+ if (result < 0) {
+ vdo_log_error("dm_register_target failed %d", result);
+ vdo_module_destroy();
+ return result;
+ }
+ dm_registered = true;
+
+ return result;
+}
+
+static void __exit vdo_exit(void)
+{
+ vdo_module_destroy();
+ /* Memory tracking cleanup must be done last. */
+ vdo_memory_exit();
+}
+
+module_init(vdo_init);
+module_exit(vdo_exit);
+
+module_param_named(log_level, vdo_log_level, uint, 0644);
+MODULE_PARM_DESC(log_level, "Log level for log messages");
+
+MODULE_DESCRIPTION(DM_NAME " target for transparent deduplication");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-vdo/dump.c b/drivers/md/dm-vdo/dump.c
new file mode 100644
index 000000000000..00e575d7d773
--- /dev/null
+++ b/drivers/md/dm-vdo/dump.c
@@ -0,0 +1,275 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "dump.h"
+
+#include <linux/module.h>
+
+#include "memory-alloc.h"
+#include "string-utils.h"
+
+#include "constants.h"
+#include "data-vio.h"
+#include "dedupe.h"
+#include "funnel-workqueue.h"
+#include "io-submitter.h"
+#include "logger.h"
+#include "types.h"
+#include "vdo.h"
+
+enum dump_options {
+ /* Work queues */
+ SHOW_QUEUES,
+ /* Memory pools */
+ SHOW_VIO_POOL,
+ /* Others */
+ SHOW_VDO_STATUS,
+ /* This one means an option overrides the "default" choices, instead of altering them. */
+ SKIP_DEFAULT
+};
+
+enum dump_option_flags {
+ /* Work queues */
+ FLAG_SHOW_QUEUES = (1 << SHOW_QUEUES),
+ /* Memory pools */
+ FLAG_SHOW_VIO_POOL = (1 << SHOW_VIO_POOL),
+ /* Others */
+ FLAG_SHOW_VDO_STATUS = (1 << SHOW_VDO_STATUS),
+ /* Special */
+ FLAG_SKIP_DEFAULT = (1 << SKIP_DEFAULT)
+};
+
+#define FLAGS_ALL_POOLS (FLAG_SHOW_VIO_POOL)
+#define DEFAULT_DUMP_FLAGS (FLAG_SHOW_QUEUES | FLAG_SHOW_VDO_STATUS)
+/* Another static buffer... log10(256) = 2.408+, round up: */
+#define DIGITS_PER_U64 (1 + sizeof(u64) * 2409 / 1000)
+
+static inline bool is_arg_string(const char *arg, const char *this_option)
+{
+ /* convention seems to be case-independent options */
+ return strncasecmp(arg, this_option, strlen(this_option)) == 0;
+}
+
+static void do_dump(struct vdo *vdo, unsigned int dump_options_requested,
+ const char *why)
+{
+ u32 active, maximum;
+ s64 outstanding;
+
+ vdo_log_info("%s dump triggered via %s", VDO_LOGGING_MODULE_NAME, why);
+ active = get_data_vio_pool_active_requests(vdo->data_vio_pool);
+ maximum = get_data_vio_pool_maximum_requests(vdo->data_vio_pool);
+ outstanding = (atomic64_read(&vdo->stats.bios_submitted) -
+ atomic64_read(&vdo->stats.bios_completed));
+ vdo_log_info("%u device requests outstanding (max %u), %lld bio requests outstanding, device '%s'",
+ active, maximum, outstanding,
+ vdo_get_device_name(vdo->device_config->owning_target));
+ if (((dump_options_requested & FLAG_SHOW_QUEUES) != 0) && (vdo->threads != NULL)) {
+ thread_id_t id;
+
+ for (id = 0; id < vdo->thread_config.thread_count; id++)
+ vdo_dump_work_queue(vdo->threads[id].queue);
+ }
+
+ vdo_dump_hash_zones(vdo->hash_zones);
+ dump_data_vio_pool(vdo->data_vio_pool,
+ (dump_options_requested & FLAG_SHOW_VIO_POOL) != 0);
+ if ((dump_options_requested & FLAG_SHOW_VDO_STATUS) != 0)
+ vdo_dump_status(vdo);
+
+ vdo_report_memory_usage();
+ vdo_log_info("end of %s dump", VDO_LOGGING_MODULE_NAME);
+}
+
+static int parse_dump_options(unsigned int argc, char *const *argv,
+ unsigned int *dump_options_requested_ptr)
+{
+ unsigned int dump_options_requested = 0;
+
+ static const struct {
+ const char *name;
+ unsigned int flags;
+ } option_names[] = {
+ { "viopool", FLAG_SKIP_DEFAULT | FLAG_SHOW_VIO_POOL },
+ { "vdo", FLAG_SKIP_DEFAULT | FLAG_SHOW_VDO_STATUS },
+ { "pools", FLAG_SKIP_DEFAULT | FLAGS_ALL_POOLS },
+ { "queues", FLAG_SKIP_DEFAULT | FLAG_SHOW_QUEUES },
+ { "threads", FLAG_SKIP_DEFAULT | FLAG_SHOW_QUEUES },
+ { "default", FLAG_SKIP_DEFAULT | DEFAULT_DUMP_FLAGS },
+ { "all", ~0 },
+ };
+
+ bool options_okay = true;
+ unsigned int i;
+
+ for (i = 1; i < argc; i++) {
+ unsigned int j;
+
+ for (j = 0; j < ARRAY_SIZE(option_names); j++) {
+ if (is_arg_string(argv[i], option_names[j].name)) {
+ dump_options_requested |= option_names[j].flags;
+ break;
+ }
+ }
+ if (j == ARRAY_SIZE(option_names)) {
+ vdo_log_warning("dump option name '%s' unknown", argv[i]);
+ options_okay = false;
+ }
+ }
+ if (!options_okay)
+ return -EINVAL;
+ if ((dump_options_requested & FLAG_SKIP_DEFAULT) == 0)
+ dump_options_requested |= DEFAULT_DUMP_FLAGS;
+ *dump_options_requested_ptr = dump_options_requested;
+ return 0;
+}
+
+/* Dump as specified by zero or more string arguments. */
+int vdo_dump(struct vdo *vdo, unsigned int argc, char *const *argv, const char *why)
+{
+ unsigned int dump_options_requested = 0;
+ int result = parse_dump_options(argc, argv, &dump_options_requested);
+
+ if (result != 0)
+ return result;
+
+ do_dump(vdo, dump_options_requested, why);
+ return 0;
+}
+
+/* Dump everything we know how to dump */
+void vdo_dump_all(struct vdo *vdo, const char *why)
+{
+ do_dump(vdo, ~0, why);
+}
+
+/*
+ * Dump out the data_vio waiters on a waitq.
+ * wait_on should be the label to print for queue (e.g. logical or physical)
+ */
+static void dump_vio_waiters(struct vdo_wait_queue *waitq, char *wait_on)
+{
+ struct vdo_waiter *waiter, *first = vdo_waitq_get_first_waiter(waitq);
+ struct data_vio *data_vio;
+
+ if (first == NULL)
+ return;
+
+ data_vio = vdo_waiter_as_data_vio(first);
+
+ vdo_log_info(" %s is locked. Waited on by: vio %px pbn %llu lbn %llu d-pbn %llu lastOp %s",
+ wait_on, data_vio, data_vio->allocation.pbn, data_vio->logical.lbn,
+ data_vio->duplicate.pbn, get_data_vio_operation_name(data_vio));
+
+ for (waiter = first->next_waiter; waiter != first; waiter = waiter->next_waiter) {
+ data_vio = vdo_waiter_as_data_vio(waiter);
+ vdo_log_info(" ... and : vio %px pbn %llu lbn %llu d-pbn %llu lastOp %s",
+ data_vio, data_vio->allocation.pbn, data_vio->logical.lbn,
+ data_vio->duplicate.pbn,
+ get_data_vio_operation_name(data_vio));
+ }
+}
+
+/*
+ * Encode various attributes of a data_vio as a string of one-character flags. This encoding is for
+ * logging brevity:
+ *
+ * R => vio completion result not VDO_SUCCESS
+ * W => vio is on a waitq
+ * D => vio is a duplicate
+ * p => vio is a partial block operation
+ * z => vio is a zero block
+ * d => vio is a discard
+ *
+ * The common case of no flags set will result in an empty, null-terminated buffer. If any flags
+ * are encoded, the first character in the string will be a space character.
+ */
+static void encode_vio_dump_flags(struct data_vio *data_vio, char buffer[8])
+{
+ char *p_flag = buffer;
+ *p_flag++ = ' ';
+ if (data_vio->vio.completion.result != VDO_SUCCESS)
+ *p_flag++ = 'R';
+ if (data_vio->waiter.next_waiter != NULL)
+ *p_flag++ = 'W';
+ if (data_vio->is_duplicate)
+ *p_flag++ = 'D';
+ if (data_vio->is_partial)
+ *p_flag++ = 'p';
+ if (data_vio->is_zero)
+ *p_flag++ = 'z';
+ if (data_vio->remaining_discard > 0)
+ *p_flag++ = 'd';
+ if (p_flag == &buffer[1]) {
+ /* No flags, so remove the blank space. */
+ p_flag = buffer;
+ }
+ *p_flag = '\0';
+}
+
+/* Implements buffer_dump_function. */
+void dump_data_vio(void *data)
+{
+ struct data_vio *data_vio = data;
+
+ /*
+ * This just needs to be big enough to hold a queue (thread) name and a function name (plus
+ * a separator character and NUL). The latter is limited only by taste.
+ *
+ * In making this static, we're assuming only one "dump" will run at a time. If more than
+ * one does run, the log output will be garbled anyway.
+ */
+ static char vio_completion_dump_buffer[100 + MAX_VDO_WORK_QUEUE_NAME_LEN];
+ static char vio_block_number_dump_buffer[sizeof("P L D") + 3 * DIGITS_PER_U64];
+ static char vio_flush_generation_buffer[sizeof(" FG") + DIGITS_PER_U64];
+ static char flags_dump_buffer[8];
+
+ /*
+ * We're likely to be logging a couple thousand of these lines, and in some circumstances
+ * syslogd may have trouble keeping up, so keep it BRIEF rather than user-friendly.
+ */
+ vdo_dump_completion_to_buffer(&data_vio->vio.completion,
+ vio_completion_dump_buffer,
+ sizeof(vio_completion_dump_buffer));
+ if (data_vio->is_duplicate) {
+ snprintf(vio_block_number_dump_buffer,
+ sizeof(vio_block_number_dump_buffer), "P%llu L%llu D%llu",
+ data_vio->allocation.pbn, data_vio->logical.lbn,
+ data_vio->duplicate.pbn);
+ } else if (data_vio_has_allocation(data_vio)) {
+ snprintf(vio_block_number_dump_buffer,
+ sizeof(vio_block_number_dump_buffer), "P%llu L%llu",
+ data_vio->allocation.pbn, data_vio->logical.lbn);
+ } else {
+ snprintf(vio_block_number_dump_buffer,
+ sizeof(vio_block_number_dump_buffer), "L%llu",
+ data_vio->logical.lbn);
+ }
+
+ if (data_vio->flush_generation != 0) {
+ snprintf(vio_flush_generation_buffer,
+ sizeof(vio_flush_generation_buffer), " FG%llu",
+ data_vio->flush_generation);
+ } else {
+ vio_flush_generation_buffer[0] = 0;
+ }
+
+ encode_vio_dump_flags(data_vio, flags_dump_buffer);
+
+ vdo_log_info(" vio %px %s%s %s %s%s", data_vio,
+ vio_block_number_dump_buffer,
+ vio_flush_generation_buffer,
+ get_data_vio_operation_name(data_vio),
+ vio_completion_dump_buffer,
+ flags_dump_buffer);
+ /*
+ * might want info on: wantUDSAnswer / operation / status
+ * might want info on: bio / bios_merged
+ */
+
+ dump_vio_waiters(&data_vio->logical.waiters, "lbn");
+
+ /* might want to dump more info from vio here */
+}
diff --git a/drivers/md/dm-vdo/dump.h b/drivers/md/dm-vdo/dump.h
new file mode 100644
index 000000000000..ad47c70cca78
--- /dev/null
+++ b/drivers/md/dm-vdo/dump.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef VDO_DUMP_H
+#define VDO_DUMP_H
+
+#include "types.h"
+
+int vdo_dump(struct vdo *vdo, unsigned int argc, char *const *argv, const char *why);
+
+void vdo_dump_all(struct vdo *vdo, const char *why);
+
+void dump_data_vio(void *data);
+
+#endif /* VDO_DUMP_H */
diff --git a/drivers/md/dm-vdo/encodings.c b/drivers/md/dm-vdo/encodings.c
new file mode 100644
index 000000000000..a34ea0229d53
--- /dev/null
+++ b/drivers/md/dm-vdo/encodings.c
@@ -0,0 +1,1483 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "encodings.h"
+
+#include <linux/log2.h>
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "permassert.h"
+
+#include "constants.h"
+#include "status-codes.h"
+#include "types.h"
+
+/** The maximum logical space is 4 petabytes, which is 1 terablock. */
+static const block_count_t MAXIMUM_VDO_LOGICAL_BLOCKS = 1024ULL * 1024 * 1024 * 1024;
+
+/** The maximum physical space is 256 terabytes, which is 64 gigablocks. */
+static const block_count_t MAXIMUM_VDO_PHYSICAL_BLOCKS = 1024ULL * 1024 * 1024 * 64;
+
+struct geometry_block {
+ char magic_number[VDO_GEOMETRY_MAGIC_NUMBER_SIZE];
+ struct packed_header header;
+ u32 checksum;
+} __packed;
+
+static const struct header GEOMETRY_BLOCK_HEADER_5_0 = {
+ .id = VDO_GEOMETRY_BLOCK,
+ .version = {
+ .major_version = 5,
+ .minor_version = 0,
+ },
+ /*
+ * Note: this size isn't just the payload size following the header, like it is everywhere
+ * else in VDO.
+ */
+ .size = sizeof(struct geometry_block) + sizeof(struct volume_geometry),
+};
+
+static const struct header GEOMETRY_BLOCK_HEADER_4_0 = {
+ .id = VDO_GEOMETRY_BLOCK,
+ .version = {
+ .major_version = 4,
+ .minor_version = 0,
+ },
+ /*
+ * Note: this size isn't just the payload size following the header, like it is everywhere
+ * else in VDO.
+ */
+ .size = sizeof(struct geometry_block) + sizeof(struct volume_geometry_4_0),
+};
+
+const u8 VDO_GEOMETRY_MAGIC_NUMBER[VDO_GEOMETRY_MAGIC_NUMBER_SIZE + 1] = "dmvdo001";
+
+#define PAGE_HEADER_4_1_SIZE (8 + 8 + 8 + 1 + 1 + 1 + 1)
+
+static const struct version_number BLOCK_MAP_4_1 = {
+ .major_version = 4,
+ .minor_version = 1,
+};
+
+const struct header VDO_BLOCK_MAP_HEADER_2_0 = {
+ .id = VDO_BLOCK_MAP,
+ .version = {
+ .major_version = 2,
+ .minor_version = 0,
+ },
+ .size = sizeof(struct block_map_state_2_0),
+};
+
+const struct header VDO_RECOVERY_JOURNAL_HEADER_7_0 = {
+ .id = VDO_RECOVERY_JOURNAL,
+ .version = {
+ .major_version = 7,
+ .minor_version = 0,
+ },
+ .size = sizeof(struct recovery_journal_state_7_0),
+};
+
+const struct header VDO_SLAB_DEPOT_HEADER_2_0 = {
+ .id = VDO_SLAB_DEPOT,
+ .version = {
+ .major_version = 2,
+ .minor_version = 0,
+ },
+ .size = sizeof(struct slab_depot_state_2_0),
+};
+
+static const struct header VDO_LAYOUT_HEADER_3_0 = {
+ .id = VDO_LAYOUT,
+ .version = {
+ .major_version = 3,
+ .minor_version = 0,
+ },
+ .size = sizeof(struct layout_3_0) + (sizeof(struct partition_3_0) * VDO_PARTITION_COUNT),
+};
+
+static const enum partition_id REQUIRED_PARTITIONS[] = {
+ VDO_BLOCK_MAP_PARTITION,
+ VDO_SLAB_DEPOT_PARTITION,
+ VDO_RECOVERY_JOURNAL_PARTITION,
+ VDO_SLAB_SUMMARY_PARTITION,
+};
+
+/*
+ * The current version for the data encoded in the super block. This must be changed any time there
+ * is a change to encoding of the component data of any VDO component.
+ */
+static const struct version_number VDO_COMPONENT_DATA_41_0 = {
+ .major_version = 41,
+ .minor_version = 0,
+};
+
+const struct version_number VDO_VOLUME_VERSION_67_0 = {
+ .major_version = 67,
+ .minor_version = 0,
+};
+
+static const struct header SUPER_BLOCK_HEADER_12_0 = {
+ .id = VDO_SUPER_BLOCK,
+ .version = {
+ .major_version = 12,
+ .minor_version = 0,
+ },
+
+ /* This is the minimum size, if the super block contains no components. */
+ .size = VDO_SUPER_BLOCK_FIXED_SIZE - VDO_ENCODED_HEADER_SIZE,
+};
+
+/**
+ * validate_version() - Check whether a version matches an expected version.
+ * @expected_version: The expected version.
+ * @actual_version: The version being validated.
+ * @component_name: The name of the component or the calling function (for error logging).
+ *
+ * Logs an error describing a mismatch.
+ *
+ * Return: VDO_SUCCESS if the versions are the same,
+ * VDO_UNSUPPORTED_VERSION if the versions don't match.
+ */
+static int __must_check validate_version(struct version_number expected_version,
+ struct version_number actual_version,
+ const char *component_name)
+{
+ if (!vdo_are_same_version(expected_version, actual_version)) {
+ return vdo_log_error_strerror(VDO_UNSUPPORTED_VERSION,
+ "%s version mismatch, expected %d.%d, got %d.%d",
+ component_name,
+ expected_version.major_version,
+ expected_version.minor_version,
+ actual_version.major_version,
+ actual_version.minor_version);
+ }
+
+ return VDO_SUCCESS;
+}
+
+/**
+ * vdo_validate_header() - Check whether a header matches expectations.
+ * @expected_header: The expected header.
+ * @actual_header: The header being validated.
+ * @exact_size: If true, the size fields of the two headers must be the same, otherwise it is
+ * required that actual_header.size >= expected_header.size.
+ * @name: The name of the component or the calling function (for error logging).
+ *
+ * Logs an error describing the first mismatch found.
+ *
+ * Return: VDO_SUCCESS if the header meets expectations,
+ * VDO_INCORRECT_COMPONENT if the component ids don't match,
+ * VDO_UNSUPPORTED_VERSION if the versions or sizes don't match.
+ */
+int vdo_validate_header(const struct header *expected_header,
+ const struct header *actual_header, bool exact_size,
+ const char *name)
+{
+ int result;
+
+ if (expected_header->id != actual_header->id) {
+ return vdo_log_error_strerror(VDO_INCORRECT_COMPONENT,
+ "%s ID mismatch, expected %d, got %d",
+ name, expected_header->id,
+ actual_header->id);
+ }
+
+ result = validate_version(expected_header->version, actual_header->version,
+ name);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ if ((expected_header->size > actual_header->size) ||
+ (exact_size && (expected_header->size < actual_header->size))) {
+ return vdo_log_error_strerror(VDO_UNSUPPORTED_VERSION,
+ "%s size mismatch, expected %zu, got %zu",
+ name, expected_header->size,
+ actual_header->size);
+ }
+
+ return VDO_SUCCESS;
+}
+
+static void encode_version_number(u8 *buffer, size_t *offset,
+ struct version_number version)
+{
+ struct packed_version_number packed = vdo_pack_version_number(version);
+
+ memcpy(buffer + *offset, &packed, sizeof(packed));
+ *offset += sizeof(packed);
+}
+
+void vdo_encode_header(u8 *buffer, size_t *offset, const struct header *header)
+{
+ struct packed_header packed = vdo_pack_header(header);
+
+ memcpy(buffer + *offset, &packed, sizeof(packed));
+ *offset += sizeof(packed);
+}
+
+static void decode_version_number(u8 *buffer, size_t *offset,
+ struct version_number *version)
+{
+ struct packed_version_number packed;
+
+ memcpy(&packed, buffer + *offset, sizeof(packed));
+ *offset += sizeof(packed);
+ *version = vdo_unpack_version_number(packed);
+}
+
+void vdo_decode_header(u8 *buffer, size_t *offset, struct header *header)
+{
+ struct packed_header packed;
+
+ memcpy(&packed, buffer + *offset, sizeof(packed));
+ *offset += sizeof(packed);
+
+ *header = vdo_unpack_header(&packed);
+}
+
+/**
+ * decode_volume_geometry() - Decode the on-disk representation of a volume geometry from a buffer.
+ * @buffer: A buffer to decode from.
+ * @offset: The offset in the buffer at which to decode.
+ * @geometry: The structure to receive the decoded fields.
+ * @version: The geometry block version to decode.
+ */
+static void decode_volume_geometry(u8 *buffer, size_t *offset,
+ struct volume_geometry *geometry, u32 version)
+{
+ u32 unused, mem;
+ enum volume_region_id id;
+ nonce_t nonce;
+ block_count_t bio_offset = 0;
+ bool sparse;
+
+ /* This is for backwards compatibility. */
+ decode_u32_le(buffer, offset, &unused);
+ geometry->unused = unused;
+
+ decode_u64_le(buffer, offset, &nonce);
+ geometry->nonce = nonce;
+
+ memcpy((unsigned char *) &geometry->uuid, buffer + *offset, sizeof(uuid_t));
+ *offset += sizeof(uuid_t);
+
+ if (version > 4)
+ decode_u64_le(buffer, offset, &bio_offset);
+ geometry->bio_offset = bio_offset;
+
+ for (id = 0; id < VDO_VOLUME_REGION_COUNT; id++) {
+ physical_block_number_t start_block;
+ enum volume_region_id saved_id;
+
+ decode_u32_le(buffer, offset, &saved_id);
+ decode_u64_le(buffer, offset, &start_block);
+
+ geometry->regions[id] = (struct volume_region) {
+ .id = saved_id,
+ .start_block = start_block,
+ };
+ }
+
+ decode_u32_le(buffer, offset, &mem);
+ *offset += sizeof(u32);
+ sparse = buffer[(*offset)++];
+
+ geometry->index_config = (struct index_config) {
+ .mem = mem,
+ .sparse = sparse,
+ };
+}
+
+/**
+ * vdo_parse_geometry_block() - Decode and validate an encoded geometry block.
+ * @block: The encoded geometry block.
+ * @geometry: The structure to receive the decoded fields.
+ */
+int __must_check vdo_parse_geometry_block(u8 *block, struct volume_geometry *geometry)
+{
+ u32 checksum, saved_checksum;
+ struct header header;
+ size_t offset = 0;
+ int result;
+
+ if (memcmp(block, VDO_GEOMETRY_MAGIC_NUMBER, VDO_GEOMETRY_MAGIC_NUMBER_SIZE) != 0)
+ return VDO_BAD_MAGIC;
+ offset += VDO_GEOMETRY_MAGIC_NUMBER_SIZE;
+
+ vdo_decode_header(block, &offset, &header);
+ if (header.version.major_version <= 4) {
+ result = vdo_validate_header(&GEOMETRY_BLOCK_HEADER_4_0, &header,
+ true, __func__);
+ } else {
+ result = vdo_validate_header(&GEOMETRY_BLOCK_HEADER_5_0, &header,
+ true, __func__);
+ }
+ if (result != VDO_SUCCESS)
+ return result;
+
+ decode_volume_geometry(block, &offset, geometry, header.version.major_version);
+
+ result = VDO_ASSERT(header.size == offset + sizeof(u32),
+ "should have decoded up to the geometry checksum");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ /* Decode and verify the checksum. */
+ checksum = vdo_crc32(block, offset);
+ decode_u32_le(block, &offset, &saved_checksum);
+
+ return ((checksum == saved_checksum) ? VDO_SUCCESS : VDO_CHECKSUM_MISMATCH);
+}
+
+struct block_map_page *vdo_format_block_map_page(void *buffer, nonce_t nonce,
+ physical_block_number_t pbn,
+ bool initialized)
+{
+ struct block_map_page *page = buffer;
+
+ memset(buffer, 0, VDO_BLOCK_SIZE);
+ page->version = vdo_pack_version_number(BLOCK_MAP_4_1);
+ page->header.nonce = __cpu_to_le64(nonce);
+ page->header.pbn = __cpu_to_le64(pbn);
+ page->header.initialized = initialized;
+ return page;
+}
+
+enum block_map_page_validity vdo_validate_block_map_page(struct block_map_page *page,
+ nonce_t nonce,
+ physical_block_number_t pbn)
+{
+ BUILD_BUG_ON(sizeof(struct block_map_page_header) != PAGE_HEADER_4_1_SIZE);
+
+ if (!vdo_are_same_version(BLOCK_MAP_4_1,
+ vdo_unpack_version_number(page->version)) ||
+ !page->header.initialized || (nonce != __le64_to_cpu(page->header.nonce)))
+ return VDO_BLOCK_MAP_PAGE_INVALID;
+
+ if (pbn != vdo_get_block_map_page_pbn(page))
+ return VDO_BLOCK_MAP_PAGE_BAD;
+
+ return VDO_BLOCK_MAP_PAGE_VALID;
+}
+
+static int decode_block_map_state_2_0(u8 *buffer, size_t *offset,
+ struct block_map_state_2_0 *state)
+{
+ size_t initial_offset;
+ block_count_t flat_page_count, root_count;
+ physical_block_number_t flat_page_origin, root_origin;
+ struct header header;
+ int result;
+
+ vdo_decode_header(buffer, offset, &header);
+ result = vdo_validate_header(&VDO_BLOCK_MAP_HEADER_2_0, &header, true, __func__);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ initial_offset = *offset;
+
+ decode_u64_le(buffer, offset, &flat_page_origin);
+ result = VDO_ASSERT(flat_page_origin == VDO_BLOCK_MAP_FLAT_PAGE_ORIGIN,
+ "Flat page origin must be %u (recorded as %llu)",
+ VDO_BLOCK_MAP_FLAT_PAGE_ORIGIN,
+ (unsigned long long) state->flat_page_origin);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ decode_u64_le(buffer, offset, &flat_page_count);
+ result = VDO_ASSERT(flat_page_count == 0,
+ "Flat page count must be 0 (recorded as %llu)",
+ (unsigned long long) state->flat_page_count);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ decode_u64_le(buffer, offset, &root_origin);
+ decode_u64_le(buffer, offset, &root_count);
+
+ result = VDO_ASSERT(VDO_BLOCK_MAP_HEADER_2_0.size == *offset - initial_offset,
+ "decoded block map component size must match header size");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ *state = (struct block_map_state_2_0) {
+ .flat_page_origin = flat_page_origin,
+ .flat_page_count = flat_page_count,
+ .root_origin = root_origin,
+ .root_count = root_count,
+ };
+
+ return VDO_SUCCESS;
+}
+
+static void encode_block_map_state_2_0(u8 *buffer, size_t *offset,
+ struct block_map_state_2_0 state)
+{
+ size_t initial_offset;
+
+ vdo_encode_header(buffer, offset, &VDO_BLOCK_MAP_HEADER_2_0);
+
+ initial_offset = *offset;
+ encode_u64_le(buffer, offset, state.flat_page_origin);
+ encode_u64_le(buffer, offset, state.flat_page_count);
+ encode_u64_le(buffer, offset, state.root_origin);
+ encode_u64_le(buffer, offset, state.root_count);
+
+ VDO_ASSERT_LOG_ONLY(VDO_BLOCK_MAP_HEADER_2_0.size == *offset - initial_offset,
+ "encoded block map component size must match header size");
+}
+
+/**
+ * vdo_compute_new_forest_pages() - Compute the number of pages which must be allocated at each
+ * level in order to grow the forest to a new number of entries.
+ * @entries: The new number of entries the block map must address.
+ *
+ * Return: The total number of non-leaf pages required.
+ */
+block_count_t vdo_compute_new_forest_pages(root_count_t root_count,
+ struct boundary *old_sizes,
+ block_count_t entries,
+ struct boundary *new_sizes)
+{
+ page_count_t leaf_pages = max(vdo_compute_block_map_page_count(entries), 1U);
+ page_count_t level_size = DIV_ROUND_UP(leaf_pages, root_count);
+ block_count_t total_pages = 0;
+ height_t height;
+
+ for (height = 0; height < VDO_BLOCK_MAP_TREE_HEIGHT; height++) {
+ block_count_t new_pages;
+
+ level_size = DIV_ROUND_UP(level_size, VDO_BLOCK_MAP_ENTRIES_PER_PAGE);
+ new_sizes->levels[height] = level_size;
+ new_pages = level_size;
+ if (old_sizes != NULL)
+ new_pages -= old_sizes->levels[height];
+ total_pages += (new_pages * root_count);
+ }
+
+ return total_pages;
+}
+
+/**
+ * encode_recovery_journal_state_7_0() - Encode the state of a recovery journal.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+static void encode_recovery_journal_state_7_0(u8 *buffer, size_t *offset,
+ struct recovery_journal_state_7_0 state)
+{
+ size_t initial_offset;
+
+ vdo_encode_header(buffer, offset, &VDO_RECOVERY_JOURNAL_HEADER_7_0);
+
+ initial_offset = *offset;
+ encode_u64_le(buffer, offset, state.journal_start);
+ encode_u64_le(buffer, offset, state.logical_blocks_used);
+ encode_u64_le(buffer, offset, state.block_map_data_blocks);
+
+ VDO_ASSERT_LOG_ONLY(VDO_RECOVERY_JOURNAL_HEADER_7_0.size == *offset - initial_offset,
+ "encoded recovery journal component size must match header size");
+}
+
+/**
+ * decode_recovery_journal_state_7_0() - Decode the state of a recovery journal saved in a buffer.
+ * @buffer: The buffer containing the saved state.
+ * @state: A pointer to a recovery journal state to hold the result of a successful decode.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+static int __must_check decode_recovery_journal_state_7_0(u8 *buffer, size_t *offset,
+ struct recovery_journal_state_7_0 *state)
+{
+ struct header header;
+ int result;
+ size_t initial_offset;
+ sequence_number_t journal_start;
+ block_count_t logical_blocks_used, block_map_data_blocks;
+
+ vdo_decode_header(buffer, offset, &header);
+ result = vdo_validate_header(&VDO_RECOVERY_JOURNAL_HEADER_7_0, &header, true,
+ __func__);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ initial_offset = *offset;
+ decode_u64_le(buffer, offset, &journal_start);
+ decode_u64_le(buffer, offset, &logical_blocks_used);
+ decode_u64_le(buffer, offset, &block_map_data_blocks);
+
+ result = VDO_ASSERT(VDO_RECOVERY_JOURNAL_HEADER_7_0.size == *offset - initial_offset,
+ "decoded recovery journal component size must match header size");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ *state = (struct recovery_journal_state_7_0) {
+ .journal_start = journal_start,
+ .logical_blocks_used = logical_blocks_used,
+ .block_map_data_blocks = block_map_data_blocks,
+ };
+
+ return VDO_SUCCESS;
+}
+
+/**
+ * vdo_get_journal_operation_name() - Get the name of a journal operation.
+ * @operation: The operation to name.
+ *
+ * Return: The name of the operation.
+ */
+const char *vdo_get_journal_operation_name(enum journal_operation operation)
+{
+ switch (operation) {
+ case VDO_JOURNAL_DATA_REMAPPING:
+ return "data remapping";
+
+ case VDO_JOURNAL_BLOCK_MAP_REMAPPING:
+ return "block map remapping";
+
+ default:
+ return "unknown journal operation";
+ }
+}
+
+/**
+ * encode_slab_depot_state_2_0() - Encode the state of a slab depot into a buffer.
+ */
+static void encode_slab_depot_state_2_0(u8 *buffer, size_t *offset,
+ struct slab_depot_state_2_0 state)
+{
+ size_t initial_offset;
+
+ vdo_encode_header(buffer, offset, &VDO_SLAB_DEPOT_HEADER_2_0);
+
+ initial_offset = *offset;
+ encode_u64_le(buffer, offset, state.slab_config.slab_blocks);
+ encode_u64_le(buffer, offset, state.slab_config.data_blocks);
+ encode_u64_le(buffer, offset, state.slab_config.reference_count_blocks);
+ encode_u64_le(buffer, offset, state.slab_config.slab_journal_blocks);
+ encode_u64_le(buffer, offset, state.slab_config.slab_journal_flushing_threshold);
+ encode_u64_le(buffer, offset, state.slab_config.slab_journal_blocking_threshold);
+ encode_u64_le(buffer, offset, state.slab_config.slab_journal_scrubbing_threshold);
+ encode_u64_le(buffer, offset, state.first_block);
+ encode_u64_le(buffer, offset, state.last_block);
+ buffer[(*offset)++] = state.zone_count;
+
+ VDO_ASSERT_LOG_ONLY(VDO_SLAB_DEPOT_HEADER_2_0.size == *offset - initial_offset,
+ "encoded block map component size must match header size");
+}
+
+/**
+ * decode_slab_depot_state_2_0() - Decode slab depot component state version 2.0 from a buffer.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+static int decode_slab_depot_state_2_0(u8 *buffer, size_t *offset,
+ struct slab_depot_state_2_0 *state)
+{
+ struct header header;
+ int result;
+ size_t initial_offset;
+ struct slab_config slab_config;
+ block_count_t count;
+ physical_block_number_t first_block, last_block;
+ zone_count_t zone_count;
+
+ vdo_decode_header(buffer, offset, &header);
+ result = vdo_validate_header(&VDO_SLAB_DEPOT_HEADER_2_0, &header, true,
+ __func__);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ initial_offset = *offset;
+ decode_u64_le(buffer, offset, &count);
+ slab_config.slab_blocks = count;
+
+ decode_u64_le(buffer, offset, &count);
+ slab_config.data_blocks = count;
+
+ decode_u64_le(buffer, offset, &count);
+ slab_config.reference_count_blocks = count;
+
+ decode_u64_le(buffer, offset, &count);
+ slab_config.slab_journal_blocks = count;
+
+ decode_u64_le(buffer, offset, &count);
+ slab_config.slab_journal_flushing_threshold = count;
+
+ decode_u64_le(buffer, offset, &count);
+ slab_config.slab_journal_blocking_threshold = count;
+
+ decode_u64_le(buffer, offset, &count);
+ slab_config.slab_journal_scrubbing_threshold = count;
+
+ decode_u64_le(buffer, offset, &first_block);
+ decode_u64_le(buffer, offset, &last_block);
+ zone_count = buffer[(*offset)++];
+
+ result = VDO_ASSERT(VDO_SLAB_DEPOT_HEADER_2_0.size == *offset - initial_offset,
+ "decoded slab depot component size must match header size");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ *state = (struct slab_depot_state_2_0) {
+ .slab_config = slab_config,
+ .first_block = first_block,
+ .last_block = last_block,
+ .zone_count = zone_count,
+ };
+
+ return VDO_SUCCESS;
+}
+
+/**
+ * vdo_configure_slab_depot() - Configure the slab depot.
+ * @partition: The slab depot partition
+ * @slab_config: The configuration of a single slab.
+ * @zone_count: The number of zones the depot will use.
+ * @state: The state structure to be configured.
+ *
+ * Configures the slab_depot for the specified storage capacity, finding the number of data blocks
+ * that will fit and still leave room for the depot metadata, then return the saved state for that
+ * configuration.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+int vdo_configure_slab_depot(const struct partition *partition,
+ struct slab_config slab_config, zone_count_t zone_count,
+ struct slab_depot_state_2_0 *state)
+{
+ block_count_t total_slab_blocks, total_data_blocks;
+ size_t slab_count;
+ physical_block_number_t last_block;
+ block_count_t slab_size = slab_config.slab_blocks;
+
+ vdo_log_debug("slabDepot %s(block_count=%llu, first_block=%llu, slab_size=%llu, zone_count=%u)",
+ __func__, (unsigned long long) partition->count,
+ (unsigned long long) partition->offset,
+ (unsigned long long) slab_size, zone_count);
+
+ /* We do not allow runt slabs, so we waste up to a slab's worth. */
+ slab_count = (partition->count / slab_size);
+ if (slab_count == 0)
+ return VDO_NO_SPACE;
+
+ if (slab_count > MAX_VDO_SLABS)
+ return VDO_TOO_MANY_SLABS;
+
+ total_slab_blocks = slab_count * slab_config.slab_blocks;
+ total_data_blocks = slab_count * slab_config.data_blocks;
+ last_block = partition->offset + total_slab_blocks;
+
+ *state = (struct slab_depot_state_2_0) {
+ .slab_config = slab_config,
+ .first_block = partition->offset,
+ .last_block = last_block,
+ .zone_count = zone_count,
+ };
+
+ vdo_log_debug("slab_depot last_block=%llu, total_data_blocks=%llu, slab_count=%zu, left_over=%llu",
+ (unsigned long long) last_block,
+ (unsigned long long) total_data_blocks, slab_count,
+ (unsigned long long) (partition->count - (last_block - partition->offset)));
+
+ return VDO_SUCCESS;
+}
+
+/**
+ * vdo_configure_slab() - Measure and initialize the configuration to use for each slab.
+ * @slab_size: The number of blocks per slab.
+ * @slab_journal_blocks: The number of blocks for the slab journal.
+ * @slab_config: The slab configuration to initialize.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+int vdo_configure_slab(block_count_t slab_size, block_count_t slab_journal_blocks,
+ struct slab_config *slab_config)
+{
+ block_count_t ref_blocks, meta_blocks, data_blocks;
+ block_count_t flushing_threshold, remaining, blocking_threshold;
+ block_count_t minimal_extra_space, scrubbing_threshold;
+
+ if (slab_journal_blocks >= slab_size)
+ return VDO_BAD_CONFIGURATION;
+
+ /*
+ * This calculation should technically be a recurrence, but the total number of metadata
+ * blocks is currently less than a single block of ref_counts, so we'd gain at most one
+ * data block in each slab with more iteration.
+ */
+ ref_blocks = vdo_get_saved_reference_count_size(slab_size - slab_journal_blocks);
+ meta_blocks = (ref_blocks + slab_journal_blocks);
+
+ /* Make sure test code hasn't configured slabs to be too small. */
+ if (meta_blocks >= slab_size)
+ return VDO_BAD_CONFIGURATION;
+
+ /*
+ * If the slab size is very small, assume this must be a unit test and override the number
+ * of data blocks to be a power of two (wasting blocks in the slab). Many tests need their
+ * data_blocks fields to be the exact capacity of the configured volume, and that used to
+ * fall out since they use a power of two for the number of data blocks, the slab size was
+ * a power of two, and every block in a slab was a data block.
+ *
+ * TODO: Try to figure out some way of structuring testParameters and unit tests so this
+ * hack isn't needed without having to edit several unit tests every time the metadata size
+ * changes by one block.
+ */
+ data_blocks = slab_size - meta_blocks;
+ if ((slab_size < 1024) && !is_power_of_2(data_blocks))
+ data_blocks = ((block_count_t) 1 << ilog2(data_blocks));
+
+ /*
+ * Configure the slab journal thresholds. The flush threshold is 168 of 224 blocks in
+ * production, or 3/4ths, so we use this ratio for all sizes.
+ */
+ flushing_threshold = ((slab_journal_blocks * 3) + 3) / 4;
+ /*
+ * The blocking threshold should be far enough from the flushing threshold to not produce
+ * delays, but far enough from the end of the journal to allow multiple successive recovery
+ * failures.
+ */
+ remaining = slab_journal_blocks - flushing_threshold;
+ blocking_threshold = flushing_threshold + ((remaining * 5) / 7);
+ /* The scrubbing threshold should be at least 2048 entries before the end of the journal. */
+ minimal_extra_space = 1 + (MAXIMUM_VDO_USER_VIOS / VDO_SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK);
+ scrubbing_threshold = blocking_threshold;
+ if (slab_journal_blocks > minimal_extra_space)
+ scrubbing_threshold = slab_journal_blocks - minimal_extra_space;
+ if (blocking_threshold > scrubbing_threshold)
+ blocking_threshold = scrubbing_threshold;
+
+ *slab_config = (struct slab_config) {
+ .slab_blocks = slab_size,
+ .data_blocks = data_blocks,
+ .reference_count_blocks = ref_blocks,
+ .slab_journal_blocks = slab_journal_blocks,
+ .slab_journal_flushing_threshold = flushing_threshold,
+ .slab_journal_blocking_threshold = blocking_threshold,
+ .slab_journal_scrubbing_threshold = scrubbing_threshold};
+ return VDO_SUCCESS;
+}
+
+/**
+ * vdo_decode_slab_journal_entry() - Decode a slab journal entry.
+ * @block: The journal block holding the entry.
+ * @entry_count: The number of the entry.
+ *
+ * Return: The decoded entry.
+ */
+struct slab_journal_entry vdo_decode_slab_journal_entry(struct packed_slab_journal_block *block,
+ journal_entry_count_t entry_count)
+{
+ struct slab_journal_entry entry =
+ vdo_unpack_slab_journal_entry(&block->payload.entries[entry_count]);
+
+ if (block->header.has_block_map_increments &&
+ ((block->payload.full_entries.entry_types[entry_count / 8] &
+ ((u8) 1 << (entry_count % 8))) != 0))
+ entry.operation = VDO_JOURNAL_BLOCK_MAP_REMAPPING;
+
+ return entry;
+}
+
+/**
+ * allocate_partition() - Allocate a partition and add it to a layout.
+ * @layout: The layout containing the partition.
+ * @id: The id of the partition.
+ * @offset: The offset into the layout at which the partition begins.
+ * @size: The size of the partition in blocks.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int allocate_partition(struct layout *layout, u8 id,
+ physical_block_number_t offset, block_count_t size)
+{
+ struct partition *partition;
+ int result;
+
+ result = vdo_allocate(1, struct partition, __func__, &partition);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ partition->id = id;
+ partition->offset = offset;
+ partition->count = size;
+ partition->next = layout->head;
+ layout->head = partition;
+
+ return VDO_SUCCESS;
+}
+
+/**
+ * make_partition() - Create a new partition from the beginning or end of the unused space in a
+ * layout.
+ * @layout: The layout.
+ * @id: The id of the partition to make.
+ * @size: The number of blocks to carve out; if 0, all remaining space will be used.
+ * @beginning: True if the partition should start at the beginning of the unused space.
+ *
+ * Return: A success or error code, particularly VDO_NO_SPACE if there are fewer than size blocks
+ * remaining.
+ */
+static int __must_check make_partition(struct layout *layout, enum partition_id id,
+ block_count_t size, bool beginning)
+{
+ int result;
+ physical_block_number_t offset;
+ block_count_t free_blocks = layout->last_free - layout->first_free;
+
+ if (size == 0) {
+ if (free_blocks == 0)
+ return VDO_NO_SPACE;
+ size = free_blocks;
+ } else if (size > free_blocks) {
+ return VDO_NO_SPACE;
+ }
+
+ result = vdo_get_partition(layout, id, NULL);
+ if (result != VDO_UNKNOWN_PARTITION)
+ return VDO_PARTITION_EXISTS;
+
+ offset = beginning ? layout->first_free : (layout->last_free - size);
+
+ result = allocate_partition(layout, id, offset, size);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ layout->num_partitions++;
+ if (beginning)
+ layout->first_free += size;
+ else
+ layout->last_free = layout->last_free - size;
+
+ return VDO_SUCCESS;
+}
+
+/**
+ * vdo_initialize_layout() - Lay out the partitions of a vdo.
+ * @size: The entire size of the vdo.
+ * @origin: The start of the layout on the underlying storage in blocks.
+ * @block_map_blocks: The size of the block map partition.
+ * @journal_blocks: The size of the journal partition.
+ * @summary_blocks: The size of the slab summary partition.
+ * @layout: The layout to initialize.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+int vdo_initialize_layout(block_count_t size, physical_block_number_t offset,
+ block_count_t block_map_blocks, block_count_t journal_blocks,
+ block_count_t summary_blocks, struct layout *layout)
+{
+ int result;
+ block_count_t necessary_size =
+ (offset + block_map_blocks + journal_blocks + summary_blocks);
+
+ if (necessary_size > size)
+ return vdo_log_error_strerror(VDO_NO_SPACE,
+ "Not enough space to make a VDO");
+
+ *layout = (struct layout) {
+ .start = offset,
+ .size = size,
+ .first_free = offset,
+ .last_free = size,
+ .num_partitions = 0,
+ .head = NULL,
+ };
+
+ result = make_partition(layout, VDO_BLOCK_MAP_PARTITION, block_map_blocks, true);
+ if (result != VDO_SUCCESS) {
+ vdo_uninitialize_layout(layout);
+ return result;
+ }
+
+ result = make_partition(layout, VDO_SLAB_SUMMARY_PARTITION, summary_blocks,
+ false);
+ if (result != VDO_SUCCESS) {
+ vdo_uninitialize_layout(layout);
+ return result;
+ }
+
+ result = make_partition(layout, VDO_RECOVERY_JOURNAL_PARTITION, journal_blocks,
+ false);
+ if (result != VDO_SUCCESS) {
+ vdo_uninitialize_layout(layout);
+ return result;
+ }
+
+ result = make_partition(layout, VDO_SLAB_DEPOT_PARTITION, 0, true);
+ if (result != VDO_SUCCESS)
+ vdo_uninitialize_layout(layout);
+
+ return result;
+}
+
+/**
+ * vdo_uninitialize_layout() - Clean up a layout.
+ * @layout: The layout to clean up.
+ *
+ * All partitions created by this layout become invalid pointers.
+ */
+void vdo_uninitialize_layout(struct layout *layout)
+{
+ while (layout->head != NULL) {
+ struct partition *part = layout->head;
+
+ layout->head = part->next;
+ vdo_free(part);
+ }
+
+ memset(layout, 0, sizeof(struct layout));
+}
+
+/**
+ * vdo_get_partition() - Get a partition by id.
+ * @layout: The layout from which to get a partition.
+ * @id: The id of the partition.
+ * @partition_ptr: A pointer to hold the partition.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+int vdo_get_partition(struct layout *layout, enum partition_id id,
+ struct partition **partition_ptr)
+{
+ struct partition *partition;
+
+ for (partition = layout->head; partition != NULL; partition = partition->next) {
+ if (partition->id == id) {
+ if (partition_ptr != NULL)
+ *partition_ptr = partition;
+ return VDO_SUCCESS;
+ }
+ }
+
+ return VDO_UNKNOWN_PARTITION;
+}
+
+/**
+ * vdo_get_known_partition() - Get a partition by id from a validated layout.
+ * @layout: The layout from which to get a partition.
+ * @id: The id of the partition.
+ *
+ * Return: the partition
+ */
+struct partition *vdo_get_known_partition(struct layout *layout, enum partition_id id)
+{
+ struct partition *partition;
+ int result = vdo_get_partition(layout, id, &partition);
+
+ VDO_ASSERT_LOG_ONLY(result == VDO_SUCCESS, "layout has expected partition: %u", id);
+
+ return partition;
+}
+
+static void encode_layout(u8 *buffer, size_t *offset, const struct layout *layout)
+{
+ const struct partition *partition;
+ size_t initial_offset;
+ struct header header = VDO_LAYOUT_HEADER_3_0;
+
+ BUILD_BUG_ON(sizeof(enum partition_id) != sizeof(u8));
+ VDO_ASSERT_LOG_ONLY(layout->num_partitions <= U8_MAX,
+ "layout partition count must fit in a byte");
+
+ vdo_encode_header(buffer, offset, &header);
+
+ initial_offset = *offset;
+ encode_u64_le(buffer, offset, layout->first_free);
+ encode_u64_le(buffer, offset, layout->last_free);
+ buffer[(*offset)++] = layout->num_partitions;
+
+ VDO_ASSERT_LOG_ONLY(sizeof(struct layout_3_0) == *offset - initial_offset,
+ "encoded size of a layout header must match structure");
+
+ for (partition = layout->head; partition != NULL; partition = partition->next) {
+ buffer[(*offset)++] = partition->id;
+ encode_u64_le(buffer, offset, partition->offset);
+ /* This field only exists for backwards compatibility */
+ encode_u64_le(buffer, offset, 0);
+ encode_u64_le(buffer, offset, partition->count);
+ }
+
+ VDO_ASSERT_LOG_ONLY(header.size == *offset - initial_offset,
+ "encoded size of a layout must match header size");
+}
+
+static int decode_layout(u8 *buffer, size_t *offset, physical_block_number_t start,
+ block_count_t size, struct layout *layout)
+{
+ struct header header;
+ struct layout_3_0 layout_header;
+ struct partition *partition;
+ size_t initial_offset;
+ physical_block_number_t first_free, last_free;
+ u8 partition_count;
+ u8 i;
+ int result;
+
+ vdo_decode_header(buffer, offset, &header);
+ /* Layout is variable size, so only do a minimum size check here. */
+ result = vdo_validate_header(&VDO_LAYOUT_HEADER_3_0, &header, false, __func__);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ initial_offset = *offset;
+ decode_u64_le(buffer, offset, &first_free);
+ decode_u64_le(buffer, offset, &last_free);
+ partition_count = buffer[(*offset)++];
+ layout_header = (struct layout_3_0) {
+ .first_free = first_free,
+ .last_free = last_free,
+ .partition_count = partition_count,
+ };
+
+ result = VDO_ASSERT(sizeof(struct layout_3_0) == *offset - initial_offset,
+ "decoded size of a layout header must match structure");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ layout->start = start;
+ layout->size = size;
+ layout->first_free = layout_header.first_free;
+ layout->last_free = layout_header.last_free;
+ layout->num_partitions = layout_header.partition_count;
+
+ if (layout->num_partitions > VDO_PARTITION_COUNT) {
+ return vdo_log_error_strerror(VDO_UNKNOWN_PARTITION,
+ "layout has extra partitions");
+ }
+
+ for (i = 0; i < layout->num_partitions; i++) {
+ u8 id;
+ u64 partition_offset, count;
+
+ id = buffer[(*offset)++];
+ decode_u64_le(buffer, offset, &partition_offset);
+ *offset += sizeof(u64);
+ decode_u64_le(buffer, offset, &count);
+
+ result = allocate_partition(layout, id, partition_offset, count);
+ if (result != VDO_SUCCESS) {
+ vdo_uninitialize_layout(layout);
+ return result;
+ }
+ }
+
+ /* Validate that the layout has all (and only) the required partitions */
+ for (i = 0; i < VDO_PARTITION_COUNT; i++) {
+ result = vdo_get_partition(layout, REQUIRED_PARTITIONS[i], &partition);
+ if (result != VDO_SUCCESS) {
+ vdo_uninitialize_layout(layout);
+ return vdo_log_error_strerror(result,
+ "layout is missing required partition %u",
+ REQUIRED_PARTITIONS[i]);
+ }
+
+ start += partition->count;
+ }
+
+ if (start != size) {
+ vdo_uninitialize_layout(layout);
+ return vdo_log_error_strerror(UDS_BAD_STATE,
+ "partitions do not cover the layout");
+ }
+
+ return VDO_SUCCESS;
+}
+
+/**
+ * pack_vdo_config() - Convert a vdo_config to its packed on-disk representation.
+ * @config: The vdo config to convert.
+ *
+ * Return: The platform-independent representation of the config.
+ */
+static struct packed_vdo_config pack_vdo_config(struct vdo_config config)
+{
+ return (struct packed_vdo_config) {
+ .logical_blocks = __cpu_to_le64(config.logical_blocks),
+ .physical_blocks = __cpu_to_le64(config.physical_blocks),
+ .slab_size = __cpu_to_le64(config.slab_size),
+ .recovery_journal_size = __cpu_to_le64(config.recovery_journal_size),
+ .slab_journal_blocks = __cpu_to_le64(config.slab_journal_blocks),
+ };
+}
+
+/**
+ * pack_vdo_component() - Convert a vdo_component to its packed on-disk representation.
+ * @component: The VDO component data to convert.
+ *
+ * Return: The platform-independent representation of the component.
+ */
+static struct packed_vdo_component_41_0 pack_vdo_component(const struct vdo_component component)
+{
+ return (struct packed_vdo_component_41_0) {
+ .state = __cpu_to_le32(component.state),
+ .complete_recoveries = __cpu_to_le64(component.complete_recoveries),
+ .read_only_recoveries = __cpu_to_le64(component.read_only_recoveries),
+ .config = pack_vdo_config(component.config),
+ .nonce = __cpu_to_le64(component.nonce),
+ };
+}
+
+static void encode_vdo_component(u8 *buffer, size_t *offset,
+ struct vdo_component component)
+{
+ struct packed_vdo_component_41_0 packed;
+
+ encode_version_number(buffer, offset, VDO_COMPONENT_DATA_41_0);
+ packed = pack_vdo_component(component);
+ memcpy(buffer + *offset, &packed, sizeof(packed));
+ *offset += sizeof(packed);
+}
+
+/**
+ * unpack_vdo_config() - Convert a packed_vdo_config to its native in-memory representation.
+ * @config: The packed vdo config to convert.
+ *
+ * Return: The native in-memory representation of the vdo config.
+ */
+static struct vdo_config unpack_vdo_config(struct packed_vdo_config config)
+{
+ return (struct vdo_config) {
+ .logical_blocks = __le64_to_cpu(config.logical_blocks),
+ .physical_blocks = __le64_to_cpu(config.physical_blocks),
+ .slab_size = __le64_to_cpu(config.slab_size),
+ .recovery_journal_size = __le64_to_cpu(config.recovery_journal_size),
+ .slab_journal_blocks = __le64_to_cpu(config.slab_journal_blocks),
+ };
+}
+
+/**
+ * unpack_vdo_component_41_0() - Convert a packed_vdo_component_41_0 to its native in-memory
+ * representation.
+ * @component: The packed vdo component data to convert.
+ *
+ * Return: The native in-memory representation of the component.
+ */
+static struct vdo_component unpack_vdo_component_41_0(struct packed_vdo_component_41_0 component)
+{
+ return (struct vdo_component) {
+ .state = __le32_to_cpu(component.state),
+ .complete_recoveries = __le64_to_cpu(component.complete_recoveries),
+ .read_only_recoveries = __le64_to_cpu(component.read_only_recoveries),
+ .config = unpack_vdo_config(component.config),
+ .nonce = __le64_to_cpu(component.nonce),
+ };
+}
+
+/**
+ * decode_vdo_component() - Decode the component data for the vdo itself out of the super block.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int decode_vdo_component(u8 *buffer, size_t *offset, struct vdo_component *component)
+{
+ struct version_number version;
+ struct packed_vdo_component_41_0 packed;
+ int result;
+
+ decode_version_number(buffer, offset, &version);
+ result = validate_version(version, VDO_COMPONENT_DATA_41_0,
+ "VDO component data");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ memcpy(&packed, buffer + *offset, sizeof(packed));
+ *offset += sizeof(packed);
+ *component = unpack_vdo_component_41_0(packed);
+ return VDO_SUCCESS;
+}
+
+/**
+ * vdo_validate_config() - Validate constraints on a VDO config.
+ * @config: The VDO config.
+ * @physical_block_count: The minimum block count of the underlying storage.
+ * @logical_block_count: The expected logical size of the VDO, or 0 if the logical size may be
+ * unspecified.
+ *
+ * Return: A success or error code.
+ */
+int vdo_validate_config(const struct vdo_config *config,
+ block_count_t physical_block_count,
+ block_count_t logical_block_count)
+{
+ struct slab_config slab_config;
+ int result;
+
+ result = VDO_ASSERT(config->slab_size > 0, "slab size unspecified");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = VDO_ASSERT(is_power_of_2(config->slab_size),
+ "slab size must be a power of two");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = VDO_ASSERT(config->slab_size <= (1 << MAX_VDO_SLAB_BITS),
+ "slab size must be less than or equal to 2^%d",
+ MAX_VDO_SLAB_BITS);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = VDO_ASSERT(config->slab_journal_blocks >= MINIMUM_VDO_SLAB_JOURNAL_BLOCKS,
+ "slab journal size meets minimum size");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = VDO_ASSERT(config->slab_journal_blocks <= config->slab_size,
+ "slab journal size is within expected bound");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = vdo_configure_slab(config->slab_size, config->slab_journal_blocks,
+ &slab_config);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = VDO_ASSERT((slab_config.data_blocks >= 1),
+ "slab must be able to hold at least one block");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = VDO_ASSERT(config->physical_blocks > 0, "physical blocks unspecified");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = VDO_ASSERT(config->physical_blocks <= MAXIMUM_VDO_PHYSICAL_BLOCKS,
+ "physical block count %llu exceeds maximum %llu",
+ (unsigned long long) config->physical_blocks,
+ (unsigned long long) MAXIMUM_VDO_PHYSICAL_BLOCKS);
+ if (result != VDO_SUCCESS)
+ return VDO_OUT_OF_RANGE;
+
+ if (physical_block_count != config->physical_blocks) {
+ vdo_log_error("A physical size of %llu blocks was specified, not the %llu blocks configured in the vdo super block",
+ (unsigned long long) physical_block_count,
+ (unsigned long long) config->physical_blocks);
+ return VDO_PARAMETER_MISMATCH;
+ }
+
+ if (logical_block_count > 0) {
+ result = VDO_ASSERT((config->logical_blocks > 0),
+ "logical blocks unspecified");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ if (logical_block_count != config->logical_blocks) {
+ vdo_log_error("A logical size of %llu blocks was specified, but that differs from the %llu blocks configured in the vdo super block",
+ (unsigned long long) logical_block_count,
+ (unsigned long long) config->logical_blocks);
+ return VDO_PARAMETER_MISMATCH;
+ }
+ }
+
+ result = VDO_ASSERT(config->logical_blocks <= MAXIMUM_VDO_LOGICAL_BLOCKS,
+ "logical blocks too large");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = VDO_ASSERT(config->recovery_journal_size > 0,
+ "recovery journal size unspecified");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = VDO_ASSERT(is_power_of_2(config->recovery_journal_size),
+ "recovery journal size must be a power of two");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ return result;
+}
+
+/**
+ * vdo_destroy_component_states() - Clean up any allocations in a vdo_component_states.
+ * @states: The component states to destroy.
+ */
+void vdo_destroy_component_states(struct vdo_component_states *states)
+{
+ if (states == NULL)
+ return;
+
+ vdo_uninitialize_layout(&states->layout);
+}
+
+/**
+ * decode_components() - Decode the components now that we know the component data is a version we
+ * understand.
+ * @buffer: The buffer being decoded.
+ * @offset: The offset to start decoding from.
+ * @geometry: The vdo geometry
+ * @states: An object to hold the successfully decoded state.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int __must_check decode_components(u8 *buffer, size_t *offset,
+ struct volume_geometry *geometry,
+ struct vdo_component_states *states)
+{
+ int result;
+
+ decode_vdo_component(buffer, offset, &states->vdo);
+
+ result = decode_layout(buffer, offset, vdo_get_data_region_start(*geometry) + 1,
+ states->vdo.config.physical_blocks, &states->layout);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = decode_recovery_journal_state_7_0(buffer, offset,
+ &states->recovery_journal);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = decode_slab_depot_state_2_0(buffer, offset, &states->slab_depot);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = decode_block_map_state_2_0(buffer, offset, &states->block_map);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ VDO_ASSERT_LOG_ONLY(*offset == VDO_COMPONENT_DATA_OFFSET + VDO_COMPONENT_DATA_SIZE,
+ "All decoded component data was used");
+ return VDO_SUCCESS;
+}
+
+/**
+ * vdo_decode_component_states() - Decode the payload of a super block.
+ * @buffer: The buffer containing the encoded super block contents.
+ * @geometry: The vdo geometry
+ * @states: A pointer to hold the decoded states.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+int vdo_decode_component_states(u8 *buffer, struct volume_geometry *geometry,
+ struct vdo_component_states *states)
+{
+ int result;
+ size_t offset = VDO_COMPONENT_DATA_OFFSET;
+
+ /* This is for backwards compatibility. */
+ decode_u32_le(buffer, &offset, &states->unused);
+
+ /* Check the VDO volume version */
+ decode_version_number(buffer, &offset, &states->volume_version);
+ result = validate_version(VDO_VOLUME_VERSION_67_0, states->volume_version,
+ "volume");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = decode_components(buffer, &offset, geometry, states);
+ if (result != VDO_SUCCESS)
+ vdo_uninitialize_layout(&states->layout);
+
+ return result;
+}
+
+/**
+ * vdo_validate_component_states() - Validate the decoded super block configuration.
+ * @states: The state decoded from the super block.
+ * @geometry_nonce: The nonce from the geometry block.
+ * @physical_size: The minimum block count of the underlying storage.
+ * @logical_size: The expected logical size of the VDO, or 0 if the logical size may be
+ * unspecified.
+ *
+ * Return: VDO_SUCCESS or an error if the configuration is invalid.
+ */
+int vdo_validate_component_states(struct vdo_component_states *states,
+ nonce_t geometry_nonce, block_count_t physical_size,
+ block_count_t logical_size)
+{
+ if (geometry_nonce != states->vdo.nonce) {
+ return vdo_log_error_strerror(VDO_BAD_NONCE,
+ "Geometry nonce %llu does not match superblock nonce %llu",
+ (unsigned long long) geometry_nonce,
+ (unsigned long long) states->vdo.nonce);
+ }
+
+ return vdo_validate_config(&states->vdo.config, physical_size, logical_size);
+}
+
+/**
+ * vdo_encode_component_states() - Encode the state of all vdo components in the super block.
+ */
+static void vdo_encode_component_states(u8 *buffer, size_t *offset,
+ const struct vdo_component_states *states)
+{
+ /* This is for backwards compatibility. */
+ encode_u32_le(buffer, offset, states->unused);
+ encode_version_number(buffer, offset, states->volume_version);
+ encode_vdo_component(buffer, offset, states->vdo);
+ encode_layout(buffer, offset, &states->layout);
+ encode_recovery_journal_state_7_0(buffer, offset, states->recovery_journal);
+ encode_slab_depot_state_2_0(buffer, offset, states->slab_depot);
+ encode_block_map_state_2_0(buffer, offset, states->block_map);
+
+ VDO_ASSERT_LOG_ONLY(*offset == VDO_COMPONENT_DATA_OFFSET + VDO_COMPONENT_DATA_SIZE,
+ "All super block component data was encoded");
+}
+
+/**
+ * vdo_encode_super_block() - Encode a super block into its on-disk representation.
+ */
+void vdo_encode_super_block(u8 *buffer, struct vdo_component_states *states)
+{
+ u32 checksum;
+ struct header header = SUPER_BLOCK_HEADER_12_0;
+ size_t offset = 0;
+
+ header.size += VDO_COMPONENT_DATA_SIZE;
+ vdo_encode_header(buffer, &offset, &header);
+ vdo_encode_component_states(buffer, &offset, states);
+
+ checksum = vdo_crc32(buffer, offset);
+ encode_u32_le(buffer, &offset, checksum);
+
+ /*
+ * Even though the buffer is a full block, to avoid the potential corruption from a torn
+ * write, the entire encoding must fit in the first sector.
+ */
+ VDO_ASSERT_LOG_ONLY(offset <= VDO_SECTOR_SIZE,
+ "entire superblock must fit in one sector");
+}
+
+/**
+ * vdo_decode_super_block() - Decode a super block from its on-disk representation.
+ */
+int vdo_decode_super_block(u8 *buffer)
+{
+ struct header header;
+ int result;
+ u32 checksum, saved_checksum;
+ size_t offset = 0;
+
+ /* Decode and validate the header. */
+ vdo_decode_header(buffer, &offset, &header);
+ result = vdo_validate_header(&SUPER_BLOCK_HEADER_12_0, &header, false, __func__);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ if (header.size > VDO_COMPONENT_DATA_SIZE + sizeof(u32)) {
+ /*
+ * We can't check release version or checksum until we know the content size, so we
+ * have to assume a version mismatch on unexpected values.
+ */
+ return vdo_log_error_strerror(VDO_UNSUPPORTED_VERSION,
+ "super block contents too large: %zu",
+ header.size);
+ }
+
+ /* Skip past the component data for now, to verify the checksum. */
+ offset += VDO_COMPONENT_DATA_SIZE;
+
+ checksum = vdo_crc32(buffer, offset);
+ decode_u32_le(buffer, &offset, &saved_checksum);
+
+ result = VDO_ASSERT(offset == VDO_SUPER_BLOCK_FIXED_SIZE + VDO_COMPONENT_DATA_SIZE,
+ "must have decoded entire superblock payload");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ return ((checksum != saved_checksum) ? VDO_CHECKSUM_MISMATCH : VDO_SUCCESS);
+}
diff --git a/drivers/md/dm-vdo/encodings.h b/drivers/md/dm-vdo/encodings.h
new file mode 100644
index 000000000000..e5ff2b0aaa79
--- /dev/null
+++ b/drivers/md/dm-vdo/encodings.h
@@ -0,0 +1,1298 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef VDO_ENCODINGS_H
+#define VDO_ENCODINGS_H
+
+#include <linux/blk_types.h>
+#include <linux/crc32.h>
+#include <linux/limits.h>
+#include <linux/uuid.h>
+
+#include "numeric.h"
+
+#include "constants.h"
+#include "types.h"
+
+/*
+ * An in-memory representation of a version number for versioned structures on disk.
+ *
+ * A version number consists of two portions, a major version and a minor version. Any format
+ * change which does not require an explicit upgrade step from the previous version should
+ * increment the minor version. Any format change which either requires an explicit upgrade step,
+ * or is wholly incompatible (i.e. can not be upgraded to), should increment the major version, and
+ * set the minor version to 0.
+ */
+struct version_number {
+ u32 major_version;
+ u32 minor_version;
+};
+
+/*
+ * A packed, machine-independent, on-disk representation of a version_number. Both fields are
+ * stored in little-endian byte order.
+ */
+struct packed_version_number {
+ __le32 major_version;
+ __le32 minor_version;
+} __packed;
+
+/* The registry of component ids for use in headers */
+#define VDO_SUPER_BLOCK 0
+#define VDO_LAYOUT 1
+#define VDO_RECOVERY_JOURNAL 2
+#define VDO_SLAB_DEPOT 3
+#define VDO_BLOCK_MAP 4
+#define VDO_GEOMETRY_BLOCK 5
+
+/* The header for versioned data stored on disk. */
+struct header {
+ u32 id; /* The component this is a header for */
+ struct version_number version; /* The version of the data format */
+ size_t size; /* The size of the data following this header */
+};
+
+/* A packed, machine-independent, on-disk representation of a component header. */
+struct packed_header {
+ __le32 id;
+ struct packed_version_number version;
+ __le64 size;
+} __packed;
+
+enum {
+ VDO_GEOMETRY_BLOCK_LOCATION = 0,
+ VDO_GEOMETRY_MAGIC_NUMBER_SIZE = 8,
+ VDO_DEFAULT_GEOMETRY_BLOCK_VERSION = 5,
+};
+
+struct index_config {
+ u32 mem;
+ u32 unused;
+ bool sparse;
+} __packed;
+
+enum volume_region_id {
+ VDO_INDEX_REGION = 0,
+ VDO_DATA_REGION = 1,
+ VDO_VOLUME_REGION_COUNT,
+};
+
+struct volume_region {
+ /* The ID of the region */
+ enum volume_region_id id;
+ /*
+ * The absolute starting offset on the device. The region continues until the next region
+ * begins.
+ */
+ physical_block_number_t start_block;
+} __packed;
+
+struct volume_geometry {
+ /* For backwards compatibility */
+ u32 unused;
+ /* The nonce of this volume */
+ nonce_t nonce;
+ /* The uuid of this volume */
+ uuid_t uuid;
+ /* The block offset to be applied to bios */
+ block_count_t bio_offset;
+ /* The regions in ID order */
+ struct volume_region regions[VDO_VOLUME_REGION_COUNT];
+ /* The index config */
+ struct index_config index_config;
+} __packed;
+
+/* This volume geometry struct is used for sizing only */
+struct volume_geometry_4_0 {
+ /* For backwards compatibility */
+ u32 unused;
+ /* The nonce of this volume */
+ nonce_t nonce;
+ /* The uuid of this volume */
+ uuid_t uuid;
+ /* The regions in ID order */
+ struct volume_region regions[VDO_VOLUME_REGION_COUNT];
+ /* The index config */
+ struct index_config index_config;
+} __packed;
+
+extern const u8 VDO_GEOMETRY_MAGIC_NUMBER[VDO_GEOMETRY_MAGIC_NUMBER_SIZE + 1];
+
+/**
+ * DOC: Block map entries
+ *
+ * The entry for each logical block in the block map is encoded into five bytes, which saves space
+ * in both the on-disk and in-memory layouts. It consists of the 36 low-order bits of a
+ * physical_block_number_t (addressing 256 terabytes with a 4KB block size) and a 4-bit encoding of
+ * a block_mapping_state.
+ *
+ * Of the 8 high bits of the 5-byte structure:
+ *
+ * Bits 7..4: The four highest bits of the 36-bit physical block number
+ * Bits 3..0: The 4-bit block_mapping_state
+ *
+ * The following 4 bytes are the low order bytes of the physical block number, in little-endian
+ * order.
+ *
+ * Conversion functions to and from a data location are provided.
+ */
+struct block_map_entry {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ unsigned mapping_state : 4;
+ unsigned pbn_high_nibble : 4;
+#else
+ unsigned pbn_high_nibble : 4;
+ unsigned mapping_state : 4;
+#endif
+
+ __le32 pbn_low_word;
+} __packed;
+
+struct block_map_page_header {
+ __le64 nonce;
+ __le64 pbn;
+
+ /* May be non-zero on disk */
+ u8 unused_long_word[8];
+
+ /* Whether this page has been written twice to disk */
+ bool initialized;
+
+ /* Always zero on disk */
+ u8 unused_byte1;
+
+ /* May be non-zero on disk */
+ u8 unused_byte2;
+ u8 unused_byte3;
+} __packed;
+
+struct block_map_page {
+ struct packed_version_number version;
+ struct block_map_page_header header;
+ struct block_map_entry entries[];
+} __packed;
+
+enum block_map_page_validity {
+ VDO_BLOCK_MAP_PAGE_VALID,
+ VDO_BLOCK_MAP_PAGE_INVALID,
+ /* Valid page found in the wrong location on disk */
+ VDO_BLOCK_MAP_PAGE_BAD,
+};
+
+struct block_map_state_2_0 {
+ physical_block_number_t flat_page_origin;
+ block_count_t flat_page_count;
+ physical_block_number_t root_origin;
+ block_count_t root_count;
+} __packed;
+
+struct boundary {
+ page_number_t levels[VDO_BLOCK_MAP_TREE_HEIGHT];
+};
+
+extern const struct header VDO_BLOCK_MAP_HEADER_2_0;
+
+/* The state of the recovery journal as encoded in the VDO super block. */
+struct recovery_journal_state_7_0 {
+ /* Sequence number to start the journal */
+ sequence_number_t journal_start;
+ /* Number of logical blocks used by VDO */
+ block_count_t logical_blocks_used;
+ /* Number of block map pages allocated */
+ block_count_t block_map_data_blocks;
+} __packed;
+
+extern const struct header VDO_RECOVERY_JOURNAL_HEADER_7_0;
+
+typedef u16 journal_entry_count_t;
+
+/*
+ * A recovery journal entry stores three physical locations: a data location that is the value of a
+ * single mapping in the block map tree, and the two locations of the block map pages and slots
+ * that are acquiring and releasing a reference to the location. The journal entry also stores an
+ * operation code that says whether the mapping is for a logical block or for the block map tree
+ * itself.
+ */
+struct recovery_journal_entry {
+ struct block_map_slot slot;
+ struct data_location mapping;
+ struct data_location unmapping;
+ enum journal_operation operation;
+};
+
+/* The packed, on-disk representation of a recovery journal entry. */
+struct packed_recovery_journal_entry {
+ /*
+ * In little-endian bit order:
+ * Bits 15..12: The four highest bits of the 36-bit physical block number of the block map
+ * tree page
+ * Bits 11..2: The 10-bit block map page slot number
+ * Bit 1..0: The journal_operation of the entry (this actually only requires 1 bit, but
+ * it is convenient to keep the extra bit as part of this field.
+ */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ unsigned operation : 2;
+ unsigned slot_low : 6;
+ unsigned slot_high : 4;
+ unsigned pbn_high_nibble : 4;
+#else
+ unsigned slot_low : 6;
+ unsigned operation : 2;
+ unsigned pbn_high_nibble : 4;
+ unsigned slot_high : 4;
+#endif
+
+ /*
+ * Bits 47..16: The 32 low-order bits of the block map page PBN, in little-endian byte
+ * order
+ */
+ __le32 pbn_low_word;
+
+ /*
+ * Bits 87..48: The five-byte block map entry encoding the location that will be stored in
+ * the block map page slot
+ */
+ struct block_map_entry mapping;
+
+ /*
+ * Bits 127..88: The five-byte block map entry encoding the location that was stored in the
+ * block map page slot
+ */
+ struct block_map_entry unmapping;
+} __packed;
+
+/* The packed, on-disk representation of an old format recovery journal entry. */
+struct packed_recovery_journal_entry_1 {
+ /*
+ * In little-endian bit order:
+ * Bits 15..12: The four highest bits of the 36-bit physical block number of the block map
+ * tree page
+ * Bits 11..2: The 10-bit block map page slot number
+ * Bits 1..0: The 2-bit journal_operation of the entry
+ *
+ */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ unsigned operation : 2;
+ unsigned slot_low : 6;
+ unsigned slot_high : 4;
+ unsigned pbn_high_nibble : 4;
+#else
+ unsigned slot_low : 6;
+ unsigned operation : 2;
+ unsigned pbn_high_nibble : 4;
+ unsigned slot_high : 4;
+#endif
+
+ /*
+ * Bits 47..16: The 32 low-order bits of the block map page PBN, in little-endian byte
+ * order
+ */
+ __le32 pbn_low_word;
+
+ /*
+ * Bits 87..48: The five-byte block map entry encoding the location that was or will be
+ * stored in the block map page slot
+ */
+ struct block_map_entry block_map_entry;
+} __packed;
+
+enum journal_operation_1 {
+ VDO_JOURNAL_DATA_DECREMENT = 0,
+ VDO_JOURNAL_DATA_INCREMENT = 1,
+ VDO_JOURNAL_BLOCK_MAP_DECREMENT = 2,
+ VDO_JOURNAL_BLOCK_MAP_INCREMENT = 3,
+} __packed;
+
+struct recovery_block_header {
+ sequence_number_t block_map_head; /* Block map head sequence number */
+ sequence_number_t slab_journal_head; /* Slab journal head seq. number */
+ sequence_number_t sequence_number; /* Sequence number for this block */
+ nonce_t nonce; /* A given VDO instance's nonce */
+ block_count_t logical_blocks_used; /* Logical blocks in use */
+ block_count_t block_map_data_blocks; /* Allocated block map pages */
+ journal_entry_count_t entry_count; /* Number of entries written */
+ u8 check_byte; /* The protection check byte */
+ u8 recovery_count; /* Number of recoveries completed */
+ enum vdo_metadata_type metadata_type; /* Metadata type */
+};
+
+/*
+ * The packed, on-disk representation of a recovery journal block header. All fields are kept in
+ * little-endian byte order.
+ */
+struct packed_journal_header {
+ /* Block map head 64-bit sequence number */
+ __le64 block_map_head;
+
+ /* Slab journal head 64-bit sequence number */
+ __le64 slab_journal_head;
+
+ /* The 64-bit sequence number for this block */
+ __le64 sequence_number;
+
+ /* A given VDO instance's 64-bit nonce */
+ __le64 nonce;
+
+ /* 8-bit metadata type (should always be one for the recovery journal) */
+ u8 metadata_type;
+
+ /* 16-bit count of the entries encoded in the block */
+ __le16 entry_count;
+
+ /* 64-bit count of the logical blocks used when this block was opened */
+ __le64 logical_blocks_used;
+
+ /* 64-bit count of the block map blocks used when this block was opened */
+ __le64 block_map_data_blocks;
+
+ /* The protection check byte */
+ u8 check_byte;
+
+ /* The number of recoveries completed */
+ u8 recovery_count;
+} __packed;
+
+struct packed_journal_sector {
+ /* The protection check byte */
+ u8 check_byte;
+
+ /* The number of recoveries completed */
+ u8 recovery_count;
+
+ /* The number of entries in this sector */
+ u8 entry_count;
+
+ /* Journal entries for this sector */
+ struct packed_recovery_journal_entry entries[];
+} __packed;
+
+enum {
+ /* The number of entries in each sector (except the last) when filled */
+ RECOVERY_JOURNAL_ENTRIES_PER_SECTOR =
+ ((VDO_SECTOR_SIZE - sizeof(struct packed_journal_sector)) /
+ sizeof(struct packed_recovery_journal_entry)),
+ RECOVERY_JOURNAL_ENTRIES_PER_BLOCK = RECOVERY_JOURNAL_ENTRIES_PER_SECTOR * 7,
+ /* The number of entries in a v1 recovery journal block. */
+ RECOVERY_JOURNAL_1_ENTRIES_PER_BLOCK = 311,
+ /* The number of entries in each v1 sector (except the last) when filled */
+ RECOVERY_JOURNAL_1_ENTRIES_PER_SECTOR =
+ ((VDO_SECTOR_SIZE - sizeof(struct packed_journal_sector)) /
+ sizeof(struct packed_recovery_journal_entry_1)),
+ /* The number of entries in the last sector when a block is full */
+ RECOVERY_JOURNAL_1_ENTRIES_IN_LAST_SECTOR =
+ (RECOVERY_JOURNAL_1_ENTRIES_PER_BLOCK % RECOVERY_JOURNAL_1_ENTRIES_PER_SECTOR),
+};
+
+/* A type representing a reference count of a block. */
+typedef u8 vdo_refcount_t;
+
+/* The absolute position of an entry in a recovery journal or slab journal. */
+struct journal_point {
+ sequence_number_t sequence_number;
+ journal_entry_count_t entry_count;
+};
+
+/* A packed, platform-independent encoding of a struct journal_point. */
+struct packed_journal_point {
+ /*
+ * The packed representation is the little-endian 64-bit representation of the low-order 48
+ * bits of the sequence number, shifted up 16 bits, or'ed with the 16-bit entry count.
+ *
+ * Very long-term, the top 16 bits of the sequence number may not always be zero, as this
+ * encoding assumes--see BZ 1523240.
+ */
+ __le64 encoded_point;
+} __packed;
+
+/* Special vdo_refcount_t values. */
+#define EMPTY_REFERENCE_COUNT 0
+enum {
+ MAXIMUM_REFERENCE_COUNT = 254,
+ PROVISIONAL_REFERENCE_COUNT = 255,
+};
+
+enum {
+ COUNTS_PER_SECTOR =
+ ((VDO_SECTOR_SIZE - sizeof(struct packed_journal_point)) / sizeof(vdo_refcount_t)),
+ COUNTS_PER_BLOCK = COUNTS_PER_SECTOR * VDO_SECTORS_PER_BLOCK,
+};
+
+/* The format of each sector of a reference_block on disk. */
+struct packed_reference_sector {
+ struct packed_journal_point commit_point;
+ vdo_refcount_t counts[COUNTS_PER_SECTOR];
+} __packed;
+
+struct packed_reference_block {
+ struct packed_reference_sector sectors[VDO_SECTORS_PER_BLOCK];
+};
+
+struct slab_depot_state_2_0 {
+ struct slab_config slab_config;
+ physical_block_number_t first_block;
+ physical_block_number_t last_block;
+ zone_count_t zone_count;
+} __packed;
+
+extern const struct header VDO_SLAB_DEPOT_HEADER_2_0;
+
+/*
+ * vdo_slab journal blocks may have one of two formats, depending upon whether or not any of the
+ * entries in the block are block map increments. Since the steady state for a VDO is that all of
+ * the necessary block map pages will be allocated, most slab journal blocks will have only data
+ * entries. Such blocks can hold more entries, hence the two formats.
+ */
+
+/* A single slab journal entry */
+struct slab_journal_entry {
+ slab_block_number sbn;
+ enum journal_operation operation;
+ bool increment;
+};
+
+/* A single slab journal entry in its on-disk form */
+typedef struct {
+ u8 offset_low8;
+ u8 offset_mid8;
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ unsigned offset_high7 : 7;
+ unsigned increment : 1;
+#else
+ unsigned increment : 1;
+ unsigned offset_high7 : 7;
+#endif
+} __packed packed_slab_journal_entry;
+
+/* The unpacked representation of the header of a slab journal block */
+struct slab_journal_block_header {
+ /* Sequence number for head of journal */
+ sequence_number_t head;
+ /* Sequence number for this block */
+ sequence_number_t sequence_number;
+ /* The nonce for a given VDO instance */
+ nonce_t nonce;
+ /* Recovery journal point for last entry */
+ struct journal_point recovery_point;
+ /* Metadata type */
+ enum vdo_metadata_type metadata_type;
+ /* Whether this block contains block map increments */
+ bool has_block_map_increments;
+ /* The number of entries in the block */
+ journal_entry_count_t entry_count;
+};
+
+/*
+ * The packed, on-disk representation of a slab journal block header. All fields are kept in
+ * little-endian byte order.
+ */
+struct packed_slab_journal_block_header {
+ /* 64-bit sequence number for head of journal */
+ __le64 head;
+ /* 64-bit sequence number for this block */
+ __le64 sequence_number;
+ /* Recovery journal point for the last entry, packed into 64 bits */
+ struct packed_journal_point recovery_point;
+ /* The 64-bit nonce for a given VDO instance */
+ __le64 nonce;
+ /* 8-bit metadata type (should always be two, for the slab journal) */
+ u8 metadata_type;
+ /* Whether this block contains block map increments */
+ bool has_block_map_increments;
+ /* 16-bit count of the entries encoded in the block */
+ __le16 entry_count;
+} __packed;
+
+enum {
+ VDO_SLAB_JOURNAL_PAYLOAD_SIZE =
+ VDO_BLOCK_SIZE - sizeof(struct packed_slab_journal_block_header),
+ VDO_SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK = (VDO_SLAB_JOURNAL_PAYLOAD_SIZE * 8) / 25,
+ VDO_SLAB_JOURNAL_ENTRY_TYPES_SIZE =
+ ((VDO_SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK - 1) / 8) + 1,
+ VDO_SLAB_JOURNAL_ENTRIES_PER_BLOCK =
+ (VDO_SLAB_JOURNAL_PAYLOAD_SIZE / sizeof(packed_slab_journal_entry)),
+};
+
+/* The payload of a slab journal block which has block map increments */
+struct full_slab_journal_entries {
+ /* The entries themselves */
+ packed_slab_journal_entry entries[VDO_SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK];
+ /* The bit map indicating which entries are block map increments */
+ u8 entry_types[VDO_SLAB_JOURNAL_ENTRY_TYPES_SIZE];
+} __packed;
+
+typedef union {
+ /* Entries which include block map increments */
+ struct full_slab_journal_entries full_entries;
+ /* Entries which are only data updates */
+ packed_slab_journal_entry entries[VDO_SLAB_JOURNAL_ENTRIES_PER_BLOCK];
+ /* Ensure the payload fills to the end of the block */
+ u8 space[VDO_SLAB_JOURNAL_PAYLOAD_SIZE];
+} __packed slab_journal_payload;
+
+struct packed_slab_journal_block {
+ struct packed_slab_journal_block_header header;
+ slab_journal_payload payload;
+} __packed;
+
+/* The offset of a slab journal tail block. */
+typedef u8 tail_block_offset_t;
+
+struct slab_summary_entry {
+ /* Bits 7..0: The offset of the tail block within the slab journal */
+ tail_block_offset_t tail_block_offset;
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ /* Bits 13..8: A hint about the fullness of the slab */
+ unsigned int fullness_hint : 6;
+ /* Bit 14: Whether the ref_counts must be loaded from the layer */
+ unsigned int load_ref_counts : 1;
+ /* Bit 15: The believed cleanliness of this slab */
+ unsigned int is_dirty : 1;
+#else
+ /* Bit 15: The believed cleanliness of this slab */
+ unsigned int is_dirty : 1;
+ /* Bit 14: Whether the ref_counts must be loaded from the layer */
+ unsigned int load_ref_counts : 1;
+ /* Bits 13..8: A hint about the fullness of the slab */
+ unsigned int fullness_hint : 6;
+#endif
+} __packed;
+
+enum {
+ VDO_SLAB_SUMMARY_FULLNESS_HINT_BITS = 6,
+ VDO_SLAB_SUMMARY_ENTRIES_PER_BLOCK = VDO_BLOCK_SIZE / sizeof(struct slab_summary_entry),
+ VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE = MAX_VDO_SLABS / VDO_SLAB_SUMMARY_ENTRIES_PER_BLOCK,
+ VDO_SLAB_SUMMARY_BLOCKS = VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE * MAX_VDO_PHYSICAL_ZONES,
+};
+
+struct layout {
+ physical_block_number_t start;
+ block_count_t size;
+ physical_block_number_t first_free;
+ physical_block_number_t last_free;
+ size_t num_partitions;
+ struct partition *head;
+};
+
+struct partition {
+ enum partition_id id; /* The id of this partition */
+ physical_block_number_t offset; /* The offset into the layout of this partition */
+ block_count_t count; /* The number of blocks in the partition */
+ struct partition *next; /* A pointer to the next partition in the layout */
+};
+
+struct layout_3_0 {
+ physical_block_number_t first_free;
+ physical_block_number_t last_free;
+ u8 partition_count;
+} __packed;
+
+struct partition_3_0 {
+ enum partition_id id;
+ physical_block_number_t offset;
+ physical_block_number_t base; /* unused but retained for backwards compatibility */
+ block_count_t count;
+} __packed;
+
+/*
+ * The configuration of the VDO service.
+ */
+struct vdo_config {
+ block_count_t logical_blocks; /* number of logical blocks */
+ block_count_t physical_blocks; /* number of physical blocks */
+ block_count_t slab_size; /* number of blocks in a slab */
+ block_count_t recovery_journal_size; /* number of recovery journal blocks */
+ block_count_t slab_journal_blocks; /* number of slab journal blocks */
+};
+
+/* This is the structure that captures the vdo fields saved as a super block component. */
+struct vdo_component {
+ enum vdo_state state;
+ u64 complete_recoveries;
+ u64 read_only_recoveries;
+ struct vdo_config config;
+ nonce_t nonce;
+};
+
+/*
+ * A packed, machine-independent, on-disk representation of the vdo_config in the VDO component
+ * data in the super block.
+ */
+struct packed_vdo_config {
+ __le64 logical_blocks;
+ __le64 physical_blocks;
+ __le64 slab_size;
+ __le64 recovery_journal_size;
+ __le64 slab_journal_blocks;
+} __packed;
+
+/*
+ * A packed, machine-independent, on-disk representation of version 41.0 of the VDO component data
+ * in the super block.
+ */
+struct packed_vdo_component_41_0 {
+ __le32 state;
+ __le64 complete_recoveries;
+ __le64 read_only_recoveries;
+ struct packed_vdo_config config;
+ __le64 nonce;
+} __packed;
+
+/*
+ * The version of the on-disk format of a VDO volume. This should be incremented any time the
+ * on-disk representation of any VDO structure changes. Changes which require only online upgrade
+ * steps should increment the minor version. Changes which require an offline upgrade or which can
+ * not be upgraded to at all should increment the major version and set the minor version to 0.
+ */
+extern const struct version_number VDO_VOLUME_VERSION_67_0;
+
+enum {
+ VDO_ENCODED_HEADER_SIZE = sizeof(struct packed_header),
+ BLOCK_MAP_COMPONENT_ENCODED_SIZE =
+ VDO_ENCODED_HEADER_SIZE + sizeof(struct block_map_state_2_0),
+ RECOVERY_JOURNAL_COMPONENT_ENCODED_SIZE =
+ VDO_ENCODED_HEADER_SIZE + sizeof(struct recovery_journal_state_7_0),
+ SLAB_DEPOT_COMPONENT_ENCODED_SIZE =
+ VDO_ENCODED_HEADER_SIZE + sizeof(struct slab_depot_state_2_0),
+ VDO_PARTITION_COUNT = 4,
+ VDO_LAYOUT_ENCODED_SIZE = (VDO_ENCODED_HEADER_SIZE +
+ sizeof(struct layout_3_0) +
+ (sizeof(struct partition_3_0) * VDO_PARTITION_COUNT)),
+ VDO_SUPER_BLOCK_FIXED_SIZE = VDO_ENCODED_HEADER_SIZE + sizeof(u32),
+ VDO_MAX_COMPONENT_DATA_SIZE = VDO_SECTOR_SIZE - VDO_SUPER_BLOCK_FIXED_SIZE,
+ VDO_COMPONENT_ENCODED_SIZE =
+ (sizeof(struct packed_version_number) + sizeof(struct packed_vdo_component_41_0)),
+ VDO_COMPONENT_DATA_OFFSET = VDO_ENCODED_HEADER_SIZE,
+ VDO_COMPONENT_DATA_SIZE = (sizeof(u32) +
+ sizeof(struct packed_version_number) +
+ VDO_COMPONENT_ENCODED_SIZE +
+ VDO_LAYOUT_ENCODED_SIZE +
+ RECOVERY_JOURNAL_COMPONENT_ENCODED_SIZE +
+ SLAB_DEPOT_COMPONENT_ENCODED_SIZE +
+ BLOCK_MAP_COMPONENT_ENCODED_SIZE),
+};
+
+/* The entirety of the component data encoded in the VDO super block. */
+struct vdo_component_states {
+ /* For backwards compatibility */
+ u32 unused;
+
+ /* The VDO volume version */
+ struct version_number volume_version;
+
+ /* Components */
+ struct vdo_component vdo;
+ struct block_map_state_2_0 block_map;
+ struct recovery_journal_state_7_0 recovery_journal;
+ struct slab_depot_state_2_0 slab_depot;
+
+ /* Our partitioning of the underlying storage */
+ struct layout layout;
+};
+
+/**
+ * vdo_are_same_version() - Check whether two version numbers are the same.
+ * @version_a: The first version.
+ * @version_b: The second version.
+ *
+ * Return: true if the two versions are the same.
+ */
+static inline bool vdo_are_same_version(struct version_number version_a,
+ struct version_number version_b)
+{
+ return ((version_a.major_version == version_b.major_version) &&
+ (version_a.minor_version == version_b.minor_version));
+}
+
+/**
+ * vdo_is_upgradable_version() - Check whether an actual version is upgradable to an expected
+ * version.
+ * @expected_version: The expected version.
+ * @actual_version: The version being validated.
+ *
+ * An actual version is upgradable if its major number is expected but its minor number differs,
+ * and the expected version's minor number is greater than the actual version's minor number.
+ *
+ * Return: true if the actual version is upgradable.
+ */
+static inline bool vdo_is_upgradable_version(struct version_number expected_version,
+ struct version_number actual_version)
+{
+ return ((expected_version.major_version == actual_version.major_version) &&
+ (expected_version.minor_version > actual_version.minor_version));
+}
+
+int __must_check vdo_validate_header(const struct header *expected_header,
+ const struct header *actual_header, bool exact_size,
+ const char *component_name);
+
+void vdo_encode_header(u8 *buffer, size_t *offset, const struct header *header);
+void vdo_decode_header(u8 *buffer, size_t *offset, struct header *header);
+
+/**
+ * vdo_pack_version_number() - Convert a version_number to its packed on-disk representation.
+ * @version: The version number to convert.
+ *
+ * Return: the platform-independent representation of the version
+ */
+static inline struct packed_version_number vdo_pack_version_number(struct version_number version)
+{
+ return (struct packed_version_number) {
+ .major_version = __cpu_to_le32(version.major_version),
+ .minor_version = __cpu_to_le32(version.minor_version),
+ };
+}
+
+/**
+ * vdo_unpack_version_number() - Convert a packed_version_number to its native in-memory
+ * representation.
+ * @version: The version number to convert.
+ *
+ * Return: The platform-independent representation of the version.
+ */
+static inline struct version_number vdo_unpack_version_number(struct packed_version_number version)
+{
+ return (struct version_number) {
+ .major_version = __le32_to_cpu(version.major_version),
+ .minor_version = __le32_to_cpu(version.minor_version),
+ };
+}
+
+/**
+ * vdo_pack_header() - Convert a component header to its packed on-disk representation.
+ * @header: The header to convert.
+ *
+ * Return: the platform-independent representation of the header
+ */
+static inline struct packed_header vdo_pack_header(const struct header *header)
+{
+ return (struct packed_header) {
+ .id = __cpu_to_le32(header->id),
+ .version = vdo_pack_version_number(header->version),
+ .size = __cpu_to_le64(header->size),
+ };
+}
+
+/**
+ * vdo_unpack_header() - Convert a packed_header to its native in-memory representation.
+ * @header: The header to convert.
+ *
+ * Return: The platform-independent representation of the version.
+ */
+static inline struct header vdo_unpack_header(const struct packed_header *header)
+{
+ return (struct header) {
+ .id = __le32_to_cpu(header->id),
+ .version = vdo_unpack_version_number(header->version),
+ .size = __le64_to_cpu(header->size),
+ };
+}
+
+/**
+ * vdo_get_index_region_start() - Get the start of the index region from a geometry.
+ * @geometry: The geometry.
+ *
+ * Return: The start of the index region.
+ */
+static inline physical_block_number_t __must_check
+vdo_get_index_region_start(struct volume_geometry geometry)
+{
+ return geometry.regions[VDO_INDEX_REGION].start_block;
+}
+
+/**
+ * vdo_get_data_region_start() - Get the start of the data region from a geometry.
+ * @geometry: The geometry.
+ *
+ * Return: The start of the data region.
+ */
+static inline physical_block_number_t __must_check
+vdo_get_data_region_start(struct volume_geometry geometry)
+{
+ return geometry.regions[VDO_DATA_REGION].start_block;
+}
+
+/**
+ * vdo_get_index_region_size() - Get the size of the index region from a geometry.
+ * @geometry: The geometry.
+ *
+ * Return: The size of the index region.
+ */
+static inline physical_block_number_t __must_check
+vdo_get_index_region_size(struct volume_geometry geometry)
+{
+ return vdo_get_data_region_start(geometry) -
+ vdo_get_index_region_start(geometry);
+}
+
+int __must_check vdo_parse_geometry_block(unsigned char *block,
+ struct volume_geometry *geometry);
+
+static inline bool vdo_is_state_compressed(const enum block_mapping_state mapping_state)
+{
+ return (mapping_state > VDO_MAPPING_STATE_UNCOMPRESSED);
+}
+
+static inline struct block_map_entry
+vdo_pack_block_map_entry(physical_block_number_t pbn, enum block_mapping_state mapping_state)
+{
+ return (struct block_map_entry) {
+ .mapping_state = (mapping_state & 0x0F),
+ .pbn_high_nibble = ((pbn >> 32) & 0x0F),
+ .pbn_low_word = __cpu_to_le32(pbn & UINT_MAX),
+ };
+}
+
+static inline struct data_location vdo_unpack_block_map_entry(const struct block_map_entry *entry)
+{
+ physical_block_number_t low32 = __le32_to_cpu(entry->pbn_low_word);
+ physical_block_number_t high4 = entry->pbn_high_nibble;
+
+ return (struct data_location) {
+ .pbn = ((high4 << 32) | low32),
+ .state = entry->mapping_state,
+ };
+}
+
+static inline bool vdo_is_mapped_location(const struct data_location *location)
+{
+ return (location->state != VDO_MAPPING_STATE_UNMAPPED);
+}
+
+static inline bool vdo_is_valid_location(const struct data_location *location)
+{
+ if (location->pbn == VDO_ZERO_BLOCK)
+ return !vdo_is_state_compressed(location->state);
+ else
+ return vdo_is_mapped_location(location);
+}
+
+static inline physical_block_number_t __must_check
+vdo_get_block_map_page_pbn(const struct block_map_page *page)
+{
+ return __le64_to_cpu(page->header.pbn);
+}
+
+struct block_map_page *vdo_format_block_map_page(void *buffer, nonce_t nonce,
+ physical_block_number_t pbn,
+ bool initialized);
+
+enum block_map_page_validity __must_check vdo_validate_block_map_page(struct block_map_page *page,
+ nonce_t nonce,
+ physical_block_number_t pbn);
+
+static inline page_count_t vdo_compute_block_map_page_count(block_count_t entries)
+{
+ return DIV_ROUND_UP(entries, VDO_BLOCK_MAP_ENTRIES_PER_PAGE);
+}
+
+block_count_t __must_check vdo_compute_new_forest_pages(root_count_t root_count,
+ struct boundary *old_sizes,
+ block_count_t entries,
+ struct boundary *new_sizes);
+
+/**
+ * vdo_pack_recovery_journal_entry() - Return the packed, on-disk representation of a recovery
+ * journal entry.
+ * @entry: The journal entry to pack.
+ *
+ * Return: The packed representation of the journal entry.
+ */
+static inline struct packed_recovery_journal_entry
+vdo_pack_recovery_journal_entry(const struct recovery_journal_entry *entry)
+{
+ return (struct packed_recovery_journal_entry) {
+ .operation = entry->operation,
+ .slot_low = entry->slot.slot & 0x3F,
+ .slot_high = (entry->slot.slot >> 6) & 0x0F,
+ .pbn_high_nibble = (entry->slot.pbn >> 32) & 0x0F,
+ .pbn_low_word = __cpu_to_le32(entry->slot.pbn & UINT_MAX),
+ .mapping = vdo_pack_block_map_entry(entry->mapping.pbn,
+ entry->mapping.state),
+ .unmapping = vdo_pack_block_map_entry(entry->unmapping.pbn,
+ entry->unmapping.state),
+ };
+}
+
+/**
+ * vdo_unpack_recovery_journal_entry() - Unpack the on-disk representation of a recovery journal
+ * entry.
+ * @entry: The recovery journal entry to unpack.
+ *
+ * Return: The unpacked entry.
+ */
+static inline struct recovery_journal_entry
+vdo_unpack_recovery_journal_entry(const struct packed_recovery_journal_entry *entry)
+{
+ physical_block_number_t low32 = __le32_to_cpu(entry->pbn_low_word);
+ physical_block_number_t high4 = entry->pbn_high_nibble;
+
+ return (struct recovery_journal_entry) {
+ .operation = entry->operation,
+ .slot = {
+ .pbn = ((high4 << 32) | low32),
+ .slot = (entry->slot_low | (entry->slot_high << 6)),
+ },
+ .mapping = vdo_unpack_block_map_entry(&entry->mapping),
+ .unmapping = vdo_unpack_block_map_entry(&entry->unmapping),
+ };
+}
+
+const char * __must_check vdo_get_journal_operation_name(enum journal_operation operation);
+
+/**
+ * vdo_is_valid_recovery_journal_sector() - Determine whether the header of the given sector could
+ * describe a valid sector for the given journal block
+ * header.
+ * @header: The unpacked block header to compare against.
+ * @sector: The packed sector to check.
+ * @sector_number: The number of the sector being checked.
+ *
+ * Return: true if the sector matches the block header.
+ */
+static inline bool __must_check
+vdo_is_valid_recovery_journal_sector(const struct recovery_block_header *header,
+ const struct packed_journal_sector *sector,
+ u8 sector_number)
+{
+ if ((header->check_byte != sector->check_byte) ||
+ (header->recovery_count != sector->recovery_count))
+ return false;
+
+ if (header->metadata_type == VDO_METADATA_RECOVERY_JOURNAL_2)
+ return sector->entry_count <= RECOVERY_JOURNAL_ENTRIES_PER_SECTOR;
+
+ if (sector_number == 7)
+ return sector->entry_count <= RECOVERY_JOURNAL_1_ENTRIES_IN_LAST_SECTOR;
+
+ return sector->entry_count <= RECOVERY_JOURNAL_1_ENTRIES_PER_SECTOR;
+}
+
+/**
+ * vdo_compute_recovery_journal_block_number() - Compute the physical block number of the recovery
+ * journal block which would have a given sequence
+ * number.
+ * @journal_size: The size of the journal.
+ * @sequence_number: The sequence number.
+ *
+ * Return: The pbn of the journal block which would the specified sequence number.
+ */
+static inline physical_block_number_t __must_check
+vdo_compute_recovery_journal_block_number(block_count_t journal_size,
+ sequence_number_t sequence_number)
+{
+ /*
+ * Since journal size is a power of two, the block number modulus can just be extracted
+ * from the low-order bits of the sequence.
+ */
+ return (sequence_number & (journal_size - 1));
+}
+
+/**
+ * vdo_get_journal_block_sector() - Find the recovery journal sector from the block header and
+ * sector number.
+ * @header: The header of the recovery journal block.
+ * @sector_number: The index of the sector (1-based).
+ *
+ * Return: A packed recovery journal sector.
+ */
+static inline struct packed_journal_sector * __must_check
+vdo_get_journal_block_sector(struct packed_journal_header *header, int sector_number)
+{
+ char *sector_data = ((char *) header) + (VDO_SECTOR_SIZE * sector_number);
+
+ return (struct packed_journal_sector *) sector_data;
+}
+
+/**
+ * vdo_pack_recovery_block_header() - Generate the packed representation of a recovery block
+ * header.
+ * @header: The header containing the values to encode.
+ * @packed: The header into which to pack the values.
+ */
+static inline void vdo_pack_recovery_block_header(const struct recovery_block_header *header,
+ struct packed_journal_header *packed)
+{
+ *packed = (struct packed_journal_header) {
+ .block_map_head = __cpu_to_le64(header->block_map_head),
+ .slab_journal_head = __cpu_to_le64(header->slab_journal_head),
+ .sequence_number = __cpu_to_le64(header->sequence_number),
+ .nonce = __cpu_to_le64(header->nonce),
+ .logical_blocks_used = __cpu_to_le64(header->logical_blocks_used),
+ .block_map_data_blocks = __cpu_to_le64(header->block_map_data_blocks),
+ .entry_count = __cpu_to_le16(header->entry_count),
+ .check_byte = header->check_byte,
+ .recovery_count = header->recovery_count,
+ .metadata_type = header->metadata_type,
+ };
+}
+
+/**
+ * vdo_unpack_recovery_block_header() - Decode the packed representation of a recovery block
+ * header.
+ * @packed: The packed header to decode.
+ *
+ * Return: The unpacked header.
+ */
+static inline struct recovery_block_header
+vdo_unpack_recovery_block_header(const struct packed_journal_header *packed)
+{
+ return (struct recovery_block_header) {
+ .block_map_head = __le64_to_cpu(packed->block_map_head),
+ .slab_journal_head = __le64_to_cpu(packed->slab_journal_head),
+ .sequence_number = __le64_to_cpu(packed->sequence_number),
+ .nonce = __le64_to_cpu(packed->nonce),
+ .logical_blocks_used = __le64_to_cpu(packed->logical_blocks_used),
+ .block_map_data_blocks = __le64_to_cpu(packed->block_map_data_blocks),
+ .entry_count = __le16_to_cpu(packed->entry_count),
+ .check_byte = packed->check_byte,
+ .recovery_count = packed->recovery_count,
+ .metadata_type = packed->metadata_type,
+ };
+}
+
+/**
+ * vdo_compute_slab_count() - Compute the number of slabs a depot with given parameters would have.
+ * @first_block: PBN of the first data block.
+ * @last_block: PBN of the last data block.
+ * @slab_size_shift: Exponent for the number of blocks per slab.
+ *
+ * Return: The number of slabs.
+ */
+static inline slab_count_t vdo_compute_slab_count(physical_block_number_t first_block,
+ physical_block_number_t last_block,
+ unsigned int slab_size_shift)
+{
+ return (slab_count_t) ((last_block - first_block) >> slab_size_shift);
+}
+
+int __must_check vdo_configure_slab_depot(const struct partition *partition,
+ struct slab_config slab_config,
+ zone_count_t zone_count,
+ struct slab_depot_state_2_0 *state);
+
+int __must_check vdo_configure_slab(block_count_t slab_size,
+ block_count_t slab_journal_blocks,
+ struct slab_config *slab_config);
+
+/**
+ * vdo_get_saved_reference_count_size() - Get the number of blocks required to save a reference
+ * counts state covering the specified number of data
+ * blocks.
+ * @block_count: The number of physical data blocks that can be referenced.
+ *
+ * Return: The number of blocks required to save reference counts with the given block count.
+ */
+static inline block_count_t vdo_get_saved_reference_count_size(block_count_t block_count)
+{
+ return DIV_ROUND_UP(block_count, COUNTS_PER_BLOCK);
+}
+
+/**
+ * vdo_get_slab_journal_start_block() - Get the physical block number of the start of the slab
+ * journal relative to the start block allocator partition.
+ * @slab_config: The slab configuration of the VDO.
+ * @origin: The first block of the slab.
+ */
+static inline physical_block_number_t __must_check
+vdo_get_slab_journal_start_block(const struct slab_config *slab_config,
+ physical_block_number_t origin)
+{
+ return origin + slab_config->data_blocks + slab_config->reference_count_blocks;
+}
+
+/**
+ * vdo_advance_journal_point() - Move the given journal point forward by one entry.
+ * @point: The journal point to adjust.
+ * @entries_per_block: The number of entries in one full block.
+ */
+static inline void vdo_advance_journal_point(struct journal_point *point,
+ journal_entry_count_t entries_per_block)
+{
+ point->entry_count++;
+ if (point->entry_count == entries_per_block) {
+ point->sequence_number++;
+ point->entry_count = 0;
+ }
+}
+
+/**
+ * vdo_before_journal_point() - Check whether the first point precedes the second point.
+ * @first: The first journal point.
+ * @second: The second journal point.
+ *
+ * Return: true if the first point precedes the second point.
+ */
+static inline bool vdo_before_journal_point(const struct journal_point *first,
+ const struct journal_point *second)
+{
+ return ((first->sequence_number < second->sequence_number) ||
+ ((first->sequence_number == second->sequence_number) &&
+ (first->entry_count < second->entry_count)));
+}
+
+/**
+ * vdo_pack_journal_point() - Encode the journal location represented by a
+ * journal_point into a packed_journal_point.
+ * @unpacked: The unpacked input point.
+ * @packed: The packed output point.
+ */
+static inline void vdo_pack_journal_point(const struct journal_point *unpacked,
+ struct packed_journal_point *packed)
+{
+ packed->encoded_point =
+ __cpu_to_le64((unpacked->sequence_number << 16) | unpacked->entry_count);
+}
+
+/**
+ * vdo_unpack_journal_point() - Decode the journal location represented by a packed_journal_point
+ * into a journal_point.
+ * @packed: The packed input point.
+ * @unpacked: The unpacked output point.
+ */
+static inline void vdo_unpack_journal_point(const struct packed_journal_point *packed,
+ struct journal_point *unpacked)
+{
+ u64 native = __le64_to_cpu(packed->encoded_point);
+
+ unpacked->sequence_number = (native >> 16);
+ unpacked->entry_count = (native & 0xffff);
+}
+
+/**
+ * vdo_pack_slab_journal_block_header() - Generate the packed representation of a slab block
+ * header.
+ * @header: The header containing the values to encode.
+ * @packed: The header into which to pack the values.
+ */
+static inline void
+vdo_pack_slab_journal_block_header(const struct slab_journal_block_header *header,
+ struct packed_slab_journal_block_header *packed)
+{
+ packed->head = __cpu_to_le64(header->head);
+ packed->sequence_number = __cpu_to_le64(header->sequence_number);
+ packed->nonce = __cpu_to_le64(header->nonce);
+ packed->entry_count = __cpu_to_le16(header->entry_count);
+ packed->metadata_type = header->metadata_type;
+ packed->has_block_map_increments = header->has_block_map_increments;
+
+ vdo_pack_journal_point(&header->recovery_point, &packed->recovery_point);
+}
+
+/**
+ * vdo_unpack_slab_journal_block_header() - Decode the packed representation of a slab block
+ * header.
+ * @packed: The packed header to decode.
+ * @header: The header into which to unpack the values.
+ */
+static inline void
+vdo_unpack_slab_journal_block_header(const struct packed_slab_journal_block_header *packed,
+ struct slab_journal_block_header *header)
+{
+ *header = (struct slab_journal_block_header) {
+ .head = __le64_to_cpu(packed->head),
+ .sequence_number = __le64_to_cpu(packed->sequence_number),
+ .nonce = __le64_to_cpu(packed->nonce),
+ .entry_count = __le16_to_cpu(packed->entry_count),
+ .metadata_type = packed->metadata_type,
+ .has_block_map_increments = packed->has_block_map_increments,
+ };
+ vdo_unpack_journal_point(&packed->recovery_point, &header->recovery_point);
+}
+
+/**
+ * vdo_pack_slab_journal_entry() - Generate the packed encoding of a slab journal entry.
+ * @packed: The entry into which to pack the values.
+ * @sbn: The slab block number of the entry to encode.
+ * @is_increment: The increment flag.
+ */
+static inline void vdo_pack_slab_journal_entry(packed_slab_journal_entry *packed,
+ slab_block_number sbn, bool is_increment)
+{
+ packed->offset_low8 = (sbn & 0x0000FF);
+ packed->offset_mid8 = (sbn & 0x00FF00) >> 8;
+ packed->offset_high7 = (sbn & 0x7F0000) >> 16;
+ packed->increment = is_increment ? 1 : 0;
+}
+
+/**
+ * vdo_unpack_slab_journal_entry() - Decode the packed representation of a slab journal entry.
+ * @packed: The packed entry to decode.
+ *
+ * Return: The decoded slab journal entry.
+ */
+static inline struct slab_journal_entry __must_check
+vdo_unpack_slab_journal_entry(const packed_slab_journal_entry *packed)
+{
+ struct slab_journal_entry entry;
+
+ entry.sbn = packed->offset_high7;
+ entry.sbn <<= 8;
+ entry.sbn |= packed->offset_mid8;
+ entry.sbn <<= 8;
+ entry.sbn |= packed->offset_low8;
+ entry.operation = VDO_JOURNAL_DATA_REMAPPING;
+ entry.increment = packed->increment;
+ return entry;
+}
+
+struct slab_journal_entry __must_check
+vdo_decode_slab_journal_entry(struct packed_slab_journal_block *block,
+ journal_entry_count_t entry_count);
+
+/**
+ * vdo_get_slab_summary_hint_shift() - Compute the shift for slab summary hints.
+ * @slab_size_shift: Exponent for the number of blocks per slab.
+ *
+ * Return: The hint shift.
+ */
+static inline u8 __must_check vdo_get_slab_summary_hint_shift(unsigned int slab_size_shift)
+{
+ return ((slab_size_shift > VDO_SLAB_SUMMARY_FULLNESS_HINT_BITS) ?
+ (slab_size_shift - VDO_SLAB_SUMMARY_FULLNESS_HINT_BITS) :
+ 0);
+}
+
+int __must_check vdo_initialize_layout(block_count_t size,
+ physical_block_number_t offset,
+ block_count_t block_map_blocks,
+ block_count_t journal_blocks,
+ block_count_t summary_blocks,
+ struct layout *layout);
+
+void vdo_uninitialize_layout(struct layout *layout);
+
+int __must_check vdo_get_partition(struct layout *layout, enum partition_id id,
+ struct partition **partition_ptr);
+
+struct partition * __must_check vdo_get_known_partition(struct layout *layout,
+ enum partition_id id);
+
+int vdo_validate_config(const struct vdo_config *config,
+ block_count_t physical_block_count,
+ block_count_t logical_block_count);
+
+void vdo_destroy_component_states(struct vdo_component_states *states);
+
+int __must_check vdo_decode_component_states(u8 *buffer,
+ struct volume_geometry *geometry,
+ struct vdo_component_states *states);
+
+int __must_check vdo_validate_component_states(struct vdo_component_states *states,
+ nonce_t geometry_nonce,
+ block_count_t physical_size,
+ block_count_t logical_size);
+
+void vdo_encode_super_block(u8 *buffer, struct vdo_component_states *states);
+int __must_check vdo_decode_super_block(u8 *buffer);
+
+/* We start with 0L and postcondition with ~0L to match our historical usage in userspace. */
+static inline u32 vdo_crc32(const void *buf, unsigned long len)
+{
+ return (crc32(0L, buf, len) ^ ~0L);
+}
+
+#endif /* VDO_ENCODINGS_H */
diff --git a/drivers/md/dm-vdo/errors.c b/drivers/md/dm-vdo/errors.c
new file mode 100644
index 000000000000..6f89eb1c63a3
--- /dev/null
+++ b/drivers/md/dm-vdo/errors.c
@@ -0,0 +1,307 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "errors.h"
+
+#include <linux/compiler.h>
+#include <linux/errno.h>
+
+#include "logger.h"
+#include "permassert.h"
+#include "string-utils.h"
+
+static const struct error_info successful = { "UDS_SUCCESS", "Success" };
+
+static const char *const message_table[] = {
+ [EPERM] = "Operation not permitted",
+ [ENOENT] = "No such file or directory",
+ [ESRCH] = "No such process",
+ [EINTR] = "Interrupted system call",
+ [EIO] = "Input/output error",
+ [ENXIO] = "No such device or address",
+ [E2BIG] = "Argument list too long",
+ [ENOEXEC] = "Exec format error",
+ [EBADF] = "Bad file descriptor",
+ [ECHILD] = "No child processes",
+ [EAGAIN] = "Resource temporarily unavailable",
+ [ENOMEM] = "Cannot allocate memory",
+ [EACCES] = "Permission denied",
+ [EFAULT] = "Bad address",
+ [ENOTBLK] = "Block device required",
+ [EBUSY] = "Device or resource busy",
+ [EEXIST] = "File exists",
+ [EXDEV] = "Invalid cross-device link",
+ [ENODEV] = "No such device",
+ [ENOTDIR] = "Not a directory",
+ [EISDIR] = "Is a directory",
+ [EINVAL] = "Invalid argument",
+ [ENFILE] = "Too many open files in system",
+ [EMFILE] = "Too many open files",
+ [ENOTTY] = "Inappropriate ioctl for device",
+ [ETXTBSY] = "Text file busy",
+ [EFBIG] = "File too large",
+ [ENOSPC] = "No space left on device",
+ [ESPIPE] = "Illegal seek",
+ [EROFS] = "Read-only file system",
+ [EMLINK] = "Too many links",
+ [EPIPE] = "Broken pipe",
+ [EDOM] = "Numerical argument out of domain",
+ [ERANGE] = "Numerical result out of range"
+};
+
+static const struct error_info error_list[] = {
+ { "UDS_OVERFLOW", "Index overflow" },
+ { "UDS_INVALID_ARGUMENT", "Invalid argument passed to internal routine" },
+ { "UDS_BAD_STATE", "UDS data structures are in an invalid state" },
+ { "UDS_DUPLICATE_NAME", "Attempt to enter the same name into a delta index twice" },
+ { "UDS_ASSERTION_FAILED", "Assertion failed" },
+ { "UDS_QUEUED", "Request queued" },
+ { "UDS_ALREADY_REGISTERED", "Error range already registered" },
+ { "UDS_OUT_OF_RANGE", "Cannot access data outside specified limits" },
+ { "UDS_DISABLED", "UDS library context is disabled" },
+ { "UDS_UNSUPPORTED_VERSION", "Unsupported version" },
+ { "UDS_CORRUPT_DATA", "Some index structure is corrupt" },
+ { "UDS_NO_INDEX", "No index found" },
+ { "UDS_INDEX_NOT_SAVED_CLEANLY", "Index not saved cleanly" },
+};
+
+struct error_block {
+ const char *name;
+ int base;
+ int last;
+ int max;
+ const struct error_info *infos;
+};
+
+#define MAX_ERROR_BLOCKS 6
+
+static struct {
+ int allocated;
+ int count;
+ struct error_block blocks[MAX_ERROR_BLOCKS];
+} registered_errors = {
+ .allocated = MAX_ERROR_BLOCKS,
+ .count = 1,
+ .blocks = { {
+ .name = "UDS Error",
+ .base = UDS_ERROR_CODE_BASE,
+ .last = UDS_ERROR_CODE_LAST,
+ .max = UDS_ERROR_CODE_BLOCK_END,
+ .infos = error_list,
+ } },
+};
+
+/* Get the error info for an error number. Also returns the name of the error block, if known. */
+static const char *get_error_info(int errnum, const struct error_info **info_ptr)
+{
+ struct error_block *block;
+
+ if (errnum == UDS_SUCCESS) {
+ *info_ptr = &successful;
+ return NULL;
+ }
+
+ for (block = registered_errors.blocks;
+ block < registered_errors.blocks + registered_errors.count;
+ block++) {
+ if ((errnum >= block->base) && (errnum < block->last)) {
+ *info_ptr = block->infos + (errnum - block->base);
+ return block->name;
+ } else if ((errnum >= block->last) && (errnum < block->max)) {
+ *info_ptr = NULL;
+ return block->name;
+ }
+ }
+
+ return NULL;
+}
+
+/* Return a string describing a system error message. */
+static const char *system_string_error(int errnum, char *buf, size_t buflen)
+{
+ size_t len;
+ const char *error_string = NULL;
+
+ if ((errnum > 0) && (errnum < ARRAY_SIZE(message_table)))
+ error_string = message_table[errnum];
+
+ len = ((error_string == NULL) ?
+ snprintf(buf, buflen, "Unknown error %d", errnum) :
+ snprintf(buf, buflen, "%s", error_string));
+ if (len < buflen)
+ return buf;
+
+ buf[0] = '\0';
+ return "System error";
+}
+
+/* Convert an error code to a descriptive string. */
+const char *uds_string_error(int errnum, char *buf, size_t buflen)
+{
+ char *buffer = buf;
+ char *buf_end = buf + buflen;
+ const struct error_info *info = NULL;
+ const char *block_name;
+
+ if (buf == NULL)
+ return NULL;
+
+ if (errnum < 0)
+ errnum = -errnum;
+
+ block_name = get_error_info(errnum, &info);
+ if (block_name != NULL) {
+ if (info != NULL) {
+ buffer = vdo_append_to_buffer(buffer, buf_end, "%s: %s",
+ block_name, info->message);
+ } else {
+ buffer = vdo_append_to_buffer(buffer, buf_end, "Unknown %s %d",
+ block_name, errnum);
+ }
+ } else if (info != NULL) {
+ buffer = vdo_append_to_buffer(buffer, buf_end, "%s", info->message);
+ } else {
+ const char *tmp = system_string_error(errnum, buffer, buf_end - buffer);
+
+ if (tmp != buffer)
+ buffer = vdo_append_to_buffer(buffer, buf_end, "%s", tmp);
+ else
+ buffer += strlen(tmp);
+ }
+
+ return buf;
+}
+
+/* Convert an error code to its name. */
+const char *uds_string_error_name(int errnum, char *buf, size_t buflen)
+{
+ char *buffer = buf;
+ char *buf_end = buf + buflen;
+ const struct error_info *info = NULL;
+ const char *block_name;
+
+ if (errnum < 0)
+ errnum = -errnum;
+
+ block_name = get_error_info(errnum, &info);
+ if (block_name != NULL) {
+ if (info != NULL) {
+ buffer = vdo_append_to_buffer(buffer, buf_end, "%s", info->name);
+ } else {
+ buffer = vdo_append_to_buffer(buffer, buf_end, "%s %d",
+ block_name, errnum);
+ }
+ } else if (info != NULL) {
+ buffer = vdo_append_to_buffer(buffer, buf_end, "%s", info->name);
+ } else {
+ const char *tmp;
+
+ tmp = system_string_error(errnum, buffer, buf_end - buffer);
+ if (tmp != buffer)
+ buffer = vdo_append_to_buffer(buffer, buf_end, "%s", tmp);
+ else
+ buffer += strlen(tmp);
+ }
+
+ return buf;
+}
+
+/*
+ * Translate an error code into a value acceptable to the kernel. The input error code may be a
+ * system-generated value (such as -EIO), or an internal UDS status code. The result will be a
+ * negative errno value.
+ */
+int uds_status_to_errno(int error)
+{
+ char error_name[VDO_MAX_ERROR_NAME_SIZE];
+ char error_message[VDO_MAX_ERROR_MESSAGE_SIZE];
+
+ /* 0 is success, and negative values are already system error codes. */
+ if (likely(error <= 0))
+ return error;
+
+ if (error < 1024) {
+ /* This is probably an errno from userspace. */
+ return -error;
+ }
+
+ /* Internal UDS errors */
+ switch (error) {
+ case UDS_NO_INDEX:
+ case UDS_CORRUPT_DATA:
+ /* The index doesn't exist or can't be recovered. */
+ return -ENOENT;
+
+ case UDS_INDEX_NOT_SAVED_CLEANLY:
+ case UDS_UNSUPPORTED_VERSION:
+ /*
+ * The index exists, but can't be loaded. Tell the client it exists so they don't
+ * destroy it inadvertently.
+ */
+ return -EEXIST;
+
+ case UDS_DISABLED:
+ /* The session is unusable; only returned by requests. */
+ return -EIO;
+
+ default:
+ /* Translate an unexpected error into something generic. */
+ vdo_log_info("%s: mapping status code %d (%s: %s) to -EIO",
+ __func__, error,
+ uds_string_error_name(error, error_name,
+ sizeof(error_name)),
+ uds_string_error(error, error_message,
+ sizeof(error_message)));
+ return -EIO;
+ }
+}
+
+/*
+ * Register a block of error codes.
+ *
+ * @block_name: the name of the block of error codes
+ * @first_error: the first error code in the block
+ * @next_free_error: one past the highest possible error in the block
+ * @infos: a pointer to the error info array for the block
+ * @info_size: the size of the error info array
+ */
+int uds_register_error_block(const char *block_name, int first_error,
+ int next_free_error, const struct error_info *infos,
+ size_t info_size)
+{
+ int result;
+ struct error_block *block;
+ struct error_block new_block = {
+ .name = block_name,
+ .base = first_error,
+ .last = first_error + (info_size / sizeof(struct error_info)),
+ .max = next_free_error,
+ .infos = infos,
+ };
+
+ result = VDO_ASSERT(first_error < next_free_error,
+ "well-defined error block range");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ if (registered_errors.count == registered_errors.allocated) {
+ /* This should never happen. */
+ return UDS_OVERFLOW;
+ }
+
+ for (block = registered_errors.blocks;
+ block < registered_errors.blocks + registered_errors.count;
+ block++) {
+ if (strcmp(block_name, block->name) == 0)
+ return UDS_DUPLICATE_NAME;
+
+ /* Ensure error ranges do not overlap. */
+ if ((first_error < block->max) && (next_free_error > block->base))
+ return UDS_ALREADY_REGISTERED;
+ }
+
+ registered_errors.blocks[registered_errors.count++] = new_block;
+ return UDS_SUCCESS;
+}
diff --git a/drivers/md/dm-vdo/errors.h b/drivers/md/dm-vdo/errors.h
new file mode 100644
index 000000000000..24e0e745fd5f
--- /dev/null
+++ b/drivers/md/dm-vdo/errors.h
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef UDS_ERRORS_H
+#define UDS_ERRORS_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+/* Custom error codes and error-related utilities */
+#define VDO_SUCCESS 0
+
+/* Valid status codes for internal UDS functions. */
+enum uds_status_codes {
+ /* Successful return */
+ UDS_SUCCESS = VDO_SUCCESS,
+ /* Used as a base value for reporting internal errors */
+ UDS_ERROR_CODE_BASE = 1024,
+ /* Index overflow */
+ UDS_OVERFLOW = UDS_ERROR_CODE_BASE,
+ /* Invalid argument passed to internal routine */
+ UDS_INVALID_ARGUMENT,
+ /* UDS data structures are in an invalid state */
+ UDS_BAD_STATE,
+ /* Attempt to enter the same name into an internal structure twice */
+ UDS_DUPLICATE_NAME,
+ /* An assertion failed */
+ UDS_ASSERTION_FAILED,
+ /* A request has been queued for later processing (not an error) */
+ UDS_QUEUED,
+ /* This error range has already been registered */
+ UDS_ALREADY_REGISTERED,
+ /* Attempt to read or write data outside the valid range */
+ UDS_OUT_OF_RANGE,
+ /* The index session is disabled */
+ UDS_DISABLED,
+ /* The index configuration or volume format is no longer supported */
+ UDS_UNSUPPORTED_VERSION,
+ /* Some index structure is corrupt */
+ UDS_CORRUPT_DATA,
+ /* No index state found */
+ UDS_NO_INDEX,
+ /* Attempt to access incomplete index save data */
+ UDS_INDEX_NOT_SAVED_CLEANLY,
+ /* One more than the last UDS_INTERNAL error code */
+ UDS_ERROR_CODE_LAST,
+ /* One more than the last error this block will ever use */
+ UDS_ERROR_CODE_BLOCK_END = UDS_ERROR_CODE_BASE + 440,
+};
+
+enum {
+ VDO_MAX_ERROR_NAME_SIZE = 80,
+ VDO_MAX_ERROR_MESSAGE_SIZE = 128,
+};
+
+struct error_info {
+ const char *name;
+ const char *message;
+};
+
+const char * __must_check uds_string_error(int errnum, char *buf, size_t buflen);
+
+const char *uds_string_error_name(int errnum, char *buf, size_t buflen);
+
+int uds_status_to_errno(int error);
+
+int uds_register_error_block(const char *block_name, int first_error,
+ int last_reserved_error, const struct error_info *infos,
+ size_t info_size);
+
+#endif /* UDS_ERRORS_H */
diff --git a/drivers/md/dm-vdo/flush.c b/drivers/md/dm-vdo/flush.c
new file mode 100644
index 000000000000..57e87f0d7069
--- /dev/null
+++ b/drivers/md/dm-vdo/flush.c
@@ -0,0 +1,560 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "flush.h"
+
+#include <linux/mempool.h>
+#include <linux/spinlock.h>
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "permassert.h"
+
+#include "admin-state.h"
+#include "completion.h"
+#include "io-submitter.h"
+#include "logical-zone.h"
+#include "slab-depot.h"
+#include "types.h"
+#include "vdo.h"
+
+struct flusher {
+ struct vdo_completion completion;
+ /* The vdo to which this flusher belongs */
+ struct vdo *vdo;
+ /* The administrative state of the flusher */
+ struct admin_state state;
+ /* The current flush generation of the vdo */
+ sequence_number_t flush_generation;
+ /* The first unacknowledged flush generation */
+ sequence_number_t first_unacknowledged_generation;
+ /* The queue of flush requests waiting to notify other threads */
+ struct vdo_wait_queue notifiers;
+ /* The queue of flush requests waiting for VIOs to complete */
+ struct vdo_wait_queue pending_flushes;
+ /* The flush generation for which notifications are being sent */
+ sequence_number_t notify_generation;
+ /* The logical zone to notify next */
+ struct logical_zone *logical_zone_to_notify;
+ /* The ID of the thread on which flush requests should be made */
+ thread_id_t thread_id;
+ /* The pool of flush requests */
+ mempool_t *flush_pool;
+ /* Bios waiting for a flush request to become available */
+ struct bio_list waiting_flush_bios;
+ /* The lock to protect the previous fields */
+ spinlock_t lock;
+ /* The rotor for selecting the bio queue for submitting flush bios */
+ zone_count_t bio_queue_rotor;
+ /* The number of flushes submitted to the current bio queue */
+ int flush_count;
+};
+
+/**
+ * assert_on_flusher_thread() - Check that we are on the flusher thread.
+ * @flusher: The flusher.
+ * @caller: The function which is asserting.
+ */
+static inline void assert_on_flusher_thread(struct flusher *flusher, const char *caller)
+{
+ VDO_ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == flusher->thread_id),
+ "%s() called from flusher thread", caller);
+}
+
+/**
+ * as_flusher() - Convert a generic vdo_completion to a flusher.
+ * @completion: The completion to convert.
+ *
+ * Return: The completion as a flusher.
+ */
+static struct flusher *as_flusher(struct vdo_completion *completion)
+{
+ vdo_assert_completion_type(completion, VDO_FLUSH_NOTIFICATION_COMPLETION);
+ return container_of(completion, struct flusher, completion);
+}
+
+/**
+ * completion_as_vdo_flush() - Convert a generic vdo_completion to a vdo_flush.
+ * @completion: The completion to convert.
+ *
+ * Return: The completion as a vdo_flush.
+ */
+static inline struct vdo_flush *completion_as_vdo_flush(struct vdo_completion *completion)
+{
+ vdo_assert_completion_type(completion, VDO_FLUSH_COMPLETION);
+ return container_of(completion, struct vdo_flush, completion);
+}
+
+/**
+ * vdo_waiter_as_flush() - Convert a vdo_flush's generic wait queue entry back to the vdo_flush.
+ * @waiter: The wait queue entry to convert.
+ *
+ * Return: The wait queue entry as a vdo_flush.
+ */
+static struct vdo_flush *vdo_waiter_as_flush(struct vdo_waiter *waiter)
+{
+ return container_of(waiter, struct vdo_flush, waiter);
+}
+
+static void *allocate_flush(gfp_t gfp_mask, void *pool_data)
+{
+ struct vdo_flush *flush = NULL;
+
+ if ((gfp_mask & GFP_NOWAIT) == GFP_NOWAIT) {
+ flush = vdo_allocate_memory_nowait(sizeof(struct vdo_flush), __func__);
+ } else {
+ int result = vdo_allocate(1, struct vdo_flush, __func__, &flush);
+
+ if (result != VDO_SUCCESS)
+ vdo_log_error_strerror(result, "failed to allocate spare flush");
+ }
+
+ if (flush != NULL) {
+ struct flusher *flusher = pool_data;
+
+ vdo_initialize_completion(&flush->completion, flusher->vdo,
+ VDO_FLUSH_COMPLETION);
+ }
+
+ return flush;
+}
+
+static void free_flush(void *element, void *pool_data __always_unused)
+{
+ vdo_free(element);
+}
+
+/**
+ * vdo_make_flusher() - Make a flusher for a vdo.
+ * @vdo: The vdo which owns the flusher.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+int vdo_make_flusher(struct vdo *vdo)
+{
+ int result = vdo_allocate(1, struct flusher, __func__, &vdo->flusher);
+
+ if (result != VDO_SUCCESS)
+ return result;
+
+ vdo->flusher->vdo = vdo;
+ vdo->flusher->thread_id = vdo->thread_config.packer_thread;
+ vdo_set_admin_state_code(&vdo->flusher->state, VDO_ADMIN_STATE_NORMAL_OPERATION);
+ vdo_initialize_completion(&vdo->flusher->completion, vdo,
+ VDO_FLUSH_NOTIFICATION_COMPLETION);
+
+ spin_lock_init(&vdo->flusher->lock);
+ bio_list_init(&vdo->flusher->waiting_flush_bios);
+ vdo->flusher->flush_pool = mempool_create(1, allocate_flush, free_flush,
+ vdo->flusher);
+ return ((vdo->flusher->flush_pool == NULL) ? -ENOMEM : VDO_SUCCESS);
+}
+
+/**
+ * vdo_free_flusher() - Free a flusher.
+ * @flusher: The flusher to free.
+ */
+void vdo_free_flusher(struct flusher *flusher)
+{
+ if (flusher == NULL)
+ return;
+
+ if (flusher->flush_pool != NULL)
+ mempool_destroy(vdo_forget(flusher->flush_pool));
+ vdo_free(flusher);
+}
+
+/**
+ * vdo_get_flusher_thread_id() - Get the ID of the thread on which flusher functions should be
+ * called.
+ * @flusher: The flusher to query.
+ *
+ * Return: The ID of the thread which handles the flusher.
+ */
+thread_id_t vdo_get_flusher_thread_id(struct flusher *flusher)
+{
+ return flusher->thread_id;
+}
+
+static void notify_flush(struct flusher *flusher);
+static void vdo_complete_flush(struct vdo_flush *flush);
+
+/**
+ * finish_notification() - Finish the notification process.
+ * @completion: The flusher completion.
+ *
+ * Finishes the notification process by checking if any flushes have completed and then starting
+ * the notification of the next flush request if one came in while the current notification was in
+ * progress. This callback is registered in flush_packer_callback().
+ */
+static void finish_notification(struct vdo_completion *completion)
+{
+ struct flusher *flusher = as_flusher(completion);
+
+ assert_on_flusher_thread(flusher, __func__);
+
+ vdo_waitq_enqueue_waiter(&flusher->pending_flushes,
+ vdo_waitq_dequeue_waiter(&flusher->notifiers));
+ vdo_complete_flushes(flusher);
+ if (vdo_waitq_has_waiters(&flusher->notifiers))
+ notify_flush(flusher);
+}
+
+/**
+ * flush_packer_callback() - Flush the packer.
+ * @completion: The flusher completion.
+ *
+ * Flushes the packer now that all of the logical and physical zones have been notified of the new
+ * flush request. This callback is registered in increment_generation().
+ */
+static void flush_packer_callback(struct vdo_completion *completion)
+{
+ struct flusher *flusher = as_flusher(completion);
+
+ vdo_increment_packer_flush_generation(flusher->vdo->packer);
+ vdo_launch_completion_callback(completion, finish_notification,
+ flusher->thread_id);
+}
+
+/**
+ * increment_generation() - Increment the flush generation in a logical zone.
+ * @completion: The flusher as a completion.
+ *
+ * If there are more logical zones, go on to the next one, otherwise, prepare the physical zones.
+ * This callback is registered both in notify_flush() and in itself.
+ */
+static void increment_generation(struct vdo_completion *completion)
+{
+ struct flusher *flusher = as_flusher(completion);
+ struct logical_zone *zone = flusher->logical_zone_to_notify;
+
+ vdo_increment_logical_zone_flush_generation(zone, flusher->notify_generation);
+ if (zone->next == NULL) {
+ vdo_launch_completion_callback(completion, flush_packer_callback,
+ flusher->thread_id);
+ return;
+ }
+
+ flusher->logical_zone_to_notify = zone->next;
+ vdo_launch_completion_callback(completion, increment_generation,
+ flusher->logical_zone_to_notify->thread_id);
+}
+
+/**
+ * notify_flush() - Launch a flush notification.
+ * @flusher: The flusher doing the notification.
+ */
+static void notify_flush(struct flusher *flusher)
+{
+ struct vdo_flush *flush =
+ vdo_waiter_as_flush(vdo_waitq_get_first_waiter(&flusher->notifiers));
+
+ flusher->notify_generation = flush->flush_generation;
+ flusher->logical_zone_to_notify = &flusher->vdo->logical_zones->zones[0];
+ flusher->completion.requeue = true;
+ vdo_launch_completion_callback(&flusher->completion, increment_generation,
+ flusher->logical_zone_to_notify->thread_id);
+}
+
+/**
+ * flush_vdo() - Start processing a flush request.
+ * @completion: A flush request (as a vdo_completion)
+ *
+ * This callback is registered in launch_flush().
+ */
+static void flush_vdo(struct vdo_completion *completion)
+{
+ struct vdo_flush *flush = completion_as_vdo_flush(completion);
+ struct flusher *flusher = completion->vdo->flusher;
+ bool may_notify;
+ int result;
+
+ assert_on_flusher_thread(flusher, __func__);
+ result = VDO_ASSERT(vdo_is_state_normal(&flusher->state),
+ "flusher is in normal operation");
+ if (result != VDO_SUCCESS) {
+ vdo_enter_read_only_mode(flusher->vdo, result);
+ vdo_complete_flush(flush);
+ return;
+ }
+
+ flush->flush_generation = flusher->flush_generation++;
+ may_notify = !vdo_waitq_has_waiters(&flusher->notifiers);
+ vdo_waitq_enqueue_waiter(&flusher->notifiers, &flush->waiter);
+ if (may_notify)
+ notify_flush(flusher);
+}
+
+/**
+ * check_for_drain_complete() - Check whether the flusher has drained.
+ * @flusher: The flusher.
+ */
+static void check_for_drain_complete(struct flusher *flusher)
+{
+ bool drained;
+
+ if (!vdo_is_state_draining(&flusher->state) ||
+ vdo_waitq_has_waiters(&flusher->pending_flushes))
+ return;
+
+ spin_lock(&flusher->lock);
+ drained = bio_list_empty(&flusher->waiting_flush_bios);
+ spin_unlock(&flusher->lock);
+
+ if (drained)
+ vdo_finish_draining(&flusher->state);
+}
+
+/**
+ * vdo_complete_flushes() - Attempt to complete any flushes which might have finished.
+ * @flusher: The flusher.
+ */
+void vdo_complete_flushes(struct flusher *flusher)
+{
+ sequence_number_t oldest_active_generation = U64_MAX;
+ struct logical_zone *zone;
+
+ assert_on_flusher_thread(flusher, __func__);
+
+ for (zone = &flusher->vdo->logical_zones->zones[0]; zone != NULL; zone = zone->next)
+ oldest_active_generation =
+ min(oldest_active_generation,
+ READ_ONCE(zone->oldest_active_generation));
+
+ while (vdo_waitq_has_waiters(&flusher->pending_flushes)) {
+ struct vdo_flush *flush =
+ vdo_waiter_as_flush(vdo_waitq_get_first_waiter(&flusher->pending_flushes));
+
+ if (flush->flush_generation >= oldest_active_generation)
+ return;
+
+ VDO_ASSERT_LOG_ONLY((flush->flush_generation ==
+ flusher->first_unacknowledged_generation),
+ "acknowledged next expected flush, %llu, was: %llu",
+ (unsigned long long) flusher->first_unacknowledged_generation,
+ (unsigned long long) flush->flush_generation);
+ vdo_waitq_dequeue_waiter(&flusher->pending_flushes);
+ vdo_complete_flush(flush);
+ flusher->first_unacknowledged_generation++;
+ }
+
+ check_for_drain_complete(flusher);
+}
+
+/**
+ * vdo_dump_flusher() - Dump the flusher, in a thread-unsafe fashion.
+ * @flusher: The flusher.
+ */
+void vdo_dump_flusher(const struct flusher *flusher)
+{
+ vdo_log_info("struct flusher");
+ vdo_log_info(" flush_generation=%llu first_unacknowledged_generation=%llu",
+ (unsigned long long) flusher->flush_generation,
+ (unsigned long long) flusher->first_unacknowledged_generation);
+ vdo_log_info(" notifiers queue is %s; pending_flushes queue is %s",
+ (vdo_waitq_has_waiters(&flusher->notifiers) ? "not empty" : "empty"),
+ (vdo_waitq_has_waiters(&flusher->pending_flushes) ? "not empty" : "empty"));
+}
+
+/**
+ * initialize_flush() - Initialize a vdo_flush structure.
+ * @flush: The flush to initialize.
+ * @vdo: The vdo being flushed.
+ *
+ * Initializes a vdo_flush structure, transferring all the bios in the flusher's waiting_flush_bios
+ * list to it. The caller MUST already hold the lock.
+ */
+static void initialize_flush(struct vdo_flush *flush, struct vdo *vdo)
+{
+ bio_list_init(&flush->bios);
+ bio_list_merge(&flush->bios, &vdo->flusher->waiting_flush_bios);
+ bio_list_init(&vdo->flusher->waiting_flush_bios);
+}
+
+static void launch_flush(struct vdo_flush *flush)
+{
+ struct vdo_completion *completion = &flush->completion;
+
+ vdo_prepare_completion(completion, flush_vdo, flush_vdo,
+ completion->vdo->thread_config.packer_thread, NULL);
+ vdo_enqueue_completion(completion, VDO_DEFAULT_Q_FLUSH_PRIORITY);
+}
+
+/**
+ * vdo_launch_flush() - Function called to start processing a flush request.
+ * @vdo: The vdo.
+ * @bio: The bio containing an empty flush request.
+ *
+ * This is called when we receive an empty flush bio from the block layer, and before acknowledging
+ * a non-empty bio with the FUA flag set.
+ */
+void vdo_launch_flush(struct vdo *vdo, struct bio *bio)
+{
+ /*
+ * Try to allocate a vdo_flush to represent the flush request. If the allocation fails,
+ * we'll deal with it later.
+ */
+ struct vdo_flush *flush = mempool_alloc(vdo->flusher->flush_pool, GFP_NOWAIT);
+ struct flusher *flusher = vdo->flusher;
+ const struct admin_state_code *code = vdo_get_admin_state_code(&flusher->state);
+
+ VDO_ASSERT_LOG_ONLY(!code->quiescent, "Flushing not allowed in state %s",
+ code->name);
+
+ spin_lock(&flusher->lock);
+
+ /* We have a new bio to start. Add it to the list. */
+ bio_list_add(&flusher->waiting_flush_bios, bio);
+
+ if (flush == NULL) {
+ spin_unlock(&flusher->lock);
+ return;
+ }
+
+ /* We have flushes to start. Capture them in the vdo_flush structure. */
+ initialize_flush(flush, vdo);
+ spin_unlock(&flusher->lock);
+
+ /* Finish launching the flushes. */
+ launch_flush(flush);
+}
+
+/**
+ * release_flush() - Release a vdo_flush structure that has completed its work.
+ * @flush: The completed flush structure to re-use or free.
+ *
+ * If there are any pending flush requests whose vdo_flush allocation failed, they will be launched
+ * by immediately re-using the released vdo_flush. If there is no spare vdo_flush, the released
+ * structure will become the spare. Otherwise, the vdo_flush will be freed.
+ */
+static void release_flush(struct vdo_flush *flush)
+{
+ bool relaunch_flush;
+ struct flusher *flusher = flush->completion.vdo->flusher;
+
+ spin_lock(&flusher->lock);
+ if (bio_list_empty(&flusher->waiting_flush_bios)) {
+ relaunch_flush = false;
+ } else {
+ /* We have flushes to start. Capture them in a flush request. */
+ initialize_flush(flush, flusher->vdo);
+ relaunch_flush = true;
+ }
+ spin_unlock(&flusher->lock);
+
+ if (relaunch_flush) {
+ /* Finish launching the flushes. */
+ launch_flush(flush);
+ return;
+ }
+
+ mempool_free(flush, flusher->flush_pool);
+}
+
+/**
+ * vdo_complete_flush_callback() - Function called to complete and free a flush request, registered
+ * in vdo_complete_flush().
+ * @completion: The flush request.
+ */
+static void vdo_complete_flush_callback(struct vdo_completion *completion)
+{
+ struct vdo_flush *flush = completion_as_vdo_flush(completion);
+ struct vdo *vdo = completion->vdo;
+ struct bio *bio;
+
+ while ((bio = bio_list_pop(&flush->bios)) != NULL) {
+ /*
+ * We're not acknowledging this bio now, but we'll never touch it again, so this is
+ * the last chance to account for it.
+ */
+ vdo_count_bios(&vdo->stats.bios_acknowledged, bio);
+
+ /* Update the device, and send it on down... */
+ bio_set_dev(bio, vdo_get_backing_device(vdo));
+ atomic64_inc(&vdo->stats.flush_out);
+ submit_bio_noacct(bio);
+ }
+
+
+ /*
+ * Release the flush structure, freeing it, re-using it as the spare, or using it to launch
+ * any flushes that had to wait when allocations failed.
+ */
+ release_flush(flush);
+}
+
+/**
+ * select_bio_queue() - Select the bio queue on which to finish a flush request.
+ * @flusher: The flusher finishing the request.
+ */
+static thread_id_t select_bio_queue(struct flusher *flusher)
+{
+ struct vdo *vdo = flusher->vdo;
+ zone_count_t bio_threads = flusher->vdo->thread_config.bio_thread_count;
+ int interval;
+
+ if (bio_threads == 1)
+ return vdo->thread_config.bio_threads[0];
+
+ interval = vdo->device_config->thread_counts.bio_rotation_interval;
+ if (flusher->flush_count == interval) {
+ flusher->flush_count = 1;
+ flusher->bio_queue_rotor = ((flusher->bio_queue_rotor + 1) % bio_threads);
+ } else {
+ flusher->flush_count++;
+ }
+
+ return vdo->thread_config.bio_threads[flusher->bio_queue_rotor];
+}
+
+/**
+ * vdo_complete_flush() - Complete and free a vdo flush request.
+ * @flush: The flush request.
+ */
+static void vdo_complete_flush(struct vdo_flush *flush)
+{
+ struct vdo_completion *completion = &flush->completion;
+
+ vdo_prepare_completion(completion, vdo_complete_flush_callback,
+ vdo_complete_flush_callback,
+ select_bio_queue(completion->vdo->flusher), NULL);
+ vdo_enqueue_completion(completion, BIO_Q_FLUSH_PRIORITY);
+}
+
+/**
+ * initiate_drain() - Initiate a drain.
+ *
+ * Implements vdo_admin_initiator_fn.
+ */
+static void initiate_drain(struct admin_state *state)
+{
+ check_for_drain_complete(container_of(state, struct flusher, state));
+}
+
+/**
+ * vdo_drain_flusher() - Drain the flusher.
+ * @flusher: The flusher to drain.
+ * @completion: The completion to finish when the flusher has drained.
+ *
+ * Drains the flusher by preventing any more VIOs from entering the flusher and then flushing. The
+ * flusher will be left in the suspended state.
+ */
+void vdo_drain_flusher(struct flusher *flusher, struct vdo_completion *completion)
+{
+ assert_on_flusher_thread(flusher, __func__);
+ vdo_start_draining(&flusher->state, VDO_ADMIN_STATE_SUSPENDING, completion,
+ initiate_drain);
+}
+
+/**
+ * vdo_resume_flusher() - Resume a flusher which has been suspended.
+ * @flusher: The flusher to resume.
+ * @parent: The completion to finish when the flusher has resumed.
+ */
+void vdo_resume_flusher(struct flusher *flusher, struct vdo_completion *parent)
+{
+ assert_on_flusher_thread(flusher, __func__);
+ vdo_continue_completion(parent, vdo_resume_if_quiescent(&flusher->state));
+}
diff --git a/drivers/md/dm-vdo/flush.h b/drivers/md/dm-vdo/flush.h
new file mode 100644
index 000000000000..97252d6656e0
--- /dev/null
+++ b/drivers/md/dm-vdo/flush.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef VDO_FLUSH_H
+#define VDO_FLUSH_H
+
+#include "funnel-workqueue.h"
+#include "types.h"
+#include "vio.h"
+#include "wait-queue.h"
+
+/* A marker for tracking which journal entries are affected by a flush request. */
+struct vdo_flush {
+ /* The completion for enqueueing this flush request. */
+ struct vdo_completion completion;
+ /* The flush bios covered by this request */
+ struct bio_list bios;
+ /* The wait queue entry for this flush */
+ struct vdo_waiter waiter;
+ /* Which flush this struct represents */
+ sequence_number_t flush_generation;
+};
+
+struct flusher;
+
+int __must_check vdo_make_flusher(struct vdo *vdo);
+
+void vdo_free_flusher(struct flusher *flusher);
+
+thread_id_t __must_check vdo_get_flusher_thread_id(struct flusher *flusher);
+
+void vdo_complete_flushes(struct flusher *flusher);
+
+void vdo_dump_flusher(const struct flusher *flusher);
+
+void vdo_launch_flush(struct vdo *vdo, struct bio *bio);
+
+void vdo_drain_flusher(struct flusher *flusher, struct vdo_completion *completion);
+
+void vdo_resume_flusher(struct flusher *flusher, struct vdo_completion *parent);
+
+#endif /* VDO_FLUSH_H */
diff --git a/drivers/md/dm-vdo/funnel-queue.c b/drivers/md/dm-vdo/funnel-queue.c
new file mode 100644
index 000000000000..a63b2f2bfd7d
--- /dev/null
+++ b/drivers/md/dm-vdo/funnel-queue.c
@@ -0,0 +1,170 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "funnel-queue.h"
+
+#include "cpu.h"
+#include "memory-alloc.h"
+#include "permassert.h"
+
+int vdo_make_funnel_queue(struct funnel_queue **queue_ptr)
+{
+ int result;
+ struct funnel_queue *queue;
+
+ result = vdo_allocate(1, struct funnel_queue, "funnel queue", &queue);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ /*
+ * Initialize the stub entry and put it in the queue, establishing the invariant that
+ * queue->newest and queue->oldest are never null.
+ */
+ queue->stub.next = NULL;
+ queue->newest = &queue->stub;
+ queue->oldest = &queue->stub;
+
+ *queue_ptr = queue;
+ return VDO_SUCCESS;
+}
+
+void vdo_free_funnel_queue(struct funnel_queue *queue)
+{
+ vdo_free(queue);
+}
+
+static struct funnel_queue_entry *get_oldest(struct funnel_queue *queue)
+{
+ /*
+ * Barrier requirements: We need a read barrier between reading a "next" field pointer
+ * value and reading anything it points to. There's an accompanying barrier in
+ * vdo_funnel_queue_put() between its caller setting up the entry and making it visible.
+ */
+ struct funnel_queue_entry *oldest = queue->oldest;
+ struct funnel_queue_entry *next = READ_ONCE(oldest->next);
+
+ if (oldest == &queue->stub) {
+ /*
+ * When the oldest entry is the stub and it has no successor, the queue is
+ * logically empty.
+ */
+ if (next == NULL)
+ return NULL;
+ /*
+ * The stub entry has a successor, so the stub can be dequeued and ignored without
+ * breaking the queue invariants.
+ */
+ oldest = next;
+ queue->oldest = oldest;
+ next = READ_ONCE(oldest->next);
+ }
+
+ /*
+ * We have a non-stub candidate to dequeue. If it lacks a successor, we'll need to put the
+ * stub entry back on the queue first.
+ */
+ if (next == NULL) {
+ struct funnel_queue_entry *newest = READ_ONCE(queue->newest);
+
+ if (oldest != newest) {
+ /*
+ * Another thread has already swung queue->newest atomically, but not yet
+ * assigned previous->next. The queue is really still empty.
+ */
+ return NULL;
+ }
+
+ /*
+ * Put the stub entry back on the queue, ensuring a successor will eventually be
+ * seen.
+ */
+ vdo_funnel_queue_put(queue, &queue->stub);
+
+ /* Check again for a successor. */
+ next = READ_ONCE(oldest->next);
+ if (next == NULL) {
+ /*
+ * We lost a race with a producer who swapped queue->newest before we did,
+ * but who hasn't yet updated previous->next. Try again later.
+ */
+ return NULL;
+ }
+ }
+
+ return oldest;
+}
+
+/*
+ * Poll a queue, removing the oldest entry if the queue is not empty. This function must only be
+ * called from a single consumer thread.
+ */
+struct funnel_queue_entry *vdo_funnel_queue_poll(struct funnel_queue *queue)
+{
+ struct funnel_queue_entry *oldest = get_oldest(queue);
+
+ if (oldest == NULL)
+ return oldest;
+
+ /*
+ * Dequeue the oldest entry and return it. Only one consumer thread may call this function,
+ * so no locking, atomic operations, or fences are needed; queue->oldest is owned by the
+ * consumer and oldest->next is never used by a producer thread after it is swung from NULL
+ * to non-NULL.
+ */
+ queue->oldest = READ_ONCE(oldest->next);
+ /*
+ * Make sure the caller sees the proper stored data for this entry. Since we've already
+ * fetched the entry pointer we stored in "queue->oldest", this also ensures that on entry
+ * to the next call we'll properly see the dependent data.
+ */
+ smp_rmb();
+ /*
+ * If "oldest" is a very light-weight work item, we'll be looking for the next one very
+ * soon, so prefetch it now.
+ */
+ uds_prefetch_address(queue->oldest, true);
+ WRITE_ONCE(oldest->next, NULL);
+ return oldest;
+}
+
+/*
+ * Check whether the funnel queue is empty or not. If the queue is in a transition state with one
+ * or more entries being added such that the list view is incomplete, this function will report the
+ * queue as empty.
+ */
+bool vdo_is_funnel_queue_empty(struct funnel_queue *queue)
+{
+ return get_oldest(queue) == NULL;
+}
+
+/*
+ * Check whether the funnel queue is idle or not. If the queue has entries available to be
+ * retrieved, it is not idle. If the queue is in a transition state with one or more entries being
+ * added such that the list view is incomplete, it may not be possible to retrieve an entry with
+ * the vdo_funnel_queue_poll() function, but the queue will not be considered idle.
+ */
+bool vdo_is_funnel_queue_idle(struct funnel_queue *queue)
+{
+ /*
+ * Oldest is not the stub, so there's another entry, though if next is NULL we can't
+ * retrieve it yet.
+ */
+ if (queue->oldest != &queue->stub)
+ return false;
+
+ /*
+ * Oldest is the stub, but newest has been updated by _put(); either there's another,
+ * retrievable entry in the list, or the list is officially empty but in the intermediate
+ * state of having an entry added.
+ *
+ * Whether anything is retrievable depends on whether stub.next has been updated and become
+ * visible to us, but for idleness we don't care. And due to memory ordering in _put(), the
+ * update to newest would be visible to us at the same time or sooner.
+ */
+ if (READ_ONCE(queue->newest) != &queue->stub)
+ return false;
+
+ return true;
+}
diff --git a/drivers/md/dm-vdo/funnel-queue.h b/drivers/md/dm-vdo/funnel-queue.h
new file mode 100644
index 000000000000..bde0f1deff98
--- /dev/null
+++ b/drivers/md/dm-vdo/funnel-queue.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef VDO_FUNNEL_QUEUE_H
+#define VDO_FUNNEL_QUEUE_H
+
+#include <linux/atomic.h>
+#include <linux/cache.h>
+
+/*
+ * A funnel queue is a simple (almost) lock-free queue that accepts entries from multiple threads
+ * (multi-producer) and delivers them to a single thread (single-consumer). "Funnel" is an attempt
+ * to evoke the image of requests from more than one producer being "funneled down" to a single
+ * consumer.
+ *
+ * This is an unsynchronized but thread-safe data structure when used as intended. There is no
+ * mechanism to ensure that only one thread is consuming from the queue. If more than one thread
+ * attempts to consume from the queue, the resulting behavior is undefined. Clients must not
+ * directly access or manipulate the internals of the queue, which are only exposed for the purpose
+ * of allowing the very simple enqueue operation to be inlined.
+ *
+ * The implementation requires that a funnel_queue_entry structure (a link pointer) is embedded in
+ * the queue entries, and pointers to those structures are used exclusively by the queue. No macros
+ * are defined to template the queue, so the offset of the funnel_queue_entry in the records placed
+ * in the queue must all be the same so the client can derive their structure pointer from the
+ * entry pointer returned by vdo_funnel_queue_poll().
+ *
+ * Callers are wholly responsible for allocating and freeing the entries. Entries may be freed as
+ * soon as they are returned since this queue is not susceptible to the "ABA problem" present in
+ * many lock-free data structures. The queue is dynamically allocated to ensure cache-line
+ * alignment, but no other dynamic allocation is used.
+ *
+ * The algorithm is not actually 100% lock-free. There is a single point in vdo_funnel_queue_put()
+ * at which a preempted producer will prevent the consumers from seeing items added to the queue by
+ * later producers, and only if the queue is short enough or the consumer fast enough for it to
+ * reach what was the end of the queue at the time of the preemption.
+ *
+ * The consumer function, vdo_funnel_queue_poll(), will return NULL when the queue is empty. To
+ * wait for data to consume, spin (if safe) or combine the queue with a struct event_count to
+ * signal the presence of new entries.
+ */
+
+/* This queue link structure must be embedded in client entries. */
+struct funnel_queue_entry {
+ /* The next (newer) entry in the queue. */
+ struct funnel_queue_entry *next;
+};
+
+/*
+ * The dynamically allocated queue structure, which is allocated on a cache line boundary so the
+ * producer and consumer fields in the structure will land on separate cache lines. This should be
+ * consider opaque but it is exposed here so vdo_funnel_queue_put() can be inlined.
+ */
+struct __aligned(L1_CACHE_BYTES) funnel_queue {
+ /*
+ * The producers' end of the queue, an atomically exchanged pointer that will never be
+ * NULL.
+ */
+ struct funnel_queue_entry *newest;
+
+ /* The consumer's end of the queue, which is owned by the consumer and never NULL. */
+ struct funnel_queue_entry *oldest __aligned(L1_CACHE_BYTES);
+
+ /* A dummy entry used to provide the non-NULL invariants above. */
+ struct funnel_queue_entry stub;
+};
+
+int __must_check vdo_make_funnel_queue(struct funnel_queue **queue_ptr);
+
+void vdo_free_funnel_queue(struct funnel_queue *queue);
+
+/*
+ * Put an entry on the end of the queue.
+ *
+ * The entry pointer must be to the struct funnel_queue_entry embedded in the caller's data
+ * structure. The caller must be able to derive the address of the start of their data structure
+ * from the pointer that passed in here, so every entry in the queue must have the struct
+ * funnel_queue_entry at the same offset within the client's structure.
+ */
+static inline void vdo_funnel_queue_put(struct funnel_queue *queue,
+ struct funnel_queue_entry *entry)
+{
+ struct funnel_queue_entry *previous;
+
+ /*
+ * Barrier requirements: All stores relating to the entry ("next" pointer, containing data
+ * structure fields) must happen before the previous->next store making it visible to the
+ * consumer. Also, the entry's "next" field initialization to NULL must happen before any
+ * other producer threads can see the entry (the xchg) and try to update the "next" field.
+ *
+ * xchg implements a full barrier.
+ */
+ WRITE_ONCE(entry->next, NULL);
+ previous = xchg(&queue->newest, entry);
+ /*
+ * Preemptions between these two statements hide the rest of the queue from the consumer,
+ * preventing consumption until the following assignment runs.
+ */
+ WRITE_ONCE(previous->next, entry);
+}
+
+struct funnel_queue_entry *__must_check vdo_funnel_queue_poll(struct funnel_queue *queue);
+
+bool __must_check vdo_is_funnel_queue_empty(struct funnel_queue *queue);
+
+bool __must_check vdo_is_funnel_queue_idle(struct funnel_queue *queue);
+
+#endif /* VDO_FUNNEL_QUEUE_H */
diff --git a/drivers/md/dm-vdo/funnel-workqueue.c b/drivers/md/dm-vdo/funnel-workqueue.c
new file mode 100644
index 000000000000..ae11941c90a9
--- /dev/null
+++ b/drivers/md/dm-vdo/funnel-workqueue.c
@@ -0,0 +1,638 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "funnel-workqueue.h"
+
+#include <linux/atomic.h>
+#include <linux/cache.h>
+#include <linux/completion.h>
+#include <linux/err.h>
+#include <linux/kthread.h>
+#include <linux/percpu.h>
+
+#include "funnel-queue.h"
+#include "logger.h"
+#include "memory-alloc.h"
+#include "numeric.h"
+#include "permassert.h"
+#include "string-utils.h"
+
+#include "completion.h"
+#include "status-codes.h"
+
+static DEFINE_PER_CPU(unsigned int, service_queue_rotor);
+
+/**
+ * DOC: Work queue definition.
+ *
+ * There are two types of work queues: simple, with one worker thread, and round-robin, which uses
+ * a group of the former to do the work, and assigns work to them in round-robin fashion (roughly).
+ * Externally, both are represented via the same common sub-structure, though there's actually not
+ * a great deal of overlap between the two types internally.
+ */
+struct vdo_work_queue {
+ /* Name of just the work queue (e.g., "cpuQ12") */
+ char *name;
+ bool round_robin_mode;
+ struct vdo_thread *owner;
+ /* Life cycle functions, etc */
+ const struct vdo_work_queue_type *type;
+};
+
+struct simple_work_queue {
+ struct vdo_work_queue common;
+ struct funnel_queue *priority_lists[VDO_WORK_Q_MAX_PRIORITY + 1];
+ void *private;
+
+ /*
+ * The fields above are unchanged after setup but often read, and are good candidates for
+ * caching -- and if the max priority is 2, just fit in one x86-64 cache line if aligned.
+ * The fields below are often modified as we sleep and wake, so we want a separate cache
+ * line for performance.
+ */
+
+ /* Any (0 or 1) worker threads waiting for new work to do */
+ wait_queue_head_t waiting_worker_threads ____cacheline_aligned;
+ /* Hack to reduce wakeup calls if the worker thread is running */
+ atomic_t idle;
+
+ /* These are infrequently used so in terms of performance we don't care where they land. */
+ struct task_struct *thread;
+ /* Notify creator once worker has initialized */
+ struct completion *started;
+};
+
+struct round_robin_work_queue {
+ struct vdo_work_queue common;
+ struct simple_work_queue **service_queues;
+ unsigned int num_service_queues;
+};
+
+static inline struct simple_work_queue *as_simple_work_queue(struct vdo_work_queue *queue)
+{
+ return ((queue == NULL) ?
+ NULL : container_of(queue, struct simple_work_queue, common));
+}
+
+static inline struct round_robin_work_queue *as_round_robin_work_queue(struct vdo_work_queue *queue)
+{
+ return ((queue == NULL) ?
+ NULL :
+ container_of(queue, struct round_robin_work_queue, common));
+}
+
+/* Processing normal completions. */
+
+/*
+ * Dequeue and return the next waiting completion, if any.
+ *
+ * We scan the funnel queues from highest priority to lowest, once; there is therefore a race
+ * condition where a high-priority completion can be enqueued followed by a lower-priority one, and
+ * we'll grab the latter (but we'll catch the high-priority item on the next call). If strict
+ * enforcement of priorities becomes necessary, this function will need fixing.
+ */
+static struct vdo_completion *poll_for_completion(struct simple_work_queue *queue)
+{
+ int i;
+
+ for (i = queue->common.type->max_priority; i >= 0; i--) {
+ struct funnel_queue_entry *link = vdo_funnel_queue_poll(queue->priority_lists[i]);
+
+ if (link != NULL)
+ return container_of(link, struct vdo_completion, work_queue_entry_link);
+ }
+
+ return NULL;
+}
+
+static void enqueue_work_queue_completion(struct simple_work_queue *queue,
+ struct vdo_completion *completion)
+{
+ VDO_ASSERT_LOG_ONLY(completion->my_queue == NULL,
+ "completion %px (fn %px) to enqueue (%px) is not already queued (%px)",
+ completion, completion->callback, queue, completion->my_queue);
+ if (completion->priority == VDO_WORK_Q_DEFAULT_PRIORITY)
+ completion->priority = queue->common.type->default_priority;
+
+ if (VDO_ASSERT(completion->priority <= queue->common.type->max_priority,
+ "priority is in range for queue") != VDO_SUCCESS)
+ completion->priority = 0;
+
+ completion->my_queue = &queue->common;
+
+ /* Funnel queue handles the synchronization for the put. */
+ vdo_funnel_queue_put(queue->priority_lists[completion->priority],
+ &completion->work_queue_entry_link);
+
+ /*
+ * Due to how funnel queue synchronization is handled (just atomic operations), the
+ * simplest safe implementation here would be to wake-up any waiting threads after
+ * enqueueing each item. Even if the funnel queue is not empty at the time of adding an
+ * item to the queue, the consumer thread may not see this since it is not guaranteed to
+ * have the same view of the queue as a producer thread.
+ *
+ * However, the above is wasteful so instead we attempt to minimize the number of thread
+ * wakeups. Using an idle flag, and careful ordering using memory barriers, we should be
+ * able to determine when the worker thread might be asleep or going to sleep. We use
+ * cmpxchg to try to take ownership (vs other producer threads) of the responsibility for
+ * waking the worker thread, so multiple wakeups aren't tried at once.
+ *
+ * This was tuned for some x86 boxes that were handy; it's untested whether doing the read
+ * first is any better or worse for other platforms, even other x86 configurations.
+ */
+ smp_mb();
+ if ((atomic_read(&queue->idle) != 1) || (atomic_cmpxchg(&queue->idle, 1, 0) != 1))
+ return;
+
+ /* There's a maximum of one thread in this list. */
+ wake_up(&queue->waiting_worker_threads);
+}
+
+static void run_start_hook(struct simple_work_queue *queue)
+{
+ if (queue->common.type->start != NULL)
+ queue->common.type->start(queue->private);
+}
+
+static void run_finish_hook(struct simple_work_queue *queue)
+{
+ if (queue->common.type->finish != NULL)
+ queue->common.type->finish(queue->private);
+}
+
+/*
+ * Wait for the next completion to process, or until kthread_should_stop indicates that it's time
+ * for us to shut down.
+ *
+ * If kthread_should_stop says it's time to stop but we have pending completions return a
+ * completion.
+ *
+ * Also update statistics relating to scheduler interactions.
+ */
+static struct vdo_completion *wait_for_next_completion(struct simple_work_queue *queue)
+{
+ struct vdo_completion *completion;
+ DEFINE_WAIT(wait);
+
+ while (true) {
+ prepare_to_wait(&queue->waiting_worker_threads, &wait,
+ TASK_INTERRUPTIBLE);
+ /*
+ * Don't set the idle flag until a wakeup will not be lost.
+ *
+ * Force synchronization between setting the idle flag and checking the funnel
+ * queue; the producer side will do them in the reverse order. (There's still a
+ * race condition we've chosen to allow, because we've got a timeout below that
+ * unwedges us if we hit it, but this may narrow the window a little.)
+ */
+ atomic_set(&queue->idle, 1);
+ smp_mb(); /* store-load barrier between "idle" and funnel queue */
+
+ completion = poll_for_completion(queue);
+ if (completion != NULL)
+ break;
+
+ /*
+ * We need to check for thread-stop after setting TASK_INTERRUPTIBLE state up
+ * above. Otherwise, schedule() will put the thread to sleep and might miss a
+ * wakeup from kthread_stop() call in vdo_finish_work_queue().
+ */
+ if (kthread_should_stop())
+ break;
+
+ schedule();
+
+ /*
+ * Most of the time when we wake, it should be because there's work to do. If it
+ * was a spurious wakeup, continue looping.
+ */
+ completion = poll_for_completion(queue);
+ if (completion != NULL)
+ break;
+ }
+
+ finish_wait(&queue->waiting_worker_threads, &wait);
+ atomic_set(&queue->idle, 0);
+
+ return completion;
+}
+
+static void process_completion(struct simple_work_queue *queue,
+ struct vdo_completion *completion)
+{
+ if (VDO_ASSERT(completion->my_queue == &queue->common,
+ "completion %px from queue %px marked as being in this queue (%px)",
+ completion, queue, completion->my_queue) == VDO_SUCCESS)
+ completion->my_queue = NULL;
+
+ vdo_run_completion(completion);
+}
+
+static void service_work_queue(struct simple_work_queue *queue)
+{
+ run_start_hook(queue);
+
+ while (true) {
+ struct vdo_completion *completion = poll_for_completion(queue);
+
+ if (completion == NULL)
+ completion = wait_for_next_completion(queue);
+
+ if (completion == NULL) {
+ /* No completions but kthread_should_stop() was triggered. */
+ break;
+ }
+
+ process_completion(queue, completion);
+
+ /*
+ * Be friendly to a CPU that has other work to do, if the kernel has told us to.
+ * This speeds up some performance tests; that "other work" might include other VDO
+ * threads.
+ */
+ if (need_resched())
+ cond_resched();
+ }
+
+ run_finish_hook(queue);
+}
+
+static int work_queue_runner(void *ptr)
+{
+ struct simple_work_queue *queue = ptr;
+
+ complete(queue->started);
+ service_work_queue(queue);
+ return 0;
+}
+
+/* Creation & teardown */
+
+static void free_simple_work_queue(struct simple_work_queue *queue)
+{
+ unsigned int i;
+
+ for (i = 0; i <= VDO_WORK_Q_MAX_PRIORITY; i++)
+ vdo_free_funnel_queue(queue->priority_lists[i]);
+ vdo_free(queue->common.name);
+ vdo_free(queue);
+}
+
+static void free_round_robin_work_queue(struct round_robin_work_queue *queue)
+{
+ struct simple_work_queue **queue_table = queue->service_queues;
+ unsigned int count = queue->num_service_queues;
+ unsigned int i;
+
+ queue->service_queues = NULL;
+
+ for (i = 0; i < count; i++)
+ free_simple_work_queue(queue_table[i]);
+ vdo_free(queue_table);
+ vdo_free(queue->common.name);
+ vdo_free(queue);
+}
+
+void vdo_free_work_queue(struct vdo_work_queue *queue)
+{
+ if (queue == NULL)
+ return;
+
+ vdo_finish_work_queue(queue);
+
+ if (queue->round_robin_mode)
+ free_round_robin_work_queue(as_round_robin_work_queue(queue));
+ else
+ free_simple_work_queue(as_simple_work_queue(queue));
+}
+
+static int make_simple_work_queue(const char *thread_name_prefix, const char *name,
+ struct vdo_thread *owner, void *private,
+ const struct vdo_work_queue_type *type,
+ struct simple_work_queue **queue_ptr)
+{
+ DECLARE_COMPLETION_ONSTACK(started);
+ struct simple_work_queue *queue;
+ int i;
+ struct task_struct *thread = NULL;
+ int result;
+
+ VDO_ASSERT_LOG_ONLY((type->max_priority <= VDO_WORK_Q_MAX_PRIORITY),
+ "queue priority count %u within limit %u", type->max_priority,
+ VDO_WORK_Q_MAX_PRIORITY);
+
+ result = vdo_allocate(1, struct simple_work_queue, "simple work queue", &queue);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ queue->private = private;
+ queue->started = &started;
+ queue->common.type = type;
+ queue->common.owner = owner;
+ init_waitqueue_head(&queue->waiting_worker_threads);
+
+ result = vdo_duplicate_string(name, "queue name", &queue->common.name);
+ if (result != VDO_SUCCESS) {
+ vdo_free(queue);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i <= type->max_priority; i++) {
+ result = vdo_make_funnel_queue(&queue->priority_lists[i]);
+ if (result != VDO_SUCCESS) {
+ free_simple_work_queue(queue);
+ return result;
+ }
+ }
+
+ thread = kthread_run(work_queue_runner, queue, "%s:%s", thread_name_prefix,
+ queue->common.name);
+ if (IS_ERR(thread)) {
+ free_simple_work_queue(queue);
+ return (int) PTR_ERR(thread);
+ }
+
+ queue->thread = thread;
+
+ /*
+ * If we don't wait to ensure the thread is running VDO code, a quick kthread_stop (due to
+ * errors elsewhere) could cause it to never get as far as running VDO, skipping the
+ * cleanup code.
+ *
+ * Eventually we should just make that path safe too, and then we won't need this
+ * synchronization.
+ */
+ wait_for_completion(&started);
+
+ *queue_ptr = queue;
+ return VDO_SUCCESS;
+}
+
+/**
+ * vdo_make_work_queue() - Create a work queue; if multiple threads are requested, completions will
+ * be distributed to them in round-robin fashion.
+ *
+ * Each queue is associated with a struct vdo_thread which has a single vdo thread id. Regardless
+ * of the actual number of queues and threads allocated here, code outside of the queue
+ * implementation will treat this as a single zone.
+ */
+int vdo_make_work_queue(const char *thread_name_prefix, const char *name,
+ struct vdo_thread *owner, const struct vdo_work_queue_type *type,
+ unsigned int thread_count, void *thread_privates[],
+ struct vdo_work_queue **queue_ptr)
+{
+ struct round_robin_work_queue *queue;
+ int result;
+ char thread_name[TASK_COMM_LEN];
+ unsigned int i;
+
+ if (thread_count == 1) {
+ struct simple_work_queue *simple_queue;
+ void *context = ((thread_privates != NULL) ? thread_privates[0] : NULL);
+
+ result = make_simple_work_queue(thread_name_prefix, name, owner, context,
+ type, &simple_queue);
+ if (result == VDO_SUCCESS)
+ *queue_ptr = &simple_queue->common;
+ return result;
+ }
+
+ result = vdo_allocate(1, struct round_robin_work_queue, "round-robin work queue",
+ &queue);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = vdo_allocate(thread_count, struct simple_work_queue *,
+ "subordinate work queues", &queue->service_queues);
+ if (result != VDO_SUCCESS) {
+ vdo_free(queue);
+ return result;
+ }
+
+ queue->num_service_queues = thread_count;
+ queue->common.round_robin_mode = true;
+ queue->common.owner = owner;
+
+ result = vdo_duplicate_string(name, "queue name", &queue->common.name);
+ if (result != VDO_SUCCESS) {
+ vdo_free(queue->service_queues);
+ vdo_free(queue);
+ return -ENOMEM;
+ }
+
+ *queue_ptr = &queue->common;
+
+ for (i = 0; i < thread_count; i++) {
+ void *context = ((thread_privates != NULL) ? thread_privates[i] : NULL);
+
+ snprintf(thread_name, sizeof(thread_name), "%s%u", name, i);
+ result = make_simple_work_queue(thread_name_prefix, thread_name, owner,
+ context, type, &queue->service_queues[i]);
+ if (result != VDO_SUCCESS) {
+ queue->num_service_queues = i;
+ /* Destroy previously created subordinates. */
+ vdo_free_work_queue(vdo_forget(*queue_ptr));
+ return result;
+ }
+ }
+
+ return VDO_SUCCESS;
+}
+
+static void finish_simple_work_queue(struct simple_work_queue *queue)
+{
+ if (queue->thread == NULL)
+ return;
+
+ /* Tells the worker thread to shut down and waits for it to exit. */
+ kthread_stop(queue->thread);
+ queue->thread = NULL;
+}
+
+static void finish_round_robin_work_queue(struct round_robin_work_queue *queue)
+{
+ struct simple_work_queue **queue_table = queue->service_queues;
+ unsigned int count = queue->num_service_queues;
+ unsigned int i;
+
+ for (i = 0; i < count; i++)
+ finish_simple_work_queue(queue_table[i]);
+}
+
+/* No enqueueing of completions should be done once this function is called. */
+void vdo_finish_work_queue(struct vdo_work_queue *queue)
+{
+ if (queue == NULL)
+ return;
+
+ if (queue->round_robin_mode)
+ finish_round_robin_work_queue(as_round_robin_work_queue(queue));
+ else
+ finish_simple_work_queue(as_simple_work_queue(queue));
+}
+
+/* Debugging dumps */
+
+static void dump_simple_work_queue(struct simple_work_queue *queue)
+{
+ const char *thread_status = "no threads";
+ char task_state_report = '-';
+
+ if (queue->thread != NULL) {
+ task_state_report = task_state_to_char(queue->thread);
+ thread_status = atomic_read(&queue->idle) ? "idle" : "running";
+ }
+
+ vdo_log_info("workQ %px (%s) %s (%c)", &queue->common, queue->common.name,
+ thread_status, task_state_report);
+
+ /* ->waiting_worker_threads wait queue status? anyone waiting? */
+}
+
+/*
+ * Write to the buffer some info about the completion, for logging. Since the common use case is
+ * dumping info about a lot of completions to syslog all at once, the format favors brevity over
+ * readability.
+ */
+void vdo_dump_work_queue(struct vdo_work_queue *queue)
+{
+ if (queue->round_robin_mode) {
+ struct round_robin_work_queue *round_robin = as_round_robin_work_queue(queue);
+ unsigned int i;
+
+ for (i = 0; i < round_robin->num_service_queues; i++)
+ dump_simple_work_queue(round_robin->service_queues[i]);
+ } else {
+ dump_simple_work_queue(as_simple_work_queue(queue));
+ }
+}
+
+static void get_function_name(void *pointer, char *buffer, size_t buffer_length)
+{
+ if (pointer == NULL) {
+ /*
+ * Format "%ps" logs a null pointer as "(null)" with a bunch of leading spaces. We
+ * sometimes use this when logging lots of data; don't be so verbose.
+ */
+ strscpy(buffer, "-", buffer_length);
+ } else {
+ /*
+ * Use a pragma to defeat gcc's format checking, which doesn't understand that
+ * "%ps" actually does support a precision spec in Linux kernel code.
+ */
+ char *space;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wformat"
+ snprintf(buffer, buffer_length, "%.*ps", buffer_length - 1, pointer);
+#pragma GCC diagnostic pop
+
+ space = strchr(buffer, ' ');
+ if (space != NULL)
+ *space = '\0';
+ }
+}
+
+void vdo_dump_completion_to_buffer(struct vdo_completion *completion, char *buffer,
+ size_t length)
+{
+ size_t current_length =
+ scnprintf(buffer, length, "%.*s/", TASK_COMM_LEN,
+ (completion->my_queue == NULL ? "-" : completion->my_queue->name));
+
+ if (current_length < length - 1) {
+ get_function_name((void *) completion->callback, buffer + current_length,
+ length - current_length);
+ }
+}
+
+/* Completion submission */
+/*
+ * If the completion has a timeout that has already passed, the timeout handler function may be
+ * invoked by this function.
+ */
+void vdo_enqueue_work_queue(struct vdo_work_queue *queue,
+ struct vdo_completion *completion)
+{
+ /*
+ * Convert the provided generic vdo_work_queue to the simple_work_queue to actually queue
+ * on.
+ */
+ struct simple_work_queue *simple_queue = NULL;
+
+ if (!queue->round_robin_mode) {
+ simple_queue = as_simple_work_queue(queue);
+ } else {
+ struct round_robin_work_queue *round_robin = as_round_robin_work_queue(queue);
+
+ /*
+ * It shouldn't be a big deal if the same rotor gets used for multiple work queues.
+ * Any patterns that might develop are likely to be disrupted by random ordering of
+ * multiple completions and migration between cores, unless the load is so light as
+ * to be regular in ordering of tasks and the threads are confined to individual
+ * cores; with a load that light we won't care.
+ */
+ unsigned int rotor = this_cpu_inc_return(service_queue_rotor);
+ unsigned int index = rotor % round_robin->num_service_queues;
+
+ simple_queue = round_robin->service_queues[index];
+ }
+
+ enqueue_work_queue_completion(simple_queue, completion);
+}
+
+/* Misc */
+
+/*
+ * Return the work queue pointer recorded at initialization time in the work-queue stack handle
+ * initialized on the stack of the current thread, if any.
+ */
+static struct simple_work_queue *get_current_thread_work_queue(void)
+{
+ /*
+ * In interrupt context, if a vdo thread is what got interrupted, the calls below will find
+ * the queue for the thread which was interrupted. However, the interrupted thread may have
+ * been processing a completion, in which case starting to process another would violate
+ * our concurrency assumptions.
+ */
+ if (in_interrupt())
+ return NULL;
+
+ if (kthread_func(current) != work_queue_runner)
+ /* Not a VDO work queue thread. */
+ return NULL;
+
+ return kthread_data(current);
+}
+
+struct vdo_work_queue *vdo_get_current_work_queue(void)
+{
+ struct simple_work_queue *queue = get_current_thread_work_queue();
+
+ return (queue == NULL) ? NULL : &queue->common;
+}
+
+struct vdo_thread *vdo_get_work_queue_owner(struct vdo_work_queue *queue)
+{
+ return queue->owner;
+}
+
+/**
+ * vdo_get_work_queue_private_data() - Returns the private data for the current thread's work
+ * queue, or NULL if none or if the current thread is not a
+ * work queue thread.
+ */
+void *vdo_get_work_queue_private_data(void)
+{
+ struct simple_work_queue *queue = get_current_thread_work_queue();
+
+ return (queue != NULL) ? queue->private : NULL;
+}
+
+bool vdo_work_queue_type_is(struct vdo_work_queue *queue,
+ const struct vdo_work_queue_type *type)
+{
+ return (queue->type == type);
+}
diff --git a/drivers/md/dm-vdo/funnel-workqueue.h b/drivers/md/dm-vdo/funnel-workqueue.h
new file mode 100644
index 000000000000..b5be6e9e83bc
--- /dev/null
+++ b/drivers/md/dm-vdo/funnel-workqueue.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef VDO_WORK_QUEUE_H
+#define VDO_WORK_QUEUE_H
+
+#include <linux/sched.h> /* for TASK_COMM_LEN */
+
+#include "types.h"
+
+enum {
+ MAX_VDO_WORK_QUEUE_NAME_LEN = TASK_COMM_LEN,
+};
+
+struct vdo_work_queue_type {
+ void (*start)(void *context);
+ void (*finish)(void *context);
+ enum vdo_completion_priority max_priority;
+ enum vdo_completion_priority default_priority;
+};
+
+struct vdo_completion;
+struct vdo_thread;
+struct vdo_work_queue;
+
+int vdo_make_work_queue(const char *thread_name_prefix, const char *name,
+ struct vdo_thread *owner, const struct vdo_work_queue_type *type,
+ unsigned int thread_count, void *thread_privates[],
+ struct vdo_work_queue **queue_ptr);
+
+void vdo_enqueue_work_queue(struct vdo_work_queue *queue, struct vdo_completion *completion);
+
+void vdo_finish_work_queue(struct vdo_work_queue *queue);
+
+void vdo_free_work_queue(struct vdo_work_queue *queue);
+
+void vdo_dump_work_queue(struct vdo_work_queue *queue);
+
+void vdo_dump_completion_to_buffer(struct vdo_completion *completion, char *buffer,
+ size_t length);
+
+void *vdo_get_work_queue_private_data(void);
+struct vdo_work_queue *vdo_get_current_work_queue(void);
+struct vdo_thread *vdo_get_work_queue_owner(struct vdo_work_queue *queue);
+
+bool __must_check vdo_work_queue_type_is(struct vdo_work_queue *queue,
+ const struct vdo_work_queue_type *type);
+
+#endif /* VDO_WORK_QUEUE_H */
diff --git a/drivers/md/dm-vdo/indexer/chapter-index.c b/drivers/md/dm-vdo/indexer/chapter-index.c
new file mode 100644
index 000000000000..7e32a25d3f2f
--- /dev/null
+++ b/drivers/md/dm-vdo/indexer/chapter-index.c
@@ -0,0 +1,293 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "chapter-index.h"
+
+#include "errors.h"
+#include "logger.h"
+#include "memory-alloc.h"
+#include "permassert.h"
+
+#include "hash-utils.h"
+#include "indexer.h"
+
+int uds_make_open_chapter_index(struct open_chapter_index **chapter_index,
+ const struct index_geometry *geometry, u64 volume_nonce)
+{
+ int result;
+ size_t memory_size;
+ struct open_chapter_index *index;
+
+ result = vdo_allocate(1, struct open_chapter_index, "open chapter index", &index);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ /*
+ * The delta index will rebalance delta lists when memory gets tight,
+ * so give the chapter index one extra page.
+ */
+ memory_size = ((geometry->index_pages_per_chapter + 1) * geometry->bytes_per_page);
+ index->geometry = geometry;
+ index->volume_nonce = volume_nonce;
+ result = uds_initialize_delta_index(&index->delta_index, 1,
+ geometry->delta_lists_per_chapter,
+ geometry->chapter_mean_delta,
+ geometry->chapter_payload_bits,
+ memory_size, 'm');
+ if (result != UDS_SUCCESS) {
+ vdo_free(index);
+ return result;
+ }
+
+ index->memory_size = index->delta_index.memory_size + sizeof(struct open_chapter_index);
+ *chapter_index = index;
+ return UDS_SUCCESS;
+}
+
+void uds_free_open_chapter_index(struct open_chapter_index *chapter_index)
+{
+ if (chapter_index == NULL)
+ return;
+
+ uds_uninitialize_delta_index(&chapter_index->delta_index);
+ vdo_free(chapter_index);
+}
+
+/* Re-initialize an open chapter index for a new chapter. */
+void uds_empty_open_chapter_index(struct open_chapter_index *chapter_index,
+ u64 virtual_chapter_number)
+{
+ uds_reset_delta_index(&chapter_index->delta_index);
+ chapter_index->virtual_chapter_number = virtual_chapter_number;
+}
+
+static inline bool was_entry_found(const struct delta_index_entry *entry, u32 address)
+{
+ return (!entry->at_end) && (entry->key == address);
+}
+
+/* Associate a record name with the record page containing its metadata. */
+int uds_put_open_chapter_index_record(struct open_chapter_index *chapter_index,
+ const struct uds_record_name *name,
+ u32 page_number)
+{
+ int result;
+ struct delta_index_entry entry;
+ u32 address;
+ u32 list_number;
+ const u8 *found_name;
+ bool found;
+ const struct index_geometry *geometry = chapter_index->geometry;
+ u64 chapter_number = chapter_index->virtual_chapter_number;
+ u32 record_pages = geometry->record_pages_per_chapter;
+
+ result = VDO_ASSERT(page_number < record_pages,
+ "Page number within chapter (%u) exceeds the maximum value %u",
+ page_number, record_pages);
+ if (result != VDO_SUCCESS)
+ return UDS_INVALID_ARGUMENT;
+
+ address = uds_hash_to_chapter_delta_address(name, geometry);
+ list_number = uds_hash_to_chapter_delta_list(name, geometry);
+ result = uds_get_delta_index_entry(&chapter_index->delta_index, list_number,
+ address, name->name, &entry);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ found = was_entry_found(&entry, address);
+ result = VDO_ASSERT(!(found && entry.is_collision),
+ "Chunk appears more than once in chapter %llu",
+ (unsigned long long) chapter_number);
+ if (result != VDO_SUCCESS)
+ return UDS_BAD_STATE;
+
+ found_name = (found ? name->name : NULL);
+ return uds_put_delta_index_entry(&entry, address, page_number, found_name);
+}
+
+/*
+ * Pack a section of an open chapter index into a chapter index page. A range of delta lists
+ * (starting with a specified list index) is copied from the open chapter index into a memory page.
+ * The number of lists copied onto the page is returned to the caller on success.
+ *
+ * @chapter_index: The open chapter index
+ * @memory: The memory page to use
+ * @first_list: The first delta list number to be copied
+ * @last_page: If true, this is the last page of the chapter index and all the remaining lists must
+ * be packed onto this page
+ * @lists_packed: The number of delta lists that were packed onto this page
+ */
+int uds_pack_open_chapter_index_page(struct open_chapter_index *chapter_index,
+ u8 *memory, u32 first_list, bool last_page,
+ u32 *lists_packed)
+{
+ int result;
+ struct delta_index *delta_index = &chapter_index->delta_index;
+ struct delta_index_stats stats;
+ u64 nonce = chapter_index->volume_nonce;
+ u64 chapter_number = chapter_index->virtual_chapter_number;
+ const struct index_geometry *geometry = chapter_index->geometry;
+ u32 list_count = geometry->delta_lists_per_chapter;
+ unsigned int removals = 0;
+ struct delta_index_entry entry;
+ u32 next_list;
+ s32 list_number;
+
+ for (;;) {
+ result = uds_pack_delta_index_page(delta_index, nonce, memory,
+ geometry->bytes_per_page,
+ chapter_number, first_list,
+ lists_packed);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ if ((first_list + *lists_packed) == list_count) {
+ /* All lists are packed. */
+ break;
+ } else if (*lists_packed == 0) {
+ /*
+ * The next delta list does not fit on a page. This delta list will be
+ * removed.
+ */
+ } else if (last_page) {
+ /*
+ * This is the last page and there are lists left unpacked, but all of the
+ * remaining lists must fit on the page. Find a list that contains entries
+ * and remove the entire list. Try the first list that does not fit. If it
+ * is empty, we will select the last list that already fits and has any
+ * entries.
+ */
+ } else {
+ /* This page is done. */
+ break;
+ }
+
+ if (removals == 0) {
+ uds_get_delta_index_stats(delta_index, &stats);
+ vdo_log_warning("The chapter index for chapter %llu contains %llu entries with %llu collisions",
+ (unsigned long long) chapter_number,
+ (unsigned long long) stats.record_count,
+ (unsigned long long) stats.collision_count);
+ }
+
+ list_number = *lists_packed;
+ do {
+ if (list_number < 0)
+ return UDS_OVERFLOW;
+
+ next_list = first_list + list_number--,
+ result = uds_start_delta_index_search(delta_index, next_list, 0,
+ &entry);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ result = uds_next_delta_index_entry(&entry);
+ if (result != UDS_SUCCESS)
+ return result;
+ } while (entry.at_end);
+
+ do {
+ result = uds_remove_delta_index_entry(&entry);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ removals++;
+ } while (!entry.at_end);
+ }
+
+ if (removals > 0) {
+ vdo_log_warning("To avoid chapter index page overflow in chapter %llu, %u entries were removed from the chapter index",
+ (unsigned long long) chapter_number, removals);
+ }
+
+ return UDS_SUCCESS;
+}
+
+/* Make a new chapter index page, initializing it with the data from a given index_page buffer. */
+int uds_initialize_chapter_index_page(struct delta_index_page *index_page,
+ const struct index_geometry *geometry,
+ u8 *page_buffer, u64 volume_nonce)
+{
+ return uds_initialize_delta_index_page(index_page, volume_nonce,
+ geometry->chapter_mean_delta,
+ geometry->chapter_payload_bits,
+ page_buffer, geometry->bytes_per_page);
+}
+
+/* Validate a chapter index page read during rebuild. */
+int uds_validate_chapter_index_page(const struct delta_index_page *index_page,
+ const struct index_geometry *geometry)
+{
+ int result;
+ const struct delta_index *delta_index = &index_page->delta_index;
+ u32 first = index_page->lowest_list_number;
+ u32 last = index_page->highest_list_number;
+ u32 list_number;
+
+ /* We walk every delta list from start to finish. */
+ for (list_number = first; list_number <= last; list_number++) {
+ struct delta_index_entry entry;
+
+ result = uds_start_delta_index_search(delta_index, list_number - first,
+ 0, &entry);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ for (;;) {
+ result = uds_next_delta_index_entry(&entry);
+ if (result != UDS_SUCCESS) {
+ /*
+ * A random bit stream is highly likely to arrive here when we go
+ * past the end of the delta list.
+ */
+ return result;
+ }
+
+ if (entry.at_end)
+ break;
+
+ /* Also make sure that the record page field contains a plausible value. */
+ if (uds_get_delta_entry_value(&entry) >=
+ geometry->record_pages_per_chapter) {
+ /*
+ * Do not log this as an error. It happens in normal operation when
+ * we are doing a rebuild but haven't written the entire volume
+ * once.
+ */
+ return UDS_CORRUPT_DATA;
+ }
+ }
+ }
+ return UDS_SUCCESS;
+}
+
+/*
+ * Search a chapter index page for a record name, returning the record page number that may contain
+ * the name.
+ */
+int uds_search_chapter_index_page(struct delta_index_page *index_page,
+ const struct index_geometry *geometry,
+ const struct uds_record_name *name,
+ u16 *record_page_ptr)
+{
+ int result;
+ struct delta_index *delta_index = &index_page->delta_index;
+ u32 address = uds_hash_to_chapter_delta_address(name, geometry);
+ u32 delta_list_number = uds_hash_to_chapter_delta_list(name, geometry);
+ u32 sub_list_number = delta_list_number - index_page->lowest_list_number;
+ struct delta_index_entry entry;
+
+ result = uds_get_delta_index_entry(delta_index, sub_list_number, address,
+ name->name, &entry);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ if (was_entry_found(&entry, address))
+ *record_page_ptr = uds_get_delta_entry_value(&entry);
+ else
+ *record_page_ptr = NO_CHAPTER_INDEX_ENTRY;
+
+ return UDS_SUCCESS;
+}
diff --git a/drivers/md/dm-vdo/indexer/chapter-index.h b/drivers/md/dm-vdo/indexer/chapter-index.h
new file mode 100644
index 000000000000..be8bf2b675b1
--- /dev/null
+++ b/drivers/md/dm-vdo/indexer/chapter-index.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef UDS_CHAPTER_INDEX_H
+#define UDS_CHAPTER_INDEX_H
+
+#include <linux/limits.h>
+
+#include "delta-index.h"
+#include "geometry.h"
+
+/*
+ * A chapter index for an open chapter is a mutable structure that tracks all the records that have
+ * been added to the chapter. A chapter index for a closed chapter is similar except that it is
+ * immutable because the contents of a closed chapter can never change, and the immutable structure
+ * is more efficient. Both types of chapter index are implemented with a delta index.
+ */
+
+/* The value returned when no entry is found in the chapter index. */
+#define NO_CHAPTER_INDEX_ENTRY U16_MAX
+
+struct open_chapter_index {
+ const struct index_geometry *geometry;
+ struct delta_index delta_index;
+ u64 virtual_chapter_number;
+ u64 volume_nonce;
+ size_t memory_size;
+};
+
+int __must_check uds_make_open_chapter_index(struct open_chapter_index **chapter_index,
+ const struct index_geometry *geometry,
+ u64 volume_nonce);
+
+void uds_free_open_chapter_index(struct open_chapter_index *chapter_index);
+
+void uds_empty_open_chapter_index(struct open_chapter_index *chapter_index,
+ u64 virtual_chapter_number);
+
+int __must_check uds_put_open_chapter_index_record(struct open_chapter_index *chapter_index,
+ const struct uds_record_name *name,
+ u32 page_number);
+
+int __must_check uds_pack_open_chapter_index_page(struct open_chapter_index *chapter_index,
+ u8 *memory, u32 first_list,
+ bool last_page, u32 *lists_packed);
+
+int __must_check uds_initialize_chapter_index_page(struct delta_index_page *index_page,
+ const struct index_geometry *geometry,
+ u8 *page_buffer, u64 volume_nonce);
+
+int __must_check uds_validate_chapter_index_page(const struct delta_index_page *index_page,
+ const struct index_geometry *geometry);
+
+int __must_check uds_search_chapter_index_page(struct delta_index_page *index_page,
+ const struct index_geometry *geometry,
+ const struct uds_record_name *name,
+ u16 *record_page_ptr);
+
+#endif /* UDS_CHAPTER_INDEX_H */
diff --git a/drivers/md/dm-vdo/indexer/config.c b/drivers/md/dm-vdo/indexer/config.c
new file mode 100644
index 000000000000..5532371b952f
--- /dev/null
+++ b/drivers/md/dm-vdo/indexer/config.c
@@ -0,0 +1,376 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "config.h"
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "numeric.h"
+#include "string-utils.h"
+#include "thread-utils.h"
+
+static const u8 INDEX_CONFIG_MAGIC[] = "ALBIC";
+static const u8 INDEX_CONFIG_VERSION_6_02[] = "06.02";
+static const u8 INDEX_CONFIG_VERSION_8_02[] = "08.02";
+
+#define DEFAULT_VOLUME_READ_THREADS 2
+#define MAX_VOLUME_READ_THREADS 16
+#define INDEX_CONFIG_MAGIC_LENGTH (sizeof(INDEX_CONFIG_MAGIC) - 1)
+#define INDEX_CONFIG_VERSION_LENGTH ((int)(sizeof(INDEX_CONFIG_VERSION_6_02) - 1))
+
+static bool is_version(const u8 *version, u8 *buffer)
+{
+ return memcmp(version, buffer, INDEX_CONFIG_VERSION_LENGTH) == 0;
+}
+
+static bool are_matching_configurations(struct uds_configuration *saved_config,
+ struct index_geometry *saved_geometry,
+ struct uds_configuration *user)
+{
+ struct index_geometry *geometry = user->geometry;
+ bool result = true;
+
+ if (saved_geometry->record_pages_per_chapter != geometry->record_pages_per_chapter) {
+ vdo_log_error("Record pages per chapter (%u) does not match (%u)",
+ saved_geometry->record_pages_per_chapter,
+ geometry->record_pages_per_chapter);
+ result = false;
+ }
+
+ if (saved_geometry->chapters_per_volume != geometry->chapters_per_volume) {
+ vdo_log_error("Chapter count (%u) does not match (%u)",
+ saved_geometry->chapters_per_volume,
+ geometry->chapters_per_volume);
+ result = false;
+ }
+
+ if (saved_geometry->sparse_chapters_per_volume != geometry->sparse_chapters_per_volume) {
+ vdo_log_error("Sparse chapter count (%u) does not match (%u)",
+ saved_geometry->sparse_chapters_per_volume,
+ geometry->sparse_chapters_per_volume);
+ result = false;
+ }
+
+ if (saved_config->cache_chapters != user->cache_chapters) {
+ vdo_log_error("Cache size (%u) does not match (%u)",
+ saved_config->cache_chapters, user->cache_chapters);
+ result = false;
+ }
+
+ if (saved_config->volume_index_mean_delta != user->volume_index_mean_delta) {
+ vdo_log_error("Volume index mean delta (%u) does not match (%u)",
+ saved_config->volume_index_mean_delta,
+ user->volume_index_mean_delta);
+ result = false;
+ }
+
+ if (saved_geometry->bytes_per_page != geometry->bytes_per_page) {
+ vdo_log_error("Bytes per page value (%zu) does not match (%zu)",
+ saved_geometry->bytes_per_page, geometry->bytes_per_page);
+ result = false;
+ }
+
+ if (saved_config->sparse_sample_rate != user->sparse_sample_rate) {
+ vdo_log_error("Sparse sample rate (%u) does not match (%u)",
+ saved_config->sparse_sample_rate,
+ user->sparse_sample_rate);
+ result = false;
+ }
+
+ if (saved_config->nonce != user->nonce) {
+ vdo_log_error("Nonce (%llu) does not match (%llu)",
+ (unsigned long long) saved_config->nonce,
+ (unsigned long long) user->nonce);
+ result = false;
+ }
+
+ return result;
+}
+
+/* Read the configuration and validate it against the provided one. */
+int uds_validate_config_contents(struct buffered_reader *reader,
+ struct uds_configuration *user_config)
+{
+ int result;
+ struct uds_configuration config;
+ struct index_geometry geometry;
+ u8 version_buffer[INDEX_CONFIG_VERSION_LENGTH];
+ u32 bytes_per_page;
+ u8 buffer[sizeof(struct uds_configuration_6_02)];
+ size_t offset = 0;
+
+ result = uds_verify_buffered_data(reader, INDEX_CONFIG_MAGIC,
+ INDEX_CONFIG_MAGIC_LENGTH);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ result = uds_read_from_buffered_reader(reader, version_buffer,
+ INDEX_CONFIG_VERSION_LENGTH);
+ if (result != UDS_SUCCESS)
+ return vdo_log_error_strerror(result, "cannot read index config version");
+
+ if (!is_version(INDEX_CONFIG_VERSION_6_02, version_buffer) &&
+ !is_version(INDEX_CONFIG_VERSION_8_02, version_buffer)) {
+ return vdo_log_error_strerror(UDS_CORRUPT_DATA,
+ "unsupported configuration version: '%.*s'",
+ INDEX_CONFIG_VERSION_LENGTH,
+ version_buffer);
+ }
+
+ result = uds_read_from_buffered_reader(reader, buffer, sizeof(buffer));
+ if (result != UDS_SUCCESS)
+ return vdo_log_error_strerror(result, "cannot read config data");
+
+ decode_u32_le(buffer, &offset, &geometry.record_pages_per_chapter);
+ decode_u32_le(buffer, &offset, &geometry.chapters_per_volume);
+ decode_u32_le(buffer, &offset, &geometry.sparse_chapters_per_volume);
+ decode_u32_le(buffer, &offset, &config.cache_chapters);
+ offset += sizeof(u32);
+ decode_u32_le(buffer, &offset, &config.volume_index_mean_delta);
+ decode_u32_le(buffer, &offset, &bytes_per_page);
+ geometry.bytes_per_page = bytes_per_page;
+ decode_u32_le(buffer, &offset, &config.sparse_sample_rate);
+ decode_u64_le(buffer, &offset, &config.nonce);
+
+ result = VDO_ASSERT(offset == sizeof(struct uds_configuration_6_02),
+ "%zu bytes read but not decoded",
+ sizeof(struct uds_configuration_6_02) - offset);
+ if (result != VDO_SUCCESS)
+ return UDS_CORRUPT_DATA;
+
+ if (is_version(INDEX_CONFIG_VERSION_6_02, version_buffer)) {
+ user_config->geometry->remapped_virtual = 0;
+ user_config->geometry->remapped_physical = 0;
+ } else {
+ u8 remapping[sizeof(u64) + sizeof(u64)];
+
+ result = uds_read_from_buffered_reader(reader, remapping,
+ sizeof(remapping));
+ if (result != UDS_SUCCESS)
+ return vdo_log_error_strerror(result, "cannot read converted config");
+
+ offset = 0;
+ decode_u64_le(remapping, &offset,
+ &user_config->geometry->remapped_virtual);
+ decode_u64_le(remapping, &offset,
+ &user_config->geometry->remapped_physical);
+ }
+
+ if (!are_matching_configurations(&config, &geometry, user_config)) {
+ vdo_log_warning("Supplied configuration does not match save");
+ return UDS_NO_INDEX;
+ }
+
+ return UDS_SUCCESS;
+}
+
+/*
+ * Write the configuration to stable storage. If the superblock version is < 4, write the 6.02
+ * version; otherwise write the 8.02 version, indicating the configuration is for an index that has
+ * been reduced by one chapter.
+ */
+int uds_write_config_contents(struct buffered_writer *writer,
+ struct uds_configuration *config, u32 version)
+{
+ int result;
+ struct index_geometry *geometry = config->geometry;
+ u8 buffer[sizeof(struct uds_configuration_8_02)];
+ size_t offset = 0;
+
+ result = uds_write_to_buffered_writer(writer, INDEX_CONFIG_MAGIC,
+ INDEX_CONFIG_MAGIC_LENGTH);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ /*
+ * If version is < 4, the index has not been reduced by a chapter so it must be written out
+ * as version 6.02 so that it is still compatible with older versions of UDS.
+ */
+ if (version >= 4) {
+ result = uds_write_to_buffered_writer(writer, INDEX_CONFIG_VERSION_8_02,
+ INDEX_CONFIG_VERSION_LENGTH);
+ if (result != UDS_SUCCESS)
+ return result;
+ } else {
+ result = uds_write_to_buffered_writer(writer, INDEX_CONFIG_VERSION_6_02,
+ INDEX_CONFIG_VERSION_LENGTH);
+ if (result != UDS_SUCCESS)
+ return result;
+ }
+
+ encode_u32_le(buffer, &offset, geometry->record_pages_per_chapter);
+ encode_u32_le(buffer, &offset, geometry->chapters_per_volume);
+ encode_u32_le(buffer, &offset, geometry->sparse_chapters_per_volume);
+ encode_u32_le(buffer, &offset, config->cache_chapters);
+ encode_u32_le(buffer, &offset, 0);
+ encode_u32_le(buffer, &offset, config->volume_index_mean_delta);
+ encode_u32_le(buffer, &offset, geometry->bytes_per_page);
+ encode_u32_le(buffer, &offset, config->sparse_sample_rate);
+ encode_u64_le(buffer, &offset, config->nonce);
+
+ result = VDO_ASSERT(offset == sizeof(struct uds_configuration_6_02),
+ "%zu bytes encoded, of %zu expected", offset,
+ sizeof(struct uds_configuration_6_02));
+ if (result != VDO_SUCCESS)
+ return result;
+
+ if (version >= 4) {
+ encode_u64_le(buffer, &offset, geometry->remapped_virtual);
+ encode_u64_le(buffer, &offset, geometry->remapped_physical);
+ }
+
+ return uds_write_to_buffered_writer(writer, buffer, offset);
+}
+
+/* Compute configuration parameters that depend on memory size. */
+static int compute_memory_sizes(uds_memory_config_size_t mem_gb, bool sparse,
+ u32 *chapters_per_volume, u32 *record_pages_per_chapter,
+ u32 *sparse_chapters_per_volume)
+{
+ u32 reduced_chapters = 0;
+ u32 base_chapters;
+
+ if (mem_gb == UDS_MEMORY_CONFIG_256MB) {
+ base_chapters = DEFAULT_CHAPTERS_PER_VOLUME;
+ *record_pages_per_chapter = SMALL_RECORD_PAGES_PER_CHAPTER;
+ } else if (mem_gb == UDS_MEMORY_CONFIG_512MB) {
+ base_chapters = DEFAULT_CHAPTERS_PER_VOLUME;
+ *record_pages_per_chapter = 2 * SMALL_RECORD_PAGES_PER_CHAPTER;
+ } else if (mem_gb == UDS_MEMORY_CONFIG_768MB) {
+ base_chapters = DEFAULT_CHAPTERS_PER_VOLUME;
+ *record_pages_per_chapter = 3 * SMALL_RECORD_PAGES_PER_CHAPTER;
+ } else if ((mem_gb >= 1) && (mem_gb <= UDS_MEMORY_CONFIG_MAX)) {
+ base_chapters = mem_gb * DEFAULT_CHAPTERS_PER_VOLUME;
+ *record_pages_per_chapter = DEFAULT_RECORD_PAGES_PER_CHAPTER;
+ } else if (mem_gb == UDS_MEMORY_CONFIG_REDUCED_256MB) {
+ reduced_chapters = 1;
+ base_chapters = DEFAULT_CHAPTERS_PER_VOLUME;
+ *record_pages_per_chapter = SMALL_RECORD_PAGES_PER_CHAPTER;
+ } else if (mem_gb == UDS_MEMORY_CONFIG_REDUCED_512MB) {
+ reduced_chapters = 1;
+ base_chapters = DEFAULT_CHAPTERS_PER_VOLUME;
+ *record_pages_per_chapter = 2 * SMALL_RECORD_PAGES_PER_CHAPTER;
+ } else if (mem_gb == UDS_MEMORY_CONFIG_REDUCED_768MB) {
+ reduced_chapters = 1;
+ base_chapters = DEFAULT_CHAPTERS_PER_VOLUME;
+ *record_pages_per_chapter = 3 * SMALL_RECORD_PAGES_PER_CHAPTER;
+ } else if ((mem_gb >= 1 + UDS_MEMORY_CONFIG_REDUCED) &&
+ (mem_gb <= UDS_MEMORY_CONFIG_REDUCED_MAX)) {
+ reduced_chapters = 1;
+ base_chapters = ((mem_gb - UDS_MEMORY_CONFIG_REDUCED) *
+ DEFAULT_CHAPTERS_PER_VOLUME);
+ *record_pages_per_chapter = DEFAULT_RECORD_PAGES_PER_CHAPTER;
+ } else {
+ vdo_log_error("received invalid memory size");
+ return -EINVAL;
+ }
+
+ if (sparse) {
+ /* Make 95% of chapters sparse, allowing 10x more records. */
+ *sparse_chapters_per_volume = (19 * base_chapters) / 2;
+ base_chapters *= 10;
+ } else {
+ *sparse_chapters_per_volume = 0;
+ }
+
+ *chapters_per_volume = base_chapters - reduced_chapters;
+ return UDS_SUCCESS;
+}
+
+static unsigned int __must_check normalize_zone_count(unsigned int requested)
+{
+ unsigned int zone_count = requested;
+
+ if (zone_count == 0)
+ zone_count = num_online_cpus() / 2;
+
+ if (zone_count < 1)
+ zone_count = 1;
+
+ if (zone_count > MAX_ZONES)
+ zone_count = MAX_ZONES;
+
+ vdo_log_info("Using %u indexing zone%s for concurrency.",
+ zone_count, zone_count == 1 ? "" : "s");
+ return zone_count;
+}
+
+static unsigned int __must_check normalize_read_threads(unsigned int requested)
+{
+ unsigned int read_threads = requested;
+
+ if (read_threads < 1)
+ read_threads = DEFAULT_VOLUME_READ_THREADS;
+
+ if (read_threads > MAX_VOLUME_READ_THREADS)
+ read_threads = MAX_VOLUME_READ_THREADS;
+
+ return read_threads;
+}
+
+int uds_make_configuration(const struct uds_parameters *params,
+ struct uds_configuration **config_ptr)
+{
+ struct uds_configuration *config;
+ u32 chapters_per_volume = 0;
+ u32 record_pages_per_chapter = 0;
+ u32 sparse_chapters_per_volume = 0;
+ int result;
+
+ result = compute_memory_sizes(params->memory_size, params->sparse,
+ &chapters_per_volume, &record_pages_per_chapter,
+ &sparse_chapters_per_volume);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ result = vdo_allocate(1, struct uds_configuration, __func__, &config);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = uds_make_index_geometry(DEFAULT_BYTES_PER_PAGE, record_pages_per_chapter,
+ chapters_per_volume, sparse_chapters_per_volume,
+ 0, 0, &config->geometry);
+ if (result != UDS_SUCCESS) {
+ uds_free_configuration(config);
+ return result;
+ }
+
+ config->zone_count = normalize_zone_count(params->zone_count);
+ config->read_threads = normalize_read_threads(params->read_threads);
+
+ config->cache_chapters = DEFAULT_CACHE_CHAPTERS;
+ config->volume_index_mean_delta = DEFAULT_VOLUME_INDEX_MEAN_DELTA;
+ config->sparse_sample_rate = (params->sparse ? DEFAULT_SPARSE_SAMPLE_RATE : 0);
+ config->nonce = params->nonce;
+ config->bdev = params->bdev;
+ config->offset = params->offset;
+ config->size = params->size;
+
+ *config_ptr = config;
+ return UDS_SUCCESS;
+}
+
+void uds_free_configuration(struct uds_configuration *config)
+{
+ if (config != NULL) {
+ uds_free_index_geometry(config->geometry);
+ vdo_free(config);
+ }
+}
+
+void uds_log_configuration(struct uds_configuration *config)
+{
+ struct index_geometry *geometry = config->geometry;
+
+ vdo_log_debug("Configuration:");
+ vdo_log_debug(" Record pages per chapter: %10u", geometry->record_pages_per_chapter);
+ vdo_log_debug(" Chapters per volume: %10u", geometry->chapters_per_volume);
+ vdo_log_debug(" Sparse chapters per volume: %10u", geometry->sparse_chapters_per_volume);
+ vdo_log_debug(" Cache size (chapters): %10u", config->cache_chapters);
+ vdo_log_debug(" Volume index mean delta: %10u", config->volume_index_mean_delta);
+ vdo_log_debug(" Bytes per page: %10zu", geometry->bytes_per_page);
+ vdo_log_debug(" Sparse sample rate: %10u", config->sparse_sample_rate);
+ vdo_log_debug(" Nonce: %llu", (unsigned long long) config->nonce);
+}
diff --git a/drivers/md/dm-vdo/indexer/config.h b/drivers/md/dm-vdo/indexer/config.h
new file mode 100644
index 000000000000..08507dc2f7a1
--- /dev/null
+++ b/drivers/md/dm-vdo/indexer/config.h
@@ -0,0 +1,124 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef UDS_CONFIG_H
+#define UDS_CONFIG_H
+
+#include "geometry.h"
+#include "indexer.h"
+#include "io-factory.h"
+
+/*
+ * The uds_configuration records a variety of parameters used to configure a new UDS index. Some
+ * parameters are provided by the client, while others are fixed or derived from user-supplied
+ * values. It is created when an index is created, and it is recorded in the index metadata.
+ */
+
+enum {
+ DEFAULT_VOLUME_INDEX_MEAN_DELTA = 4096,
+ DEFAULT_CACHE_CHAPTERS = 7,
+ DEFAULT_SPARSE_SAMPLE_RATE = 32,
+ MAX_ZONES = 16,
+};
+
+/* A set of configuration parameters for the indexer. */
+struct uds_configuration {
+ /* Storage device for the index */
+ struct block_device *bdev;
+
+ /* The maximum allowable size of the index */
+ size_t size;
+
+ /* The offset where the index should start */
+ off_t offset;
+
+ /* Parameters for the volume */
+
+ /* The volume layout */
+ struct index_geometry *geometry;
+
+ /* Index owner's nonce */
+ u64 nonce;
+
+ /* The number of threads used to process index requests */
+ unsigned int zone_count;
+
+ /* The number of threads used to read volume pages */
+ unsigned int read_threads;
+
+ /* Size of the page cache and sparse chapter index cache in chapters */
+ u32 cache_chapters;
+
+ /* Parameters for the volume index */
+
+ /* The mean delta for the volume index */
+ u32 volume_index_mean_delta;
+
+ /* Sampling rate for sparse indexing */
+ u32 sparse_sample_rate;
+};
+
+/* On-disk structure of data for a version 8.02 index. */
+struct uds_configuration_8_02 {
+ /* Smaller (16), Small (64) or large (256) indices */
+ u32 record_pages_per_chapter;
+ /* Total number of chapters per volume */
+ u32 chapters_per_volume;
+ /* Number of sparse chapters per volume */
+ u32 sparse_chapters_per_volume;
+ /* Size of the page cache, in chapters */
+ u32 cache_chapters;
+ /* Unused field */
+ u32 unused;
+ /* The volume index mean delta to use */
+ u32 volume_index_mean_delta;
+ /* Size of a page, used for both record pages and index pages */
+ u32 bytes_per_page;
+ /* Sampling rate for sparse indexing */
+ u32 sparse_sample_rate;
+ /* Index owner's nonce */
+ u64 nonce;
+ /* Virtual chapter remapped from physical chapter 0 */
+ u64 remapped_virtual;
+ /* New physical chapter which remapped chapter was moved to */
+ u64 remapped_physical;
+} __packed;
+
+/* On-disk structure of data for a version 6.02 index. */
+struct uds_configuration_6_02 {
+ /* Smaller (16), Small (64) or large (256) indices */
+ u32 record_pages_per_chapter;
+ /* Total number of chapters per volume */
+ u32 chapters_per_volume;
+ /* Number of sparse chapters per volume */
+ u32 sparse_chapters_per_volume;
+ /* Size of the page cache, in chapters */
+ u32 cache_chapters;
+ /* Unused field */
+ u32 unused;
+ /* The volume index mean delta to use */
+ u32 volume_index_mean_delta;
+ /* Size of a page, used for both record pages and index pages */
+ u32 bytes_per_page;
+ /* Sampling rate for sparse indexing */
+ u32 sparse_sample_rate;
+ /* Index owner's nonce */
+ u64 nonce;
+} __packed;
+
+int __must_check uds_make_configuration(const struct uds_parameters *params,
+ struct uds_configuration **config_ptr);
+
+void uds_free_configuration(struct uds_configuration *config);
+
+int __must_check uds_validate_config_contents(struct buffered_reader *reader,
+ struct uds_configuration *config);
+
+int __must_check uds_write_config_contents(struct buffered_writer *writer,
+ struct uds_configuration *config, u32 version);
+
+void uds_log_configuration(struct uds_configuration *config);
+
+#endif /* UDS_CONFIG_H */
diff --git a/drivers/md/dm-vdo/indexer/delta-index.c b/drivers/md/dm-vdo/indexer/delta-index.c
new file mode 100644
index 000000000000..0ac2443f0df3
--- /dev/null
+++ b/drivers/md/dm-vdo/indexer/delta-index.c
@@ -0,0 +1,1970 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+#include "delta-index.h"
+
+#include <linux/bitops.h>
+#include <linux/bits.h>
+#include <linux/compiler.h>
+#include <linux/limits.h>
+#include <linux/log2.h>
+
+#include "cpu.h"
+#include "errors.h"
+#include "logger.h"
+#include "memory-alloc.h"
+#include "numeric.h"
+#include "permassert.h"
+#include "string-utils.h"
+#include "time-utils.h"
+
+#include "config.h"
+#include "indexer.h"
+
+/*
+ * The entries in a delta index could be stored in a single delta list, but to reduce search times
+ * and update costs it uses multiple delta lists. These lists are stored in a single chunk of
+ * memory managed by the delta_zone structure. The delta_zone can move the data around within its
+ * memory, so the location of each delta list is recorded as a bit offset into the memory. Because
+ * the volume index can contain over a million delta lists, we want to be efficient with the size
+ * of the delta list header information. This information is encoded into 16 bytes per list. The
+ * volume index delta list memory can easily exceed 4 gigabits, so a 64 bit value is needed to
+ * address the memory. The volume index delta lists average around 6 kilobits, so 16 bits are
+ * sufficient to store the size of a delta list.
+ *
+ * Each delta list is stored as a bit stream. Within the delta list encoding, bits and bytes are
+ * numbered in little endian order. Within a byte, bit 0 is the least significant bit (0x1), and
+ * bit 7 is the most significant bit (0x80). Within a bit stream, bit 7 is the most significant bit
+ * of byte 0, and bit 8 is the least significant bit of byte 1. Within a byte array, a byte's
+ * number corresponds to its index in the array.
+ *
+ * A standard delta list entry is stored as a fixed length payload (the value) followed by a
+ * variable length key (the delta). A collision entry is used when two block names have the same
+ * delta list address. A collision entry always follows a standard entry for the hash with which it
+ * collides, and is encoded with DELTA == 0 with an additional 256 bits field at the end,
+ * containing the full block name. An entry with a delta of 0 at the beginning of a delta list
+ * indicates a normal entry.
+ *
+ * The delta in each entry is encoded with a variable-length Huffman code to minimize the memory
+ * used by small deltas. The Huffman code is specified by three parameters, which can be computed
+ * from the desired mean delta when the index is full. (See compute_coding_constants() for
+ * details.)
+ *
+ * The bit field utilities used to read and write delta entries assume that it is possible to read
+ * some bytes beyond the end of the bit field, so a delta_zone memory allocation is guarded by two
+ * invalid delta lists to prevent reading outside the delta_zone memory. The valid delta lists are
+ * numbered 1 to N, and the guard lists are numbered 0 and N+1. The function to decode the bit
+ * stream include a step that skips over bits set to 0 until the first 1 bit is found. A corrupted
+ * delta list could cause this step to run off the end of the delta_zone memory, so as extra
+ * protection against this happening, the tail guard list is set to all ones.
+ *
+ * The delta_index supports two different forms. The mutable form is created by
+ * uds_initialize_delta_index(), and is used for the volume index and for open chapter indexes. The
+ * immutable form is created by uds_initialize_delta_index_page(), and is used for closed (and
+ * cached) chapter index pages. The immutable form does not allocate delta list headers or
+ * temporary offsets, and thus is somewhat more memory efficient.
+ */
+
+/*
+ * This is the largest field size supported by get_field() and set_field(). Any field that is
+ * larger is not guaranteed to fit in a single byte-aligned u32.
+ */
+#define MAX_FIELD_BITS ((sizeof(u32) - 1) * BITS_PER_BYTE + 1)
+
+/*
+ * This is the largest field size supported by get_big_field() and set_big_field(). Any field that
+ * is larger is not guaranteed to fit in a single byte-aligned u64.
+ */
+#define MAX_BIG_FIELD_BITS ((sizeof(u64) - 1) * BITS_PER_BYTE + 1)
+
+/*
+ * This is the number of guard bytes needed at the end of the memory byte array when using the bit
+ * utilities. These utilities call get_big_field() and set_big_field(), which can access up to 7
+ * bytes beyond the end of the desired field. The definition is written to make it clear how this
+ * value is derived.
+ */
+#define POST_FIELD_GUARD_BYTES (sizeof(u64) - 1)
+
+/* The number of guard bits that are needed in the tail guard list */
+#define GUARD_BITS (POST_FIELD_GUARD_BYTES * BITS_PER_BYTE)
+
+/*
+ * The maximum size of a single delta list in bytes. We count guard bytes in this value because a
+ * buffer of this size can be used with move_bits().
+ */
+#define DELTA_LIST_MAX_BYTE_COUNT \
+ ((U16_MAX + BITS_PER_BYTE) / BITS_PER_BYTE + POST_FIELD_GUARD_BYTES)
+
+/* The number of extra bytes and bits needed to store a collision entry */
+#define COLLISION_BYTES UDS_RECORD_NAME_SIZE
+#define COLLISION_BITS (COLLISION_BYTES * BITS_PER_BYTE)
+
+/*
+ * Immutable delta lists are packed into pages containing a header that encodes the delta list
+ * information into 19 bits per list (64KB bit offset).
+ */
+#define IMMUTABLE_HEADER_SIZE 19
+
+/*
+ * Constants and structures for the saved delta index. "DI" is for delta_index, and -##### is a
+ * number to increment when the format of the data changes.
+ */
+#define MAGIC_SIZE 8
+
+static const char DELTA_INDEX_MAGIC[] = "DI-00002";
+
+struct delta_index_header {
+ char magic[MAGIC_SIZE];
+ u32 zone_number;
+ u32 zone_count;
+ u32 first_list;
+ u32 list_count;
+ u64 record_count;
+ u64 collision_count;
+};
+
+/*
+ * Header data used for immutable delta index pages. This data is followed by the delta list offset
+ * table.
+ */
+struct delta_page_header {
+ /* Externally-defined nonce */
+ u64 nonce;
+ /* The virtual chapter number */
+ u64 virtual_chapter_number;
+ /* Index of the first delta list on the page */
+ u16 first_list;
+ /* Number of delta lists on the page */
+ u16 list_count;
+} __packed;
+
+static inline u64 get_delta_list_byte_start(const struct delta_list *delta_list)
+{
+ return delta_list->start / BITS_PER_BYTE;
+}
+
+static inline u16 get_delta_list_byte_size(const struct delta_list *delta_list)
+{
+ unsigned int bit_offset = delta_list->start % BITS_PER_BYTE;
+
+ return BITS_TO_BYTES(bit_offset + delta_list->size);
+}
+
+static void rebalance_delta_zone(const struct delta_zone *delta_zone, u32 first,
+ u32 last)
+{
+ struct delta_list *delta_list;
+ u64 new_start;
+
+ if (first == last) {
+ /* Only one list is moving, and we know there is space. */
+ delta_list = &delta_zone->delta_lists[first];
+ new_start = delta_zone->new_offsets[first];
+ if (delta_list->start != new_start) {
+ u64 source;
+ u64 destination;
+
+ source = get_delta_list_byte_start(delta_list);
+ delta_list->start = new_start;
+ destination = get_delta_list_byte_start(delta_list);
+ memmove(delta_zone->memory + destination,
+ delta_zone->memory + source,
+ get_delta_list_byte_size(delta_list));
+ }
+ } else {
+ /*
+ * There is more than one list. Divide the problem in half, and use recursive calls
+ * to process each half. Note that after this computation, first <= middle, and
+ * middle < last.
+ */
+ u32 middle = (first + last) / 2;
+
+ delta_list = &delta_zone->delta_lists[middle];
+ new_start = delta_zone->new_offsets[middle];
+
+ /*
+ * The direction that our middle list is moving determines which half of the
+ * problem must be processed first.
+ */
+ if (new_start > delta_list->start) {
+ rebalance_delta_zone(delta_zone, middle + 1, last);
+ rebalance_delta_zone(delta_zone, first, middle);
+ } else {
+ rebalance_delta_zone(delta_zone, first, middle);
+ rebalance_delta_zone(delta_zone, middle + 1, last);
+ }
+ }
+}
+
+static inline size_t get_zone_memory_size(unsigned int zone_count, size_t memory_size)
+{
+ /* Round up so that each zone is a multiple of 64K in size. */
+ size_t ALLOC_BOUNDARY = 64 * 1024;
+
+ return (memory_size / zone_count + ALLOC_BOUNDARY - 1) & -ALLOC_BOUNDARY;
+}
+
+void uds_reset_delta_index(const struct delta_index *delta_index)
+{
+ unsigned int z;
+
+ /*
+ * Initialize all delta lists to be empty. We keep 2 extra delta list descriptors, one
+ * before the first real entry and one after so that we don't need to bounds check the
+ * array access when calculating preceding and following gap sizes.
+ */
+ for (z = 0; z < delta_index->zone_count; z++) {
+ u64 list_bits;
+ u64 spacing;
+ u64 offset;
+ unsigned int i;
+ struct delta_zone *zone = &delta_index->delta_zones[z];
+ struct delta_list *delta_lists = zone->delta_lists;
+
+ /* Zeroing the delta list headers initializes the head guard list correctly. */
+ memset(delta_lists, 0,
+ (zone->list_count + 2) * sizeof(struct delta_list));
+
+ /* Set all the bits in the end guard list. */
+ list_bits = (u64) zone->size * BITS_PER_BYTE - GUARD_BITS;
+ delta_lists[zone->list_count + 1].start = list_bits;
+ delta_lists[zone->list_count + 1].size = GUARD_BITS;
+ memset(zone->memory + (list_bits / BITS_PER_BYTE), ~0,
+ POST_FIELD_GUARD_BYTES);
+
+ /* Evenly space out the real delta lists by setting regular offsets. */
+ spacing = list_bits / zone->list_count;
+ offset = spacing / 2;
+ for (i = 1; i <= zone->list_count; i++) {
+ delta_lists[i].start = offset;
+ offset += spacing;
+ }
+
+ /* Update the statistics. */
+ zone->discard_count += zone->record_count;
+ zone->record_count = 0;
+ zone->collision_count = 0;
+ }
+}
+
+/* Compute the Huffman coding parameters for the given mean delta. The Huffman code is specified by
+ * three parameters:
+ *
+ * MINBITS The number of bits in the smallest code
+ * BASE The number of values coded using a code of length MINBITS
+ * INCR The number of values coded by using one additional bit
+ *
+ * These parameters are related by this equation:
+ *
+ * BASE + INCR == 1 << MINBITS
+ *
+ * The math for the Huffman code of an exponential distribution says that
+ *
+ * INCR = log(2) * MEAN_DELTA
+ *
+ * Then use the smallest MINBITS value so that
+ *
+ * (1 << MINBITS) > INCR
+ *
+ * And then
+ *
+ * BASE = (1 << MINBITS) - INCR
+ *
+ * Now the index can generate a code such that
+ * - The first BASE values code using MINBITS bits.
+ * - The next INCR values code using MINBITS+1 bits.
+ * - The next INCR values code using MINBITS+2 bits.
+ * - (and so on).
+ */
+static void compute_coding_constants(u32 mean_delta, u16 *min_bits, u32 *min_keys, u32 *incr_keys)
+{
+ /*
+ * We want to compute the rounded value of log(2) * mean_delta. Since we cannot always use
+ * floating point, use a really good integer approximation.
+ */
+ *incr_keys = (836158UL * mean_delta + 603160UL) / 1206321UL;
+ *min_bits = bits_per(*incr_keys + 1);
+ *min_keys = (1 << *min_bits) - *incr_keys;
+}
+
+void uds_uninitialize_delta_index(struct delta_index *delta_index)
+{
+ unsigned int z;
+
+ if (delta_index->delta_zones == NULL)
+ return;
+
+ for (z = 0; z < delta_index->zone_count; z++) {
+ vdo_free(vdo_forget(delta_index->delta_zones[z].new_offsets));
+ vdo_free(vdo_forget(delta_index->delta_zones[z].delta_lists));
+ vdo_free(vdo_forget(delta_index->delta_zones[z].memory));
+ }
+
+ vdo_free(delta_index->delta_zones);
+ memset(delta_index, 0, sizeof(struct delta_index));
+}
+
+static int initialize_delta_zone(struct delta_zone *delta_zone, size_t size,
+ u32 first_list, u32 list_count, u32 mean_delta,
+ u32 payload_bits, u8 tag)
+{
+ int result;
+
+ result = vdo_allocate(size, u8, "delta list", &delta_zone->memory);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = vdo_allocate(list_count + 2, u64, "delta list temp",
+ &delta_zone->new_offsets);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ /* Allocate the delta lists. */
+ result = vdo_allocate(list_count + 2, struct delta_list, "delta lists",
+ &delta_zone->delta_lists);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ compute_coding_constants(mean_delta, &delta_zone->min_bits,
+ &delta_zone->min_keys, &delta_zone->incr_keys);
+ delta_zone->value_bits = payload_bits;
+ delta_zone->buffered_writer = NULL;
+ delta_zone->size = size;
+ delta_zone->rebalance_time = 0;
+ delta_zone->rebalance_count = 0;
+ delta_zone->record_count = 0;
+ delta_zone->collision_count = 0;
+ delta_zone->discard_count = 0;
+ delta_zone->overflow_count = 0;
+ delta_zone->first_list = first_list;
+ delta_zone->list_count = list_count;
+ delta_zone->tag = tag;
+
+ return UDS_SUCCESS;
+}
+
+int uds_initialize_delta_index(struct delta_index *delta_index, unsigned int zone_count,
+ u32 list_count, u32 mean_delta, u32 payload_bits,
+ size_t memory_size, u8 tag)
+{
+ int result;
+ unsigned int z;
+ size_t zone_memory;
+
+ result = vdo_allocate(zone_count, struct delta_zone, "Delta Index Zones",
+ &delta_index->delta_zones);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ delta_index->zone_count = zone_count;
+ delta_index->list_count = list_count;
+ delta_index->lists_per_zone = DIV_ROUND_UP(list_count, zone_count);
+ delta_index->memory_size = 0;
+ delta_index->mutable = true;
+ delta_index->tag = tag;
+
+ for (z = 0; z < zone_count; z++) {
+ u32 lists_in_zone = delta_index->lists_per_zone;
+ u32 first_list_in_zone = z * lists_in_zone;
+
+ if (z == zone_count - 1) {
+ /*
+ * The last zone gets fewer lists if zone_count doesn't evenly divide
+ * list_count. We'll have an underflow if the assertion below doesn't hold.
+ */
+ if (delta_index->list_count <= first_list_in_zone) {
+ uds_uninitialize_delta_index(delta_index);
+ return vdo_log_error_strerror(UDS_INVALID_ARGUMENT,
+ "%u delta lists not enough for %u zones",
+ list_count, zone_count);
+ }
+ lists_in_zone = delta_index->list_count - first_list_in_zone;
+ }
+
+ zone_memory = get_zone_memory_size(zone_count, memory_size);
+ result = initialize_delta_zone(&delta_index->delta_zones[z], zone_memory,
+ first_list_in_zone, lists_in_zone,
+ mean_delta, payload_bits, tag);
+ if (result != UDS_SUCCESS) {
+ uds_uninitialize_delta_index(delta_index);
+ return result;
+ }
+
+ delta_index->memory_size +=
+ (sizeof(struct delta_zone) + zone_memory +
+ (lists_in_zone + 2) * (sizeof(struct delta_list) + sizeof(u64)));
+ }
+
+ uds_reset_delta_index(delta_index);
+ return UDS_SUCCESS;
+}
+
+/* Read a bit field from an arbitrary bit boundary. */
+static inline u32 get_field(const u8 *memory, u64 offset, u8 size)
+{
+ const void *addr = memory + offset / BITS_PER_BYTE;
+
+ return (get_unaligned_le32(addr) >> (offset % BITS_PER_BYTE)) & ((1 << size) - 1);
+}
+
+/* Write a bit field to an arbitrary bit boundary. */
+static inline void set_field(u32 value, u8 *memory, u64 offset, u8 size)
+{
+ void *addr = memory + offset / BITS_PER_BYTE;
+ int shift = offset % BITS_PER_BYTE;
+ u32 data = get_unaligned_le32(addr);
+
+ data &= ~(((1 << size) - 1) << shift);
+ data |= value << shift;
+ put_unaligned_le32(data, addr);
+}
+
+/* Get the bit offset to the immutable delta list header. */
+static inline u32 get_immutable_header_offset(u32 list_number)
+{
+ return sizeof(struct delta_page_header) * BITS_PER_BYTE +
+ list_number * IMMUTABLE_HEADER_SIZE;
+}
+
+/* Get the bit offset to the start of the immutable delta list bit stream. */
+static inline u32 get_immutable_start(const u8 *memory, u32 list_number)
+{
+ return get_field(memory, get_immutable_header_offset(list_number),
+ IMMUTABLE_HEADER_SIZE);
+}
+
+/* Set the bit offset to the start of the immutable delta list bit stream. */
+static inline void set_immutable_start(u8 *memory, u32 list_number, u32 start)
+{
+ set_field(start, memory, get_immutable_header_offset(list_number),
+ IMMUTABLE_HEADER_SIZE);
+}
+
+static bool verify_delta_index_page(u64 nonce, u16 list_count, u64 expected_nonce,
+ u8 *memory, size_t memory_size)
+{
+ unsigned int i;
+
+ /*
+ * Verify the nonce. A mismatch can happen here during rebuild if we haven't written the
+ * entire volume at least once.
+ */
+ if (nonce != expected_nonce)
+ return false;
+
+ /* Verify that the number of delta lists can fit in the page. */
+ if (list_count > ((memory_size - sizeof(struct delta_page_header)) *
+ BITS_PER_BYTE / IMMUTABLE_HEADER_SIZE))
+ return false;
+
+ /*
+ * Verify that the first delta list is immediately after the last delta
+ * list header.
+ */
+ if (get_immutable_start(memory, 0) != get_immutable_header_offset(list_count + 1))
+ return false;
+
+ /* Verify that the lists are in the correct order. */
+ for (i = 0; i < list_count; i++) {
+ if (get_immutable_start(memory, i) > get_immutable_start(memory, i + 1))
+ return false;
+ }
+
+ /*
+ * Verify that the last list ends on the page, and that there is room
+ * for the post-field guard bits.
+ */
+ if (get_immutable_start(memory, list_count) >
+ (memory_size - POST_FIELD_GUARD_BYTES) * BITS_PER_BYTE)
+ return false;
+
+ /* Verify that the guard bytes are correctly set to all ones. */
+ for (i = 0; i < POST_FIELD_GUARD_BYTES; i++) {
+ if (memory[memory_size - POST_FIELD_GUARD_BYTES + i] != (u8) ~0)
+ return false;
+ }
+
+ /* All verifications passed. */
+ return true;
+}
+
+/* Initialize a delta index page to refer to a supplied page. */
+int uds_initialize_delta_index_page(struct delta_index_page *delta_index_page,
+ u64 expected_nonce, u32 mean_delta, u32 payload_bits,
+ u8 *memory, size_t memory_size)
+{
+ u64 nonce;
+ u64 vcn;
+ u64 first_list;
+ u64 list_count;
+ struct delta_page_header *header = (struct delta_page_header *) memory;
+ struct delta_zone *delta_zone = &delta_index_page->delta_zone;
+ const u8 *nonce_addr = (const u8 *) &header->nonce;
+ const u8 *vcn_addr = (const u8 *) &header->virtual_chapter_number;
+ const u8 *first_list_addr = (const u8 *) &header->first_list;
+ const u8 *list_count_addr = (const u8 *) &header->list_count;
+
+ /* First assume that the header is little endian. */
+ nonce = get_unaligned_le64(nonce_addr);
+ vcn = get_unaligned_le64(vcn_addr);
+ first_list = get_unaligned_le16(first_list_addr);
+ list_count = get_unaligned_le16(list_count_addr);
+ if (!verify_delta_index_page(nonce, list_count, expected_nonce, memory,
+ memory_size)) {
+ /* If that fails, try big endian. */
+ nonce = get_unaligned_be64(nonce_addr);
+ vcn = get_unaligned_be64(vcn_addr);
+ first_list = get_unaligned_be16(first_list_addr);
+ list_count = get_unaligned_be16(list_count_addr);
+ if (!verify_delta_index_page(nonce, list_count, expected_nonce, memory,
+ memory_size)) {
+ /*
+ * Both attempts failed. Do not log this as an error, because it can happen
+ * during a rebuild if we haven't written the entire volume at least once.
+ */
+ return UDS_CORRUPT_DATA;
+ }
+ }
+
+ delta_index_page->delta_index.delta_zones = delta_zone;
+ delta_index_page->delta_index.zone_count = 1;
+ delta_index_page->delta_index.list_count = list_count;
+ delta_index_page->delta_index.lists_per_zone = list_count;
+ delta_index_page->delta_index.mutable = false;
+ delta_index_page->delta_index.tag = 'p';
+ delta_index_page->virtual_chapter_number = vcn;
+ delta_index_page->lowest_list_number = first_list;
+ delta_index_page->highest_list_number = first_list + list_count - 1;
+
+ compute_coding_constants(mean_delta, &delta_zone->min_bits,
+ &delta_zone->min_keys, &delta_zone->incr_keys);
+ delta_zone->value_bits = payload_bits;
+ delta_zone->memory = memory;
+ delta_zone->delta_lists = NULL;
+ delta_zone->new_offsets = NULL;
+ delta_zone->buffered_writer = NULL;
+ delta_zone->size = memory_size;
+ delta_zone->rebalance_time = 0;
+ delta_zone->rebalance_count = 0;
+ delta_zone->record_count = 0;
+ delta_zone->collision_count = 0;
+ delta_zone->discard_count = 0;
+ delta_zone->overflow_count = 0;
+ delta_zone->first_list = 0;
+ delta_zone->list_count = list_count;
+ delta_zone->tag = 'p';
+
+ return UDS_SUCCESS;
+}
+
+/* Read a large bit field from an arbitrary bit boundary. */
+static inline u64 get_big_field(const u8 *memory, u64 offset, u8 size)
+{
+ const void *addr = memory + offset / BITS_PER_BYTE;
+
+ return (get_unaligned_le64(addr) >> (offset % BITS_PER_BYTE)) & ((1UL << size) - 1);
+}
+
+/* Write a large bit field to an arbitrary bit boundary. */
+static inline void set_big_field(u64 value, u8 *memory, u64 offset, u8 size)
+{
+ void *addr = memory + offset / BITS_PER_BYTE;
+ u8 shift = offset % BITS_PER_BYTE;
+ u64 data = get_unaligned_le64(addr);
+
+ data &= ~(((1UL << size) - 1) << shift);
+ data |= value << shift;
+ put_unaligned_le64(data, addr);
+}
+
+/* Set a sequence of bits to all zeros. */
+static inline void set_zero(u8 *memory, u64 offset, u32 size)
+{
+ if (size > 0) {
+ u8 *addr = memory + offset / BITS_PER_BYTE;
+ u8 shift = offset % BITS_PER_BYTE;
+ u32 count = size + shift > BITS_PER_BYTE ? (u32) BITS_PER_BYTE - shift : size;
+
+ *addr++ &= ~(((1 << count) - 1) << shift);
+ for (size -= count; size > BITS_PER_BYTE; size -= BITS_PER_BYTE)
+ *addr++ = 0;
+
+ if (size > 0)
+ *addr &= 0xFF << size;
+ }
+}
+
+/*
+ * Move several bits from a higher to a lower address, moving the lower addressed bits first. The
+ * size and memory offsets are measured in bits.
+ */
+static void move_bits_down(const u8 *from, u64 from_offset, u8 *to, u64 to_offset, u32 size)
+{
+ const u8 *source;
+ u8 *destination;
+ u8 offset;
+ u8 count;
+ u64 field;
+
+ /* Start by moving one field that ends on a to int boundary. */
+ count = (MAX_BIG_FIELD_BITS - ((to_offset + MAX_BIG_FIELD_BITS) % BITS_PER_TYPE(u32)));
+ field = get_big_field(from, from_offset, count);
+ set_big_field(field, to, to_offset, count);
+ from_offset += count;
+ to_offset += count;
+ size -= count;
+
+ /* Now do the main loop to copy 32 bit chunks that are int-aligned at the destination. */
+ offset = from_offset % BITS_PER_TYPE(u32);
+ source = from + (from_offset - offset) / BITS_PER_BYTE;
+ destination = to + to_offset / BITS_PER_BYTE;
+ while (size > MAX_BIG_FIELD_BITS) {
+ put_unaligned_le32(get_unaligned_le64(source) >> offset, destination);
+ source += sizeof(u32);
+ destination += sizeof(u32);
+ from_offset += BITS_PER_TYPE(u32);
+ to_offset += BITS_PER_TYPE(u32);
+ size -= BITS_PER_TYPE(u32);
+ }
+
+ /* Finish up by moving any remaining bits. */
+ if (size > 0) {
+ field = get_big_field(from, from_offset, size);
+ set_big_field(field, to, to_offset, size);
+ }
+}
+
+/*
+ * Move several bits from a lower to a higher address, moving the higher addressed bits first. The
+ * size and memory offsets are measured in bits.
+ */
+static void move_bits_up(const u8 *from, u64 from_offset, u8 *to, u64 to_offset, u32 size)
+{
+ const u8 *source;
+ u8 *destination;
+ u8 offset;
+ u8 count;
+ u64 field;
+
+ /* Start by moving one field that begins on a destination int boundary. */
+ count = (to_offset + size) % BITS_PER_TYPE(u32);
+ if (count > 0) {
+ size -= count;
+ field = get_big_field(from, from_offset + size, count);
+ set_big_field(field, to, to_offset + size, count);
+ }
+
+ /* Now do the main loop to copy 32 bit chunks that are int-aligned at the destination. */
+ offset = (from_offset + size) % BITS_PER_TYPE(u32);
+ source = from + (from_offset + size - offset) / BITS_PER_BYTE;
+ destination = to + (to_offset + size) / BITS_PER_BYTE;
+ while (size > MAX_BIG_FIELD_BITS) {
+ source -= sizeof(u32);
+ destination -= sizeof(u32);
+ size -= BITS_PER_TYPE(u32);
+ put_unaligned_le32(get_unaligned_le64(source) >> offset, destination);
+ }
+
+ /* Finish up by moving any remaining bits. */
+ if (size > 0) {
+ field = get_big_field(from, from_offset, size);
+ set_big_field(field, to, to_offset, size);
+ }
+}
+
+/*
+ * Move bits from one field to another. When the fields overlap, behave as if we first move all the
+ * bits from the source to a temporary value, and then move all the bits from the temporary value
+ * to the destination. The size and memory offsets are measured in bits.
+ */
+static void move_bits(const u8 *from, u64 from_offset, u8 *to, u64 to_offset, u32 size)
+{
+ u64 field;
+
+ /* A small move doesn't require special handling. */
+ if (size <= MAX_BIG_FIELD_BITS) {
+ if (size > 0) {
+ field = get_big_field(from, from_offset, size);
+ set_big_field(field, to, to_offset, size);
+ }
+
+ return;
+ }
+
+ if (from_offset > to_offset)
+ move_bits_down(from, from_offset, to, to_offset, size);
+ else
+ move_bits_up(from, from_offset, to, to_offset, size);
+}
+
+/*
+ * Pack delta lists from a mutable delta index into an immutable delta index page. A range of delta
+ * lists (starting with a specified list index) is copied from the mutable delta index into a
+ * memory page used in the immutable index. The number of lists copied onto the page is returned in
+ * list_count.
+ */
+int uds_pack_delta_index_page(const struct delta_index *delta_index, u64 header_nonce,
+ u8 *memory, size_t memory_size, u64 virtual_chapter_number,
+ u32 first_list, u32 *list_count)
+{
+ const struct delta_zone *delta_zone;
+ struct delta_list *delta_lists;
+ u32 max_lists;
+ u32 n_lists = 0;
+ u32 offset;
+ u32 i;
+ int free_bits;
+ int bits;
+ struct delta_page_header *header;
+
+ delta_zone = &delta_index->delta_zones[0];
+ delta_lists = &delta_zone->delta_lists[first_list + 1];
+ max_lists = delta_index->list_count - first_list;
+
+ /*
+ * Compute how many lists will fit on the page. Subtract the size of the fixed header, one
+ * delta list offset, and the guard bytes from the page size to determine how much space is
+ * available for delta lists.
+ */
+ free_bits = memory_size * BITS_PER_BYTE;
+ free_bits -= get_immutable_header_offset(1);
+ free_bits -= GUARD_BITS;
+ if (free_bits < IMMUTABLE_HEADER_SIZE) {
+ /* This page is too small to store any delta lists. */
+ return vdo_log_error_strerror(UDS_OVERFLOW,
+ "Chapter Index Page of %zu bytes is too small",
+ memory_size);
+ }
+
+ while (n_lists < max_lists) {
+ /* Each list requires a delta list offset and the list data. */
+ bits = IMMUTABLE_HEADER_SIZE + delta_lists[n_lists].size;
+ if (bits > free_bits)
+ break;
+
+ n_lists++;
+ free_bits -= bits;
+ }
+
+ *list_count = n_lists;
+
+ header = (struct delta_page_header *) memory;
+ put_unaligned_le64(header_nonce, (u8 *) &header->nonce);
+ put_unaligned_le64(virtual_chapter_number,
+ (u8 *) &header->virtual_chapter_number);
+ put_unaligned_le16(first_list, (u8 *) &header->first_list);
+ put_unaligned_le16(n_lists, (u8 *) &header->list_count);
+
+ /* Construct the delta list offset table. */
+ offset = get_immutable_header_offset(n_lists + 1);
+ set_immutable_start(memory, 0, offset);
+ for (i = 0; i < n_lists; i++) {
+ offset += delta_lists[i].size;
+ set_immutable_start(memory, i + 1, offset);
+ }
+
+ /* Copy the delta list data onto the memory page. */
+ for (i = 0; i < n_lists; i++) {
+ move_bits(delta_zone->memory, delta_lists[i].start, memory,
+ get_immutable_start(memory, i), delta_lists[i].size);
+ }
+
+ /* Set all the bits in the guard bytes. */
+ memset(memory + memory_size - POST_FIELD_GUARD_BYTES, ~0,
+ POST_FIELD_GUARD_BYTES);
+ return UDS_SUCCESS;
+}
+
+/* Compute the new offsets of the delta lists. */
+static void compute_new_list_offsets(struct delta_zone *delta_zone, u32 growing_index,
+ size_t growing_size, size_t used_space)
+{
+ size_t spacing;
+ u32 i;
+ struct delta_list *delta_lists = delta_zone->delta_lists;
+ u32 tail_guard_index = delta_zone->list_count + 1;
+
+ spacing = (delta_zone->size - used_space) / delta_zone->list_count;
+ delta_zone->new_offsets[0] = 0;
+ for (i = 0; i <= delta_zone->list_count; i++) {
+ delta_zone->new_offsets[i + 1] =
+ (delta_zone->new_offsets[i] +
+ get_delta_list_byte_size(&delta_lists[i]) + spacing);
+ delta_zone->new_offsets[i] *= BITS_PER_BYTE;
+ delta_zone->new_offsets[i] += delta_lists[i].start % BITS_PER_BYTE;
+ if (i == 0)
+ delta_zone->new_offsets[i + 1] -= spacing / 2;
+ if (i + 1 == growing_index)
+ delta_zone->new_offsets[i + 1] += growing_size;
+ }
+
+ delta_zone->new_offsets[tail_guard_index] =
+ (delta_zone->size * BITS_PER_BYTE - delta_lists[tail_guard_index].size);
+}
+
+static void rebalance_lists(struct delta_zone *delta_zone)
+{
+ struct delta_list *delta_lists;
+ u32 i;
+ size_t used_space = 0;
+
+ /* Extend and balance memory to receive the delta lists */
+ delta_lists = delta_zone->delta_lists;
+ for (i = 0; i <= delta_zone->list_count + 1; i++)
+ used_space += get_delta_list_byte_size(&delta_lists[i]);
+
+ compute_new_list_offsets(delta_zone, 0, 0, used_space);
+ for (i = 1; i <= delta_zone->list_count + 1; i++)
+ delta_lists[i].start = delta_zone->new_offsets[i];
+}
+
+/* Start restoring a delta index from multiple input streams. */
+int uds_start_restoring_delta_index(struct delta_index *delta_index,
+ struct buffered_reader **buffered_readers,
+ unsigned int reader_count)
+{
+ int result;
+ unsigned int zone_count = reader_count;
+ u64 record_count = 0;
+ u64 collision_count = 0;
+ u32 first_list[MAX_ZONES];
+ u32 list_count[MAX_ZONES];
+ unsigned int z;
+ u32 list_next = 0;
+ const struct delta_zone *delta_zone;
+
+ /* Read and validate each header. */
+ for (z = 0; z < zone_count; z++) {
+ struct delta_index_header header;
+ u8 buffer[sizeof(struct delta_index_header)];
+ size_t offset = 0;
+
+ result = uds_read_from_buffered_reader(buffered_readers[z], buffer,
+ sizeof(buffer));
+ if (result != UDS_SUCCESS) {
+ return vdo_log_warning_strerror(result,
+ "failed to read delta index header");
+ }
+
+ memcpy(&header.magic, buffer, MAGIC_SIZE);
+ offset += MAGIC_SIZE;
+ decode_u32_le(buffer, &offset, &header.zone_number);
+ decode_u32_le(buffer, &offset, &header.zone_count);
+ decode_u32_le(buffer, &offset, &header.first_list);
+ decode_u32_le(buffer, &offset, &header.list_count);
+ decode_u64_le(buffer, &offset, &header.record_count);
+ decode_u64_le(buffer, &offset, &header.collision_count);
+
+ result = VDO_ASSERT(offset == sizeof(struct delta_index_header),
+ "%zu bytes decoded of %zu expected", offset,
+ sizeof(struct delta_index_header));
+ if (result != VDO_SUCCESS) {
+ return vdo_log_warning_strerror(result,
+ "failed to read delta index header");
+ }
+
+ if (memcmp(header.magic, DELTA_INDEX_MAGIC, MAGIC_SIZE) != 0) {
+ return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
+ "delta index file has bad magic number");
+ }
+
+ if (zone_count != header.zone_count) {
+ return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
+ "delta index files contain mismatched zone counts (%u,%u)",
+ zone_count, header.zone_count);
+ }
+
+ if (header.zone_number != z) {
+ return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
+ "delta index zone %u found in slot %u",
+ header.zone_number, z);
+ }
+
+ first_list[z] = header.first_list;
+ list_count[z] = header.list_count;
+ record_count += header.record_count;
+ collision_count += header.collision_count;
+
+ if (first_list[z] != list_next) {
+ return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
+ "delta index file for zone %u starts with list %u instead of list %u",
+ z, first_list[z], list_next);
+ }
+
+ list_next += list_count[z];
+ }
+
+ if (list_next != delta_index->list_count) {
+ return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
+ "delta index files contain %u delta lists instead of %u delta lists",
+ list_next, delta_index->list_count);
+ }
+
+ if (collision_count > record_count) {
+ return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
+ "delta index files contain %llu collisions and %llu records",
+ (unsigned long long) collision_count,
+ (unsigned long long) record_count);
+ }
+
+ uds_reset_delta_index(delta_index);
+ delta_index->delta_zones[0].record_count = record_count;
+ delta_index->delta_zones[0].collision_count = collision_count;
+
+ /* Read the delta lists and distribute them to the proper zones. */
+ for (z = 0; z < zone_count; z++) {
+ u32 i;
+
+ delta_index->load_lists[z] = 0;
+ for (i = 0; i < list_count[z]; i++) {
+ u16 delta_list_size;
+ u32 list_number;
+ unsigned int zone_number;
+ u8 size_data[sizeof(u16)];
+
+ result = uds_read_from_buffered_reader(buffered_readers[z],
+ size_data,
+ sizeof(size_data));
+ if (result != UDS_SUCCESS) {
+ return vdo_log_warning_strerror(result,
+ "failed to read delta index size");
+ }
+
+ delta_list_size = get_unaligned_le16(size_data);
+ if (delta_list_size > 0)
+ delta_index->load_lists[z] += 1;
+
+ list_number = first_list[z] + i;
+ zone_number = list_number / delta_index->lists_per_zone;
+ delta_zone = &delta_index->delta_zones[zone_number];
+ list_number -= delta_zone->first_list;
+ delta_zone->delta_lists[list_number + 1].size = delta_list_size;
+ }
+ }
+
+ /* Prepare each zone to start receiving the delta list data. */
+ for (z = 0; z < delta_index->zone_count; z++)
+ rebalance_lists(&delta_index->delta_zones[z]);
+
+ return UDS_SUCCESS;
+}
+
+static int restore_delta_list_to_zone(struct delta_zone *delta_zone,
+ const struct delta_list_save_info *save_info,
+ const u8 *data)
+{
+ struct delta_list *delta_list;
+ u16 bit_count;
+ u16 byte_count;
+ u32 list_number = save_info->index - delta_zone->first_list;
+
+ if (list_number >= delta_zone->list_count) {
+ return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
+ "invalid delta list number %u not in range [%u,%u)",
+ save_info->index, delta_zone->first_list,
+ delta_zone->first_list + delta_zone->list_count);
+ }
+
+ delta_list = &delta_zone->delta_lists[list_number + 1];
+ if (delta_list->size == 0) {
+ return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
+ "unexpected delta list number %u",
+ save_info->index);
+ }
+
+ bit_count = delta_list->size + save_info->bit_offset;
+ byte_count = BITS_TO_BYTES(bit_count);
+ if (save_info->byte_count != byte_count) {
+ return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
+ "unexpected delta list size %u != %u",
+ save_info->byte_count, byte_count);
+ }
+
+ move_bits(data, save_info->bit_offset, delta_zone->memory, delta_list->start,
+ delta_list->size);
+ return UDS_SUCCESS;
+}
+
+static int restore_delta_list_data(struct delta_index *delta_index, unsigned int load_zone,
+ struct buffered_reader *buffered_reader, u8 *data)
+{
+ int result;
+ struct delta_list_save_info save_info;
+ u8 buffer[sizeof(struct delta_list_save_info)];
+ unsigned int new_zone;
+
+ result = uds_read_from_buffered_reader(buffered_reader, buffer, sizeof(buffer));
+ if (result != UDS_SUCCESS) {
+ return vdo_log_warning_strerror(result,
+ "failed to read delta list data");
+ }
+
+ save_info = (struct delta_list_save_info) {
+ .tag = buffer[0],
+ .bit_offset = buffer[1],
+ .byte_count = get_unaligned_le16(&buffer[2]),
+ .index = get_unaligned_le32(&buffer[4]),
+ };
+
+ if ((save_info.bit_offset >= BITS_PER_BYTE) ||
+ (save_info.byte_count > DELTA_LIST_MAX_BYTE_COUNT)) {
+ return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
+ "corrupt delta list data");
+ }
+
+ /* Make sure the data is intended for this delta index. */
+ if (save_info.tag != delta_index->tag)
+ return UDS_CORRUPT_DATA;
+
+ if (save_info.index >= delta_index->list_count) {
+ return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
+ "invalid delta list number %u of %u",
+ save_info.index,
+ delta_index->list_count);
+ }
+
+ result = uds_read_from_buffered_reader(buffered_reader, data,
+ save_info.byte_count);
+ if (result != UDS_SUCCESS) {
+ return vdo_log_warning_strerror(result,
+ "failed to read delta list data");
+ }
+
+ delta_index->load_lists[load_zone] -= 1;
+ new_zone = save_info.index / delta_index->lists_per_zone;
+ return restore_delta_list_to_zone(&delta_index->delta_zones[new_zone],
+ &save_info, data);
+}
+
+/* Restore delta lists from saved data. */
+int uds_finish_restoring_delta_index(struct delta_index *delta_index,
+ struct buffered_reader **buffered_readers,
+ unsigned int reader_count)
+{
+ int result;
+ int saved_result = UDS_SUCCESS;
+ unsigned int z;
+ u8 *data;
+
+ result = vdo_allocate(DELTA_LIST_MAX_BYTE_COUNT, u8, __func__, &data);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ for (z = 0; z < reader_count; z++) {
+ while (delta_index->load_lists[z] > 0) {
+ result = restore_delta_list_data(delta_index, z,
+ buffered_readers[z], data);
+ if (result != UDS_SUCCESS) {
+ saved_result = result;
+ break;
+ }
+ }
+ }
+
+ vdo_free(data);
+ return saved_result;
+}
+
+int uds_check_guard_delta_lists(struct buffered_reader **buffered_readers,
+ unsigned int reader_count)
+{
+ int result;
+ unsigned int z;
+ u8 buffer[sizeof(struct delta_list_save_info)];
+
+ for (z = 0; z < reader_count; z++) {
+ result = uds_read_from_buffered_reader(buffered_readers[z], buffer,
+ sizeof(buffer));
+ if (result != UDS_SUCCESS)
+ return result;
+
+ if (buffer[0] != 'z')
+ return UDS_CORRUPT_DATA;
+ }
+
+ return UDS_SUCCESS;
+}
+
+static int flush_delta_list(struct delta_zone *zone, u32 flush_index)
+{
+ struct delta_list *delta_list;
+ u8 buffer[sizeof(struct delta_list_save_info)];
+ int result;
+
+ delta_list = &zone->delta_lists[flush_index + 1];
+
+ buffer[0] = zone->tag;
+ buffer[1] = delta_list->start % BITS_PER_BYTE;
+ put_unaligned_le16(get_delta_list_byte_size(delta_list), &buffer[2]);
+ put_unaligned_le32(zone->first_list + flush_index, &buffer[4]);
+
+ result = uds_write_to_buffered_writer(zone->buffered_writer, buffer,
+ sizeof(buffer));
+ if (result != UDS_SUCCESS) {
+ vdo_log_warning_strerror(result, "failed to write delta list memory");
+ return result;
+ }
+
+ result = uds_write_to_buffered_writer(zone->buffered_writer,
+ zone->memory + get_delta_list_byte_start(delta_list),
+ get_delta_list_byte_size(delta_list));
+ if (result != UDS_SUCCESS)
+ vdo_log_warning_strerror(result, "failed to write delta list memory");
+
+ return result;
+}
+
+/* Start saving a delta index zone to a buffered output stream. */
+int uds_start_saving_delta_index(const struct delta_index *delta_index,
+ unsigned int zone_number,
+ struct buffered_writer *buffered_writer)
+{
+ int result;
+ u32 i;
+ struct delta_zone *delta_zone;
+ u8 buffer[sizeof(struct delta_index_header)];
+ size_t offset = 0;
+
+ delta_zone = &delta_index->delta_zones[zone_number];
+ memcpy(buffer, DELTA_INDEX_MAGIC, MAGIC_SIZE);
+ offset += MAGIC_SIZE;
+ encode_u32_le(buffer, &offset, zone_number);
+ encode_u32_le(buffer, &offset, delta_index->zone_count);
+ encode_u32_le(buffer, &offset, delta_zone->first_list);
+ encode_u32_le(buffer, &offset, delta_zone->list_count);
+ encode_u64_le(buffer, &offset, delta_zone->record_count);
+ encode_u64_le(buffer, &offset, delta_zone->collision_count);
+
+ result = VDO_ASSERT(offset == sizeof(struct delta_index_header),
+ "%zu bytes encoded of %zu expected", offset,
+ sizeof(struct delta_index_header));
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = uds_write_to_buffered_writer(buffered_writer, buffer, offset);
+ if (result != UDS_SUCCESS)
+ return vdo_log_warning_strerror(result,
+ "failed to write delta index header");
+
+ for (i = 0; i < delta_zone->list_count; i++) {
+ u8 data[sizeof(u16)];
+ struct delta_list *delta_list;
+
+ delta_list = &delta_zone->delta_lists[i + 1];
+ put_unaligned_le16(delta_list->size, data);
+ result = uds_write_to_buffered_writer(buffered_writer, data,
+ sizeof(data));
+ if (result != UDS_SUCCESS)
+ return vdo_log_warning_strerror(result,
+ "failed to write delta list size");
+ }
+
+ delta_zone->buffered_writer = buffered_writer;
+ return UDS_SUCCESS;
+}
+
+int uds_finish_saving_delta_index(const struct delta_index *delta_index,
+ unsigned int zone_number)
+{
+ int result;
+ int first_error = UDS_SUCCESS;
+ u32 i;
+ struct delta_zone *delta_zone;
+ struct delta_list *delta_list;
+
+ delta_zone = &delta_index->delta_zones[zone_number];
+ for (i = 0; i < delta_zone->list_count; i++) {
+ delta_list = &delta_zone->delta_lists[i + 1];
+ if (delta_list->size > 0) {
+ result = flush_delta_list(delta_zone, i);
+ if ((result != UDS_SUCCESS) && (first_error == UDS_SUCCESS))
+ first_error = result;
+ }
+ }
+
+ delta_zone->buffered_writer = NULL;
+ return first_error;
+}
+
+int uds_write_guard_delta_list(struct buffered_writer *buffered_writer)
+{
+ int result;
+ u8 buffer[sizeof(struct delta_list_save_info)];
+
+ memset(buffer, 0, sizeof(struct delta_list_save_info));
+ buffer[0] = 'z';
+
+ result = uds_write_to_buffered_writer(buffered_writer, buffer, sizeof(buffer));
+ if (result != UDS_SUCCESS)
+ vdo_log_warning_strerror(result, "failed to write guard delta list");
+
+ return UDS_SUCCESS;
+}
+
+size_t uds_compute_delta_index_save_bytes(u32 list_count, size_t memory_size)
+{
+ /* One zone will use at least as much memory as other zone counts. */
+ return (sizeof(struct delta_index_header) +
+ list_count * (sizeof(struct delta_list_save_info) + 1) +
+ get_zone_memory_size(1, memory_size));
+}
+
+static int assert_not_at_end(const struct delta_index_entry *delta_entry)
+{
+ int result = VDO_ASSERT(!delta_entry->at_end,
+ "operation is invalid because the list entry is at the end of the delta list");
+ if (result != VDO_SUCCESS)
+ result = UDS_BAD_STATE;
+
+ return result;
+}
+
+/*
+ * Prepare to search for an entry in the specified delta list.
+ *
+ * This is always the first function to be called when dealing with delta index entries. It is
+ * always followed by calls to uds_next_delta_index_entry() to iterate through a delta list. The
+ * fields of the delta_index_entry argument will be set up for iteration, but will not contain an
+ * entry from the list.
+ */
+int uds_start_delta_index_search(const struct delta_index *delta_index, u32 list_number,
+ u32 key, struct delta_index_entry *delta_entry)
+{
+ int result;
+ unsigned int zone_number;
+ struct delta_zone *delta_zone;
+ struct delta_list *delta_list;
+
+ result = VDO_ASSERT((list_number < delta_index->list_count),
+ "Delta list number (%u) is out of range (%u)", list_number,
+ delta_index->list_count);
+ if (result != VDO_SUCCESS)
+ return UDS_CORRUPT_DATA;
+
+ zone_number = list_number / delta_index->lists_per_zone;
+ delta_zone = &delta_index->delta_zones[zone_number];
+ list_number -= delta_zone->first_list;
+ result = VDO_ASSERT((list_number < delta_zone->list_count),
+ "Delta list number (%u) is out of range (%u) for zone (%u)",
+ list_number, delta_zone->list_count, zone_number);
+ if (result != VDO_SUCCESS)
+ return UDS_CORRUPT_DATA;
+
+ if (delta_index->mutable) {
+ delta_list = &delta_zone->delta_lists[list_number + 1];
+ } else {
+ u32 end_offset;
+
+ /*
+ * Translate the immutable delta list header into a temporary
+ * full delta list header.
+ */
+ delta_list = &delta_entry->temp_delta_list;
+ delta_list->start = get_immutable_start(delta_zone->memory, list_number);
+ end_offset = get_immutable_start(delta_zone->memory, list_number + 1);
+ delta_list->size = end_offset - delta_list->start;
+ delta_list->save_key = 0;
+ delta_list->save_offset = 0;
+ }
+
+ if (key > delta_list->save_key) {
+ delta_entry->key = delta_list->save_key;
+ delta_entry->offset = delta_list->save_offset;
+ } else {
+ delta_entry->key = 0;
+ delta_entry->offset = 0;
+ if (key == 0) {
+ /*
+ * This usually means we're about to walk the entire delta list, so get all
+ * of it into the CPU cache.
+ */
+ uds_prefetch_range(&delta_zone->memory[delta_list->start / BITS_PER_BYTE],
+ delta_list->size / BITS_PER_BYTE, false);
+ }
+ }
+
+ delta_entry->at_end = false;
+ delta_entry->delta_zone = delta_zone;
+ delta_entry->delta_list = delta_list;
+ delta_entry->entry_bits = 0;
+ delta_entry->is_collision = false;
+ delta_entry->list_number = list_number;
+ delta_entry->list_overflow = false;
+ delta_entry->value_bits = delta_zone->value_bits;
+ return UDS_SUCCESS;
+}
+
+static inline u64 get_delta_entry_offset(const struct delta_index_entry *delta_entry)
+{
+ return delta_entry->delta_list->start + delta_entry->offset;
+}
+
+/*
+ * Decode a delta index entry delta value. The delta_index_entry basically describes the previous
+ * list entry, and has had its offset field changed to point to the subsequent entry. We decode the
+ * bit stream and update the delta_list_entry to describe the entry.
+ */
+static inline void decode_delta(struct delta_index_entry *delta_entry)
+{
+ int key_bits;
+ u32 delta;
+ const struct delta_zone *delta_zone = delta_entry->delta_zone;
+ const u8 *memory = delta_zone->memory;
+ u64 delta_offset = get_delta_entry_offset(delta_entry) + delta_entry->value_bits;
+ const u8 *addr = memory + delta_offset / BITS_PER_BYTE;
+ int offset = delta_offset % BITS_PER_BYTE;
+ u32 data = get_unaligned_le32(addr) >> offset;
+
+ addr += sizeof(u32);
+ key_bits = delta_zone->min_bits;
+ delta = data & ((1 << key_bits) - 1);
+ if (delta >= delta_zone->min_keys) {
+ data >>= key_bits;
+ if (data == 0) {
+ key_bits = sizeof(u32) * BITS_PER_BYTE - offset;
+ while ((data = get_unaligned_le32(addr)) == 0) {
+ addr += sizeof(u32);
+ key_bits += sizeof(u32) * BITS_PER_BYTE;
+ }
+ }
+ key_bits += ffs(data);
+ delta += ((key_bits - delta_zone->min_bits - 1) * delta_zone->incr_keys);
+ }
+ delta_entry->delta = delta;
+ delta_entry->key += delta;
+
+ /* Check for a collision, a delta of zero after the start. */
+ if (unlikely((delta == 0) && (delta_entry->offset > 0))) {
+ delta_entry->is_collision = true;
+ delta_entry->entry_bits = delta_entry->value_bits + key_bits + COLLISION_BITS;
+ } else {
+ delta_entry->is_collision = false;
+ delta_entry->entry_bits = delta_entry->value_bits + key_bits;
+ }
+}
+
+noinline int uds_next_delta_index_entry(struct delta_index_entry *delta_entry)
+{
+ int result;
+ const struct delta_list *delta_list;
+ u32 next_offset;
+ u16 size;
+
+ result = assert_not_at_end(delta_entry);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ delta_list = delta_entry->delta_list;
+ delta_entry->offset += delta_entry->entry_bits;
+ size = delta_list->size;
+ if (unlikely(delta_entry->offset >= size)) {
+ delta_entry->at_end = true;
+ delta_entry->delta = 0;
+ delta_entry->is_collision = false;
+ result = VDO_ASSERT((delta_entry->offset == size),
+ "next offset past end of delta list");
+ if (result != VDO_SUCCESS)
+ result = UDS_CORRUPT_DATA;
+
+ return result;
+ }
+
+ decode_delta(delta_entry);
+
+ next_offset = delta_entry->offset + delta_entry->entry_bits;
+ if (next_offset > size) {
+ /*
+ * This is not an assertion because uds_validate_chapter_index_page() wants to
+ * handle this error.
+ */
+ vdo_log_warning("Decoded past the end of the delta list");
+ return UDS_CORRUPT_DATA;
+ }
+
+ return UDS_SUCCESS;
+}
+
+int uds_remember_delta_index_offset(const struct delta_index_entry *delta_entry)
+{
+ int result;
+ struct delta_list *delta_list = delta_entry->delta_list;
+
+ result = VDO_ASSERT(!delta_entry->is_collision, "entry is not a collision");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ delta_list->save_key = delta_entry->key - delta_entry->delta;
+ delta_list->save_offset = delta_entry->offset;
+ return UDS_SUCCESS;
+}
+
+static void set_delta(struct delta_index_entry *delta_entry, u32 delta)
+{
+ const struct delta_zone *delta_zone = delta_entry->delta_zone;
+ u32 key_bits = (delta_zone->min_bits +
+ ((delta_zone->incr_keys - delta_zone->min_keys + delta) /
+ delta_zone->incr_keys));
+
+ delta_entry->delta = delta;
+ delta_entry->entry_bits = delta_entry->value_bits + key_bits;
+}
+
+static void get_collision_name(const struct delta_index_entry *entry, u8 *name)
+{
+ u64 offset = get_delta_entry_offset(entry) + entry->entry_bits - COLLISION_BITS;
+ const u8 *addr = entry->delta_zone->memory + offset / BITS_PER_BYTE;
+ int size = COLLISION_BYTES;
+ int shift = offset % BITS_PER_BYTE;
+
+ while (--size >= 0)
+ *name++ = get_unaligned_le16(addr++) >> shift;
+}
+
+static void set_collision_name(const struct delta_index_entry *entry, const u8 *name)
+{
+ u64 offset = get_delta_entry_offset(entry) + entry->entry_bits - COLLISION_BITS;
+ u8 *addr = entry->delta_zone->memory + offset / BITS_PER_BYTE;
+ int size = COLLISION_BYTES;
+ int shift = offset % BITS_PER_BYTE;
+ u16 mask = ~((u16) 0xFF << shift);
+ u16 data;
+
+ while (--size >= 0) {
+ data = (get_unaligned_le16(addr) & mask) | (*name++ << shift);
+ put_unaligned_le16(data, addr++);
+ }
+}
+
+int uds_get_delta_index_entry(const struct delta_index *delta_index, u32 list_number,
+ u32 key, const u8 *name,
+ struct delta_index_entry *delta_entry)
+{
+ int result;
+
+ result = uds_start_delta_index_search(delta_index, list_number, key,
+ delta_entry);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ do {
+ result = uds_next_delta_index_entry(delta_entry);
+ if (result != UDS_SUCCESS)
+ return result;
+ } while (!delta_entry->at_end && (key > delta_entry->key));
+
+ result = uds_remember_delta_index_offset(delta_entry);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ if (!delta_entry->at_end && (key == delta_entry->key)) {
+ struct delta_index_entry collision_entry = *delta_entry;
+
+ for (;;) {
+ u8 full_name[COLLISION_BYTES];
+
+ result = uds_next_delta_index_entry(&collision_entry);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ if (collision_entry.at_end || !collision_entry.is_collision)
+ break;
+
+ get_collision_name(&collision_entry, full_name);
+ if (memcmp(full_name, name, COLLISION_BYTES) == 0) {
+ *delta_entry = collision_entry;
+ break;
+ }
+ }
+ }
+
+ return UDS_SUCCESS;
+}
+
+int uds_get_delta_entry_collision(const struct delta_index_entry *delta_entry, u8 *name)
+{
+ int result;
+
+ result = assert_not_at_end(delta_entry);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ result = VDO_ASSERT(delta_entry->is_collision,
+ "Cannot get full block name from a non-collision delta index entry");
+ if (result != VDO_SUCCESS)
+ return UDS_BAD_STATE;
+
+ get_collision_name(delta_entry, name);
+ return UDS_SUCCESS;
+}
+
+u32 uds_get_delta_entry_value(const struct delta_index_entry *delta_entry)
+{
+ return get_field(delta_entry->delta_zone->memory,
+ get_delta_entry_offset(delta_entry), delta_entry->value_bits);
+}
+
+static int assert_mutable_entry(const struct delta_index_entry *delta_entry)
+{
+ int result = VDO_ASSERT((delta_entry->delta_list != &delta_entry->temp_delta_list),
+ "delta index is mutable");
+ if (result != VDO_SUCCESS)
+ result = UDS_BAD_STATE;
+
+ return result;
+}
+
+int uds_set_delta_entry_value(const struct delta_index_entry *delta_entry, u32 value)
+{
+ int result;
+ u32 value_mask = (1 << delta_entry->value_bits) - 1;
+
+ result = assert_mutable_entry(delta_entry);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ result = assert_not_at_end(delta_entry);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ result = VDO_ASSERT((value & value_mask) == value,
+ "Value (%u) being set in a delta index is too large (must fit in %u bits)",
+ value, delta_entry->value_bits);
+ if (result != VDO_SUCCESS)
+ return UDS_INVALID_ARGUMENT;
+
+ set_field(value, delta_entry->delta_zone->memory,
+ get_delta_entry_offset(delta_entry), delta_entry->value_bits);
+ return UDS_SUCCESS;
+}
+
+/*
+ * Extend the memory used by the delta lists by adding growing_size bytes before the list indicated
+ * by growing_index, then rebalancing the lists in the new chunk.
+ */
+static int extend_delta_zone(struct delta_zone *delta_zone, u32 growing_index,
+ size_t growing_size)
+{
+ ktime_t start_time;
+ ktime_t end_time;
+ struct delta_list *delta_lists;
+ u32 i;
+ size_t used_space;
+
+
+ /* Calculate the amount of space that is or will be in use. */
+ start_time = current_time_ns(CLOCK_MONOTONIC);
+ delta_lists = delta_zone->delta_lists;
+ used_space = growing_size;
+ for (i = 0; i <= delta_zone->list_count + 1; i++)
+ used_space += get_delta_list_byte_size(&delta_lists[i]);
+
+ if (delta_zone->size < used_space)
+ return UDS_OVERFLOW;
+
+ /* Compute the new offsets of the delta lists. */
+ compute_new_list_offsets(delta_zone, growing_index, growing_size, used_space);
+
+ /*
+ * When we rebalance the delta list, we will include the end guard list in the rebalancing.
+ * It contains the end guard data, which must be copied.
+ */
+ rebalance_delta_zone(delta_zone, 1, delta_zone->list_count + 1);
+ end_time = current_time_ns(CLOCK_MONOTONIC);
+ delta_zone->rebalance_count++;
+ delta_zone->rebalance_time += ktime_sub(end_time, start_time);
+ return UDS_SUCCESS;
+}
+
+static int insert_bits(struct delta_index_entry *delta_entry, u16 size)
+{
+ u64 free_before;
+ u64 free_after;
+ u64 source;
+ u64 destination;
+ u32 count;
+ bool before_flag;
+ u8 *memory;
+ struct delta_zone *delta_zone = delta_entry->delta_zone;
+ struct delta_list *delta_list = delta_entry->delta_list;
+ /* Compute bits in use before and after the inserted bits. */
+ u32 total_size = delta_list->size;
+ u32 before_size = delta_entry->offset;
+ u32 after_size = total_size - delta_entry->offset;
+
+ if (total_size + size > U16_MAX) {
+ delta_entry->list_overflow = true;
+ delta_zone->overflow_count++;
+ return UDS_OVERFLOW;
+ }
+
+ /* Compute bits available before and after the delta list. */
+ free_before = (delta_list[0].start - (delta_list[-1].start + delta_list[-1].size));
+ free_after = (delta_list[1].start - (delta_list[0].start + delta_list[0].size));
+
+ if ((size <= free_before) && (size <= free_after)) {
+ /*
+ * We have enough space to use either before or after the list. Select the smaller
+ * amount of data. If it is exactly the same, try to take from the larger amount of
+ * free space.
+ */
+ if (before_size < after_size)
+ before_flag = true;
+ else if (after_size < before_size)
+ before_flag = false;
+ else
+ before_flag = free_before > free_after;
+ } else if (size <= free_before) {
+ /* There is space before but not after. */
+ before_flag = true;
+ } else if (size <= free_after) {
+ /* There is space after but not before. */
+ before_flag = false;
+ } else {
+ /*
+ * Neither of the surrounding spaces is large enough for this request. Extend
+ * and/or rebalance the delta list memory choosing to move the least amount of
+ * data.
+ */
+ int result;
+ u32 growing_index = delta_entry->list_number + 1;
+
+ before_flag = before_size < after_size;
+ if (!before_flag)
+ growing_index++;
+ result = extend_delta_zone(delta_zone, growing_index,
+ BITS_TO_BYTES(size));
+ if (result != UDS_SUCCESS)
+ return result;
+ }
+
+ delta_list->size += size;
+ if (before_flag) {
+ source = delta_list->start;
+ destination = source - size;
+ delta_list->start -= size;
+ count = before_size;
+ } else {
+ source = delta_list->start + delta_entry->offset;
+ destination = source + size;
+ count = after_size;
+ }
+
+ memory = delta_zone->memory;
+ move_bits(memory, source, memory, destination, count);
+ return UDS_SUCCESS;
+}
+
+static void encode_delta(const struct delta_index_entry *delta_entry)
+{
+ u32 temp;
+ u32 t1;
+ u32 t2;
+ u64 offset;
+ const struct delta_zone *delta_zone = delta_entry->delta_zone;
+ u8 *memory = delta_zone->memory;
+
+ offset = get_delta_entry_offset(delta_entry) + delta_entry->value_bits;
+ if (delta_entry->delta < delta_zone->min_keys) {
+ set_field(delta_entry->delta, memory, offset, delta_zone->min_bits);
+ return;
+ }
+
+ temp = delta_entry->delta - delta_zone->min_keys;
+ t1 = (temp % delta_zone->incr_keys) + delta_zone->min_keys;
+ t2 = temp / delta_zone->incr_keys;
+ set_field(t1, memory, offset, delta_zone->min_bits);
+ set_zero(memory, offset + delta_zone->min_bits, t2);
+ set_field(1, memory, offset + delta_zone->min_bits + t2, 1);
+}
+
+static void encode_entry(const struct delta_index_entry *delta_entry, u32 value,
+ const u8 *name)
+{
+ u8 *memory = delta_entry->delta_zone->memory;
+ u64 offset = get_delta_entry_offset(delta_entry);
+
+ set_field(value, memory, offset, delta_entry->value_bits);
+ encode_delta(delta_entry);
+ if (name != NULL)
+ set_collision_name(delta_entry, name);
+}
+
+/*
+ * Create a new entry in the delta index. If the entry is a collision, the full 256 bit name must
+ * be provided.
+ */
+int uds_put_delta_index_entry(struct delta_index_entry *delta_entry, u32 key, u32 value,
+ const u8 *name)
+{
+ int result;
+ struct delta_zone *delta_zone;
+
+ result = assert_mutable_entry(delta_entry);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ if (delta_entry->is_collision) {
+ /*
+ * The caller wants us to insert a collision entry onto a collision entry. This
+ * happens when we find a collision and attempt to add the name again to the index.
+ * This is normally a fatal error unless we are replaying a closed chapter while we
+ * are rebuilding a volume index.
+ */
+ return UDS_DUPLICATE_NAME;
+ }
+
+ if (delta_entry->offset < delta_entry->delta_list->save_offset) {
+ /*
+ * The saved entry offset is after the new entry and will no longer be valid, so
+ * replace it with the insertion point.
+ */
+ result = uds_remember_delta_index_offset(delta_entry);
+ if (result != UDS_SUCCESS)
+ return result;
+ }
+
+ if (name != NULL) {
+ /* Insert a collision entry which is placed after this entry. */
+ result = assert_not_at_end(delta_entry);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ result = VDO_ASSERT((key == delta_entry->key),
+ "incorrect key for collision entry");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ delta_entry->offset += delta_entry->entry_bits;
+ set_delta(delta_entry, 0);
+ delta_entry->is_collision = true;
+ delta_entry->entry_bits += COLLISION_BITS;
+ result = insert_bits(delta_entry, delta_entry->entry_bits);
+ } else if (delta_entry->at_end) {
+ /* Insert a new entry at the end of the delta list. */
+ result = VDO_ASSERT((key >= delta_entry->key), "key past end of list");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ set_delta(delta_entry, key - delta_entry->key);
+ delta_entry->key = key;
+ delta_entry->at_end = false;
+ result = insert_bits(delta_entry, delta_entry->entry_bits);
+ } else {
+ u16 old_entry_size;
+ u16 additional_size;
+ struct delta_index_entry next_entry;
+ u32 next_value;
+
+ /*
+ * Insert a new entry which requires the delta in the following entry to be
+ * updated.
+ */
+ result = VDO_ASSERT((key < delta_entry->key),
+ "key precedes following entry");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = VDO_ASSERT((key >= delta_entry->key - delta_entry->delta),
+ "key effects following entry's delta");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ old_entry_size = delta_entry->entry_bits;
+ next_entry = *delta_entry;
+ next_value = uds_get_delta_entry_value(&next_entry);
+ set_delta(delta_entry, key - (delta_entry->key - delta_entry->delta));
+ delta_entry->key = key;
+ set_delta(&next_entry, next_entry.key - key);
+ next_entry.offset += delta_entry->entry_bits;
+ /* The two new entries are always bigger than the single entry being replaced. */
+ additional_size = (delta_entry->entry_bits +
+ next_entry.entry_bits - old_entry_size);
+ result = insert_bits(delta_entry, additional_size);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ encode_entry(&next_entry, next_value, NULL);
+ }
+
+ if (result != UDS_SUCCESS)
+ return result;
+
+ encode_entry(delta_entry, value, name);
+ delta_zone = delta_entry->delta_zone;
+ delta_zone->record_count++;
+ delta_zone->collision_count += delta_entry->is_collision ? 1 : 0;
+ return UDS_SUCCESS;
+}
+
+static void delete_bits(const struct delta_index_entry *delta_entry, int size)
+{
+ u64 source;
+ u64 destination;
+ u32 count;
+ bool before_flag;
+ struct delta_list *delta_list = delta_entry->delta_list;
+ u8 *memory = delta_entry->delta_zone->memory;
+ /* Compute bits retained before and after the deleted bits. */
+ u32 total_size = delta_list->size;
+ u32 before_size = delta_entry->offset;
+ u32 after_size = total_size - delta_entry->offset - size;
+
+ /*
+ * Determine whether to add to the available space either before or after the delta list.
+ * We prefer to move the least amount of data. If it is exactly the same, try to add to the
+ * smaller amount of free space.
+ */
+ if (before_size < after_size) {
+ before_flag = true;
+ } else if (after_size < before_size) {
+ before_flag = false;
+ } else {
+ u64 free_before =
+ (delta_list[0].start - (delta_list[-1].start + delta_list[-1].size));
+ u64 free_after =
+ (delta_list[1].start - (delta_list[0].start + delta_list[0].size));
+
+ before_flag = (free_before < free_after);
+ }
+
+ delta_list->size -= size;
+ if (before_flag) {
+ source = delta_list->start;
+ destination = source + size;
+ delta_list->start += size;
+ count = before_size;
+ } else {
+ destination = delta_list->start + delta_entry->offset;
+ source = destination + size;
+ count = after_size;
+ }
+
+ move_bits(memory, source, memory, destination, count);
+}
+
+int uds_remove_delta_index_entry(struct delta_index_entry *delta_entry)
+{
+ int result;
+ struct delta_index_entry next_entry;
+ struct delta_zone *delta_zone;
+ struct delta_list *delta_list;
+
+ result = assert_mutable_entry(delta_entry);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ next_entry = *delta_entry;
+ result = uds_next_delta_index_entry(&next_entry);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ delta_zone = delta_entry->delta_zone;
+
+ if (delta_entry->is_collision) {
+ /* This is a collision entry, so just remove it. */
+ delete_bits(delta_entry, delta_entry->entry_bits);
+ next_entry.offset = delta_entry->offset;
+ delta_zone->collision_count -= 1;
+ } else if (next_entry.at_end) {
+ /* This entry is at the end of the list, so just remove it. */
+ delete_bits(delta_entry, delta_entry->entry_bits);
+ next_entry.key -= delta_entry->delta;
+ next_entry.offset = delta_entry->offset;
+ } else {
+ /* The delta in the next entry needs to be updated. */
+ u32 next_value = uds_get_delta_entry_value(&next_entry);
+ u16 old_size = delta_entry->entry_bits + next_entry.entry_bits;
+
+ if (next_entry.is_collision) {
+ next_entry.is_collision = false;
+ delta_zone->collision_count -= 1;
+ }
+
+ set_delta(&next_entry, delta_entry->delta + next_entry.delta);
+ next_entry.offset = delta_entry->offset;
+ /* The one new entry is always smaller than the two entries being replaced. */
+ delete_bits(delta_entry, old_size - next_entry.entry_bits);
+ encode_entry(&next_entry, next_value, NULL);
+ }
+
+ delta_zone->record_count--;
+ delta_zone->discard_count++;
+ *delta_entry = next_entry;
+
+ delta_list = delta_entry->delta_list;
+ if (delta_entry->offset < delta_list->save_offset) {
+ /* The saved entry offset is no longer valid. */
+ delta_list->save_key = 0;
+ delta_list->save_offset = 0;
+ }
+
+ return UDS_SUCCESS;
+}
+
+void uds_get_delta_index_stats(const struct delta_index *delta_index,
+ struct delta_index_stats *stats)
+{
+ unsigned int z;
+ const struct delta_zone *delta_zone;
+
+ memset(stats, 0, sizeof(struct delta_index_stats));
+ for (z = 0; z < delta_index->zone_count; z++) {
+ delta_zone = &delta_index->delta_zones[z];
+ stats->rebalance_time += delta_zone->rebalance_time;
+ stats->rebalance_count += delta_zone->rebalance_count;
+ stats->record_count += delta_zone->record_count;
+ stats->collision_count += delta_zone->collision_count;
+ stats->discard_count += delta_zone->discard_count;
+ stats->overflow_count += delta_zone->overflow_count;
+ stats->list_count += delta_zone->list_count;
+ }
+}
+
+size_t uds_compute_delta_index_size(u32 entry_count, u32 mean_delta, u32 payload_bits)
+{
+ u16 min_bits;
+ u32 incr_keys;
+ u32 min_keys;
+
+ compute_coding_constants(mean_delta, &min_bits, &min_keys, &incr_keys);
+ /* On average, each delta is encoded into about min_bits + 1.5 bits. */
+ return entry_count * (payload_bits + min_bits + 1) + entry_count / 2;
+}
+
+u32 uds_get_delta_index_page_count(u32 entry_count, u32 list_count, u32 mean_delta,
+ u32 payload_bits, size_t bytes_per_page)
+{
+ unsigned int bits_per_delta_list;
+ unsigned int bits_per_page;
+ size_t bits_per_index;
+
+ /* Compute the expected number of bits needed for all the entries. */
+ bits_per_index = uds_compute_delta_index_size(entry_count, mean_delta,
+ payload_bits);
+ bits_per_delta_list = bits_per_index / list_count;
+
+ /* Add in the immutable delta list headers. */
+ bits_per_index += list_count * IMMUTABLE_HEADER_SIZE;
+ /* Compute the number of usable bits on an immutable index page. */
+ bits_per_page = ((bytes_per_page - sizeof(struct delta_page_header)) * BITS_PER_BYTE);
+ /*
+ * Reduce the bits per page by one immutable delta list header and one delta list to
+ * account for internal fragmentation.
+ */
+ bits_per_page -= IMMUTABLE_HEADER_SIZE + bits_per_delta_list;
+ /* Now compute the number of pages needed. */
+ return DIV_ROUND_UP(bits_per_index, bits_per_page);
+}
+
+void uds_log_delta_index_entry(struct delta_index_entry *delta_entry)
+{
+ vdo_log_ratelimit(vdo_log_info,
+ "List 0x%X Key 0x%X Offset 0x%X%s%s List_size 0x%X%s",
+ delta_entry->list_number, delta_entry->key,
+ delta_entry->offset, delta_entry->at_end ? " end" : "",
+ delta_entry->is_collision ? " collision" : "",
+ delta_entry->delta_list->size,
+ delta_entry->list_overflow ? " overflow" : "");
+ delta_entry->list_overflow = false;
+}
diff --git a/drivers/md/dm-vdo/indexer/delta-index.h b/drivers/md/dm-vdo/indexer/delta-index.h
new file mode 100644
index 000000000000..53f6c6ac0bc7
--- /dev/null
+++ b/drivers/md/dm-vdo/indexer/delta-index.h
@@ -0,0 +1,279 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef UDS_DELTA_INDEX_H
+#define UDS_DELTA_INDEX_H
+
+#include <linux/cache.h>
+
+#include "numeric.h"
+#include "time-utils.h"
+
+#include "config.h"
+#include "io-factory.h"
+
+/*
+ * A delta index is a key-value store, where each entry maps an address (the key) to a payload (the
+ * value). The entries are sorted by address, and only the delta between successive addresses is
+ * stored in the entry. The addresses are assumed to be uniformly distributed, and the deltas are
+ * therefore exponentially distributed.
+ *
+ * A delta_index can either be mutable or immutable depending on its expected use. The immutable
+ * form of a delta index is used for the indexes of closed chapters committed to the volume. The
+ * mutable form of a delta index is used by the volume index, and also by the chapter index in an
+ * open chapter. Like the index as a whole, each mutable delta index is divided into a number of
+ * independent zones.
+ */
+
+struct delta_list {
+ /* The offset of the delta list start, in bits */
+ u64 start;
+ /* The number of bits in the delta list */
+ u16 size;
+ /* Where the last search "found" the key, in bits */
+ u16 save_offset;
+ /* The key for the record just before save_offset */
+ u32 save_key;
+};
+
+struct delta_zone {
+ /* The delta list memory */
+ u8 *memory;
+ /* The delta list headers */
+ struct delta_list *delta_lists;
+ /* Temporary starts of delta lists */
+ u64 *new_offsets;
+ /* Buffered writer for saving an index */
+ struct buffered_writer *buffered_writer;
+ /* The size of delta list memory */
+ size_t size;
+ /* Nanoseconds spent rebalancing */
+ ktime_t rebalance_time;
+ /* Number of memory rebalances */
+ u32 rebalance_count;
+ /* The number of bits in a stored value */
+ u8 value_bits;
+ /* The number of bits in the minimal key code */
+ u16 min_bits;
+ /* The number of keys used in a minimal code */
+ u32 min_keys;
+ /* The number of keys used for another code bit */
+ u32 incr_keys;
+ /* The number of records in the index */
+ u64 record_count;
+ /* The number of collision records */
+ u64 collision_count;
+ /* The number of records removed */
+ u64 discard_count;
+ /* The number of UDS_OVERFLOW errors detected */
+ u64 overflow_count;
+ /* The index of the first delta list */
+ u32 first_list;
+ /* The number of delta lists */
+ u32 list_count;
+ /* Tag belonging to this delta index */
+ u8 tag;
+} __aligned(L1_CACHE_BYTES);
+
+struct delta_list_save_info {
+ /* Tag identifying which delta index this list is in */
+ u8 tag;
+ /* Bit offset of the start of the list data */
+ u8 bit_offset;
+ /* Number of bytes of list data */
+ u16 byte_count;
+ /* The delta list number within the delta index */
+ u32 index;
+} __packed;
+
+struct delta_index {
+ /* The zones */
+ struct delta_zone *delta_zones;
+ /* The number of zones */
+ unsigned int zone_count;
+ /* The number of delta lists */
+ u32 list_count;
+ /* Maximum lists per zone */
+ u32 lists_per_zone;
+ /* Total memory allocated to this index */
+ size_t memory_size;
+ /* The number of non-empty lists at load time per zone */
+ u32 load_lists[MAX_ZONES];
+ /* True if this index is mutable */
+ bool mutable;
+ /* Tag belonging to this delta index */
+ u8 tag;
+};
+
+/*
+ * A delta_index_page describes a single page of a chapter index. The delta_index field allows the
+ * page to be treated as an immutable delta_index. We use the delta_zone field to treat the chapter
+ * index page as a single zone index, and without the need to do an additional memory allocation.
+ */
+struct delta_index_page {
+ struct delta_index delta_index;
+ /* These values are loaded from the delta_page_header */
+ u32 lowest_list_number;
+ u32 highest_list_number;
+ u64 virtual_chapter_number;
+ /* This structure describes the single zone of a delta index page. */
+ struct delta_zone delta_zone;
+};
+
+/*
+ * Notes on the delta_index_entries:
+ *
+ * The fields documented as "public" can be read by any code that uses a delta_index. The fields
+ * documented as "private" carry information between delta_index method calls and should not be
+ * used outside the delta_index module.
+ *
+ * (1) The delta_index_entry is used like an iterator when searching a delta list.
+ *
+ * (2) It is also the result of a successful search and can be used to refer to the element found
+ * by the search.
+ *
+ * (3) It is also the result of an unsuccessful search and can be used to refer to the insertion
+ * point for a new record.
+ *
+ * (4) If at_end is true, the delta_list entry can only be used as the insertion point for a new
+ * record at the end of the list.
+ *
+ * (5) If at_end is false and is_collision is true, the delta_list entry fields refer to a
+ * collision entry in the list, and the delta_list entry can be used as a reference to this
+ * entry.
+ *
+ * (6) If at_end is false and is_collision is false, the delta_list entry fields refer to a
+ * non-collision entry in the list. Such delta_list entries can be used as a reference to a
+ * found entry, or an insertion point for a non-collision entry before this entry, or an
+ * insertion point for a collision entry that collides with this entry.
+ */
+struct delta_index_entry {
+ /* Public fields */
+ /* The key for this entry */
+ u32 key;
+ /* We are after the last list entry */
+ bool at_end;
+ /* This record is a collision */
+ bool is_collision;
+
+ /* Private fields */
+ /* This delta list overflowed */
+ bool list_overflow;
+ /* The number of bits used for the value */
+ u8 value_bits;
+ /* The number of bits used for the entire entry */
+ u16 entry_bits;
+ /* The delta index zone */
+ struct delta_zone *delta_zone;
+ /* The delta list containing the entry */
+ struct delta_list *delta_list;
+ /* The delta list number */
+ u32 list_number;
+ /* Bit offset of this entry within the list */
+ u16 offset;
+ /* The delta between this and previous entry */
+ u32 delta;
+ /* Temporary delta list for immutable indices */
+ struct delta_list temp_delta_list;
+};
+
+struct delta_index_stats {
+ /* Number of bytes allocated */
+ size_t memory_allocated;
+ /* Nanoseconds spent rebalancing */
+ ktime_t rebalance_time;
+ /* Number of memory rebalances */
+ u32 rebalance_count;
+ /* The number of records in the index */
+ u64 record_count;
+ /* The number of collision records */
+ u64 collision_count;
+ /* The number of records removed */
+ u64 discard_count;
+ /* The number of UDS_OVERFLOW errors detected */
+ u64 overflow_count;
+ /* The number of delta lists */
+ u32 list_count;
+};
+
+int __must_check uds_initialize_delta_index(struct delta_index *delta_index,
+ unsigned int zone_count, u32 list_count,
+ u32 mean_delta, u32 payload_bits,
+ size_t memory_size, u8 tag);
+
+int __must_check uds_initialize_delta_index_page(struct delta_index_page *delta_index_page,
+ u64 expected_nonce, u32 mean_delta,
+ u32 payload_bits, u8 *memory,
+ size_t memory_size);
+
+void uds_uninitialize_delta_index(struct delta_index *delta_index);
+
+void uds_reset_delta_index(const struct delta_index *delta_index);
+
+int __must_check uds_pack_delta_index_page(const struct delta_index *delta_index,
+ u64 header_nonce, u8 *memory,
+ size_t memory_size,
+ u64 virtual_chapter_number, u32 first_list,
+ u32 *list_count);
+
+int __must_check uds_start_restoring_delta_index(struct delta_index *delta_index,
+ struct buffered_reader **buffered_readers,
+ unsigned int reader_count);
+
+int __must_check uds_finish_restoring_delta_index(struct delta_index *delta_index,
+ struct buffered_reader **buffered_readers,
+ unsigned int reader_count);
+
+int __must_check uds_check_guard_delta_lists(struct buffered_reader **buffered_readers,
+ unsigned int reader_count);
+
+int __must_check uds_start_saving_delta_index(const struct delta_index *delta_index,
+ unsigned int zone_number,
+ struct buffered_writer *buffered_writer);
+
+int __must_check uds_finish_saving_delta_index(const struct delta_index *delta_index,
+ unsigned int zone_number);
+
+int __must_check uds_write_guard_delta_list(struct buffered_writer *buffered_writer);
+
+size_t __must_check uds_compute_delta_index_save_bytes(u32 list_count,
+ size_t memory_size);
+
+int __must_check uds_start_delta_index_search(const struct delta_index *delta_index,
+ u32 list_number, u32 key,
+ struct delta_index_entry *iterator);
+
+int __must_check uds_next_delta_index_entry(struct delta_index_entry *delta_entry);
+
+int __must_check uds_remember_delta_index_offset(const struct delta_index_entry *delta_entry);
+
+int __must_check uds_get_delta_index_entry(const struct delta_index *delta_index,
+ u32 list_number, u32 key, const u8 *name,
+ struct delta_index_entry *delta_entry);
+
+int __must_check uds_get_delta_entry_collision(const struct delta_index_entry *delta_entry,
+ u8 *name);
+
+u32 __must_check uds_get_delta_entry_value(const struct delta_index_entry *delta_entry);
+
+int __must_check uds_set_delta_entry_value(const struct delta_index_entry *delta_entry, u32 value);
+
+int __must_check uds_put_delta_index_entry(struct delta_index_entry *delta_entry, u32 key,
+ u32 value, const u8 *name);
+
+int __must_check uds_remove_delta_index_entry(struct delta_index_entry *delta_entry);
+
+void uds_get_delta_index_stats(const struct delta_index *delta_index,
+ struct delta_index_stats *stats);
+
+size_t __must_check uds_compute_delta_index_size(u32 entry_count, u32 mean_delta,
+ u32 payload_bits);
+
+u32 uds_get_delta_index_page_count(u32 entry_count, u32 list_count, u32 mean_delta,
+ u32 payload_bits, size_t bytes_per_page);
+
+void uds_log_delta_index_entry(struct delta_index_entry *delta_entry);
+
+#endif /* UDS_DELTA_INDEX_H */
diff --git a/drivers/md/dm-vdo/indexer/funnel-requestqueue.c b/drivers/md/dm-vdo/indexer/funnel-requestqueue.c
new file mode 100644
index 000000000000..1a5735375ddc
--- /dev/null
+++ b/drivers/md/dm-vdo/indexer/funnel-requestqueue.c
@@ -0,0 +1,279 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "funnel-requestqueue.h"
+
+#include <linux/atomic.h>
+#include <linux/compiler.h>
+#include <linux/wait.h>
+
+#include "funnel-queue.h"
+#include "logger.h"
+#include "memory-alloc.h"
+#include "thread-utils.h"
+
+/*
+ * This queue will attempt to handle requests in reasonably sized batches instead of reacting
+ * immediately to each new request. The wait time between batches is dynamically adjusted up or
+ * down to try to balance responsiveness against wasted thread run time.
+ *
+ * If the wait time becomes long enough, the queue will become dormant and must be explicitly
+ * awoken when a new request is enqueued. The enqueue operation updates "newest" in the funnel
+ * queue via xchg (which is a memory barrier), and later checks "dormant" to decide whether to do a
+ * wakeup of the worker thread.
+ *
+ * When deciding to go to sleep, the worker thread sets "dormant" and then examines "newest" to
+ * decide if the funnel queue is idle. In dormant mode, the last examination of "newest" before
+ * going to sleep is done inside the wait_event_interruptible() macro, after a point where one or
+ * more memory barriers have been issued. (Preparing to sleep uses spin locks.) Even if the funnel
+ * queue's "next" field update isn't visible yet to make the entry accessible, its existence will
+ * kick the worker thread out of dormant mode and back into timer-based mode.
+ *
+ * Unbatched requests are used to communicate between different zone threads and will also cause
+ * the queue to awaken immediately.
+ */
+
+enum {
+ NANOSECOND = 1,
+ MICROSECOND = 1000 * NANOSECOND,
+ MILLISECOND = 1000 * MICROSECOND,
+ DEFAULT_WAIT_TIME = 20 * MICROSECOND,
+ MINIMUM_WAIT_TIME = DEFAULT_WAIT_TIME / 2,
+ MAXIMUM_WAIT_TIME = MILLISECOND,
+ MINIMUM_BATCH = 32,
+ MAXIMUM_BATCH = 64,
+};
+
+struct uds_request_queue {
+ /* Wait queue for synchronizing producers and consumer */
+ struct wait_queue_head wait_head;
+ /* Function to process a request */
+ uds_request_queue_processor_fn processor;
+ /* Queue of new incoming requests */
+ struct funnel_queue *main_queue;
+ /* Queue of old requests to retry */
+ struct funnel_queue *retry_queue;
+ /* The thread id of the worker thread */
+ struct thread *thread;
+ /* True if the worker was started */
+ bool started;
+ /* When true, requests can be enqueued */
+ bool running;
+ /* A flag set when the worker is waiting without a timeout */
+ atomic_t dormant;
+};
+
+static inline struct uds_request *poll_queues(struct uds_request_queue *queue)
+{
+ struct funnel_queue_entry *entry;
+
+ entry = vdo_funnel_queue_poll(queue->retry_queue);
+ if (entry != NULL)
+ return container_of(entry, struct uds_request, queue_link);
+
+ entry = vdo_funnel_queue_poll(queue->main_queue);
+ if (entry != NULL)
+ return container_of(entry, struct uds_request, queue_link);
+
+ return NULL;
+}
+
+static inline bool are_queues_idle(struct uds_request_queue *queue)
+{
+ return vdo_is_funnel_queue_idle(queue->retry_queue) &&
+ vdo_is_funnel_queue_idle(queue->main_queue);
+}
+
+/*
+ * Determine if there is a next request to process, and return it if there is. Also return flags
+ * indicating whether the worker thread can sleep (for the use of wait_event() macros) and whether
+ * the thread did sleep before returning a new request.
+ */
+static inline bool dequeue_request(struct uds_request_queue *queue,
+ struct uds_request **request_ptr, bool *waited_ptr)
+{
+ struct uds_request *request = poll_queues(queue);
+
+ if (request != NULL) {
+ *request_ptr = request;
+ return true;
+ }
+
+ if (!READ_ONCE(queue->running)) {
+ /* Wake the worker thread so it can exit. */
+ *request_ptr = NULL;
+ return true;
+ }
+
+ *request_ptr = NULL;
+ *waited_ptr = true;
+ return false;
+}
+
+static void wait_for_request(struct uds_request_queue *queue, bool dormant,
+ unsigned long timeout, struct uds_request **request,
+ bool *waited)
+{
+ if (dormant) {
+ wait_event_interruptible(queue->wait_head,
+ (dequeue_request(queue, request, waited) ||
+ !are_queues_idle(queue)));
+ return;
+ }
+
+ wait_event_interruptible_hrtimeout(queue->wait_head,
+ dequeue_request(queue, request, waited),
+ ns_to_ktime(timeout));
+}
+
+static void request_queue_worker(void *arg)
+{
+ struct uds_request_queue *queue = arg;
+ struct uds_request *request = NULL;
+ unsigned long time_batch = DEFAULT_WAIT_TIME;
+ bool dormant = atomic_read(&queue->dormant);
+ bool waited = false;
+ long current_batch = 0;
+
+ for (;;) {
+ wait_for_request(queue, dormant, time_batch, &request, &waited);
+ if (likely(request != NULL)) {
+ current_batch++;
+ queue->processor(request);
+ } else if (!READ_ONCE(queue->running)) {
+ break;
+ }
+
+ if (dormant) {
+ /*
+ * The queue has been roused from dormancy. Clear the flag so enqueuers can
+ * stop broadcasting. No fence is needed for this transition.
+ */
+ atomic_set(&queue->dormant, false);
+ dormant = false;
+ time_batch = DEFAULT_WAIT_TIME;
+ } else if (waited) {
+ /*
+ * We waited for this request to show up. Adjust the wait time to smooth
+ * out the batch size.
+ */
+ if (current_batch < MINIMUM_BATCH) {
+ /*
+ * If the last batch of requests was too small, increase the wait
+ * time.
+ */
+ time_batch += time_batch / 4;
+ if (time_batch >= MAXIMUM_WAIT_TIME) {
+ atomic_set(&queue->dormant, true);
+ dormant = true;
+ }
+ } else if (current_batch > MAXIMUM_BATCH) {
+ /*
+ * If the last batch of requests was too large, decrease the wait
+ * time.
+ */
+ time_batch -= time_batch / 4;
+ if (time_batch < MINIMUM_WAIT_TIME)
+ time_batch = MINIMUM_WAIT_TIME;
+ }
+ current_batch = 0;
+ }
+ }
+
+ /*
+ * Ensure that we process any remaining requests that were enqueued before trying to shut
+ * down. The corresponding write barrier is in uds_request_queue_finish().
+ */
+ smp_rmb();
+ while ((request = poll_queues(queue)) != NULL)
+ queue->processor(request);
+}
+
+int uds_make_request_queue(const char *queue_name,
+ uds_request_queue_processor_fn processor,
+ struct uds_request_queue **queue_ptr)
+{
+ int result;
+ struct uds_request_queue *queue;
+
+ result = vdo_allocate(1, struct uds_request_queue, __func__, &queue);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ queue->processor = processor;
+ queue->running = true;
+ atomic_set(&queue->dormant, false);
+ init_waitqueue_head(&queue->wait_head);
+
+ result = vdo_make_funnel_queue(&queue->main_queue);
+ if (result != VDO_SUCCESS) {
+ uds_request_queue_finish(queue);
+ return result;
+ }
+
+ result = vdo_make_funnel_queue(&queue->retry_queue);
+ if (result != VDO_SUCCESS) {
+ uds_request_queue_finish(queue);
+ return result;
+ }
+
+ result = vdo_create_thread(request_queue_worker, queue, queue_name,
+ &queue->thread);
+ if (result != VDO_SUCCESS) {
+ uds_request_queue_finish(queue);
+ return result;
+ }
+
+ queue->started = true;
+ *queue_ptr = queue;
+ return UDS_SUCCESS;
+}
+
+static inline void wake_up_worker(struct uds_request_queue *queue)
+{
+ if (wq_has_sleeper(&queue->wait_head))
+ wake_up(&queue->wait_head);
+}
+
+void uds_request_queue_enqueue(struct uds_request_queue *queue,
+ struct uds_request *request)
+{
+ struct funnel_queue *sub_queue;
+ bool unbatched = request->unbatched;
+
+ sub_queue = request->requeued ? queue->retry_queue : queue->main_queue;
+ vdo_funnel_queue_put(sub_queue, &request->queue_link);
+
+ /*
+ * We must wake the worker thread when it is dormant. A read fence isn't needed here since
+ * we know the queue operation acts as one.
+ */
+ if (atomic_read(&queue->dormant) || unbatched)
+ wake_up_worker(queue);
+}
+
+void uds_request_queue_finish(struct uds_request_queue *queue)
+{
+ if (queue == NULL)
+ return;
+
+ /*
+ * This memory barrier ensures that any requests we queued will be seen. The point is that
+ * when dequeue_request() sees the following update to the running flag, it will also be
+ * able to see any change we made to a next field in the funnel queue entry. The
+ * corresponding read barrier is in request_queue_worker().
+ */
+ smp_wmb();
+ WRITE_ONCE(queue->running, false);
+
+ if (queue->started) {
+ wake_up_worker(queue);
+ vdo_join_threads(queue->thread);
+ }
+
+ vdo_free_funnel_queue(queue->main_queue);
+ vdo_free_funnel_queue(queue->retry_queue);
+ vdo_free(queue);
+}
diff --git a/drivers/md/dm-vdo/indexer/funnel-requestqueue.h b/drivers/md/dm-vdo/indexer/funnel-requestqueue.h
new file mode 100644
index 000000000000..9b0f53939b4d
--- /dev/null
+++ b/drivers/md/dm-vdo/indexer/funnel-requestqueue.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef UDS_REQUEST_QUEUE_H
+#define UDS_REQUEST_QUEUE_H
+
+#include "indexer.h"
+
+/*
+ * A simple request queue which will handle new requests in the order in which they are received,
+ * and will attempt to handle requeued requests before new ones. However, the nature of the
+ * implementation means that it cannot guarantee this ordering; the prioritization is merely a
+ * hint.
+ */
+
+struct uds_request_queue;
+
+typedef void (*uds_request_queue_processor_fn)(struct uds_request *);
+
+int __must_check uds_make_request_queue(const char *queue_name,
+ uds_request_queue_processor_fn processor,
+ struct uds_request_queue **queue_ptr);
+
+void uds_request_queue_enqueue(struct uds_request_queue *queue,
+ struct uds_request *request);
+
+void uds_request_queue_finish(struct uds_request_queue *queue);
+
+#endif /* UDS_REQUEST_QUEUE_H */
diff --git a/drivers/md/dm-vdo/indexer/geometry.c b/drivers/md/dm-vdo/indexer/geometry.c
new file mode 100644
index 000000000000..c0575612e820
--- /dev/null
+++ b/drivers/md/dm-vdo/indexer/geometry.c
@@ -0,0 +1,201 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "geometry.h"
+
+#include <linux/compiler.h>
+#include <linux/log2.h>
+
+#include "errors.h"
+#include "logger.h"
+#include "memory-alloc.h"
+#include "permassert.h"
+
+#include "delta-index.h"
+#include "indexer.h"
+
+/*
+ * An index volume is divided into a fixed number of fixed-size chapters, each consisting of a
+ * fixed number of fixed-size pages. The volume layout is defined by two constants and four
+ * parameters. The constants are that index records are 32 bytes long (16-byte block name plus
+ * 16-byte metadata) and that open chapter index hash slots are one byte long. The four parameters
+ * are the number of bytes in a page, the number of record pages in a chapter, the number of
+ * chapters in a volume, and the number of chapters that are sparse. From these parameters, we can
+ * derive the rest of the layout and other index properties.
+ *
+ * The index volume is sized by its maximum memory footprint. For a dense index, the persistent
+ * storage is about 10 times the size of the memory footprint. For a sparse index, the persistent
+ * storage is about 100 times the size of the memory footprint.
+ *
+ * For a small index with a memory footprint less than 1GB, there are three possible memory
+ * configurations: 0.25GB, 0.5GB and 0.75GB. The default geometry for each is 1024 index records
+ * per 32 KB page, 1024 chapters per volume, and either 64, 128, or 192 record pages per chapter
+ * (resulting in 6, 13, or 20 index pages per chapter) depending on the memory configuration. For
+ * the VDO default of a 0.25 GB index, this yields a deduplication window of 256 GB using about 2.5
+ * GB for the persistent storage and 256 MB of RAM.
+ *
+ * For a larger index with a memory footprint that is a multiple of 1 GB, the geometry is 1024
+ * index records per 32 KB page, 256 record pages per chapter, 26 index pages per chapter, and 1024
+ * chapters for every GB of memory footprint. For a 1 GB volume, this yields a deduplication window
+ * of 1 TB using about 9GB of persistent storage and 1 GB of RAM.
+ *
+ * The above numbers hold for volumes which have no sparse chapters. A sparse volume has 10 times
+ * as many chapters as the corresponding non-sparse volume, which provides 10 times the
+ * deduplication window while using 10 times as much persistent storage as the equivalent
+ * non-sparse volume with the same memory footprint.
+ *
+ * If the volume has been converted from a non-lvm format to an lvm volume, the number of chapters
+ * per volume will have been reduced by one by eliminating physical chapter 0, and the virtual
+ * chapter that formerly mapped to physical chapter 0 may be remapped to another physical chapter.
+ * This remapping is expressed by storing which virtual chapter was remapped, and which physical
+ * chapter it was moved to.
+ */
+
+int uds_make_index_geometry(size_t bytes_per_page, u32 record_pages_per_chapter,
+ u32 chapters_per_volume, u32 sparse_chapters_per_volume,
+ u64 remapped_virtual, u64 remapped_physical,
+ struct index_geometry **geometry_ptr)
+{
+ int result;
+ struct index_geometry *geometry;
+
+ result = vdo_allocate(1, struct index_geometry, "geometry", &geometry);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ geometry->bytes_per_page = bytes_per_page;
+ geometry->record_pages_per_chapter = record_pages_per_chapter;
+ geometry->chapters_per_volume = chapters_per_volume;
+ geometry->sparse_chapters_per_volume = sparse_chapters_per_volume;
+ geometry->dense_chapters_per_volume = chapters_per_volume - sparse_chapters_per_volume;
+ geometry->remapped_virtual = remapped_virtual;
+ geometry->remapped_physical = remapped_physical;
+
+ geometry->records_per_page = bytes_per_page / BYTES_PER_RECORD;
+ geometry->records_per_chapter = geometry->records_per_page * record_pages_per_chapter;
+ geometry->records_per_volume = (u64) geometry->records_per_chapter * chapters_per_volume;
+
+ geometry->chapter_mean_delta = 1 << DEFAULT_CHAPTER_MEAN_DELTA_BITS;
+ geometry->chapter_payload_bits = bits_per(record_pages_per_chapter - 1);
+ /*
+ * We want 1 delta list for every 64 records in the chapter.
+ * The "| 077" ensures that the chapter_delta_list_bits computation
+ * does not underflow.
+ */
+ geometry->chapter_delta_list_bits =
+ bits_per((geometry->records_per_chapter - 1) | 077) - 6;
+ geometry->delta_lists_per_chapter = 1 << geometry->chapter_delta_list_bits;
+ /* We need enough address bits to achieve the desired mean delta. */
+ geometry->chapter_address_bits =
+ (DEFAULT_CHAPTER_MEAN_DELTA_BITS -
+ geometry->chapter_delta_list_bits +
+ bits_per(geometry->records_per_chapter - 1));
+ geometry->index_pages_per_chapter =
+ uds_get_delta_index_page_count(geometry->records_per_chapter,
+ geometry->delta_lists_per_chapter,
+ geometry->chapter_mean_delta,
+ geometry->chapter_payload_bits,
+ bytes_per_page);
+
+ geometry->pages_per_chapter = geometry->index_pages_per_chapter + record_pages_per_chapter;
+ geometry->pages_per_volume = geometry->pages_per_chapter * chapters_per_volume;
+ geometry->bytes_per_volume =
+ bytes_per_page * (geometry->pages_per_volume + HEADER_PAGES_PER_VOLUME);
+
+ *geometry_ptr = geometry;
+ return UDS_SUCCESS;
+}
+
+int uds_copy_index_geometry(struct index_geometry *source,
+ struct index_geometry **geometry_ptr)
+{
+ return uds_make_index_geometry(source->bytes_per_page,
+ source->record_pages_per_chapter,
+ source->chapters_per_volume,
+ source->sparse_chapters_per_volume,
+ source->remapped_virtual, source->remapped_physical,
+ geometry_ptr);
+}
+
+void uds_free_index_geometry(struct index_geometry *geometry)
+{
+ vdo_free(geometry);
+}
+
+u32 __must_check uds_map_to_physical_chapter(const struct index_geometry *geometry,
+ u64 virtual_chapter)
+{
+ u64 delta;
+
+ if (!uds_is_reduced_index_geometry(geometry))
+ return virtual_chapter % geometry->chapters_per_volume;
+
+ if (likely(virtual_chapter > geometry->remapped_virtual)) {
+ delta = virtual_chapter - geometry->remapped_virtual;
+ if (likely(delta > geometry->remapped_physical))
+ return delta % geometry->chapters_per_volume;
+ else
+ return delta - 1;
+ }
+
+ if (virtual_chapter == geometry->remapped_virtual)
+ return geometry->remapped_physical;
+
+ delta = geometry->remapped_virtual - virtual_chapter;
+ if (delta < geometry->chapters_per_volume)
+ return geometry->chapters_per_volume - delta;
+
+ /* This chapter is so old the answer doesn't matter. */
+ return 0;
+}
+
+/* Check whether any sparse chapters are in use. */
+bool uds_has_sparse_chapters(const struct index_geometry *geometry,
+ u64 oldest_virtual_chapter, u64 newest_virtual_chapter)
+{
+ return uds_is_sparse_index_geometry(geometry) &&
+ ((newest_virtual_chapter - oldest_virtual_chapter + 1) >
+ geometry->dense_chapters_per_volume);
+}
+
+bool uds_is_chapter_sparse(const struct index_geometry *geometry,
+ u64 oldest_virtual_chapter, u64 newest_virtual_chapter,
+ u64 virtual_chapter_number)
+{
+ return uds_has_sparse_chapters(geometry, oldest_virtual_chapter,
+ newest_virtual_chapter) &&
+ ((virtual_chapter_number + geometry->dense_chapters_per_volume) <=
+ newest_virtual_chapter);
+}
+
+/* Calculate how many chapters to expire after opening the newest chapter. */
+u32 uds_chapters_to_expire(const struct index_geometry *geometry, u64 newest_chapter)
+{
+ /* If the index isn't full yet, don't expire anything. */
+ if (newest_chapter < geometry->chapters_per_volume)
+ return 0;
+
+ /* If a chapter is out of order... */
+ if (geometry->remapped_physical > 0) {
+ u64 oldest_chapter = newest_chapter - geometry->chapters_per_volume;
+
+ /*
+ * ... expire an extra chapter when expiring the moved chapter to free physical
+ * space for the new chapter ...
+ */
+ if (oldest_chapter == geometry->remapped_virtual)
+ return 2;
+
+ /*
+ * ... but don't expire anything when the new chapter will use the physical chapter
+ * freed by expiring the moved chapter.
+ */
+ if (oldest_chapter == (geometry->remapped_virtual + geometry->remapped_physical))
+ return 0;
+ }
+
+ /* Normally, just expire one. */
+ return 1;
+}
diff --git a/drivers/md/dm-vdo/indexer/geometry.h b/drivers/md/dm-vdo/indexer/geometry.h
new file mode 100644
index 000000000000..a2ecdb238cf2
--- /dev/null
+++ b/drivers/md/dm-vdo/indexer/geometry.h
@@ -0,0 +1,140 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef UDS_INDEX_GEOMETRY_H
+#define UDS_INDEX_GEOMETRY_H
+
+#include "indexer.h"
+
+/*
+ * The index_geometry records parameters that define the layout of a UDS index volume, and the size and
+ * shape of various index structures. It is created when the index is created, and is referenced by
+ * many index sub-components.
+ */
+
+struct index_geometry {
+ /* Size of a chapter page, in bytes */
+ size_t bytes_per_page;
+ /* Number of record pages in a chapter */
+ u32 record_pages_per_chapter;
+ /* Total number of chapters in a volume */
+ u32 chapters_per_volume;
+ /* Number of sparsely-indexed chapters in a volume */
+ u32 sparse_chapters_per_volume;
+ /* Number of bits used to determine delta list numbers */
+ u8 chapter_delta_list_bits;
+ /* Virtual chapter remapped from physical chapter 0 */
+ u64 remapped_virtual;
+ /* New physical chapter where the remapped chapter can be found */
+ u64 remapped_physical;
+
+ /*
+ * The following properties are derived from the ones above, but they are computed and
+ * recorded as fields for convenience.
+ */
+ /* Total number of pages in a volume, excluding the header */
+ u32 pages_per_volume;
+ /* Total number of bytes in a volume, including the header */
+ size_t bytes_per_volume;
+ /* Number of pages in a chapter */
+ u32 pages_per_chapter;
+ /* Number of index pages in a chapter index */
+ u32 index_pages_per_chapter;
+ /* Number of records that fit on a page */
+ u32 records_per_page;
+ /* Number of records that fit in a chapter */
+ u32 records_per_chapter;
+ /* Number of records that fit in a volume */
+ u64 records_per_volume;
+ /* Number of delta lists per chapter index */
+ u32 delta_lists_per_chapter;
+ /* Mean delta for chapter indexes */
+ u32 chapter_mean_delta;
+ /* Number of bits needed for record page numbers */
+ u8 chapter_payload_bits;
+ /* Number of bits used to compute addresses for chapter delta lists */
+ u8 chapter_address_bits;
+ /* Number of densely-indexed chapters in a volume */
+ u32 dense_chapters_per_volume;
+};
+
+enum {
+ /* The number of bytes in a record (name + metadata) */
+ BYTES_PER_RECORD = (UDS_RECORD_NAME_SIZE + UDS_RECORD_DATA_SIZE),
+
+ /* The default length of a page in a chapter, in bytes */
+ DEFAULT_BYTES_PER_PAGE = 1024 * BYTES_PER_RECORD,
+
+ /* The default maximum number of records per page */
+ DEFAULT_RECORDS_PER_PAGE = DEFAULT_BYTES_PER_PAGE / BYTES_PER_RECORD,
+
+ /* The default number of record pages in a chapter */
+ DEFAULT_RECORD_PAGES_PER_CHAPTER = 256,
+
+ /* The default number of record pages in a chapter for a small index */
+ SMALL_RECORD_PAGES_PER_CHAPTER = 64,
+
+ /* The default number of chapters in a volume */
+ DEFAULT_CHAPTERS_PER_VOLUME = 1024,
+
+ /* The default number of sparsely-indexed chapters in a volume */
+ DEFAULT_SPARSE_CHAPTERS_PER_VOLUME = 0,
+
+ /* The log2 of the default mean delta */
+ DEFAULT_CHAPTER_MEAN_DELTA_BITS = 16,
+
+ /* The log2 of the number of delta lists in a large chapter */
+ DEFAULT_CHAPTER_DELTA_LIST_BITS = 12,
+
+ /* The log2 of the number of delta lists in a small chapter */
+ SMALL_CHAPTER_DELTA_LIST_BITS = 10,
+
+ /* The number of header pages per volume */
+ HEADER_PAGES_PER_VOLUME = 1,
+};
+
+int __must_check uds_make_index_geometry(size_t bytes_per_page, u32 record_pages_per_chapter,
+ u32 chapters_per_volume,
+ u32 sparse_chapters_per_volume, u64 remapped_virtual,
+ u64 remapped_physical,
+ struct index_geometry **geometry_ptr);
+
+int __must_check uds_copy_index_geometry(struct index_geometry *source,
+ struct index_geometry **geometry_ptr);
+
+void uds_free_index_geometry(struct index_geometry *geometry);
+
+u32 __must_check uds_map_to_physical_chapter(const struct index_geometry *geometry,
+ u64 virtual_chapter);
+
+/*
+ * Check whether this geometry is reduced by a chapter. This will only be true if the volume was
+ * converted from a non-lvm volume to an lvm volume.
+ */
+static inline bool __must_check
+uds_is_reduced_index_geometry(const struct index_geometry *geometry)
+{
+ return !!(geometry->chapters_per_volume & 1);
+}
+
+static inline bool __must_check
+uds_is_sparse_index_geometry(const struct index_geometry *geometry)
+{
+ return geometry->sparse_chapters_per_volume > 0;
+}
+
+bool __must_check uds_has_sparse_chapters(const struct index_geometry *geometry,
+ u64 oldest_virtual_chapter,
+ u64 newest_virtual_chapter);
+
+bool __must_check uds_is_chapter_sparse(const struct index_geometry *geometry,
+ u64 oldest_virtual_chapter,
+ u64 newest_virtual_chapter,
+ u64 virtual_chapter_number);
+
+u32 __must_check uds_chapters_to_expire(const struct index_geometry *geometry,
+ u64 newest_chapter);
+
+#endif /* UDS_INDEX_GEOMETRY_H */
diff --git a/drivers/md/dm-vdo/indexer/hash-utils.h b/drivers/md/dm-vdo/indexer/hash-utils.h
new file mode 100644
index 000000000000..6a8dd8ffea6c
--- /dev/null
+++ b/drivers/md/dm-vdo/indexer/hash-utils.h
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef UDS_HASH_UTILS_H
+#define UDS_HASH_UTILS_H
+
+#include "numeric.h"
+
+#include "geometry.h"
+#include "indexer.h"
+
+/* Utilities for extracting portions of a request name for various uses. */
+
+/* How various portions of a record name are apportioned. */
+enum {
+ VOLUME_INDEX_BYTES_OFFSET = 0,
+ VOLUME_INDEX_BYTES_COUNT = 8,
+ CHAPTER_INDEX_BYTES_OFFSET = 8,
+ CHAPTER_INDEX_BYTES_COUNT = 6,
+ SAMPLE_BYTES_OFFSET = 14,
+ SAMPLE_BYTES_COUNT = 2,
+};
+
+static inline u64 uds_extract_chapter_index_bytes(const struct uds_record_name *name)
+{
+ const u8 *chapter_bits = &name->name[CHAPTER_INDEX_BYTES_OFFSET];
+ u64 bytes = (u64) get_unaligned_be16(chapter_bits) << 32;
+
+ bytes |= get_unaligned_be32(chapter_bits + 2);
+ return bytes;
+}
+
+static inline u64 uds_extract_volume_index_bytes(const struct uds_record_name *name)
+{
+ return get_unaligned_be64(&name->name[VOLUME_INDEX_BYTES_OFFSET]);
+}
+
+static inline u32 uds_extract_sampling_bytes(const struct uds_record_name *name)
+{
+ return get_unaligned_be16(&name->name[SAMPLE_BYTES_OFFSET]);
+}
+
+/* Compute the chapter delta list for a given name. */
+static inline u32 uds_hash_to_chapter_delta_list(const struct uds_record_name *name,
+ const struct index_geometry *geometry)
+{
+ return ((uds_extract_chapter_index_bytes(name) >> geometry->chapter_address_bits) &
+ ((1 << geometry->chapter_delta_list_bits) - 1));
+}
+
+/* Compute the chapter delta address for a given name. */
+static inline u32 uds_hash_to_chapter_delta_address(const struct uds_record_name *name,
+ const struct index_geometry *geometry)
+{
+ return uds_extract_chapter_index_bytes(name) & ((1 << geometry->chapter_address_bits) - 1);
+}
+
+static inline unsigned int uds_name_to_hash_slot(const struct uds_record_name *name,
+ unsigned int slot_count)
+{
+ return (unsigned int) (uds_extract_chapter_index_bytes(name) % slot_count);
+}
+
+#endif /* UDS_HASH_UTILS_H */
diff --git a/drivers/md/dm-vdo/indexer/index-layout.c b/drivers/md/dm-vdo/indexer/index-layout.c
new file mode 100644
index 000000000000..627adc24af3b
--- /dev/null
+++ b/drivers/md/dm-vdo/indexer/index-layout.c
@@ -0,0 +1,1765 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "index-layout.h"
+
+#include <linux/random.h>
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "murmurhash3.h"
+#include "numeric.h"
+#include "time-utils.h"
+
+#include "config.h"
+#include "open-chapter.h"
+#include "volume-index.h"
+
+/*
+ * The UDS layout on storage media is divided into a number of fixed-size regions, the sizes of
+ * which are computed when the index is created. Every header and region begins on 4K block
+ * boundary. Save regions are further sub-divided into regions of their own.
+ *
+ * Each region has a kind and an instance number. Some kinds only have one instance and therefore
+ * use RL_SOLE_INSTANCE (-1) as the instance number. The RL_KIND_INDEX used to use instances to
+ * represent sub-indices; now, however there is only ever one sub-index and therefore one instance.
+ * The RL_KIND_VOLUME_INDEX uses instances to record which zone is being saved.
+ *
+ * Every region header has a type and version.
+ *
+ * +-+-+---------+--------+--------+-+
+ * | | | I N D E X 0 101, 0 | |
+ * |H|C+---------+--------+--------+S|
+ * |D|f| Volume | Save | Save |e|
+ * |R|g| Region | Region | Region |a|
+ * | | | 201, -1 | 202, 0 | 202, 1 |l|
+ * +-+-+--------+---------+--------+-+
+ *
+ * The header contains the encoded region layout table as well as some index configuration data.
+ * The sub-index region and its subdivisions are maintained in the same table.
+ *
+ * There are two save regions to preserve the old state in case saving the new state is incomplete.
+ * They are used in alternation. Each save region is further divided into sub-regions.
+ *
+ * +-+-----+------+------+-----+-----+
+ * |H| IPM | MI | MI | | OC |
+ * |D| | zone | zone | ... | |
+ * |R| 301 | 302 | 302 | | 303 |
+ * | | -1 | 0 | 1 | | -1 |
+ * +-+-----+------+------+-----+-----+
+ *
+ * The header contains the encoded region layout table as well as index state data for that save.
+ * Each save also has a unique nonce.
+ */
+
+#define MAGIC_SIZE 32
+#define NONCE_INFO_SIZE 32
+#define MAX_SAVES 2
+
+enum region_kind {
+ RL_KIND_EMPTY = 0,
+ RL_KIND_HEADER = 1,
+ RL_KIND_CONFIG = 100,
+ RL_KIND_INDEX = 101,
+ RL_KIND_SEAL = 102,
+ RL_KIND_VOLUME = 201,
+ RL_KIND_SAVE = 202,
+ RL_KIND_INDEX_PAGE_MAP = 301,
+ RL_KIND_VOLUME_INDEX = 302,
+ RL_KIND_OPEN_CHAPTER = 303,
+};
+
+/* Some region types are historical and are no longer used. */
+enum region_type {
+ RH_TYPE_FREE = 0, /* unused */
+ RH_TYPE_SUPER = 1,
+ RH_TYPE_SAVE = 2,
+ RH_TYPE_CHECKPOINT = 3, /* unused */
+ RH_TYPE_UNSAVED = 4,
+};
+
+#define RL_SOLE_INSTANCE 65535
+
+/*
+ * Super block version 2 is the first released version.
+ *
+ * Super block version 3 is the normal version used from RHEL 8.2 onwards.
+ *
+ * Super block versions 4 through 6 were incremental development versions and
+ * are not supported.
+ *
+ * Super block version 7 is used for volumes which have been reduced in size by one chapter in
+ * order to make room to prepend LVM metadata to a volume originally created without lvm. This
+ * allows the index to retain most its deduplication records.
+ */
+#define SUPER_VERSION_MINIMUM 3
+#define SUPER_VERSION_CURRENT 3
+#define SUPER_VERSION_MAXIMUM 7
+
+static const u8 LAYOUT_MAGIC[MAGIC_SIZE] = "*ALBIREO*SINGLE*FILE*LAYOUT*001*";
+static const u64 REGION_MAGIC = 0x416c6252676e3031; /* 'AlbRgn01' */
+
+struct region_header {
+ u64 magic;
+ u64 region_blocks;
+ u16 type;
+ /* Currently always version 1 */
+ u16 version;
+ u16 region_count;
+ u16 payload;
+};
+
+struct layout_region {
+ u64 start_block;
+ u64 block_count;
+ u32 __unused;
+ u16 kind;
+ u16 instance;
+};
+
+struct region_table {
+ size_t encoded_size;
+ struct region_header header;
+ struct layout_region regions[];
+};
+
+struct index_save_data {
+ u64 timestamp;
+ u64 nonce;
+ /* Currently always version 1 */
+ u32 version;
+ u32 unused__;
+};
+
+struct index_state_version {
+ s32 signature;
+ s32 version_id;
+};
+
+static const struct index_state_version INDEX_STATE_VERSION_301 = {
+ .signature = -1,
+ .version_id = 301,
+};
+
+struct index_state_data301 {
+ struct index_state_version version;
+ u64 newest_chapter;
+ u64 oldest_chapter;
+ u64 last_save;
+ u32 unused;
+ u32 padding;
+};
+
+struct index_save_layout {
+ unsigned int zone_count;
+ struct layout_region index_save;
+ struct layout_region header;
+ struct layout_region index_page_map;
+ struct layout_region free_space;
+ struct layout_region volume_index_zones[MAX_ZONES];
+ struct layout_region open_chapter;
+ struct index_save_data save_data;
+ struct index_state_data301 state_data;
+};
+
+struct sub_index_layout {
+ u64 nonce;
+ struct layout_region sub_index;
+ struct layout_region volume;
+ struct index_save_layout *saves;
+};
+
+struct super_block_data {
+ u8 magic_label[MAGIC_SIZE];
+ u8 nonce_info[NONCE_INFO_SIZE];
+ u64 nonce;
+ u32 version;
+ u32 block_size;
+ u16 index_count;
+ u16 max_saves;
+ /* Padding reflects a blank field on permanent storage */
+ u8 padding[4];
+ u64 open_chapter_blocks;
+ u64 page_map_blocks;
+ u64 volume_offset;
+ u64 start_offset;
+};
+
+struct index_layout {
+ struct io_factory *factory;
+ size_t factory_size;
+ off_t offset;
+ struct super_block_data super;
+ struct layout_region header;
+ struct layout_region config;
+ struct sub_index_layout index;
+ struct layout_region seal;
+ u64 total_blocks;
+};
+
+struct save_layout_sizes {
+ unsigned int save_count;
+ size_t block_size;
+ u64 volume_blocks;
+ u64 volume_index_blocks;
+ u64 page_map_blocks;
+ u64 open_chapter_blocks;
+ u64 save_blocks;
+ u64 sub_index_blocks;
+ u64 total_blocks;
+ size_t total_size;
+};
+
+static inline bool is_converted_super_block(struct super_block_data *super)
+{
+ return super->version == 7;
+}
+
+static int __must_check compute_sizes(const struct uds_configuration *config,
+ struct save_layout_sizes *sls)
+{
+ int result;
+ struct index_geometry *geometry = config->geometry;
+
+ memset(sls, 0, sizeof(*sls));
+ sls->save_count = MAX_SAVES;
+ sls->block_size = UDS_BLOCK_SIZE;
+ sls->volume_blocks = geometry->bytes_per_volume / sls->block_size;
+
+ result = uds_compute_volume_index_save_blocks(config, sls->block_size,
+ &sls->volume_index_blocks);
+ if (result != UDS_SUCCESS)
+ return vdo_log_error_strerror(result, "cannot compute index save size");
+
+ sls->page_map_blocks =
+ DIV_ROUND_UP(uds_compute_index_page_map_save_size(geometry),
+ sls->block_size);
+ sls->open_chapter_blocks =
+ DIV_ROUND_UP(uds_compute_saved_open_chapter_size(geometry),
+ sls->block_size);
+ sls->save_blocks =
+ 1 + (sls->volume_index_blocks + sls->page_map_blocks + sls->open_chapter_blocks);
+ sls->sub_index_blocks = sls->volume_blocks + (sls->save_count * sls->save_blocks);
+ sls->total_blocks = 3 + sls->sub_index_blocks;
+ sls->total_size = sls->total_blocks * sls->block_size;
+
+ return UDS_SUCCESS;
+}
+
+int uds_compute_index_size(const struct uds_parameters *parameters, u64 *index_size)
+{
+ int result;
+ struct uds_configuration *index_config;
+ struct save_layout_sizes sizes;
+
+ if (index_size == NULL) {
+ vdo_log_error("Missing output size pointer");
+ return -EINVAL;
+ }
+
+ result = uds_make_configuration(parameters, &index_config);
+ if (result != UDS_SUCCESS) {
+ vdo_log_error_strerror(result, "cannot compute index size");
+ return uds_status_to_errno(result);
+ }
+
+ result = compute_sizes(index_config, &sizes);
+ uds_free_configuration(index_config);
+ if (result != UDS_SUCCESS)
+ return uds_status_to_errno(result);
+
+ *index_size = sizes.total_size;
+ return UDS_SUCCESS;
+}
+
+/* Create unique data using the current time and a pseudorandom number. */
+static void create_unique_nonce_data(u8 *buffer)
+{
+ ktime_t now = current_time_ns(CLOCK_REALTIME);
+ u32 rand;
+ size_t offset = 0;
+
+ get_random_bytes(&rand, sizeof(u32));
+ memcpy(buffer + offset, &now, sizeof(now));
+ offset += sizeof(now);
+ memcpy(buffer + offset, &rand, sizeof(rand));
+ offset += sizeof(rand);
+ while (offset < NONCE_INFO_SIZE) {
+ size_t len = min(NONCE_INFO_SIZE - offset, offset);
+
+ memcpy(buffer + offset, buffer, len);
+ offset += len;
+ }
+}
+
+static u64 hash_stuff(u64 start, const void *data, size_t len)
+{
+ u32 seed = start ^ (start >> 27);
+ u8 hash_buffer[16];
+
+ murmurhash3_128(data, len, seed, hash_buffer);
+ return get_unaligned_le64(hash_buffer + 4);
+}
+
+/* Generate a primary nonce from the provided data. */
+static u64 generate_primary_nonce(const void *data, size_t len)
+{
+ return hash_stuff(0xa1b1e0fc, data, len);
+}
+
+/*
+ * Deterministically generate a secondary nonce from an existing nonce and some arbitrary data by
+ * hashing the original nonce and the data to produce a new nonce.
+ */
+static u64 generate_secondary_nonce(u64 nonce, const void *data, size_t len)
+{
+ return hash_stuff(nonce + 1, data, len);
+}
+
+static int __must_check open_layout_reader(struct index_layout *layout,
+ struct layout_region *lr, off_t offset,
+ struct buffered_reader **reader_ptr)
+{
+ return uds_make_buffered_reader(layout->factory, lr->start_block + offset,
+ lr->block_count, reader_ptr);
+}
+
+static int open_region_reader(struct index_layout *layout, struct layout_region *region,
+ struct buffered_reader **reader_ptr)
+{
+ return open_layout_reader(layout, region, -layout->super.start_offset,
+ reader_ptr);
+}
+
+static int __must_check open_layout_writer(struct index_layout *layout,
+ struct layout_region *lr, off_t offset,
+ struct buffered_writer **writer_ptr)
+{
+ return uds_make_buffered_writer(layout->factory, lr->start_block + offset,
+ lr->block_count, writer_ptr);
+}
+
+static int open_region_writer(struct index_layout *layout, struct layout_region *region,
+ struct buffered_writer **writer_ptr)
+{
+ return open_layout_writer(layout, region, -layout->super.start_offset,
+ writer_ptr);
+}
+
+static void generate_super_block_data(struct save_layout_sizes *sls,
+ struct super_block_data *super)
+{
+ memset(super, 0, sizeof(*super));
+ memcpy(super->magic_label, LAYOUT_MAGIC, MAGIC_SIZE);
+ create_unique_nonce_data(super->nonce_info);
+
+ super->nonce = generate_primary_nonce(super->nonce_info,
+ sizeof(super->nonce_info));
+ super->version = SUPER_VERSION_CURRENT;
+ super->block_size = sls->block_size;
+ super->index_count = 1;
+ super->max_saves = sls->save_count;
+ super->open_chapter_blocks = sls->open_chapter_blocks;
+ super->page_map_blocks = sls->page_map_blocks;
+ super->volume_offset = 0;
+ super->start_offset = 0;
+}
+
+static void define_sub_index_nonce(struct index_layout *layout)
+{
+ struct sub_index_nonce_data {
+ u64 offset;
+ u16 index_id;
+ };
+ struct sub_index_layout *sil = &layout->index;
+ u64 primary_nonce = layout->super.nonce;
+ u8 buffer[sizeof(struct sub_index_nonce_data)] = { 0 };
+ size_t offset = 0;
+
+ encode_u64_le(buffer, &offset, sil->sub_index.start_block);
+ encode_u16_le(buffer, &offset, 0);
+ sil->nonce = generate_secondary_nonce(primary_nonce, buffer, sizeof(buffer));
+ if (sil->nonce == 0) {
+ sil->nonce = generate_secondary_nonce(~primary_nonce + 1, buffer,
+ sizeof(buffer));
+ }
+}
+
+static void setup_sub_index(struct index_layout *layout, u64 start_block,
+ struct save_layout_sizes *sls)
+{
+ struct sub_index_layout *sil = &layout->index;
+ u64 next_block = start_block;
+ unsigned int i;
+
+ sil->sub_index = (struct layout_region) {
+ .start_block = start_block,
+ .block_count = sls->sub_index_blocks,
+ .kind = RL_KIND_INDEX,
+ .instance = 0,
+ };
+
+ sil->volume = (struct layout_region) {
+ .start_block = next_block,
+ .block_count = sls->volume_blocks,
+ .kind = RL_KIND_VOLUME,
+ .instance = RL_SOLE_INSTANCE,
+ };
+
+ next_block += sls->volume_blocks;
+
+ for (i = 0; i < sls->save_count; i++) {
+ sil->saves[i].index_save = (struct layout_region) {
+ .start_block = next_block,
+ .block_count = sls->save_blocks,
+ .kind = RL_KIND_SAVE,
+ .instance = i,
+ };
+
+ next_block += sls->save_blocks;
+ }
+
+ define_sub_index_nonce(layout);
+}
+
+static void initialize_layout(struct index_layout *layout, struct save_layout_sizes *sls)
+{
+ u64 next_block = layout->offset / sls->block_size;
+
+ layout->total_blocks = sls->total_blocks;
+ generate_super_block_data(sls, &layout->super);
+ layout->header = (struct layout_region) {
+ .start_block = next_block++,
+ .block_count = 1,
+ .kind = RL_KIND_HEADER,
+ .instance = RL_SOLE_INSTANCE,
+ };
+
+ layout->config = (struct layout_region) {
+ .start_block = next_block++,
+ .block_count = 1,
+ .kind = RL_KIND_CONFIG,
+ .instance = RL_SOLE_INSTANCE,
+ };
+
+ setup_sub_index(layout, next_block, sls);
+ next_block += sls->sub_index_blocks;
+
+ layout->seal = (struct layout_region) {
+ .start_block = next_block,
+ .block_count = 1,
+ .kind = RL_KIND_SEAL,
+ .instance = RL_SOLE_INSTANCE,
+ };
+}
+
+static int __must_check make_index_save_region_table(struct index_save_layout *isl,
+ struct region_table **table_ptr)
+{
+ int result;
+ unsigned int z;
+ struct region_table *table;
+ struct layout_region *lr;
+ u16 region_count;
+ size_t payload;
+ size_t type;
+
+ if (isl->zone_count > 0) {
+ /*
+ * Normal save regions: header, page map, volume index zones,
+ * open chapter, and possibly free space.
+ */
+ region_count = 3 + isl->zone_count;
+ if (isl->free_space.block_count > 0)
+ region_count++;
+
+ payload = sizeof(isl->save_data) + sizeof(isl->state_data);
+ type = RH_TYPE_SAVE;
+ } else {
+ /* Empty save regions: header, page map, free space. */
+ region_count = 3;
+ payload = sizeof(isl->save_data);
+ type = RH_TYPE_UNSAVED;
+ }
+
+ result = vdo_allocate_extended(struct region_table, region_count,
+ struct layout_region,
+ "layout region table for ISL", &table);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ lr = &table->regions[0];
+ *lr++ = isl->header;
+ *lr++ = isl->index_page_map;
+ for (z = 0; z < isl->zone_count; z++)
+ *lr++ = isl->volume_index_zones[z];
+
+ if (isl->zone_count > 0)
+ *lr++ = isl->open_chapter;
+
+ if (isl->free_space.block_count > 0)
+ *lr++ = isl->free_space;
+
+ table->header = (struct region_header) {
+ .magic = REGION_MAGIC,
+ .region_blocks = isl->index_save.block_count,
+ .type = type,
+ .version = 1,
+ .region_count = region_count,
+ .payload = payload,
+ };
+
+ table->encoded_size = (sizeof(struct region_header) + payload +
+ region_count * sizeof(struct layout_region));
+ *table_ptr = table;
+ return UDS_SUCCESS;
+}
+
+static void encode_region_table(u8 *buffer, size_t *offset, struct region_table *table)
+{
+ unsigned int i;
+
+ encode_u64_le(buffer, offset, REGION_MAGIC);
+ encode_u64_le(buffer, offset, table->header.region_blocks);
+ encode_u16_le(buffer, offset, table->header.type);
+ encode_u16_le(buffer, offset, table->header.version);
+ encode_u16_le(buffer, offset, table->header.region_count);
+ encode_u16_le(buffer, offset, table->header.payload);
+
+ for (i = 0; i < table->header.region_count; i++) {
+ encode_u64_le(buffer, offset, table->regions[i].start_block);
+ encode_u64_le(buffer, offset, table->regions[i].block_count);
+ encode_u32_le(buffer, offset, 0);
+ encode_u16_le(buffer, offset, table->regions[i].kind);
+ encode_u16_le(buffer, offset, table->regions[i].instance);
+ }
+}
+
+static int __must_check write_index_save_header(struct index_save_layout *isl,
+ struct region_table *table,
+ struct buffered_writer *writer)
+{
+ int result;
+ u8 *buffer;
+ size_t offset = 0;
+
+ result = vdo_allocate(table->encoded_size, u8, "index save data", &buffer);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ encode_region_table(buffer, &offset, table);
+ encode_u64_le(buffer, &offset, isl->save_data.timestamp);
+ encode_u64_le(buffer, &offset, isl->save_data.nonce);
+ encode_u32_le(buffer, &offset, isl->save_data.version);
+ encode_u32_le(buffer, &offset, 0);
+ if (isl->zone_count > 0) {
+ encode_u32_le(buffer, &offset, INDEX_STATE_VERSION_301.signature);
+ encode_u32_le(buffer, &offset, INDEX_STATE_VERSION_301.version_id);
+ encode_u64_le(buffer, &offset, isl->state_data.newest_chapter);
+ encode_u64_le(buffer, &offset, isl->state_data.oldest_chapter);
+ encode_u64_le(buffer, &offset, isl->state_data.last_save);
+ encode_u64_le(buffer, &offset, 0);
+ }
+
+ result = uds_write_to_buffered_writer(writer, buffer, offset);
+ vdo_free(buffer);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ return uds_flush_buffered_writer(writer);
+}
+
+static int write_index_save_layout(struct index_layout *layout,
+ struct index_save_layout *isl)
+{
+ int result;
+ struct region_table *table;
+ struct buffered_writer *writer;
+
+ result = make_index_save_region_table(isl, &table);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ result = open_region_writer(layout, &isl->header, &writer);
+ if (result != UDS_SUCCESS) {
+ vdo_free(table);
+ return result;
+ }
+
+ result = write_index_save_header(isl, table, writer);
+ vdo_free(table);
+ uds_free_buffered_writer(writer);
+
+ return result;
+}
+
+static void reset_index_save_layout(struct index_save_layout *isl, u64 page_map_blocks)
+{
+ u64 free_blocks;
+ u64 next_block = isl->index_save.start_block;
+
+ isl->zone_count = 0;
+ memset(&isl->save_data, 0, sizeof(isl->save_data));
+
+ isl->header = (struct layout_region) {
+ .start_block = next_block++,
+ .block_count = 1,
+ .kind = RL_KIND_HEADER,
+ .instance = RL_SOLE_INSTANCE,
+ };
+
+ isl->index_page_map = (struct layout_region) {
+ .start_block = next_block,
+ .block_count = page_map_blocks,
+ .kind = RL_KIND_INDEX_PAGE_MAP,
+ .instance = RL_SOLE_INSTANCE,
+ };
+
+ next_block += page_map_blocks;
+
+ free_blocks = isl->index_save.block_count - page_map_blocks - 1;
+ isl->free_space = (struct layout_region) {
+ .start_block = next_block,
+ .block_count = free_blocks,
+ .kind = RL_KIND_EMPTY,
+ .instance = RL_SOLE_INSTANCE,
+ };
+}
+
+static int __must_check invalidate_old_save(struct index_layout *layout,
+ struct index_save_layout *isl)
+{
+ reset_index_save_layout(isl, layout->super.page_map_blocks);
+ return write_index_save_layout(layout, isl);
+}
+
+static int discard_index_state_data(struct index_layout *layout)
+{
+ int result;
+ int saved_result = UDS_SUCCESS;
+ unsigned int i;
+
+ for (i = 0; i < layout->super.max_saves; i++) {
+ result = invalidate_old_save(layout, &layout->index.saves[i]);
+ if (result != UDS_SUCCESS)
+ saved_result = result;
+ }
+
+ if (saved_result != UDS_SUCCESS) {
+ return vdo_log_error_strerror(result,
+ "%s: cannot destroy all index saves",
+ __func__);
+ }
+
+ return UDS_SUCCESS;
+}
+
+static int __must_check make_layout_region_table(struct index_layout *layout,
+ struct region_table **table_ptr)
+{
+ int result;
+ unsigned int i;
+ /* Regions: header, config, index, volume, saves, seal */
+ u16 region_count = 5 + layout->super.max_saves;
+ u16 payload;
+ struct region_table *table;
+ struct layout_region *lr;
+
+ result = vdo_allocate_extended(struct region_table, region_count,
+ struct layout_region, "layout region table",
+ &table);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ lr = &table->regions[0];
+ *lr++ = layout->header;
+ *lr++ = layout->config;
+ *lr++ = layout->index.sub_index;
+ *lr++ = layout->index.volume;
+
+ for (i = 0; i < layout->super.max_saves; i++)
+ *lr++ = layout->index.saves[i].index_save;
+
+ *lr++ = layout->seal;
+
+ if (is_converted_super_block(&layout->super)) {
+ payload = sizeof(struct super_block_data);
+ } else {
+ payload = (sizeof(struct super_block_data) -
+ sizeof(layout->super.volume_offset) -
+ sizeof(layout->super.start_offset));
+ }
+
+ table->header = (struct region_header) {
+ .magic = REGION_MAGIC,
+ .region_blocks = layout->total_blocks,
+ .type = RH_TYPE_SUPER,
+ .version = 1,
+ .region_count = region_count,
+ .payload = payload,
+ };
+
+ table->encoded_size = (sizeof(struct region_header) + payload +
+ region_count * sizeof(struct layout_region));
+ *table_ptr = table;
+ return UDS_SUCCESS;
+}
+
+static int __must_check write_layout_header(struct index_layout *layout,
+ struct region_table *table,
+ struct buffered_writer *writer)
+{
+ int result;
+ u8 *buffer;
+ size_t offset = 0;
+
+ result = vdo_allocate(table->encoded_size, u8, "layout data", &buffer);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ encode_region_table(buffer, &offset, table);
+ memcpy(buffer + offset, &layout->super.magic_label, MAGIC_SIZE);
+ offset += MAGIC_SIZE;
+ memcpy(buffer + offset, &layout->super.nonce_info, NONCE_INFO_SIZE);
+ offset += NONCE_INFO_SIZE;
+ encode_u64_le(buffer, &offset, layout->super.nonce);
+ encode_u32_le(buffer, &offset, layout->super.version);
+ encode_u32_le(buffer, &offset, layout->super.block_size);
+ encode_u16_le(buffer, &offset, layout->super.index_count);
+ encode_u16_le(buffer, &offset, layout->super.max_saves);
+ encode_u32_le(buffer, &offset, 0);
+ encode_u64_le(buffer, &offset, layout->super.open_chapter_blocks);
+ encode_u64_le(buffer, &offset, layout->super.page_map_blocks);
+
+ if (is_converted_super_block(&layout->super)) {
+ encode_u64_le(buffer, &offset, layout->super.volume_offset);
+ encode_u64_le(buffer, &offset, layout->super.start_offset);
+ }
+
+ result = uds_write_to_buffered_writer(writer, buffer, offset);
+ vdo_free(buffer);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ return uds_flush_buffered_writer(writer);
+}
+
+static int __must_check write_uds_index_config(struct index_layout *layout,
+ struct uds_configuration *config,
+ off_t offset)
+{
+ int result;
+ struct buffered_writer *writer = NULL;
+
+ result = open_layout_writer(layout, &layout->config, offset, &writer);
+ if (result != UDS_SUCCESS)
+ return vdo_log_error_strerror(result, "failed to open config region");
+
+ result = uds_write_config_contents(writer, config, layout->super.version);
+ if (result != UDS_SUCCESS) {
+ uds_free_buffered_writer(writer);
+ return vdo_log_error_strerror(result, "failed to write config region");
+ }
+
+ result = uds_flush_buffered_writer(writer);
+ if (result != UDS_SUCCESS) {
+ uds_free_buffered_writer(writer);
+ return vdo_log_error_strerror(result, "cannot flush config writer");
+ }
+
+ uds_free_buffered_writer(writer);
+ return UDS_SUCCESS;
+}
+
+static int __must_check save_layout(struct index_layout *layout, off_t offset)
+{
+ int result;
+ struct buffered_writer *writer = NULL;
+ struct region_table *table;
+
+ result = make_layout_region_table(layout, &table);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ result = open_layout_writer(layout, &layout->header, offset, &writer);
+ if (result != UDS_SUCCESS) {
+ vdo_free(table);
+ return result;
+ }
+
+ result = write_layout_header(layout, table, writer);
+ vdo_free(table);
+ uds_free_buffered_writer(writer);
+
+ return result;
+}
+
+static int create_index_layout(struct index_layout *layout, struct uds_configuration *config)
+{
+ int result;
+ struct save_layout_sizes sizes;
+
+ result = compute_sizes(config, &sizes);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ result = vdo_allocate(sizes.save_count, struct index_save_layout, __func__,
+ &layout->index.saves);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ initialize_layout(layout, &sizes);
+
+ result = discard_index_state_data(layout);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ result = write_uds_index_config(layout, config, 0);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ return save_layout(layout, 0);
+}
+
+static u64 generate_index_save_nonce(u64 volume_nonce, struct index_save_layout *isl)
+{
+ struct save_nonce_data {
+ struct index_save_data data;
+ u64 offset;
+ } nonce_data;
+ u8 buffer[sizeof(nonce_data)];
+ size_t offset = 0;
+
+ encode_u64_le(buffer, &offset, isl->save_data.timestamp);
+ encode_u64_le(buffer, &offset, 0);
+ encode_u32_le(buffer, &offset, isl->save_data.version);
+ encode_u32_le(buffer, &offset, 0U);
+ encode_u64_le(buffer, &offset, isl->index_save.start_block);
+ VDO_ASSERT_LOG_ONLY(offset == sizeof(nonce_data),
+ "%zu bytes encoded of %zu expected",
+ offset, sizeof(nonce_data));
+ return generate_secondary_nonce(volume_nonce, buffer, sizeof(buffer));
+}
+
+static u64 validate_index_save_layout(struct index_save_layout *isl, u64 volume_nonce)
+{
+ if ((isl->zone_count == 0) || (isl->save_data.timestamp == 0))
+ return 0;
+
+ if (isl->save_data.nonce != generate_index_save_nonce(volume_nonce, isl))
+ return 0;
+
+ return isl->save_data.timestamp;
+}
+
+static int find_latest_uds_index_save_slot(struct index_layout *layout,
+ struct index_save_layout **isl_ptr)
+{
+ struct index_save_layout *latest = NULL;
+ struct index_save_layout *isl;
+ unsigned int i;
+ u64 save_time = 0;
+ u64 latest_time = 0;
+
+ for (i = 0; i < layout->super.max_saves; i++) {
+ isl = &layout->index.saves[i];
+ save_time = validate_index_save_layout(isl, layout->index.nonce);
+ if (save_time > latest_time) {
+ latest = isl;
+ latest_time = save_time;
+ }
+ }
+
+ if (latest == NULL) {
+ vdo_log_error("No valid index save found");
+ return UDS_INDEX_NOT_SAVED_CLEANLY;
+ }
+
+ *isl_ptr = latest;
+ return UDS_SUCCESS;
+}
+
+int uds_discard_open_chapter(struct index_layout *layout)
+{
+ int result;
+ struct index_save_layout *isl;
+ struct buffered_writer *writer;
+
+ result = find_latest_uds_index_save_slot(layout, &isl);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ result = open_region_writer(layout, &isl->open_chapter, &writer);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ result = uds_write_to_buffered_writer(writer, NULL, UDS_BLOCK_SIZE);
+ if (result != UDS_SUCCESS) {
+ uds_free_buffered_writer(writer);
+ return result;
+ }
+
+ result = uds_flush_buffered_writer(writer);
+ uds_free_buffered_writer(writer);
+ return result;
+}
+
+int uds_load_index_state(struct index_layout *layout, struct uds_index *index)
+{
+ int result;
+ unsigned int zone;
+ struct index_save_layout *isl;
+ struct buffered_reader *readers[MAX_ZONES];
+
+ result = find_latest_uds_index_save_slot(layout, &isl);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ index->newest_virtual_chapter = isl->state_data.newest_chapter;
+ index->oldest_virtual_chapter = isl->state_data.oldest_chapter;
+ index->last_save = isl->state_data.last_save;
+
+ result = open_region_reader(layout, &isl->open_chapter, &readers[0]);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ result = uds_load_open_chapter(index, readers[0]);
+ uds_free_buffered_reader(readers[0]);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ for (zone = 0; zone < isl->zone_count; zone++) {
+ result = open_region_reader(layout, &isl->volume_index_zones[zone],
+ &readers[zone]);
+ if (result != UDS_SUCCESS) {
+ for (; zone > 0; zone--)
+ uds_free_buffered_reader(readers[zone - 1]);
+
+ return result;
+ }
+ }
+
+ result = uds_load_volume_index(index->volume_index, readers, isl->zone_count);
+ for (zone = 0; zone < isl->zone_count; zone++)
+ uds_free_buffered_reader(readers[zone]);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ result = open_region_reader(layout, &isl->index_page_map, &readers[0]);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ result = uds_read_index_page_map(index->volume->index_page_map, readers[0]);
+ uds_free_buffered_reader(readers[0]);
+
+ return result;
+}
+
+static struct index_save_layout *select_oldest_index_save_layout(struct index_layout *layout)
+{
+ struct index_save_layout *oldest = NULL;
+ struct index_save_layout *isl;
+ unsigned int i;
+ u64 save_time = 0;
+ u64 oldest_time = 0;
+
+ for (i = 0; i < layout->super.max_saves; i++) {
+ isl = &layout->index.saves[i];
+ save_time = validate_index_save_layout(isl, layout->index.nonce);
+ if (oldest == NULL || save_time < oldest_time) {
+ oldest = isl;
+ oldest_time = save_time;
+ }
+ }
+
+ return oldest;
+}
+
+static void instantiate_index_save_layout(struct index_save_layout *isl,
+ struct super_block_data *super,
+ u64 volume_nonce, unsigned int zone_count)
+{
+ unsigned int z;
+ u64 next_block;
+ u64 free_blocks;
+ u64 volume_index_blocks;
+
+ isl->zone_count = zone_count;
+ memset(&isl->save_data, 0, sizeof(isl->save_data));
+ isl->save_data.timestamp = ktime_to_ms(current_time_ns(CLOCK_REALTIME));
+ isl->save_data.version = 1;
+ isl->save_data.nonce = generate_index_save_nonce(volume_nonce, isl);
+
+ next_block = isl->index_save.start_block;
+ isl->header = (struct layout_region) {
+ .start_block = next_block++,
+ .block_count = 1,
+ .kind = RL_KIND_HEADER,
+ .instance = RL_SOLE_INSTANCE,
+ };
+
+ isl->index_page_map = (struct layout_region) {
+ .start_block = next_block,
+ .block_count = super->page_map_blocks,
+ .kind = RL_KIND_INDEX_PAGE_MAP,
+ .instance = RL_SOLE_INSTANCE,
+ };
+ next_block += super->page_map_blocks;
+
+ free_blocks = (isl->index_save.block_count - 1 -
+ super->page_map_blocks -
+ super->open_chapter_blocks);
+ volume_index_blocks = free_blocks / isl->zone_count;
+ for (z = 0; z < isl->zone_count; z++) {
+ isl->volume_index_zones[z] = (struct layout_region) {
+ .start_block = next_block,
+ .block_count = volume_index_blocks,
+ .kind = RL_KIND_VOLUME_INDEX,
+ .instance = z,
+ };
+
+ next_block += volume_index_blocks;
+ free_blocks -= volume_index_blocks;
+ }
+
+ isl->open_chapter = (struct layout_region) {
+ .start_block = next_block,
+ .block_count = super->open_chapter_blocks,
+ .kind = RL_KIND_OPEN_CHAPTER,
+ .instance = RL_SOLE_INSTANCE,
+ };
+
+ next_block += super->open_chapter_blocks;
+
+ isl->free_space = (struct layout_region) {
+ .start_block = next_block,
+ .block_count = free_blocks,
+ .kind = RL_KIND_EMPTY,
+ .instance = RL_SOLE_INSTANCE,
+ };
+}
+
+static int setup_uds_index_save_slot(struct index_layout *layout,
+ unsigned int zone_count,
+ struct index_save_layout **isl_ptr)
+{
+ int result;
+ struct index_save_layout *isl;
+
+ isl = select_oldest_index_save_layout(layout);
+ result = invalidate_old_save(layout, isl);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ instantiate_index_save_layout(isl, &layout->super, layout->index.nonce,
+ zone_count);
+
+ *isl_ptr = isl;
+ return UDS_SUCCESS;
+}
+
+static void cancel_uds_index_save(struct index_save_layout *isl)
+{
+ memset(&isl->save_data, 0, sizeof(isl->save_data));
+ memset(&isl->state_data, 0, sizeof(isl->state_data));
+ isl->zone_count = 0;
+}
+
+int uds_save_index_state(struct index_layout *layout, struct uds_index *index)
+{
+ int result;
+ unsigned int zone;
+ struct index_save_layout *isl;
+ struct buffered_writer *writers[MAX_ZONES];
+
+ result = setup_uds_index_save_slot(layout, index->zone_count, &isl);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ isl->state_data = (struct index_state_data301) {
+ .newest_chapter = index->newest_virtual_chapter,
+ .oldest_chapter = index->oldest_virtual_chapter,
+ .last_save = index->last_save,
+ };
+
+ result = open_region_writer(layout, &isl->open_chapter, &writers[0]);
+ if (result != UDS_SUCCESS) {
+ cancel_uds_index_save(isl);
+ return result;
+ }
+
+ result = uds_save_open_chapter(index, writers[0]);
+ uds_free_buffered_writer(writers[0]);
+ if (result != UDS_SUCCESS) {
+ cancel_uds_index_save(isl);
+ return result;
+ }
+
+ for (zone = 0; zone < index->zone_count; zone++) {
+ result = open_region_writer(layout, &isl->volume_index_zones[zone],
+ &writers[zone]);
+ if (result != UDS_SUCCESS) {
+ for (; zone > 0; zone--)
+ uds_free_buffered_writer(writers[zone - 1]);
+
+ cancel_uds_index_save(isl);
+ return result;
+ }
+ }
+
+ result = uds_save_volume_index(index->volume_index, writers, index->zone_count);
+ for (zone = 0; zone < index->zone_count; zone++)
+ uds_free_buffered_writer(writers[zone]);
+ if (result != UDS_SUCCESS) {
+ cancel_uds_index_save(isl);
+ return result;
+ }
+
+ result = open_region_writer(layout, &isl->index_page_map, &writers[0]);
+ if (result != UDS_SUCCESS) {
+ cancel_uds_index_save(isl);
+ return result;
+ }
+
+ result = uds_write_index_page_map(index->volume->index_page_map, writers[0]);
+ uds_free_buffered_writer(writers[0]);
+ if (result != UDS_SUCCESS) {
+ cancel_uds_index_save(isl);
+ return result;
+ }
+
+ return write_index_save_layout(layout, isl);
+}
+
+static int __must_check load_region_table(struct buffered_reader *reader,
+ struct region_table **table_ptr)
+{
+ int result;
+ unsigned int i;
+ struct region_header header;
+ struct region_table *table;
+ u8 buffer[sizeof(struct region_header)];
+ size_t offset = 0;
+
+ result = uds_read_from_buffered_reader(reader, buffer, sizeof(buffer));
+ if (result != UDS_SUCCESS)
+ return vdo_log_error_strerror(result, "cannot read region table header");
+
+ decode_u64_le(buffer, &offset, &header.magic);
+ decode_u64_le(buffer, &offset, &header.region_blocks);
+ decode_u16_le(buffer, &offset, &header.type);
+ decode_u16_le(buffer, &offset, &header.version);
+ decode_u16_le(buffer, &offset, &header.region_count);
+ decode_u16_le(buffer, &offset, &header.payload);
+
+ if (header.magic != REGION_MAGIC)
+ return UDS_NO_INDEX;
+
+ if (header.version != 1) {
+ return vdo_log_error_strerror(UDS_UNSUPPORTED_VERSION,
+ "unknown region table version %hu",
+ header.version);
+ }
+
+ result = vdo_allocate_extended(struct region_table, header.region_count,
+ struct layout_region,
+ "single file layout region table", &table);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ table->header = header;
+ for (i = 0; i < header.region_count; i++) {
+ u8 region_buffer[sizeof(struct layout_region)];
+
+ offset = 0;
+ result = uds_read_from_buffered_reader(reader, region_buffer,
+ sizeof(region_buffer));
+ if (result != UDS_SUCCESS) {
+ vdo_free(table);
+ return vdo_log_error_strerror(UDS_CORRUPT_DATA,
+ "cannot read region table layouts");
+ }
+
+ decode_u64_le(region_buffer, &offset, &table->regions[i].start_block);
+ decode_u64_le(region_buffer, &offset, &table->regions[i].block_count);
+ offset += sizeof(u32);
+ decode_u16_le(region_buffer, &offset, &table->regions[i].kind);
+ decode_u16_le(region_buffer, &offset, &table->regions[i].instance);
+ }
+
+ *table_ptr = table;
+ return UDS_SUCCESS;
+}
+
+static int __must_check read_super_block_data(struct buffered_reader *reader,
+ struct index_layout *layout,
+ size_t saved_size)
+{
+ int result;
+ struct super_block_data *super = &layout->super;
+ u8 *buffer;
+ size_t offset = 0;
+
+ result = vdo_allocate(saved_size, u8, "super block data", &buffer);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = uds_read_from_buffered_reader(reader, buffer, saved_size);
+ if (result != UDS_SUCCESS) {
+ vdo_free(buffer);
+ return vdo_log_error_strerror(result, "cannot read region table header");
+ }
+
+ memcpy(&super->magic_label, buffer, MAGIC_SIZE);
+ offset += MAGIC_SIZE;
+ memcpy(&super->nonce_info, buffer + offset, NONCE_INFO_SIZE);
+ offset += NONCE_INFO_SIZE;
+ decode_u64_le(buffer, &offset, &super->nonce);
+ decode_u32_le(buffer, &offset, &super->version);
+ decode_u32_le(buffer, &offset, &super->block_size);
+ decode_u16_le(buffer, &offset, &super->index_count);
+ decode_u16_le(buffer, &offset, &super->max_saves);
+ offset += sizeof(u32);
+ decode_u64_le(buffer, &offset, &super->open_chapter_blocks);
+ decode_u64_le(buffer, &offset, &super->page_map_blocks);
+
+ if (is_converted_super_block(super)) {
+ decode_u64_le(buffer, &offset, &super->volume_offset);
+ decode_u64_le(buffer, &offset, &super->start_offset);
+ } else {
+ super->volume_offset = 0;
+ super->start_offset = 0;
+ }
+
+ vdo_free(buffer);
+
+ if (memcmp(super->magic_label, LAYOUT_MAGIC, MAGIC_SIZE) != 0)
+ return vdo_log_error_strerror(UDS_CORRUPT_DATA,
+ "unknown superblock magic label");
+
+ if ((super->version < SUPER_VERSION_MINIMUM) ||
+ (super->version == 4) || (super->version == 5) || (super->version == 6) ||
+ (super->version > SUPER_VERSION_MAXIMUM)) {
+ return vdo_log_error_strerror(UDS_UNSUPPORTED_VERSION,
+ "unknown superblock version number %u",
+ super->version);
+ }
+
+ if (super->volume_offset < super->start_offset) {
+ return vdo_log_error_strerror(UDS_CORRUPT_DATA,
+ "inconsistent offsets (start %llu, volume %llu)",
+ (unsigned long long) super->start_offset,
+ (unsigned long long) super->volume_offset);
+ }
+
+ /* Sub-indexes are no longer used but the layout retains this field. */
+ if (super->index_count != 1) {
+ return vdo_log_error_strerror(UDS_CORRUPT_DATA,
+ "invalid subindex count %u",
+ super->index_count);
+ }
+
+ if (generate_primary_nonce(super->nonce_info, sizeof(super->nonce_info)) != super->nonce) {
+ return vdo_log_error_strerror(UDS_CORRUPT_DATA,
+ "inconsistent superblock nonce");
+ }
+
+ return UDS_SUCCESS;
+}
+
+static int __must_check verify_region(struct layout_region *lr, u64 start_block,
+ enum region_kind kind, unsigned int instance)
+{
+ if (lr->start_block != start_block)
+ return vdo_log_error_strerror(UDS_CORRUPT_DATA,
+ "incorrect layout region offset");
+
+ if (lr->kind != kind)
+ return vdo_log_error_strerror(UDS_CORRUPT_DATA,
+ "incorrect layout region kind");
+
+ if (lr->instance != instance) {
+ return vdo_log_error_strerror(UDS_CORRUPT_DATA,
+ "incorrect layout region instance");
+ }
+
+ return UDS_SUCCESS;
+}
+
+static int __must_check verify_sub_index(struct index_layout *layout, u64 start_block,
+ struct region_table *table)
+{
+ int result;
+ unsigned int i;
+ struct sub_index_layout *sil = &layout->index;
+ u64 next_block = start_block;
+
+ sil->sub_index = table->regions[2];
+ result = verify_region(&sil->sub_index, next_block, RL_KIND_INDEX, 0);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ define_sub_index_nonce(layout);
+
+ sil->volume = table->regions[3];
+ result = verify_region(&sil->volume, next_block, RL_KIND_VOLUME,
+ RL_SOLE_INSTANCE);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ next_block += sil->volume.block_count + layout->super.volume_offset;
+
+ for (i = 0; i < layout->super.max_saves; i++) {
+ sil->saves[i].index_save = table->regions[i + 4];
+ result = verify_region(&sil->saves[i].index_save, next_block,
+ RL_KIND_SAVE, i);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ next_block += sil->saves[i].index_save.block_count;
+ }
+
+ next_block -= layout->super.volume_offset;
+ if (next_block != start_block + sil->sub_index.block_count) {
+ return vdo_log_error_strerror(UDS_CORRUPT_DATA,
+ "sub index region does not span all saves");
+ }
+
+ return UDS_SUCCESS;
+}
+
+static int __must_check reconstitute_layout(struct index_layout *layout,
+ struct region_table *table, u64 first_block)
+{
+ int result;
+ u64 next_block = first_block;
+
+ result = vdo_allocate(layout->super.max_saves, struct index_save_layout,
+ __func__, &layout->index.saves);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ layout->total_blocks = table->header.region_blocks;
+
+ layout->header = table->regions[0];
+ result = verify_region(&layout->header, next_block++, RL_KIND_HEADER,
+ RL_SOLE_INSTANCE);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ layout->config = table->regions[1];
+ result = verify_region(&layout->config, next_block++, RL_KIND_CONFIG,
+ RL_SOLE_INSTANCE);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ result = verify_sub_index(layout, next_block, table);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ next_block += layout->index.sub_index.block_count;
+
+ layout->seal = table->regions[table->header.region_count - 1];
+ result = verify_region(&layout->seal, next_block + layout->super.volume_offset,
+ RL_KIND_SEAL, RL_SOLE_INSTANCE);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ if (++next_block != (first_block + layout->total_blocks)) {
+ return vdo_log_error_strerror(UDS_CORRUPT_DATA,
+ "layout table does not span total blocks");
+ }
+
+ return UDS_SUCCESS;
+}
+
+static int __must_check load_super_block(struct index_layout *layout, size_t block_size,
+ u64 first_block, struct buffered_reader *reader)
+{
+ int result;
+ struct region_table *table = NULL;
+ struct super_block_data *super = &layout->super;
+
+ result = load_region_table(reader, &table);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ if (table->header.type != RH_TYPE_SUPER) {
+ vdo_free(table);
+ return vdo_log_error_strerror(UDS_CORRUPT_DATA,
+ "not a superblock region table");
+ }
+
+ result = read_super_block_data(reader, layout, table->header.payload);
+ if (result != UDS_SUCCESS) {
+ vdo_free(table);
+ return vdo_log_error_strerror(result, "unknown superblock format");
+ }
+
+ if (super->block_size != block_size) {
+ vdo_free(table);
+ return vdo_log_error_strerror(UDS_CORRUPT_DATA,
+ "superblock saved block_size %u differs from supplied block_size %zu",
+ super->block_size, block_size);
+ }
+
+ first_block -= (super->volume_offset - super->start_offset);
+ result = reconstitute_layout(layout, table, first_block);
+ vdo_free(table);
+ return result;
+}
+
+static int __must_check read_index_save_data(struct buffered_reader *reader,
+ struct index_save_layout *isl,
+ size_t saved_size)
+{
+ int result;
+ struct index_state_version file_version;
+ u8 buffer[sizeof(struct index_save_data) + sizeof(struct index_state_data301)];
+ size_t offset = 0;
+
+ if (saved_size != sizeof(buffer)) {
+ return vdo_log_error_strerror(UDS_CORRUPT_DATA,
+ "unexpected index save data size %zu",
+ saved_size);
+ }
+
+ result = uds_read_from_buffered_reader(reader, buffer, sizeof(buffer));
+ if (result != UDS_SUCCESS)
+ return vdo_log_error_strerror(result, "cannot read index save data");
+
+ decode_u64_le(buffer, &offset, &isl->save_data.timestamp);
+ decode_u64_le(buffer, &offset, &isl->save_data.nonce);
+ decode_u32_le(buffer, &offset, &isl->save_data.version);
+ offset += sizeof(u32);
+
+ if (isl->save_data.version > 1) {
+ return vdo_log_error_strerror(UDS_UNSUPPORTED_VERSION,
+ "unknown index save version number %u",
+ isl->save_data.version);
+ }
+
+ decode_s32_le(buffer, &offset, &file_version.signature);
+ decode_s32_le(buffer, &offset, &file_version.version_id);
+
+ if ((file_version.signature != INDEX_STATE_VERSION_301.signature) ||
+ (file_version.version_id != INDEX_STATE_VERSION_301.version_id)) {
+ return vdo_log_error_strerror(UDS_UNSUPPORTED_VERSION,
+ "index state version %d,%d is unsupported",
+ file_version.signature,
+ file_version.version_id);
+ }
+
+ decode_u64_le(buffer, &offset, &isl->state_data.newest_chapter);
+ decode_u64_le(buffer, &offset, &isl->state_data.oldest_chapter);
+ decode_u64_le(buffer, &offset, &isl->state_data.last_save);
+ /* Skip past some historical fields that are now unused */
+ offset += sizeof(u32) + sizeof(u32);
+ return UDS_SUCCESS;
+}
+
+static int __must_check reconstruct_index_save(struct index_save_layout *isl,
+ struct region_table *table)
+{
+ int result;
+ unsigned int z;
+ struct layout_region *last_region;
+ u64 next_block = isl->index_save.start_block;
+ u64 last_block = next_block + isl->index_save.block_count;
+
+ isl->zone_count = table->header.region_count - 3;
+
+ last_region = &table->regions[table->header.region_count - 1];
+ if (last_region->kind == RL_KIND_EMPTY) {
+ isl->free_space = *last_region;
+ isl->zone_count--;
+ } else {
+ isl->free_space = (struct layout_region) {
+ .start_block = last_block,
+ .block_count = 0,
+ .kind = RL_KIND_EMPTY,
+ .instance = RL_SOLE_INSTANCE,
+ };
+ }
+
+ isl->header = table->regions[0];
+ result = verify_region(&isl->header, next_block++, RL_KIND_HEADER,
+ RL_SOLE_INSTANCE);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ isl->index_page_map = table->regions[1];
+ result = verify_region(&isl->index_page_map, next_block, RL_KIND_INDEX_PAGE_MAP,
+ RL_SOLE_INSTANCE);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ next_block += isl->index_page_map.block_count;
+
+ for (z = 0; z < isl->zone_count; z++) {
+ isl->volume_index_zones[z] = table->regions[z + 2];
+ result = verify_region(&isl->volume_index_zones[z], next_block,
+ RL_KIND_VOLUME_INDEX, z);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ next_block += isl->volume_index_zones[z].block_count;
+ }
+
+ isl->open_chapter = table->regions[isl->zone_count + 2];
+ result = verify_region(&isl->open_chapter, next_block, RL_KIND_OPEN_CHAPTER,
+ RL_SOLE_INSTANCE);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ next_block += isl->open_chapter.block_count;
+
+ result = verify_region(&isl->free_space, next_block, RL_KIND_EMPTY,
+ RL_SOLE_INSTANCE);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ next_block += isl->free_space.block_count;
+ if (next_block != last_block) {
+ return vdo_log_error_strerror(UDS_CORRUPT_DATA,
+ "index save layout table incomplete");
+ }
+
+ return UDS_SUCCESS;
+}
+
+static int __must_check load_index_save(struct index_save_layout *isl,
+ struct buffered_reader *reader,
+ unsigned int instance)
+{
+ int result;
+ struct region_table *table = NULL;
+
+ result = load_region_table(reader, &table);
+ if (result != UDS_SUCCESS) {
+ return vdo_log_error_strerror(result, "cannot read index save %u header",
+ instance);
+ }
+
+ if (table->header.region_blocks != isl->index_save.block_count) {
+ u64 region_blocks = table->header.region_blocks;
+
+ vdo_free(table);
+ return vdo_log_error_strerror(UDS_CORRUPT_DATA,
+ "unexpected index save %u region block count %llu",
+ instance,
+ (unsigned long long) region_blocks);
+ }
+
+ if (table->header.type == RH_TYPE_UNSAVED) {
+ vdo_free(table);
+ reset_index_save_layout(isl, 0);
+ return UDS_SUCCESS;
+ }
+
+
+ if (table->header.type != RH_TYPE_SAVE) {
+ vdo_log_error_strerror(UDS_CORRUPT_DATA,
+ "unexpected index save %u header type %u",
+ instance, table->header.type);
+ vdo_free(table);
+ return UDS_CORRUPT_DATA;
+ }
+
+ result = read_index_save_data(reader, isl, table->header.payload);
+ if (result != UDS_SUCCESS) {
+ vdo_free(table);
+ return vdo_log_error_strerror(result,
+ "unknown index save %u data format",
+ instance);
+ }
+
+ result = reconstruct_index_save(isl, table);
+ vdo_free(table);
+ if (result != UDS_SUCCESS) {
+ return vdo_log_error_strerror(result, "cannot reconstruct index save %u",
+ instance);
+ }
+
+ return UDS_SUCCESS;
+}
+
+static int __must_check load_sub_index_regions(struct index_layout *layout)
+{
+ int result;
+ unsigned int j;
+ struct index_save_layout *isl;
+ struct buffered_reader *reader;
+
+ for (j = 0; j < layout->super.max_saves; j++) {
+ isl = &layout->index.saves[j];
+ result = open_region_reader(layout, &isl->index_save, &reader);
+
+ if (result != UDS_SUCCESS) {
+ vdo_log_error_strerror(result,
+ "cannot get reader for index 0 save %u",
+ j);
+ return result;
+ }
+
+ result = load_index_save(isl, reader, j);
+ uds_free_buffered_reader(reader);
+ if (result != UDS_SUCCESS) {
+ /* Another save slot might be valid. */
+ reset_index_save_layout(isl, 0);
+ continue;
+ }
+ }
+
+ return UDS_SUCCESS;
+}
+
+static int __must_check verify_uds_index_config(struct index_layout *layout,
+ struct uds_configuration *config)
+{
+ int result;
+ struct buffered_reader *reader = NULL;
+ u64 offset;
+
+ offset = layout->super.volume_offset - layout->super.start_offset;
+ result = open_layout_reader(layout, &layout->config, offset, &reader);
+ if (result != UDS_SUCCESS)
+ return vdo_log_error_strerror(result, "failed to open config reader");
+
+ result = uds_validate_config_contents(reader, config);
+ if (result != UDS_SUCCESS) {
+ uds_free_buffered_reader(reader);
+ return vdo_log_error_strerror(result, "failed to read config region");
+ }
+
+ uds_free_buffered_reader(reader);
+ return UDS_SUCCESS;
+}
+
+static int load_index_layout(struct index_layout *layout, struct uds_configuration *config)
+{
+ int result;
+ struct buffered_reader *reader;
+
+ result = uds_make_buffered_reader(layout->factory,
+ layout->offset / UDS_BLOCK_SIZE, 1, &reader);
+ if (result != UDS_SUCCESS)
+ return vdo_log_error_strerror(result, "unable to read superblock");
+
+ result = load_super_block(layout, UDS_BLOCK_SIZE,
+ layout->offset / UDS_BLOCK_SIZE, reader);
+ uds_free_buffered_reader(reader);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ result = verify_uds_index_config(layout, config);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ return load_sub_index_regions(layout);
+}
+
+static int create_layout_factory(struct index_layout *layout,
+ const struct uds_configuration *config)
+{
+ int result;
+ size_t writable_size;
+ struct io_factory *factory = NULL;
+
+ result = uds_make_io_factory(config->bdev, &factory);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ writable_size = uds_get_writable_size(factory) & -UDS_BLOCK_SIZE;
+ if (writable_size < config->size + config->offset) {
+ uds_put_io_factory(factory);
+ vdo_log_error("index storage (%zu) is smaller than the requested size %zu",
+ writable_size, config->size + config->offset);
+ return -ENOSPC;
+ }
+
+ layout->factory = factory;
+ layout->factory_size = (config->size > 0) ? config->size : writable_size;
+ layout->offset = config->offset;
+ return UDS_SUCCESS;
+}
+
+int uds_make_index_layout(struct uds_configuration *config, bool new_layout,
+ struct index_layout **layout_ptr)
+{
+ int result;
+ struct index_layout *layout = NULL;
+ struct save_layout_sizes sizes;
+
+ result = compute_sizes(config, &sizes);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ result = vdo_allocate(1, struct index_layout, __func__, &layout);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = create_layout_factory(layout, config);
+ if (result != UDS_SUCCESS) {
+ uds_free_index_layout(layout);
+ return result;
+ }
+
+ if (layout->factory_size < sizes.total_size) {
+ vdo_log_error("index storage (%zu) is smaller than the required size %llu",
+ layout->factory_size,
+ (unsigned long long) sizes.total_size);
+ uds_free_index_layout(layout);
+ return -ENOSPC;
+ }
+
+ if (new_layout)
+ result = create_index_layout(layout, config);
+ else
+ result = load_index_layout(layout, config);
+ if (result != UDS_SUCCESS) {
+ uds_free_index_layout(layout);
+ return result;
+ }
+
+ *layout_ptr = layout;
+ return UDS_SUCCESS;
+}
+
+void uds_free_index_layout(struct index_layout *layout)
+{
+ if (layout == NULL)
+ return;
+
+ vdo_free(layout->index.saves);
+ if (layout->factory != NULL)
+ uds_put_io_factory(layout->factory);
+
+ vdo_free(layout);
+}
+
+int uds_replace_index_layout_storage(struct index_layout *layout,
+ struct block_device *bdev)
+{
+ return uds_replace_storage(layout->factory, bdev);
+}
+
+/* Obtain a dm_bufio_client for the volume region. */
+int uds_open_volume_bufio(struct index_layout *layout, size_t block_size,
+ unsigned int reserved_buffers,
+ struct dm_bufio_client **client_ptr)
+{
+ off_t offset = (layout->index.volume.start_block +
+ layout->super.volume_offset -
+ layout->super.start_offset);
+
+ return uds_make_bufio(layout->factory, offset, block_size, reserved_buffers,
+ client_ptr);
+}
+
+u64 uds_get_volume_nonce(struct index_layout *layout)
+{
+ return layout->index.nonce;
+}
diff --git a/drivers/md/dm-vdo/indexer/index-layout.h b/drivers/md/dm-vdo/indexer/index-layout.h
new file mode 100644
index 000000000000..e9ac6f4302d6
--- /dev/null
+++ b/drivers/md/dm-vdo/indexer/index-layout.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef UDS_INDEX_LAYOUT_H
+#define UDS_INDEX_LAYOUT_H
+
+#include "config.h"
+#include "indexer.h"
+#include "io-factory.h"
+
+/*
+ * The index layout describes the format of the index on the underlying storage, and is responsible
+ * for creating those structures when the index is first created. It also validates the index data
+ * when loading a saved index, and updates it when saving the index.
+ */
+
+struct index_layout;
+
+int __must_check uds_make_index_layout(struct uds_configuration *config, bool new_layout,
+ struct index_layout **layout_ptr);
+
+void uds_free_index_layout(struct index_layout *layout);
+
+int __must_check uds_replace_index_layout_storage(struct index_layout *layout,
+ struct block_device *bdev);
+
+int __must_check uds_load_index_state(struct index_layout *layout,
+ struct uds_index *index);
+
+int __must_check uds_save_index_state(struct index_layout *layout,
+ struct uds_index *index);
+
+int __must_check uds_discard_open_chapter(struct index_layout *layout);
+
+u64 __must_check uds_get_volume_nonce(struct index_layout *layout);
+
+int __must_check uds_open_volume_bufio(struct index_layout *layout, size_t block_size,
+ unsigned int reserved_buffers,
+ struct dm_bufio_client **client_ptr);
+
+#endif /* UDS_INDEX_LAYOUT_H */
diff --git a/drivers/md/dm-vdo/indexer/index-page-map.c b/drivers/md/dm-vdo/indexer/index-page-map.c
new file mode 100644
index 000000000000..00b44e07d0c1
--- /dev/null
+++ b/drivers/md/dm-vdo/indexer/index-page-map.c
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "index-page-map.h"
+
+#include "errors.h"
+#include "logger.h"
+#include "memory-alloc.h"
+#include "numeric.h"
+#include "permassert.h"
+#include "string-utils.h"
+#include "thread-utils.h"
+
+#include "hash-utils.h"
+#include "indexer.h"
+
+/*
+ * The index page map is conceptually a two-dimensional array indexed by chapter number and index
+ * page number within the chapter. Each entry contains the number of the last delta list on that
+ * index page. In order to save memory, the information for the last page in each chapter is not
+ * recorded, as it is known from the geometry.
+ */
+
+static const u8 PAGE_MAP_MAGIC[] = "ALBIPM02";
+
+#define PAGE_MAP_MAGIC_LENGTH (sizeof(PAGE_MAP_MAGIC) - 1)
+
+static inline u32 get_entry_count(const struct index_geometry *geometry)
+{
+ return geometry->chapters_per_volume * (geometry->index_pages_per_chapter - 1);
+}
+
+int uds_make_index_page_map(const struct index_geometry *geometry,
+ struct index_page_map **map_ptr)
+{
+ int result;
+ struct index_page_map *map;
+
+ result = vdo_allocate(1, struct index_page_map, "page map", &map);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ map->geometry = geometry;
+ map->entries_per_chapter = geometry->index_pages_per_chapter - 1;
+ result = vdo_allocate(get_entry_count(geometry), u16, "Index Page Map Entries",
+ &map->entries);
+ if (result != VDO_SUCCESS) {
+ uds_free_index_page_map(map);
+ return result;
+ }
+
+ *map_ptr = map;
+ return UDS_SUCCESS;
+}
+
+void uds_free_index_page_map(struct index_page_map *map)
+{
+ if (map != NULL) {
+ vdo_free(map->entries);
+ vdo_free(map);
+ }
+}
+
+void uds_update_index_page_map(struct index_page_map *map, u64 virtual_chapter_number,
+ u32 chapter_number, u32 index_page_number,
+ u32 delta_list_number)
+{
+ size_t slot;
+
+ map->last_update = virtual_chapter_number;
+ if (index_page_number == map->entries_per_chapter)
+ return;
+
+ slot = (chapter_number * map->entries_per_chapter) + index_page_number;
+ map->entries[slot] = delta_list_number;
+}
+
+u32 uds_find_index_page_number(const struct index_page_map *map,
+ const struct uds_record_name *name, u32 chapter_number)
+{
+ u32 delta_list_number = uds_hash_to_chapter_delta_list(name, map->geometry);
+ u32 slot = chapter_number * map->entries_per_chapter;
+ u32 page;
+
+ for (page = 0; page < map->entries_per_chapter; page++) {
+ if (delta_list_number <= map->entries[slot + page])
+ break;
+ }
+
+ return page;
+}
+
+void uds_get_list_number_bounds(const struct index_page_map *map, u32 chapter_number,
+ u32 index_page_number, u32 *lowest_list,
+ u32 *highest_list)
+{
+ u32 slot = chapter_number * map->entries_per_chapter;
+
+ *lowest_list = ((index_page_number == 0) ?
+ 0 : map->entries[slot + index_page_number - 1] + 1);
+ *highest_list = ((index_page_number < map->entries_per_chapter) ?
+ map->entries[slot + index_page_number] :
+ map->geometry->delta_lists_per_chapter - 1);
+}
+
+u64 uds_compute_index_page_map_save_size(const struct index_geometry *geometry)
+{
+ return PAGE_MAP_MAGIC_LENGTH + sizeof(u64) + sizeof(u16) * get_entry_count(geometry);
+}
+
+int uds_write_index_page_map(struct index_page_map *map, struct buffered_writer *writer)
+{
+ int result;
+ u8 *buffer;
+ size_t offset = 0;
+ u64 saved_size = uds_compute_index_page_map_save_size(map->geometry);
+ u32 i;
+
+ result = vdo_allocate(saved_size, u8, "page map data", &buffer);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ memcpy(buffer, PAGE_MAP_MAGIC, PAGE_MAP_MAGIC_LENGTH);
+ offset += PAGE_MAP_MAGIC_LENGTH;
+ encode_u64_le(buffer, &offset, map->last_update);
+ for (i = 0; i < get_entry_count(map->geometry); i++)
+ encode_u16_le(buffer, &offset, map->entries[i]);
+
+ result = uds_write_to_buffered_writer(writer, buffer, offset);
+ vdo_free(buffer);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ return uds_flush_buffered_writer(writer);
+}
+
+int uds_read_index_page_map(struct index_page_map *map, struct buffered_reader *reader)
+{
+ int result;
+ u8 magic[PAGE_MAP_MAGIC_LENGTH];
+ u8 *buffer;
+ size_t offset = 0;
+ u64 saved_size = uds_compute_index_page_map_save_size(map->geometry);
+ u32 i;
+
+ result = vdo_allocate(saved_size, u8, "page map data", &buffer);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = uds_read_from_buffered_reader(reader, buffer, saved_size);
+ if (result != UDS_SUCCESS) {
+ vdo_free(buffer);
+ return result;
+ }
+
+ memcpy(&magic, buffer, PAGE_MAP_MAGIC_LENGTH);
+ offset += PAGE_MAP_MAGIC_LENGTH;
+ if (memcmp(magic, PAGE_MAP_MAGIC, PAGE_MAP_MAGIC_LENGTH) != 0) {
+ vdo_free(buffer);
+ return UDS_CORRUPT_DATA;
+ }
+
+ decode_u64_le(buffer, &offset, &map->last_update);
+ for (i = 0; i < get_entry_count(map->geometry); i++)
+ decode_u16_le(buffer, &offset, &map->entries[i]);
+
+ vdo_free(buffer);
+ vdo_log_debug("read index page map, last update %llu",
+ (unsigned long long) map->last_update);
+ return UDS_SUCCESS;
+}
diff --git a/drivers/md/dm-vdo/indexer/index-page-map.h b/drivers/md/dm-vdo/indexer/index-page-map.h
new file mode 100644
index 000000000000..b327c0bb9656
--- /dev/null
+++ b/drivers/md/dm-vdo/indexer/index-page-map.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef UDS_INDEX_PAGE_MAP_H
+#define UDS_INDEX_PAGE_MAP_H
+
+#include "geometry.h"
+#include "io-factory.h"
+
+/*
+ * The index maintains a page map which records how the chapter delta lists are distributed among
+ * the index pages for each chapter, allowing the volume to be efficient about reading only pages
+ * that it knows it will need.
+ */
+
+struct index_page_map {
+ const struct index_geometry *geometry;
+ u64 last_update;
+ u32 entries_per_chapter;
+ u16 *entries;
+};
+
+int __must_check uds_make_index_page_map(const struct index_geometry *geometry,
+ struct index_page_map **map_ptr);
+
+void uds_free_index_page_map(struct index_page_map *map);
+
+int __must_check uds_read_index_page_map(struct index_page_map *map,
+ struct buffered_reader *reader);
+
+int __must_check uds_write_index_page_map(struct index_page_map *map,
+ struct buffered_writer *writer);
+
+void uds_update_index_page_map(struct index_page_map *map, u64 virtual_chapter_number,
+ u32 chapter_number, u32 index_page_number,
+ u32 delta_list_number);
+
+u32 __must_check uds_find_index_page_number(const struct index_page_map *map,
+ const struct uds_record_name *name,
+ u32 chapter_number);
+
+void uds_get_list_number_bounds(const struct index_page_map *map, u32 chapter_number,
+ u32 index_page_number, u32 *lowest_list,
+ u32 *highest_list);
+
+u64 uds_compute_index_page_map_save_size(const struct index_geometry *geometry);
+
+#endif /* UDS_INDEX_PAGE_MAP_H */
diff --git a/drivers/md/dm-vdo/indexer/index-session.c b/drivers/md/dm-vdo/indexer/index-session.c
new file mode 100644
index 000000000000..aee0914d604a
--- /dev/null
+++ b/drivers/md/dm-vdo/indexer/index-session.c
@@ -0,0 +1,739 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "index-session.h"
+
+#include <linux/atomic.h>
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "time-utils.h"
+
+#include "funnel-requestqueue.h"
+#include "index.h"
+#include "index-layout.h"
+
+/*
+ * The index session contains a lock (the request_mutex) which ensures that only one thread can
+ * change the state of its index at a time. The state field indicates the current state of the
+ * index through a set of descriptive flags. The request_mutex must be notified whenever a
+ * non-transient state flag is cleared. The request_mutex is also used to count the number of
+ * requests currently in progress so that they can be drained when suspending or closing the index.
+ *
+ * If the index session is suspended shortly after opening an index, it may have to suspend during
+ * a rebuild. Depending on the size of the index, a rebuild may take a significant amount of time,
+ * so UDS allows the rebuild to be paused in order to suspend the session in a timely manner. When
+ * the index session is resumed, the rebuild can continue from where it left off. If the index
+ * session is shut down with a suspended rebuild, the rebuild progress is abandoned and the rebuild
+ * will start from the beginning the next time the index is loaded. The mutex and status fields in
+ * the index_load_context are used to record the state of any interrupted rebuild.
+ */
+
+enum index_session_flag_bit {
+ IS_FLAG_BIT_START = 8,
+ /* The session has started loading an index but not completed it. */
+ IS_FLAG_BIT_LOADING = IS_FLAG_BIT_START,
+ /* The session has loaded an index, which can handle requests. */
+ IS_FLAG_BIT_LOADED,
+ /* The session's index has been permanently disabled. */
+ IS_FLAG_BIT_DISABLED,
+ /* The session's index is suspended. */
+ IS_FLAG_BIT_SUSPENDED,
+ /* The session is handling some index state change. */
+ IS_FLAG_BIT_WAITING,
+ /* The session's index is closing and draining requests. */
+ IS_FLAG_BIT_CLOSING,
+ /* The session is being destroyed and is draining requests. */
+ IS_FLAG_BIT_DESTROYING,
+};
+
+enum index_session_flag {
+ IS_FLAG_LOADED = (1 << IS_FLAG_BIT_LOADED),
+ IS_FLAG_LOADING = (1 << IS_FLAG_BIT_LOADING),
+ IS_FLAG_DISABLED = (1 << IS_FLAG_BIT_DISABLED),
+ IS_FLAG_SUSPENDED = (1 << IS_FLAG_BIT_SUSPENDED),
+ IS_FLAG_WAITING = (1 << IS_FLAG_BIT_WAITING),
+ IS_FLAG_CLOSING = (1 << IS_FLAG_BIT_CLOSING),
+ IS_FLAG_DESTROYING = (1 << IS_FLAG_BIT_DESTROYING),
+};
+
+/* Release a reference to an index session. */
+static void release_index_session(struct uds_index_session *index_session)
+{
+ mutex_lock(&index_session->request_mutex);
+ if (--index_session->request_count == 0)
+ uds_broadcast_cond(&index_session->request_cond);
+ mutex_unlock(&index_session->request_mutex);
+}
+
+/*
+ * Acquire a reference to the index session for an asynchronous index request. The reference must
+ * eventually be released with a corresponding call to release_index_session().
+ */
+static int get_index_session(struct uds_index_session *index_session)
+{
+ unsigned int state;
+ int result = UDS_SUCCESS;
+
+ mutex_lock(&index_session->request_mutex);
+ index_session->request_count++;
+ state = index_session->state;
+ mutex_unlock(&index_session->request_mutex);
+
+ if (state == IS_FLAG_LOADED) {
+ return UDS_SUCCESS;
+ } else if (state & IS_FLAG_DISABLED) {
+ result = UDS_DISABLED;
+ } else if ((state & IS_FLAG_LOADING) ||
+ (state & IS_FLAG_SUSPENDED) ||
+ (state & IS_FLAG_WAITING)) {
+ result = -EBUSY;
+ } else {
+ result = UDS_NO_INDEX;
+ }
+
+ release_index_session(index_session);
+ return result;
+}
+
+int uds_launch_request(struct uds_request *request)
+{
+ size_t internal_size;
+ int result;
+
+ if (request->callback == NULL) {
+ vdo_log_error("missing required callback");
+ return -EINVAL;
+ }
+
+ switch (request->type) {
+ case UDS_DELETE:
+ case UDS_POST:
+ case UDS_QUERY:
+ case UDS_QUERY_NO_UPDATE:
+ case UDS_UPDATE:
+ break;
+ default:
+ vdo_log_error("received invalid callback type");
+ return -EINVAL;
+ }
+
+ /* Reset all internal fields before processing. */
+ internal_size =
+ sizeof(struct uds_request) - offsetof(struct uds_request, zone_number);
+ // FIXME should be using struct_group for this instead
+ memset((char *) request + sizeof(*request) - internal_size, 0, internal_size);
+
+ result = get_index_session(request->session);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ request->found = false;
+ request->unbatched = false;
+ request->index = request->session->index;
+
+ uds_enqueue_request(request, STAGE_TRIAGE);
+ return UDS_SUCCESS;
+}
+
+static void enter_callback_stage(struct uds_request *request)
+{
+ if (request->status != UDS_SUCCESS) {
+ /* All request errors are considered unrecoverable */
+ mutex_lock(&request->session->request_mutex);
+ request->session->state |= IS_FLAG_DISABLED;
+ mutex_unlock(&request->session->request_mutex);
+ }
+
+ uds_request_queue_enqueue(request->session->callback_queue, request);
+}
+
+static inline void count_once(u64 *count_ptr)
+{
+ WRITE_ONCE(*count_ptr, READ_ONCE(*count_ptr) + 1);
+}
+
+static void update_session_stats(struct uds_request *request)
+{
+ struct session_stats *session_stats = &request->session->stats;
+
+ count_once(&session_stats->requests);
+
+ switch (request->type) {
+ case UDS_POST:
+ if (request->found)
+ count_once(&session_stats->posts_found);
+ else
+ count_once(&session_stats->posts_not_found);
+
+ if (request->location == UDS_LOCATION_IN_OPEN_CHAPTER)
+ count_once(&session_stats->posts_found_open_chapter);
+ else if (request->location == UDS_LOCATION_IN_DENSE)
+ count_once(&session_stats->posts_found_dense);
+ else if (request->location == UDS_LOCATION_IN_SPARSE)
+ count_once(&session_stats->posts_found_sparse);
+ break;
+
+ case UDS_UPDATE:
+ if (request->found)
+ count_once(&session_stats->updates_found);
+ else
+ count_once(&session_stats->updates_not_found);
+ break;
+
+ case UDS_DELETE:
+ if (request->found)
+ count_once(&session_stats->deletions_found);
+ else
+ count_once(&session_stats->deletions_not_found);
+ break;
+
+ case UDS_QUERY:
+ case UDS_QUERY_NO_UPDATE:
+ if (request->found)
+ count_once(&session_stats->queries_found);
+ else
+ count_once(&session_stats->queries_not_found);
+ break;
+
+ default:
+ request->status = VDO_ASSERT(false, "unknown request type: %d",
+ request->type);
+ }
+}
+
+static void handle_callbacks(struct uds_request *request)
+{
+ struct uds_index_session *index_session = request->session;
+
+ if (request->status == UDS_SUCCESS)
+ update_session_stats(request);
+
+ request->status = uds_status_to_errno(request->status);
+ request->callback(request);
+ release_index_session(index_session);
+}
+
+static int __must_check make_empty_index_session(struct uds_index_session **index_session_ptr)
+{
+ int result;
+ struct uds_index_session *session;
+
+ result = vdo_allocate(1, struct uds_index_session, __func__, &session);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ mutex_init(&session->request_mutex);
+ uds_init_cond(&session->request_cond);
+ mutex_init(&session->load_context.mutex);
+ uds_init_cond(&session->load_context.cond);
+
+ result = uds_make_request_queue("callbackW", &handle_callbacks,
+ &session->callback_queue);
+ if (result != UDS_SUCCESS) {
+ vdo_free(session);
+ return result;
+ }
+
+ *index_session_ptr = session;
+ return UDS_SUCCESS;
+}
+
+int uds_create_index_session(struct uds_index_session **session)
+{
+ if (session == NULL) {
+ vdo_log_error("missing session pointer");
+ return -EINVAL;
+ }
+
+ return uds_status_to_errno(make_empty_index_session(session));
+}
+
+static int __must_check start_loading_index_session(struct uds_index_session *index_session)
+{
+ int result;
+
+ mutex_lock(&index_session->request_mutex);
+ if (index_session->state & IS_FLAG_SUSPENDED) {
+ vdo_log_info("Index session is suspended");
+ result = -EBUSY;
+ } else if (index_session->state != 0) {
+ vdo_log_info("Index is already loaded");
+ result = -EBUSY;
+ } else {
+ index_session->state |= IS_FLAG_LOADING;
+ result = UDS_SUCCESS;
+ }
+ mutex_unlock(&index_session->request_mutex);
+ return result;
+}
+
+static void finish_loading_index_session(struct uds_index_session *index_session,
+ int result)
+{
+ mutex_lock(&index_session->request_mutex);
+ index_session->state &= ~IS_FLAG_LOADING;
+ if (result == UDS_SUCCESS)
+ index_session->state |= IS_FLAG_LOADED;
+
+ uds_broadcast_cond(&index_session->request_cond);
+ mutex_unlock(&index_session->request_mutex);
+}
+
+static int initialize_index_session(struct uds_index_session *index_session,
+ enum uds_open_index_type open_type)
+{
+ int result;
+ struct uds_configuration *config;
+
+ result = uds_make_configuration(&index_session->parameters, &config);
+ if (result != UDS_SUCCESS) {
+ vdo_log_error_strerror(result, "Failed to allocate config");
+ return result;
+ }
+
+ memset(&index_session->stats, 0, sizeof(index_session->stats));
+ result = uds_make_index(config, open_type, &index_session->load_context,
+ enter_callback_stage, &index_session->index);
+ if (result != UDS_SUCCESS)
+ vdo_log_error_strerror(result, "Failed to make index");
+ else
+ uds_log_configuration(config);
+
+ uds_free_configuration(config);
+ return result;
+}
+
+static const char *get_open_type_string(enum uds_open_index_type open_type)
+{
+ switch (open_type) {
+ case UDS_CREATE:
+ return "creating index";
+ case UDS_LOAD:
+ return "loading or rebuilding index";
+ case UDS_NO_REBUILD:
+ return "loading index";
+ default:
+ return "unknown open method";
+ }
+}
+
+/*
+ * Open an index under the given session. This operation will fail if the
+ * index session is suspended, or if there is already an open index.
+ */
+int uds_open_index(enum uds_open_index_type open_type,
+ const struct uds_parameters *parameters,
+ struct uds_index_session *session)
+{
+ int result;
+ char name[BDEVNAME_SIZE];
+
+ if (parameters == NULL) {
+ vdo_log_error("missing required parameters");
+ return -EINVAL;
+ }
+ if (parameters->bdev == NULL) {
+ vdo_log_error("missing required block device");
+ return -EINVAL;
+ }
+ if (session == NULL) {
+ vdo_log_error("missing required session pointer");
+ return -EINVAL;
+ }
+
+ result = start_loading_index_session(session);
+ if (result != UDS_SUCCESS)
+ return uds_status_to_errno(result);
+
+ session->parameters = *parameters;
+ format_dev_t(name, parameters->bdev->bd_dev);
+ vdo_log_info("%s: %s", get_open_type_string(open_type), name);
+
+ result = initialize_index_session(session, open_type);
+ if (result != UDS_SUCCESS)
+ vdo_log_error_strerror(result, "Failed %s",
+ get_open_type_string(open_type));
+
+ finish_loading_index_session(session, result);
+ return uds_status_to_errno(result);
+}
+
+static void wait_for_no_requests_in_progress(struct uds_index_session *index_session)
+{
+ mutex_lock(&index_session->request_mutex);
+ while (index_session->request_count > 0) {
+ uds_wait_cond(&index_session->request_cond,
+ &index_session->request_mutex);
+ }
+ mutex_unlock(&index_session->request_mutex);
+}
+
+static int __must_check save_index(struct uds_index_session *index_session)
+{
+ wait_for_no_requests_in_progress(index_session);
+ return uds_save_index(index_session->index);
+}
+
+static void suspend_rebuild(struct uds_index_session *session)
+{
+ mutex_lock(&session->load_context.mutex);
+ switch (session->load_context.status) {
+ case INDEX_OPENING:
+ session->load_context.status = INDEX_SUSPENDING;
+
+ /* Wait until the index indicates that it is not replaying. */
+ while ((session->load_context.status != INDEX_SUSPENDED) &&
+ (session->load_context.status != INDEX_READY)) {
+ uds_wait_cond(&session->load_context.cond,
+ &session->load_context.mutex);
+ }
+
+ break;
+
+ case INDEX_READY:
+ /* Index load does not need to be suspended. */
+ break;
+
+ case INDEX_SUSPENDED:
+ case INDEX_SUSPENDING:
+ case INDEX_FREEING:
+ default:
+ /* These cases should not happen. */
+ VDO_ASSERT_LOG_ONLY(false, "Bad load context state %u",
+ session->load_context.status);
+ break;
+ }
+ mutex_unlock(&session->load_context.mutex);
+}
+
+/*
+ * Suspend index operation, draining all current index requests and preventing new index requests
+ * from starting. Optionally saves all index data before returning.
+ */
+int uds_suspend_index_session(struct uds_index_session *session, bool save)
+{
+ int result = UDS_SUCCESS;
+ bool no_work = false;
+ bool rebuilding = false;
+
+ /* Wait for any current index state change to complete. */
+ mutex_lock(&session->request_mutex);
+ while (session->state & IS_FLAG_CLOSING)
+ uds_wait_cond(&session->request_cond, &session->request_mutex);
+
+ if ((session->state & IS_FLAG_WAITING) || (session->state & IS_FLAG_DESTROYING)) {
+ no_work = true;
+ vdo_log_info("Index session is already changing state");
+ result = -EBUSY;
+ } else if (session->state & IS_FLAG_SUSPENDED) {
+ no_work = true;
+ } else if (session->state & IS_FLAG_LOADING) {
+ session->state |= IS_FLAG_WAITING;
+ rebuilding = true;
+ } else if (session->state & IS_FLAG_LOADED) {
+ session->state |= IS_FLAG_WAITING;
+ } else {
+ no_work = true;
+ session->state |= IS_FLAG_SUSPENDED;
+ uds_broadcast_cond(&session->request_cond);
+ }
+ mutex_unlock(&session->request_mutex);
+
+ if (no_work)
+ return uds_status_to_errno(result);
+
+ if (rebuilding)
+ suspend_rebuild(session);
+ else if (save)
+ result = save_index(session);
+ else
+ result = uds_flush_index_session(session);
+
+ mutex_lock(&session->request_mutex);
+ session->state &= ~IS_FLAG_WAITING;
+ session->state |= IS_FLAG_SUSPENDED;
+ uds_broadcast_cond(&session->request_cond);
+ mutex_unlock(&session->request_mutex);
+ return uds_status_to_errno(result);
+}
+
+static int replace_device(struct uds_index_session *session, struct block_device *bdev)
+{
+ int result;
+
+ result = uds_replace_index_storage(session->index, bdev);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ session->parameters.bdev = bdev;
+ return UDS_SUCCESS;
+}
+
+/*
+ * Resume index operation after being suspended. If the index is suspended and the supplied block
+ * device differs from the current backing store, the index will start using the new backing store.
+ */
+int uds_resume_index_session(struct uds_index_session *session,
+ struct block_device *bdev)
+{
+ int result = UDS_SUCCESS;
+ bool no_work = false;
+ bool resume_replay = false;
+
+ mutex_lock(&session->request_mutex);
+ if (session->state & IS_FLAG_WAITING) {
+ vdo_log_info("Index session is already changing state");
+ no_work = true;
+ result = -EBUSY;
+ } else if (!(session->state & IS_FLAG_SUSPENDED)) {
+ /* If not suspended, just succeed. */
+ no_work = true;
+ result = UDS_SUCCESS;
+ } else {
+ session->state |= IS_FLAG_WAITING;
+ if (session->state & IS_FLAG_LOADING)
+ resume_replay = true;
+ }
+ mutex_unlock(&session->request_mutex);
+
+ if (no_work)
+ return result;
+
+ if ((session->index != NULL) && (bdev != session->parameters.bdev)) {
+ result = replace_device(session, bdev);
+ if (result != UDS_SUCCESS) {
+ mutex_lock(&session->request_mutex);
+ session->state &= ~IS_FLAG_WAITING;
+ uds_broadcast_cond(&session->request_cond);
+ mutex_unlock(&session->request_mutex);
+ return uds_status_to_errno(result);
+ }
+ }
+
+ if (resume_replay) {
+ mutex_lock(&session->load_context.mutex);
+ switch (session->load_context.status) {
+ case INDEX_SUSPENDED:
+ session->load_context.status = INDEX_OPENING;
+ /* Notify the index to start replaying again. */
+ uds_broadcast_cond(&session->load_context.cond);
+ break;
+
+ case INDEX_READY:
+ /* There is no index rebuild to resume. */
+ break;
+
+ case INDEX_OPENING:
+ case INDEX_SUSPENDING:
+ case INDEX_FREEING:
+ default:
+ /* These cases should not happen; do nothing. */
+ VDO_ASSERT_LOG_ONLY(false, "Bad load context state %u",
+ session->load_context.status);
+ break;
+ }
+ mutex_unlock(&session->load_context.mutex);
+ }
+
+ mutex_lock(&session->request_mutex);
+ session->state &= ~IS_FLAG_WAITING;
+ session->state &= ~IS_FLAG_SUSPENDED;
+ uds_broadcast_cond(&session->request_cond);
+ mutex_unlock(&session->request_mutex);
+ return UDS_SUCCESS;
+}
+
+static int save_and_free_index(struct uds_index_session *index_session)
+{
+ int result = UDS_SUCCESS;
+ bool suspended;
+ struct uds_index *index = index_session->index;
+
+ if (index == NULL)
+ return UDS_SUCCESS;
+
+ mutex_lock(&index_session->request_mutex);
+ suspended = (index_session->state & IS_FLAG_SUSPENDED);
+ mutex_unlock(&index_session->request_mutex);
+
+ if (!suspended) {
+ result = uds_save_index(index);
+ if (result != UDS_SUCCESS)
+ vdo_log_warning_strerror(result,
+ "ignoring error from save_index");
+ }
+ uds_free_index(index);
+ index_session->index = NULL;
+
+ /*
+ * Reset all index state that happens to be in the index
+ * session, so it doesn't affect any future index.
+ */
+ mutex_lock(&index_session->load_context.mutex);
+ index_session->load_context.status = INDEX_OPENING;
+ mutex_unlock(&index_session->load_context.mutex);
+
+ mutex_lock(&index_session->request_mutex);
+ /* Only the suspend bit will remain relevant. */
+ index_session->state &= IS_FLAG_SUSPENDED;
+ mutex_unlock(&index_session->request_mutex);
+
+ return result;
+}
+
+/* Save and close the current index. */
+int uds_close_index(struct uds_index_session *index_session)
+{
+ int result = UDS_SUCCESS;
+
+ /* Wait for any current index state change to complete. */
+ mutex_lock(&index_session->request_mutex);
+ while ((index_session->state & IS_FLAG_WAITING) ||
+ (index_session->state & IS_FLAG_CLOSING)) {
+ uds_wait_cond(&index_session->request_cond,
+ &index_session->request_mutex);
+ }
+
+ if (index_session->state & IS_FLAG_SUSPENDED) {
+ vdo_log_info("Index session is suspended");
+ result = -EBUSY;
+ } else if ((index_session->state & IS_FLAG_DESTROYING) ||
+ !(index_session->state & IS_FLAG_LOADED)) {
+ /* The index doesn't exist, hasn't finished loading, or is being destroyed. */
+ result = UDS_NO_INDEX;
+ } else {
+ index_session->state |= IS_FLAG_CLOSING;
+ }
+ mutex_unlock(&index_session->request_mutex);
+ if (result != UDS_SUCCESS)
+ return uds_status_to_errno(result);
+
+ vdo_log_debug("Closing index");
+ wait_for_no_requests_in_progress(index_session);
+ result = save_and_free_index(index_session);
+ vdo_log_debug("Closed index");
+
+ mutex_lock(&index_session->request_mutex);
+ index_session->state &= ~IS_FLAG_CLOSING;
+ uds_broadcast_cond(&index_session->request_cond);
+ mutex_unlock(&index_session->request_mutex);
+ return uds_status_to_errno(result);
+}
+
+/* This will save and close an open index before destroying the session. */
+int uds_destroy_index_session(struct uds_index_session *index_session)
+{
+ int result;
+ bool load_pending = false;
+
+ vdo_log_debug("Destroying index session");
+
+ /* Wait for any current index state change to complete. */
+ mutex_lock(&index_session->request_mutex);
+ while ((index_session->state & IS_FLAG_WAITING) ||
+ (index_session->state & IS_FLAG_CLOSING)) {
+ uds_wait_cond(&index_session->request_cond,
+ &index_session->request_mutex);
+ }
+
+ if (index_session->state & IS_FLAG_DESTROYING) {
+ mutex_unlock(&index_session->request_mutex);
+ vdo_log_info("Index session is already closing");
+ return -EBUSY;
+ }
+
+ index_session->state |= IS_FLAG_DESTROYING;
+ load_pending = ((index_session->state & IS_FLAG_LOADING) &&
+ (index_session->state & IS_FLAG_SUSPENDED));
+ mutex_unlock(&index_session->request_mutex);
+
+ if (load_pending) {
+ /* Tell the index to terminate the rebuild. */
+ mutex_lock(&index_session->load_context.mutex);
+ if (index_session->load_context.status == INDEX_SUSPENDED) {
+ index_session->load_context.status = INDEX_FREEING;
+ uds_broadcast_cond(&index_session->load_context.cond);
+ }
+ mutex_unlock(&index_session->load_context.mutex);
+
+ /* Wait until the load exits before proceeding. */
+ mutex_lock(&index_session->request_mutex);
+ while (index_session->state & IS_FLAG_LOADING) {
+ uds_wait_cond(&index_session->request_cond,
+ &index_session->request_mutex);
+ }
+ mutex_unlock(&index_session->request_mutex);
+ }
+
+ wait_for_no_requests_in_progress(index_session);
+ result = save_and_free_index(index_session);
+ uds_request_queue_finish(index_session->callback_queue);
+ index_session->callback_queue = NULL;
+ vdo_log_debug("Destroyed index session");
+ vdo_free(index_session);
+ return uds_status_to_errno(result);
+}
+
+/* Wait until all callbacks for index operations are complete. */
+int uds_flush_index_session(struct uds_index_session *index_session)
+{
+ wait_for_no_requests_in_progress(index_session);
+ uds_wait_for_idle_index(index_session->index);
+ return UDS_SUCCESS;
+}
+
+/* Statistics collection is intended to be thread-safe. */
+static void collect_stats(const struct uds_index_session *index_session,
+ struct uds_index_stats *stats)
+{
+ const struct session_stats *session_stats = &index_session->stats;
+
+ stats->current_time = ktime_to_seconds(current_time_ns(CLOCK_REALTIME));
+ stats->posts_found = READ_ONCE(session_stats->posts_found);
+ stats->in_memory_posts_found = READ_ONCE(session_stats->posts_found_open_chapter);
+ stats->dense_posts_found = READ_ONCE(session_stats->posts_found_dense);
+ stats->sparse_posts_found = READ_ONCE(session_stats->posts_found_sparse);
+ stats->posts_not_found = READ_ONCE(session_stats->posts_not_found);
+ stats->updates_found = READ_ONCE(session_stats->updates_found);
+ stats->updates_not_found = READ_ONCE(session_stats->updates_not_found);
+ stats->deletions_found = READ_ONCE(session_stats->deletions_found);
+ stats->deletions_not_found = READ_ONCE(session_stats->deletions_not_found);
+ stats->queries_found = READ_ONCE(session_stats->queries_found);
+ stats->queries_not_found = READ_ONCE(session_stats->queries_not_found);
+ stats->requests = READ_ONCE(session_stats->requests);
+}
+
+int uds_get_index_session_stats(struct uds_index_session *index_session,
+ struct uds_index_stats *stats)
+{
+ if (stats == NULL) {
+ vdo_log_error("received a NULL index stats pointer");
+ return -EINVAL;
+ }
+
+ collect_stats(index_session, stats);
+ if (index_session->index != NULL) {
+ uds_get_index_stats(index_session->index, stats);
+ } else {
+ stats->entries_indexed = 0;
+ stats->memory_used = 0;
+ stats->collisions = 0;
+ stats->entries_discarded = 0;
+ }
+
+ return UDS_SUCCESS;
+}
+
+void uds_wait_cond(struct cond_var *cv, struct mutex *mutex)
+{
+ DEFINE_WAIT(__wait);
+
+ prepare_to_wait(&cv->wait_queue, &__wait, TASK_IDLE);
+ mutex_unlock(mutex);
+ schedule();
+ finish_wait(&cv->wait_queue, &__wait);
+ mutex_lock(mutex);
+}
diff --git a/drivers/md/dm-vdo/indexer/index-session.h b/drivers/md/dm-vdo/indexer/index-session.h
new file mode 100644
index 000000000000..066648f6e062
--- /dev/null
+++ b/drivers/md/dm-vdo/indexer/index-session.h
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef UDS_INDEX_SESSION_H
+#define UDS_INDEX_SESSION_H
+
+#include <linux/atomic.h>
+#include <linux/cache.h>
+
+#include "thread-utils.h"
+
+#include "config.h"
+#include "indexer.h"
+
+/*
+ * The index session mediates all interactions with a UDS index. Once the index session is created,
+ * it can be used to open, close, suspend, or recreate an index. It implements the majority of the
+ * functions in the top-level UDS API.
+ *
+ * If any deduplication request fails due to an internal error, the index is marked disabled. It
+ * will not accept any further requests and can only be closed. Closing the index will clear the
+ * disabled flag, and the index can then be reopened and recovered using the same index session.
+ */
+
+struct __aligned(L1_CACHE_BYTES) session_stats {
+ /* Post requests that found an entry */
+ u64 posts_found;
+ /* Post requests found in the open chapter */
+ u64 posts_found_open_chapter;
+ /* Post requests found in the dense index */
+ u64 posts_found_dense;
+ /* Post requests found in the sparse index */
+ u64 posts_found_sparse;
+ /* Post requests that did not find an entry */
+ u64 posts_not_found;
+ /* Update requests that found an entry */
+ u64 updates_found;
+ /* Update requests that did not find an entry */
+ u64 updates_not_found;
+ /* Delete requests that found an entry */
+ u64 deletions_found;
+ /* Delete requests that did not find an entry */
+ u64 deletions_not_found;
+ /* Query requests that found an entry */
+ u64 queries_found;
+ /* Query requests that did not find an entry */
+ u64 queries_not_found;
+ /* Total number of requests */
+ u64 requests;
+};
+
+enum index_suspend_status {
+ /* An index load has started but the index is not ready for use. */
+ INDEX_OPENING = 0,
+ /* The index is able to handle requests. */
+ INDEX_READY,
+ /* The index is attempting to suspend a rebuild. */
+ INDEX_SUSPENDING,
+ /* An index rebuild has been suspended. */
+ INDEX_SUSPENDED,
+ /* An index rebuild is being stopped in order to shut down. */
+ INDEX_FREEING,
+};
+
+struct index_load_context {
+ struct mutex mutex;
+ struct cond_var cond;
+ enum index_suspend_status status;
+};
+
+struct uds_index_session {
+ unsigned int state;
+ struct uds_index *index;
+ struct uds_request_queue *callback_queue;
+ struct uds_parameters parameters;
+ struct index_load_context load_context;
+ struct mutex request_mutex;
+ struct cond_var request_cond;
+ int request_count;
+ struct session_stats stats;
+};
+
+#endif /* UDS_INDEX_SESSION_H */
diff --git a/drivers/md/dm-vdo/indexer/index.c b/drivers/md/dm-vdo/indexer/index.c
new file mode 100644
index 000000000000..1ba767144426
--- /dev/null
+++ b/drivers/md/dm-vdo/indexer/index.c
@@ -0,0 +1,1388 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+
+#include "index.h"
+
+#include "logger.h"
+#include "memory-alloc.h"
+
+#include "funnel-requestqueue.h"
+#include "hash-utils.h"
+#include "sparse-cache.h"
+
+static const u64 NO_LAST_SAVE = U64_MAX;
+
+/*
+ * When searching for deduplication records, the index first searches the volume index, and then
+ * searches the chapter index for the relevant chapter. If the chapter has been fully committed to
+ * storage, the chapter pages are loaded into the page cache. If the chapter has not yet been
+ * committed (either the open chapter or a recently closed one), the index searches the in-memory
+ * representation of the chapter. Finally, if the volume index does not find a record and the index
+ * is sparse, the index will search the sparse cache.
+ *
+ * The index send two kinds of messages to coordinate between zones: chapter close messages for the
+ * chapter writer, and sparse cache barrier messages for the sparse cache.
+ *
+ * The chapter writer is responsible for committing chapters of records to storage. Since zones can
+ * get different numbers of records, some zones may fall behind others. Each time a zone fills up
+ * its available space in a chapter, it informs the chapter writer that the chapter is complete,
+ * and also informs all other zones that it has closed the chapter. Each other zone will then close
+ * the chapter immediately, regardless of how full it is, in order to minimize skew between zones.
+ * Once every zone has closed the chapter, the chapter writer will commit that chapter to storage.
+ *
+ * The last zone to close the chapter also removes the oldest chapter from the volume index.
+ * Although that chapter is invalid for zones that have moved on, the existence of the open chapter
+ * means that those zones will never ask the volume index about it. No zone is allowed to get more
+ * than one chapter ahead of any other. If a zone is so far ahead that it tries to close another
+ * chapter before the previous one has been closed by all zones, it is forced to wait.
+ *
+ * The sparse cache relies on having the same set of chapter indexes available to all zones. When a
+ * request wants to add a chapter to the sparse cache, it sends a barrier message to each zone
+ * during the triage stage that acts as a rendezvous. Once every zone has reached the barrier and
+ * paused its operations, the cache membership is changed and each zone is then informed that it
+ * can proceed. More details can be found in the sparse cache documentation.
+ *
+ * If a sparse cache has only one zone, it will not create a triage queue, but it still needs the
+ * barrier message to change the sparse cache membership, so the index simulates the message by
+ * invoking the handler directly.
+ */
+
+struct chapter_writer {
+ /* The index to which we belong */
+ struct uds_index *index;
+ /* The thread to do the writing */
+ struct thread *thread;
+ /* The lock protecting the following fields */
+ struct mutex mutex;
+ /* The condition signalled on state changes */
+ struct cond_var cond;
+ /* Set to true to stop the thread */
+ bool stop;
+ /* The result from the most recent write */
+ int result;
+ /* The number of bytes allocated by the chapter writer */
+ size_t memory_size;
+ /* The number of zones which have submitted a chapter for writing */
+ unsigned int zones_to_write;
+ /* Open chapter index used by uds_close_open_chapter() */
+ struct open_chapter_index *open_chapter_index;
+ /* Collated records used by uds_close_open_chapter() */
+ struct uds_volume_record *collated_records;
+ /* The chapters to write (one per zone) */
+ struct open_chapter_zone *chapters[];
+};
+
+static bool is_zone_chapter_sparse(const struct index_zone *zone, u64 virtual_chapter)
+{
+ return uds_is_chapter_sparse(zone->index->volume->geometry,
+ zone->oldest_virtual_chapter,
+ zone->newest_virtual_chapter, virtual_chapter);
+}
+
+static int launch_zone_message(struct uds_zone_message message, unsigned int zone,
+ struct uds_index *index)
+{
+ int result;
+ struct uds_request *request;
+
+ result = vdo_allocate(1, struct uds_request, __func__, &request);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ request->index = index;
+ request->unbatched = true;
+ request->zone_number = zone;
+ request->zone_message = message;
+
+ uds_enqueue_request(request, STAGE_MESSAGE);
+ return UDS_SUCCESS;
+}
+
+static void enqueue_barrier_messages(struct uds_index *index, u64 virtual_chapter)
+{
+ struct uds_zone_message message = {
+ .type = UDS_MESSAGE_SPARSE_CACHE_BARRIER,
+ .virtual_chapter = virtual_chapter,
+ };
+ unsigned int zone;
+
+ for (zone = 0; zone < index->zone_count; zone++) {
+ int result = launch_zone_message(message, zone, index);
+
+ VDO_ASSERT_LOG_ONLY((result == UDS_SUCCESS), "barrier message allocation");
+ }
+}
+
+/*
+ * Determine whether this request should trigger a sparse cache barrier message to change the
+ * membership of the sparse cache. If a change in membership is desired, the function returns the
+ * chapter number to add.
+ */
+static u64 triage_index_request(struct uds_index *index, struct uds_request *request)
+{
+ u64 virtual_chapter;
+ struct index_zone *zone;
+
+ virtual_chapter = uds_lookup_volume_index_name(index->volume_index,
+ &request->record_name);
+ if (virtual_chapter == NO_CHAPTER)
+ return NO_CHAPTER;
+
+ zone = index->zones[request->zone_number];
+ if (!is_zone_chapter_sparse(zone, virtual_chapter))
+ return NO_CHAPTER;
+
+ /*
+ * FIXME: Optimize for a common case by remembering the chapter from the most recent
+ * barrier message and skipping this chapter if is it the same.
+ */
+
+ return virtual_chapter;
+}
+
+/*
+ * Simulate a message to change the sparse cache membership for a single-zone sparse index. This
+ * allows us to forgo the complicated locking required by a multi-zone sparse index. Any other kind
+ * of index does nothing here.
+ */
+static int simulate_index_zone_barrier_message(struct index_zone *zone,
+ struct uds_request *request)
+{
+ u64 sparse_virtual_chapter;
+
+ if ((zone->index->zone_count > 1) ||
+ !uds_is_sparse_index_geometry(zone->index->volume->geometry))
+ return UDS_SUCCESS;
+
+ sparse_virtual_chapter = triage_index_request(zone->index, request);
+ if (sparse_virtual_chapter == NO_CHAPTER)
+ return UDS_SUCCESS;
+
+ return uds_update_sparse_cache(zone, sparse_virtual_chapter);
+}
+
+/* This is the request processing function for the triage queue. */
+static void triage_request(struct uds_request *request)
+{
+ struct uds_index *index = request->index;
+ u64 sparse_virtual_chapter = triage_index_request(index, request);
+
+ if (sparse_virtual_chapter != NO_CHAPTER)
+ enqueue_barrier_messages(index, sparse_virtual_chapter);
+
+ uds_enqueue_request(request, STAGE_INDEX);
+}
+
+static int finish_previous_chapter(struct uds_index *index, u64 current_chapter_number)
+{
+ int result;
+ struct chapter_writer *writer = index->chapter_writer;
+
+ mutex_lock(&writer->mutex);
+ while (index->newest_virtual_chapter < current_chapter_number)
+ uds_wait_cond(&writer->cond, &writer->mutex);
+ result = writer->result;
+ mutex_unlock(&writer->mutex);
+
+ if (result != UDS_SUCCESS)
+ return vdo_log_error_strerror(result,
+ "Writing of previous open chapter failed");
+
+ return UDS_SUCCESS;
+}
+
+static int swap_open_chapter(struct index_zone *zone)
+{
+ int result;
+ struct open_chapter_zone *temporary_chapter;
+
+ result = finish_previous_chapter(zone->index, zone->newest_virtual_chapter);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ temporary_chapter = zone->open_chapter;
+ zone->open_chapter = zone->writing_chapter;
+ zone->writing_chapter = temporary_chapter;
+ return UDS_SUCCESS;
+}
+
+/*
+ * Inform the chapter writer that this zone is done with this chapter. The chapter won't start
+ * writing until all zones have closed it.
+ */
+static unsigned int start_closing_chapter(struct uds_index *index,
+ unsigned int zone_number,
+ struct open_chapter_zone *chapter)
+{
+ unsigned int finished_zones;
+ struct chapter_writer *writer = index->chapter_writer;
+
+ mutex_lock(&writer->mutex);
+ finished_zones = ++writer->zones_to_write;
+ writer->chapters[zone_number] = chapter;
+ uds_broadcast_cond(&writer->cond);
+ mutex_unlock(&writer->mutex);
+
+ return finished_zones;
+}
+
+static int announce_chapter_closed(struct index_zone *zone, u64 closed_chapter)
+{
+ int result;
+ unsigned int i;
+ struct uds_zone_message zone_message = {
+ .type = UDS_MESSAGE_ANNOUNCE_CHAPTER_CLOSED,
+ .virtual_chapter = closed_chapter,
+ };
+
+ for (i = 0; i < zone->index->zone_count; i++) {
+ if (zone->id == i)
+ continue;
+
+ result = launch_zone_message(zone_message, i, zone->index);
+ if (result != UDS_SUCCESS)
+ return result;
+ }
+
+ return UDS_SUCCESS;
+}
+
+static int open_next_chapter(struct index_zone *zone)
+{
+ int result;
+ u64 closed_chapter;
+ u64 expiring;
+ unsigned int finished_zones;
+ u32 expire_chapters;
+
+ vdo_log_debug("closing chapter %llu of zone %u after %u entries (%u short)",
+ (unsigned long long) zone->newest_virtual_chapter, zone->id,
+ zone->open_chapter->size,
+ zone->open_chapter->capacity - zone->open_chapter->size);
+
+ result = swap_open_chapter(zone);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ closed_chapter = zone->newest_virtual_chapter++;
+ uds_set_volume_index_zone_open_chapter(zone->index->volume_index, zone->id,
+ zone->newest_virtual_chapter);
+ uds_reset_open_chapter(zone->open_chapter);
+
+ finished_zones = start_closing_chapter(zone->index, zone->id,
+ zone->writing_chapter);
+ if ((finished_zones == 1) && (zone->index->zone_count > 1)) {
+ result = announce_chapter_closed(zone, closed_chapter);
+ if (result != UDS_SUCCESS)
+ return result;
+ }
+
+ expiring = zone->oldest_virtual_chapter;
+ expire_chapters = uds_chapters_to_expire(zone->index->volume->geometry,
+ zone->newest_virtual_chapter);
+ zone->oldest_virtual_chapter += expire_chapters;
+
+ if (finished_zones < zone->index->zone_count)
+ return UDS_SUCCESS;
+
+ while (expire_chapters-- > 0)
+ uds_forget_chapter(zone->index->volume, expiring++);
+
+ return UDS_SUCCESS;
+}
+
+static int handle_chapter_closed(struct index_zone *zone, u64 virtual_chapter)
+{
+ if (zone->newest_virtual_chapter == virtual_chapter)
+ return open_next_chapter(zone);
+
+ return UDS_SUCCESS;
+}
+
+static int dispatch_index_zone_control_request(struct uds_request *request)
+{
+ struct uds_zone_message *message = &request->zone_message;
+ struct index_zone *zone = request->index->zones[request->zone_number];
+
+ switch (message->type) {
+ case UDS_MESSAGE_SPARSE_CACHE_BARRIER:
+ return uds_update_sparse_cache(zone, message->virtual_chapter);
+
+ case UDS_MESSAGE_ANNOUNCE_CHAPTER_CLOSED:
+ return handle_chapter_closed(zone, message->virtual_chapter);
+
+ default:
+ vdo_log_error("invalid message type: %d", message->type);
+ return UDS_INVALID_ARGUMENT;
+ }
+}
+
+static void set_request_location(struct uds_request *request,
+ enum uds_index_region new_location)
+{
+ request->location = new_location;
+ request->found = ((new_location == UDS_LOCATION_IN_OPEN_CHAPTER) ||
+ (new_location == UDS_LOCATION_IN_DENSE) ||
+ (new_location == UDS_LOCATION_IN_SPARSE));
+}
+
+static void set_chapter_location(struct uds_request *request,
+ const struct index_zone *zone, u64 virtual_chapter)
+{
+ request->found = true;
+ if (virtual_chapter == zone->newest_virtual_chapter)
+ request->location = UDS_LOCATION_IN_OPEN_CHAPTER;
+ else if (is_zone_chapter_sparse(zone, virtual_chapter))
+ request->location = UDS_LOCATION_IN_SPARSE;
+ else
+ request->location = UDS_LOCATION_IN_DENSE;
+}
+
+static int search_sparse_cache_in_zone(struct index_zone *zone, struct uds_request *request,
+ u64 virtual_chapter, bool *found)
+{
+ int result;
+ struct volume *volume;
+ u16 record_page_number;
+ u32 chapter;
+
+ result = uds_search_sparse_cache(zone, &request->record_name, &virtual_chapter,
+ &record_page_number);
+ if ((result != UDS_SUCCESS) || (virtual_chapter == NO_CHAPTER))
+ return result;
+
+ request->virtual_chapter = virtual_chapter;
+ volume = zone->index->volume;
+ chapter = uds_map_to_physical_chapter(volume->geometry, virtual_chapter);
+ return uds_search_cached_record_page(volume, request, chapter,
+ record_page_number, found);
+}
+
+static int get_record_from_zone(struct index_zone *zone, struct uds_request *request,
+ bool *found)
+{
+ struct volume *volume;
+
+ if (request->location == UDS_LOCATION_RECORD_PAGE_LOOKUP) {
+ *found = true;
+ return UDS_SUCCESS;
+ } else if (request->location == UDS_LOCATION_UNAVAILABLE) {
+ *found = false;
+ return UDS_SUCCESS;
+ }
+
+ if (request->virtual_chapter == zone->newest_virtual_chapter) {
+ uds_search_open_chapter(zone->open_chapter, &request->record_name,
+ &request->old_metadata, found);
+ return UDS_SUCCESS;
+ }
+
+ if ((zone->newest_virtual_chapter > 0) &&
+ (request->virtual_chapter == (zone->newest_virtual_chapter - 1)) &&
+ (zone->writing_chapter->size > 0)) {
+ uds_search_open_chapter(zone->writing_chapter, &request->record_name,
+ &request->old_metadata, found);
+ return UDS_SUCCESS;
+ }
+
+ volume = zone->index->volume;
+ if (is_zone_chapter_sparse(zone, request->virtual_chapter) &&
+ uds_sparse_cache_contains(volume->sparse_cache, request->virtual_chapter,
+ request->zone_number))
+ return search_sparse_cache_in_zone(zone, request,
+ request->virtual_chapter, found);
+
+ return uds_search_volume_page_cache(volume, request, found);
+}
+
+static int put_record_in_zone(struct index_zone *zone, struct uds_request *request,
+ const struct uds_record_data *metadata)
+{
+ unsigned int remaining;
+
+ remaining = uds_put_open_chapter(zone->open_chapter, &request->record_name,
+ metadata);
+ if (remaining == 0)
+ return open_next_chapter(zone);
+
+ return UDS_SUCCESS;
+}
+
+static int search_index_zone(struct index_zone *zone, struct uds_request *request)
+{
+ int result;
+ struct volume_index_record record;
+ bool overflow_record, found = false;
+ struct uds_record_data *metadata;
+ u64 chapter;
+
+ result = uds_get_volume_index_record(zone->index->volume_index,
+ &request->record_name, &record);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ if (record.is_found) {
+ if (request->requeued && request->virtual_chapter != record.virtual_chapter)
+ set_request_location(request, UDS_LOCATION_UNKNOWN);
+
+ request->virtual_chapter = record.virtual_chapter;
+ result = get_record_from_zone(zone, request, &found);
+ if (result != UDS_SUCCESS)
+ return result;
+ }
+
+ if (found)
+ set_chapter_location(request, zone, record.virtual_chapter);
+
+ /*
+ * If a record has overflowed a chapter index in more than one chapter (or overflowed in
+ * one chapter and collided with an existing record), it will exist as a collision record
+ * in the volume index, but we won't find it in the volume. This case needs special
+ * handling.
+ */
+ overflow_record = (record.is_found && record.is_collision && !found);
+ chapter = zone->newest_virtual_chapter;
+ if (found || overflow_record) {
+ if ((request->type == UDS_QUERY_NO_UPDATE) ||
+ ((request->type == UDS_QUERY) && overflow_record)) {
+ /* There is nothing left to do. */
+ return UDS_SUCCESS;
+ }
+
+ if (record.virtual_chapter != chapter) {
+ /*
+ * Update the volume index to reference the new chapter for the block. If
+ * the record had been deleted or dropped from the chapter index, it will
+ * be back.
+ */
+ result = uds_set_volume_index_record_chapter(&record, chapter);
+ } else if (request->type != UDS_UPDATE) {
+ /* The record is already in the open chapter. */
+ return UDS_SUCCESS;
+ }
+ } else {
+ /*
+ * The record wasn't in the volume index, so check whether the
+ * name is in a cached sparse chapter. If we found the name on
+ * a previous search, use that result instead.
+ */
+ if (request->location == UDS_LOCATION_RECORD_PAGE_LOOKUP) {
+ found = true;
+ } else if (request->location == UDS_LOCATION_UNAVAILABLE) {
+ found = false;
+ } else if (uds_is_sparse_index_geometry(zone->index->volume->geometry) &&
+ !uds_is_volume_index_sample(zone->index->volume_index,
+ &request->record_name)) {
+ result = search_sparse_cache_in_zone(zone, request, NO_CHAPTER,
+ &found);
+ if (result != UDS_SUCCESS)
+ return result;
+ }
+
+ if (found)
+ set_request_location(request, UDS_LOCATION_IN_SPARSE);
+
+ if ((request->type == UDS_QUERY_NO_UPDATE) ||
+ ((request->type == UDS_QUERY) && !found)) {
+ /* There is nothing left to do. */
+ return UDS_SUCCESS;
+ }
+
+ /*
+ * Add a new entry to the volume index referencing the open chapter. This needs to
+ * be done both for new records, and for records from cached sparse chapters.
+ */
+ result = uds_put_volume_index_record(&record, chapter);
+ }
+
+ if (result == UDS_OVERFLOW) {
+ /*
+ * The volume index encountered a delta list overflow. The condition was already
+ * logged. We will go on without adding the record to the open chapter.
+ */
+ return UDS_SUCCESS;
+ }
+
+ if (result != UDS_SUCCESS)
+ return result;
+
+ if (!found || (request->type == UDS_UPDATE)) {
+ /* This is a new record or we're updating an existing record. */
+ metadata = &request->new_metadata;
+ } else {
+ /* Move the existing record to the open chapter. */
+ metadata = &request->old_metadata;
+ }
+
+ return put_record_in_zone(zone, request, metadata);
+}
+
+static int remove_from_index_zone(struct index_zone *zone, struct uds_request *request)
+{
+ int result;
+ struct volume_index_record record;
+
+ result = uds_get_volume_index_record(zone->index->volume_index,
+ &request->record_name, &record);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ if (!record.is_found)
+ return UDS_SUCCESS;
+
+ /* If the request was requeued, check whether the saved state is still valid. */
+
+ if (record.is_collision) {
+ set_chapter_location(request, zone, record.virtual_chapter);
+ } else {
+ /* Non-collision records are hints, so resolve the name in the chapter. */
+ bool found;
+
+ if (request->requeued && request->virtual_chapter != record.virtual_chapter)
+ set_request_location(request, UDS_LOCATION_UNKNOWN);
+
+ request->virtual_chapter = record.virtual_chapter;
+ result = get_record_from_zone(zone, request, &found);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ if (!found) {
+ /* There is no record to remove. */
+ return UDS_SUCCESS;
+ }
+ }
+
+ set_chapter_location(request, zone, record.virtual_chapter);
+
+ /*
+ * Delete the volume index entry for the named record only. Note that a later search might
+ * later return stale advice if there is a colliding name in the same chapter, but it's a
+ * very rare case (1 in 2^21).
+ */
+ result = uds_remove_volume_index_record(&record);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ /*
+ * If the record is in the open chapter, we must remove it or mark it deleted to avoid
+ * trouble if the record is added again later.
+ */
+ if (request->location == UDS_LOCATION_IN_OPEN_CHAPTER)
+ uds_remove_from_open_chapter(zone->open_chapter, &request->record_name);
+
+ return UDS_SUCCESS;
+}
+
+static int dispatch_index_request(struct uds_index *index, struct uds_request *request)
+{
+ int result;
+ struct index_zone *zone = index->zones[request->zone_number];
+
+ if (!request->requeued) {
+ result = simulate_index_zone_barrier_message(zone, request);
+ if (result != UDS_SUCCESS)
+ return result;
+ }
+
+ switch (request->type) {
+ case UDS_POST:
+ case UDS_UPDATE:
+ case UDS_QUERY:
+ case UDS_QUERY_NO_UPDATE:
+ result = search_index_zone(zone, request);
+ break;
+
+ case UDS_DELETE:
+ result = remove_from_index_zone(zone, request);
+ break;
+
+ default:
+ result = vdo_log_warning_strerror(UDS_INVALID_ARGUMENT,
+ "invalid request type: %d",
+ request->type);
+ break;
+ }
+
+ return result;
+}
+
+/* This is the request processing function invoked by each zone's thread. */
+static void execute_zone_request(struct uds_request *request)
+{
+ int result;
+ struct uds_index *index = request->index;
+
+ if (request->zone_message.type != UDS_MESSAGE_NONE) {
+ result = dispatch_index_zone_control_request(request);
+ if (result != UDS_SUCCESS) {
+ vdo_log_error_strerror(result, "error executing message: %d",
+ request->zone_message.type);
+ }
+
+ /* Once the message is processed it can be freed. */
+ vdo_free(vdo_forget(request));
+ return;
+ }
+
+ index->need_to_save = true;
+ if (request->requeued && (request->status != UDS_SUCCESS)) {
+ set_request_location(request, UDS_LOCATION_UNAVAILABLE);
+ index->callback(request);
+ return;
+ }
+
+ result = dispatch_index_request(index, request);
+ if (result == UDS_QUEUED) {
+ /* The request has been requeued so don't let it complete. */
+ return;
+ }
+
+ if (!request->found)
+ set_request_location(request, UDS_LOCATION_UNAVAILABLE);
+
+ request->status = result;
+ index->callback(request);
+}
+
+static int initialize_index_queues(struct uds_index *index,
+ const struct index_geometry *geometry)
+{
+ int result;
+ unsigned int i;
+
+ for (i = 0; i < index->zone_count; i++) {
+ result = uds_make_request_queue("indexW", &execute_zone_request,
+ &index->zone_queues[i]);
+ if (result != UDS_SUCCESS)
+ return result;
+ }
+
+ /* The triage queue is only needed for sparse multi-zone indexes. */
+ if ((index->zone_count > 1) && uds_is_sparse_index_geometry(geometry)) {
+ result = uds_make_request_queue("triageW", &triage_request,
+ &index->triage_queue);
+ if (result != UDS_SUCCESS)
+ return result;
+ }
+
+ return UDS_SUCCESS;
+}
+
+/* This is the driver function for the chapter writer thread. */
+static void close_chapters(void *arg)
+{
+ int result;
+ struct chapter_writer *writer = arg;
+ struct uds_index *index = writer->index;
+
+ vdo_log_debug("chapter writer starting");
+ mutex_lock(&writer->mutex);
+ for (;;) {
+ while (writer->zones_to_write < index->zone_count) {
+ if (writer->stop && (writer->zones_to_write == 0)) {
+ /*
+ * We've been told to stop, and all of the zones are in the same
+ * open chapter, so we can exit now.
+ */
+ mutex_unlock(&writer->mutex);
+ vdo_log_debug("chapter writer stopping");
+ return;
+ }
+ uds_wait_cond(&writer->cond, &writer->mutex);
+ }
+
+ /*
+ * Release the lock while closing a chapter. We probably don't need to do this, but
+ * it seems safer in principle. It's OK to access the chapter and chapter_number
+ * fields without the lock since those aren't allowed to change until we're done.
+ */
+ mutex_unlock(&writer->mutex);
+
+ if (index->has_saved_open_chapter) {
+ /*
+ * Remove the saved open chapter the first time we close an open chapter
+ * after loading from a clean shutdown, or after doing a clean save. The
+ * lack of the saved open chapter will indicate that a recovery is
+ * necessary.
+ */
+ index->has_saved_open_chapter = false;
+ result = uds_discard_open_chapter(index->layout);
+ if (result == UDS_SUCCESS)
+ vdo_log_debug("Discarding saved open chapter");
+ }
+
+ result = uds_close_open_chapter(writer->chapters, index->zone_count,
+ index->volume,
+ writer->open_chapter_index,
+ writer->collated_records,
+ index->newest_virtual_chapter);
+
+ mutex_lock(&writer->mutex);
+ index->newest_virtual_chapter++;
+ index->oldest_virtual_chapter +=
+ uds_chapters_to_expire(index->volume->geometry,
+ index->newest_virtual_chapter);
+ writer->result = result;
+ writer->zones_to_write = 0;
+ uds_broadcast_cond(&writer->cond);
+ }
+}
+
+static void stop_chapter_writer(struct chapter_writer *writer)
+{
+ struct thread *writer_thread = NULL;
+
+ mutex_lock(&writer->mutex);
+ if (writer->thread != NULL) {
+ writer_thread = writer->thread;
+ writer->thread = NULL;
+ writer->stop = true;
+ uds_broadcast_cond(&writer->cond);
+ }
+ mutex_unlock(&writer->mutex);
+
+ if (writer_thread != NULL)
+ vdo_join_threads(writer_thread);
+}
+
+static void free_chapter_writer(struct chapter_writer *writer)
+{
+ if (writer == NULL)
+ return;
+
+ stop_chapter_writer(writer);
+ uds_free_open_chapter_index(writer->open_chapter_index);
+ vdo_free(writer->collated_records);
+ vdo_free(writer);
+}
+
+static int make_chapter_writer(struct uds_index *index,
+ struct chapter_writer **writer_ptr)
+{
+ int result;
+ struct chapter_writer *writer;
+ size_t collated_records_size =
+ (sizeof(struct uds_volume_record) * index->volume->geometry->records_per_chapter);
+
+ result = vdo_allocate_extended(struct chapter_writer, index->zone_count,
+ struct open_chapter_zone *, "Chapter Writer",
+ &writer);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ writer->index = index;
+ mutex_init(&writer->mutex);
+ uds_init_cond(&writer->cond);
+
+ result = vdo_allocate_cache_aligned(collated_records_size, "collated records",
+ &writer->collated_records);
+ if (result != VDO_SUCCESS) {
+ free_chapter_writer(writer);
+ return result;
+ }
+
+ result = uds_make_open_chapter_index(&writer->open_chapter_index,
+ index->volume->geometry,
+ index->volume->nonce);
+ if (result != UDS_SUCCESS) {
+ free_chapter_writer(writer);
+ return result;
+ }
+
+ writer->memory_size = (sizeof(struct chapter_writer) +
+ index->zone_count * sizeof(struct open_chapter_zone *) +
+ collated_records_size +
+ writer->open_chapter_index->memory_size);
+
+ result = vdo_create_thread(close_chapters, writer, "writer", &writer->thread);
+ if (result != VDO_SUCCESS) {
+ free_chapter_writer(writer);
+ return result;
+ }
+
+ *writer_ptr = writer;
+ return UDS_SUCCESS;
+}
+
+static int load_index(struct uds_index *index)
+{
+ int result;
+ u64 last_save_chapter;
+
+ result = uds_load_index_state(index->layout, index);
+ if (result != UDS_SUCCESS)
+ return UDS_INDEX_NOT_SAVED_CLEANLY;
+
+ last_save_chapter = ((index->last_save != NO_LAST_SAVE) ? index->last_save : 0);
+
+ vdo_log_info("loaded index from chapter %llu through chapter %llu",
+ (unsigned long long) index->oldest_virtual_chapter,
+ (unsigned long long) last_save_chapter);
+
+ return UDS_SUCCESS;
+}
+
+static int rebuild_index_page_map(struct uds_index *index, u64 vcn)
+{
+ int result;
+ struct delta_index_page *chapter_index_page;
+ struct index_geometry *geometry = index->volume->geometry;
+ u32 chapter = uds_map_to_physical_chapter(geometry, vcn);
+ u32 expected_list_number = 0;
+ u32 index_page_number;
+ u32 lowest_delta_list;
+ u32 highest_delta_list;
+
+ for (index_page_number = 0;
+ index_page_number < geometry->index_pages_per_chapter;
+ index_page_number++) {
+ result = uds_get_volume_index_page(index->volume, chapter,
+ index_page_number,
+ &chapter_index_page);
+ if (result != UDS_SUCCESS) {
+ return vdo_log_error_strerror(result,
+ "failed to read index page %u in chapter %u",
+ index_page_number, chapter);
+ }
+
+ lowest_delta_list = chapter_index_page->lowest_list_number;
+ highest_delta_list = chapter_index_page->highest_list_number;
+ if (lowest_delta_list != expected_list_number) {
+ return vdo_log_error_strerror(UDS_CORRUPT_DATA,
+ "chapter %u index page %u is corrupt",
+ chapter, index_page_number);
+ }
+
+ uds_update_index_page_map(index->volume->index_page_map, vcn, chapter,
+ index_page_number, highest_delta_list);
+ expected_list_number = highest_delta_list + 1;
+ }
+
+ return UDS_SUCCESS;
+}
+
+static int replay_record(struct uds_index *index, const struct uds_record_name *name,
+ u64 virtual_chapter, bool will_be_sparse_chapter)
+{
+ int result;
+ struct volume_index_record record;
+ bool update_record;
+
+ if (will_be_sparse_chapter &&
+ !uds_is_volume_index_sample(index->volume_index, name)) {
+ /*
+ * This entry will be in a sparse chapter after the rebuild completes, and it is
+ * not a sample, so just skip over it.
+ */
+ return UDS_SUCCESS;
+ }
+
+ result = uds_get_volume_index_record(index->volume_index, name, &record);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ if (record.is_found) {
+ if (record.is_collision) {
+ if (record.virtual_chapter == virtual_chapter) {
+ /* The record is already correct. */
+ return UDS_SUCCESS;
+ }
+
+ update_record = true;
+ } else if (record.virtual_chapter == virtual_chapter) {
+ /*
+ * There is a volume index entry pointing to the current chapter, but we
+ * don't know if it is for the same name as the one we are currently
+ * working on or not. For now, we're just going to assume that it isn't.
+ * This will create one extra collision record if there was a deleted
+ * record in the current chapter.
+ */
+ update_record = false;
+ } else {
+ /*
+ * If we're rebuilding, we don't normally want to go to disk to see if the
+ * record exists, since we will likely have just read the record from disk
+ * (i.e. we know it's there). The exception to this is when we find an
+ * entry in the volume index that has a different chapter. In this case, we
+ * need to search that chapter to determine if the volume index entry was
+ * for the same record or a different one.
+ */
+ result = uds_search_volume_page_cache_for_rebuild(index->volume,
+ name,
+ record.virtual_chapter,
+ &update_record);
+ if (result != UDS_SUCCESS)
+ return result;
+ }
+ } else {
+ update_record = false;
+ }
+
+ if (update_record) {
+ /*
+ * Update the volume index to reference the new chapter for the block. If the
+ * record had been deleted or dropped from the chapter index, it will be back.
+ */
+ result = uds_set_volume_index_record_chapter(&record, virtual_chapter);
+ } else {
+ /*
+ * Add a new entry to the volume index referencing the open chapter. This should be
+ * done regardless of whether we are a brand new record or a sparse record, i.e.
+ * one that doesn't exist in the index but does on disk, since for a sparse record,
+ * we would want to un-sparsify if it did exist.
+ */
+ result = uds_put_volume_index_record(&record, virtual_chapter);
+ }
+
+ if ((result == UDS_DUPLICATE_NAME) || (result == UDS_OVERFLOW)) {
+ /* The rebuilt index will lose these records. */
+ return UDS_SUCCESS;
+ }
+
+ return result;
+}
+
+static bool check_for_suspend(struct uds_index *index)
+{
+ bool closing;
+
+ if (index->load_context == NULL)
+ return false;
+
+ mutex_lock(&index->load_context->mutex);
+ if (index->load_context->status != INDEX_SUSPENDING) {
+ mutex_unlock(&index->load_context->mutex);
+ return false;
+ }
+
+ /* Notify that we are suspended and wait for the resume. */
+ index->load_context->status = INDEX_SUSPENDED;
+ uds_broadcast_cond(&index->load_context->cond);
+
+ while ((index->load_context->status != INDEX_OPENING) &&
+ (index->load_context->status != INDEX_FREEING))
+ uds_wait_cond(&index->load_context->cond, &index->load_context->mutex);
+
+ closing = (index->load_context->status == INDEX_FREEING);
+ mutex_unlock(&index->load_context->mutex);
+ return closing;
+}
+
+static int replay_chapter(struct uds_index *index, u64 virtual, bool sparse)
+{
+ int result;
+ u32 i;
+ u32 j;
+ const struct index_geometry *geometry;
+ u32 physical_chapter;
+
+ if (check_for_suspend(index)) {
+ vdo_log_info("Replay interrupted by index shutdown at chapter %llu",
+ (unsigned long long) virtual);
+ return -EBUSY;
+ }
+
+ geometry = index->volume->geometry;
+ physical_chapter = uds_map_to_physical_chapter(geometry, virtual);
+ uds_prefetch_volume_chapter(index->volume, physical_chapter);
+ uds_set_volume_index_open_chapter(index->volume_index, virtual);
+
+ result = rebuild_index_page_map(index, virtual);
+ if (result != UDS_SUCCESS) {
+ return vdo_log_error_strerror(result,
+ "could not rebuild index page map for chapter %u",
+ physical_chapter);
+ }
+
+ for (i = 0; i < geometry->record_pages_per_chapter; i++) {
+ u8 *record_page;
+ u32 record_page_number;
+
+ record_page_number = geometry->index_pages_per_chapter + i;
+ result = uds_get_volume_record_page(index->volume, physical_chapter,
+ record_page_number, &record_page);
+ if (result != UDS_SUCCESS) {
+ return vdo_log_error_strerror(result, "could not get page %d",
+ record_page_number);
+ }
+
+ for (j = 0; j < geometry->records_per_page; j++) {
+ const u8 *name_bytes;
+ struct uds_record_name name;
+
+ name_bytes = record_page + (j * BYTES_PER_RECORD);
+ memcpy(&name.name, name_bytes, UDS_RECORD_NAME_SIZE);
+ result = replay_record(index, &name, virtual, sparse);
+ if (result != UDS_SUCCESS)
+ return result;
+ }
+ }
+
+ return UDS_SUCCESS;
+}
+
+static int replay_volume(struct uds_index *index)
+{
+ int result;
+ u64 old_map_update;
+ u64 new_map_update;
+ u64 virtual;
+ u64 from_virtual = index->oldest_virtual_chapter;
+ u64 upto_virtual = index->newest_virtual_chapter;
+ bool will_be_sparse;
+
+ vdo_log_info("Replaying volume from chapter %llu through chapter %llu",
+ (unsigned long long) from_virtual,
+ (unsigned long long) upto_virtual);
+
+ /*
+ * The index failed to load, so the volume index is empty. Add records to the volume index
+ * in order, skipping non-hooks in chapters which will be sparse to save time.
+ *
+ * Go through each record page of each chapter and add the records back to the volume
+ * index. This should not cause anything to be written to either the open chapter or the
+ * on-disk volume. Also skip the on-disk chapter corresponding to upto_virtual, as this
+ * would have already been purged from the volume index when the chapter was opened.
+ *
+ * Also, go through each index page for each chapter and rebuild the index page map.
+ */
+ old_map_update = index->volume->index_page_map->last_update;
+ for (virtual = from_virtual; virtual < upto_virtual; virtual++) {
+ will_be_sparse = uds_is_chapter_sparse(index->volume->geometry,
+ from_virtual, upto_virtual,
+ virtual);
+ result = replay_chapter(index, virtual, will_be_sparse);
+ if (result != UDS_SUCCESS)
+ return result;
+ }
+
+ /* Also reap the chapter being replaced by the open chapter. */
+ uds_set_volume_index_open_chapter(index->volume_index, upto_virtual);
+
+ new_map_update = index->volume->index_page_map->last_update;
+ if (new_map_update != old_map_update) {
+ vdo_log_info("replay changed index page map update from %llu to %llu",
+ (unsigned long long) old_map_update,
+ (unsigned long long) new_map_update);
+ }
+
+ return UDS_SUCCESS;
+}
+
+static int rebuild_index(struct uds_index *index)
+{
+ int result;
+ u64 lowest;
+ u64 highest;
+ bool is_empty = false;
+ u32 chapters_per_volume = index->volume->geometry->chapters_per_volume;
+
+ index->volume->lookup_mode = LOOKUP_FOR_REBUILD;
+ result = uds_find_volume_chapter_boundaries(index->volume, &lowest, &highest,
+ &is_empty);
+ if (result != UDS_SUCCESS) {
+ return vdo_log_fatal_strerror(result,
+ "cannot rebuild index: unknown volume chapter boundaries");
+ }
+
+ if (is_empty) {
+ index->newest_virtual_chapter = 0;
+ index->oldest_virtual_chapter = 0;
+ index->volume->lookup_mode = LOOKUP_NORMAL;
+ return UDS_SUCCESS;
+ }
+
+ index->newest_virtual_chapter = highest + 1;
+ index->oldest_virtual_chapter = lowest;
+ if (index->newest_virtual_chapter ==
+ (index->oldest_virtual_chapter + chapters_per_volume)) {
+ /* Skip the chapter shadowed by the open chapter. */
+ index->oldest_virtual_chapter++;
+ }
+
+ result = replay_volume(index);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ index->volume->lookup_mode = LOOKUP_NORMAL;
+ return UDS_SUCCESS;
+}
+
+static void free_index_zone(struct index_zone *zone)
+{
+ if (zone == NULL)
+ return;
+
+ uds_free_open_chapter(zone->open_chapter);
+ uds_free_open_chapter(zone->writing_chapter);
+ vdo_free(zone);
+}
+
+static int make_index_zone(struct uds_index *index, unsigned int zone_number)
+{
+ int result;
+ struct index_zone *zone;
+
+ result = vdo_allocate(1, struct index_zone, "index zone", &zone);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = uds_make_open_chapter(index->volume->geometry, index->zone_count,
+ &zone->open_chapter);
+ if (result != UDS_SUCCESS) {
+ free_index_zone(zone);
+ return result;
+ }
+
+ result = uds_make_open_chapter(index->volume->geometry, index->zone_count,
+ &zone->writing_chapter);
+ if (result != UDS_SUCCESS) {
+ free_index_zone(zone);
+ return result;
+ }
+
+ zone->index = index;
+ zone->id = zone_number;
+ index->zones[zone_number] = zone;
+
+ return UDS_SUCCESS;
+}
+
+int uds_make_index(struct uds_configuration *config, enum uds_open_index_type open_type,
+ struct index_load_context *load_context, index_callback_fn callback,
+ struct uds_index **new_index)
+{
+ int result;
+ bool loaded = false;
+ bool new = (open_type == UDS_CREATE);
+ struct uds_index *index = NULL;
+ struct index_zone *zone;
+ u64 nonce;
+ unsigned int z;
+
+ result = vdo_allocate_extended(struct uds_index, config->zone_count,
+ struct uds_request_queue *, "index", &index);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ index->zone_count = config->zone_count;
+
+ result = uds_make_index_layout(config, new, &index->layout);
+ if (result != UDS_SUCCESS) {
+ uds_free_index(index);
+ return result;
+ }
+
+ result = vdo_allocate(index->zone_count, struct index_zone *, "zones",
+ &index->zones);
+ if (result != VDO_SUCCESS) {
+ uds_free_index(index);
+ return result;
+ }
+
+ result = uds_make_volume(config, index->layout, &index->volume);
+ if (result != UDS_SUCCESS) {
+ uds_free_index(index);
+ return result;
+ }
+
+ index->volume->lookup_mode = LOOKUP_NORMAL;
+ for (z = 0; z < index->zone_count; z++) {
+ result = make_index_zone(index, z);
+ if (result != UDS_SUCCESS) {
+ uds_free_index(index);
+ return vdo_log_error_strerror(result,
+ "Could not create index zone");
+ }
+ }
+
+ nonce = uds_get_volume_nonce(index->layout);
+ result = uds_make_volume_index(config, nonce, &index->volume_index);
+ if (result != UDS_SUCCESS) {
+ uds_free_index(index);
+ return vdo_log_error_strerror(result, "could not make volume index");
+ }
+
+ index->load_context = load_context;
+ index->callback = callback;
+
+ result = initialize_index_queues(index, config->geometry);
+ if (result != UDS_SUCCESS) {
+ uds_free_index(index);
+ return result;
+ }
+
+ result = make_chapter_writer(index, &index->chapter_writer);
+ if (result != UDS_SUCCESS) {
+ uds_free_index(index);
+ return result;
+ }
+
+ if (!new) {
+ result = load_index(index);
+ switch (result) {
+ case UDS_SUCCESS:
+ loaded = true;
+ break;
+ case -ENOMEM:
+ /* We should not try a rebuild for this error. */
+ vdo_log_error_strerror(result, "index could not be loaded");
+ break;
+ default:
+ vdo_log_error_strerror(result, "index could not be loaded");
+ if (open_type == UDS_LOAD) {
+ result = rebuild_index(index);
+ if (result != UDS_SUCCESS) {
+ vdo_log_error_strerror(result,
+ "index could not be rebuilt");
+ }
+ }
+ break;
+ }
+ }
+
+ if (result != UDS_SUCCESS) {
+ uds_free_index(index);
+ return vdo_log_error_strerror(result, "fatal error in %s()", __func__);
+ }
+
+ for (z = 0; z < index->zone_count; z++) {
+ zone = index->zones[z];
+ zone->oldest_virtual_chapter = index->oldest_virtual_chapter;
+ zone->newest_virtual_chapter = index->newest_virtual_chapter;
+ }
+
+ if (index->load_context != NULL) {
+ mutex_lock(&index->load_context->mutex);
+ index->load_context->status = INDEX_READY;
+ /*
+ * If we get here, suspend is meaningless, but notify any thread trying to suspend
+ * us so it doesn't hang.
+ */
+ uds_broadcast_cond(&index->load_context->cond);
+ mutex_unlock(&index->load_context->mutex);
+ }
+
+ index->has_saved_open_chapter = loaded;
+ index->need_to_save = !loaded;
+ *new_index = index;
+ return UDS_SUCCESS;
+}
+
+void uds_free_index(struct uds_index *index)
+{
+ unsigned int i;
+
+ if (index == NULL)
+ return;
+
+ uds_request_queue_finish(index->triage_queue);
+ for (i = 0; i < index->zone_count; i++)
+ uds_request_queue_finish(index->zone_queues[i]);
+
+ free_chapter_writer(index->chapter_writer);
+
+ uds_free_volume_index(index->volume_index);
+ if (index->zones != NULL) {
+ for (i = 0; i < index->zone_count; i++)
+ free_index_zone(index->zones[i]);
+ vdo_free(index->zones);
+ }
+
+ uds_free_volume(index->volume);
+ uds_free_index_layout(vdo_forget(index->layout));
+ vdo_free(index);
+}
+
+/* Wait for the chapter writer to complete any outstanding writes. */
+void uds_wait_for_idle_index(struct uds_index *index)
+{
+ struct chapter_writer *writer = index->chapter_writer;
+
+ mutex_lock(&writer->mutex);
+ while (writer->zones_to_write > 0)
+ uds_wait_cond(&writer->cond, &writer->mutex);
+ mutex_unlock(&writer->mutex);
+}
+
+/* This function assumes that all requests have been drained. */
+int uds_save_index(struct uds_index *index)
+{
+ int result;
+
+ if (!index->need_to_save)
+ return UDS_SUCCESS;
+
+ uds_wait_for_idle_index(index);
+ index->prev_save = index->last_save;
+ index->last_save = ((index->newest_virtual_chapter == 0) ?
+ NO_LAST_SAVE : index->newest_virtual_chapter - 1);
+ vdo_log_info("beginning save (vcn %llu)", (unsigned long long) index->last_save);
+
+ result = uds_save_index_state(index->layout, index);
+ if (result != UDS_SUCCESS) {
+ vdo_log_info("save index failed");
+ index->last_save = index->prev_save;
+ } else {
+ index->has_saved_open_chapter = true;
+ index->need_to_save = false;
+ vdo_log_info("finished save (vcn %llu)",
+ (unsigned long long) index->last_save);
+ }
+
+ return result;
+}
+
+int uds_replace_index_storage(struct uds_index *index, struct block_device *bdev)
+{
+ return uds_replace_volume_storage(index->volume, index->layout, bdev);
+}
+
+/* Accessing statistics should be safe from any thread. */
+void uds_get_index_stats(struct uds_index *index, struct uds_index_stats *counters)
+{
+ struct volume_index_stats stats;
+
+ uds_get_volume_index_stats(index->volume_index, &stats);
+ counters->entries_indexed = stats.record_count;
+ counters->collisions = stats.collision_count;
+ counters->entries_discarded = stats.discard_count;
+
+ counters->memory_used = (index->volume_index->memory_size +
+ index->volume->cache_size +
+ index->chapter_writer->memory_size);
+}
+
+void uds_enqueue_request(struct uds_request *request, enum request_stage stage)
+{
+ struct uds_index *index = request->index;
+ struct uds_request_queue *queue;
+
+ switch (stage) {
+ case STAGE_TRIAGE:
+ if (index->triage_queue != NULL) {
+ queue = index->triage_queue;
+ break;
+ }
+
+ fallthrough;
+
+ case STAGE_INDEX:
+ request->zone_number =
+ uds_get_volume_index_zone(index->volume_index, &request->record_name);
+ fallthrough;
+
+ case STAGE_MESSAGE:
+ queue = index->zone_queues[request->zone_number];
+ break;
+
+ default:
+ VDO_ASSERT_LOG_ONLY(false, "invalid index stage: %d", stage);
+ return;
+ }
+
+ uds_request_queue_enqueue(queue, request);
+}
diff --git a/drivers/md/dm-vdo/indexer/index.h b/drivers/md/dm-vdo/indexer/index.h
new file mode 100644
index 000000000000..edabb239548e
--- /dev/null
+++ b/drivers/md/dm-vdo/indexer/index.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef UDS_INDEX_H
+#define UDS_INDEX_H
+
+#include "index-layout.h"
+#include "index-session.h"
+#include "open-chapter.h"
+#include "volume.h"
+#include "volume-index.h"
+
+/*
+ * The index is a high-level structure which represents the totality of the UDS index. It manages
+ * the queues for incoming requests and dispatches them to the appropriate sub-components like the
+ * volume or the volume index. It also manages administrative tasks such as saving and loading the
+ * index.
+ *
+ * The index is divided into a number of independent zones and assigns each request to a zone based
+ * on its name. Most sub-components are similarly divided into zones as well so that requests in
+ * each zone usually operate without interference or coordination between zones.
+ */
+
+typedef void (*index_callback_fn)(struct uds_request *request);
+
+struct index_zone {
+ struct uds_index *index;
+ struct open_chapter_zone *open_chapter;
+ struct open_chapter_zone *writing_chapter;
+ u64 oldest_virtual_chapter;
+ u64 newest_virtual_chapter;
+ unsigned int id;
+};
+
+struct uds_index {
+ bool has_saved_open_chapter;
+ bool need_to_save;
+ struct index_load_context *load_context;
+ struct index_layout *layout;
+ struct volume_index *volume_index;
+ struct volume *volume;
+ unsigned int zone_count;
+ struct index_zone **zones;
+
+ u64 oldest_virtual_chapter;
+ u64 newest_virtual_chapter;
+
+ u64 last_save;
+ u64 prev_save;
+ struct chapter_writer *chapter_writer;
+
+ index_callback_fn callback;
+ struct uds_request_queue *triage_queue;
+ struct uds_request_queue *zone_queues[];
+};
+
+enum request_stage {
+ STAGE_TRIAGE,
+ STAGE_INDEX,
+ STAGE_MESSAGE,
+};
+
+int __must_check uds_make_index(struct uds_configuration *config,
+ enum uds_open_index_type open_type,
+ struct index_load_context *load_context,
+ index_callback_fn callback, struct uds_index **new_index);
+
+int __must_check uds_save_index(struct uds_index *index);
+
+void uds_free_index(struct uds_index *index);
+
+int __must_check uds_replace_index_storage(struct uds_index *index,
+ struct block_device *bdev);
+
+void uds_get_index_stats(struct uds_index *index, struct uds_index_stats *counters);
+
+void uds_enqueue_request(struct uds_request *request, enum request_stage stage);
+
+void uds_wait_for_idle_index(struct uds_index *index);
+
+#endif /* UDS_INDEX_H */
diff --git a/drivers/md/dm-vdo/indexer/indexer.h b/drivers/md/dm-vdo/indexer/indexer.h
new file mode 100644
index 000000000000..3744aaf625b0
--- /dev/null
+++ b/drivers/md/dm-vdo/indexer/indexer.h
@@ -0,0 +1,353 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef INDEXER_H
+#define INDEXER_H
+
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+
+#include "funnel-queue.h"
+
+/*
+ * UDS public API
+ *
+ * The Universal Deduplication System (UDS) is an efficient name-value store. When used for
+ * deduplicating storage, the names are generally hashes of data blocks and the associated data is
+ * where that block is located on the underlying storage medium. The stored names are expected to
+ * be randomly distributed among the space of possible names. If this assumption is violated, the
+ * UDS index will store fewer names than normal but will otherwise continue to work. The data
+ * associated with each name can be any 16-byte value.
+ *
+ * A client must first create an index session to interact with an index. Once created, the session
+ * can be shared among multiple threads or users. When a session is destroyed, it will also close
+ * and save any associated index.
+ *
+ * To make a request, a client must allocate a uds_request structure and set the required fields
+ * before launching it. UDS will invoke the provided callback to complete the request. After the
+ * callback has been called, the uds_request structure can be freed or reused for a new request.
+ * There are five types of requests:
+ *
+ * A UDS_UPDATE request will associate the provided name with the provided data. Any previous data
+ * associated with that name will be discarded.
+ *
+ * A UDS_QUERY request will return the data associated with the provided name, if any. The entry
+ * for the name will also be marked as most recent, as if the data had been updated.
+ *
+ * A UDS_POST request is a combination of UDS_QUERY and UDS_UPDATE. If there is already data
+ * associated with the provided name, that data is returned. If there is no existing association,
+ * the name is associated with the newly provided data. This request is equivalent to a UDS_QUERY
+ * request followed by a UDS_UPDATE request if no data is found, but it is much more efficient.
+ *
+ * A UDS_QUERY_NO_UPDATE request will return the data associated with the provided name, but will
+ * not change the recency of the entry for the name. This request is primarily useful for testing,
+ * to determine whether an entry exists without changing the internal state of the index.
+ *
+ * A UDS_DELETE request removes any data associated with the provided name. This operation is
+ * generally not necessary, because the index will automatically discard its oldest entries once it
+ * becomes full.
+ */
+
+/* General UDS constants and structures */
+
+enum uds_request_type {
+ /* Create or update the mapping for a name, and make the name most recent. */
+ UDS_UPDATE,
+
+ /* Return any mapped data for a name, and make the name most recent. */
+ UDS_QUERY,
+
+ /*
+ * Return any mapped data for a name, or map the provided data to the name if there is no
+ * current data, and make the name most recent.
+ */
+ UDS_POST,
+
+ /* Return any mapped data for a name without updating its recency. */
+ UDS_QUERY_NO_UPDATE,
+
+ /* Remove any mapping for a name. */
+ UDS_DELETE,
+
+};
+
+enum uds_open_index_type {
+ /* Create a new index. */
+ UDS_CREATE,
+
+ /* Load an existing index and try to recover if necessary. */
+ UDS_LOAD,
+
+ /* Load an existing index, but only if it was saved cleanly. */
+ UDS_NO_REBUILD,
+};
+
+enum {
+ /* The record name size in bytes */
+ UDS_RECORD_NAME_SIZE = 16,
+ /* The maximum record data size in bytes */
+ UDS_RECORD_DATA_SIZE = 16,
+};
+
+/*
+ * A type representing a UDS memory configuration which is either a positive integer number of
+ * gigabytes or one of the six special constants for configurations smaller than one gigabyte.
+ */
+typedef int uds_memory_config_size_t;
+
+enum {
+ /* The maximum configurable amount of memory */
+ UDS_MEMORY_CONFIG_MAX = 1024,
+ /* Flag indicating that the index has one less chapter than usual */
+ UDS_MEMORY_CONFIG_REDUCED = 0x1000,
+ UDS_MEMORY_CONFIG_REDUCED_MAX = 1024 + UDS_MEMORY_CONFIG_REDUCED,
+ /* Special values indicating sizes less than 1 GB */
+ UDS_MEMORY_CONFIG_256MB = -256,
+ UDS_MEMORY_CONFIG_512MB = -512,
+ UDS_MEMORY_CONFIG_768MB = -768,
+ UDS_MEMORY_CONFIG_REDUCED_256MB = -1280,
+ UDS_MEMORY_CONFIG_REDUCED_512MB = -1536,
+ UDS_MEMORY_CONFIG_REDUCED_768MB = -1792,
+};
+
+struct uds_record_name {
+ unsigned char name[UDS_RECORD_NAME_SIZE];
+};
+
+struct uds_record_data {
+ unsigned char data[UDS_RECORD_DATA_SIZE];
+};
+
+struct uds_volume_record {
+ struct uds_record_name name;
+ struct uds_record_data data;
+};
+
+struct uds_parameters {
+ /* The block_device used for storage */
+ struct block_device *bdev;
+ /* The maximum allowable size of the index on storage */
+ size_t size;
+ /* The offset where the index should start */
+ off_t offset;
+ /* The maximum memory allocation, in GB */
+ uds_memory_config_size_t memory_size;
+ /* Whether the index should include sparse chapters */
+ bool sparse;
+ /* A 64-bit nonce to validate the index */
+ u64 nonce;
+ /* The number of threads used to process index requests */
+ unsigned int zone_count;
+ /* The number of threads used to read volume pages */
+ unsigned int read_threads;
+};
+
+/*
+ * These statistics capture characteristics of the current index, including resource usage and
+ * requests processed since the index was opened.
+ */
+struct uds_index_stats {
+ /* The total number of records stored in the index */
+ u64 entries_indexed;
+ /* An estimate of the index's memory usage, in bytes */
+ u64 memory_used;
+ /* The number of collisions recorded in the volume index */
+ u64 collisions;
+ /* The number of entries discarded from the index since startup */
+ u64 entries_discarded;
+ /* The time at which these statistics were fetched */
+ s64 current_time;
+ /* The number of post calls that found an existing entry */
+ u64 posts_found;
+ /* The number of post calls that added an entry */
+ u64 posts_not_found;
+ /*
+ * The number of post calls that found an existing entry that is current enough to only
+ * exist in memory and not have been committed to disk yet
+ */
+ u64 in_memory_posts_found;
+ /*
+ * The number of post calls that found an existing entry in the dense portion of the index
+ */
+ u64 dense_posts_found;
+ /*
+ * The number of post calls that found an existing entry in the sparse portion of the index
+ */
+ u64 sparse_posts_found;
+ /* The number of update calls that updated an existing entry */
+ u64 updates_found;
+ /* The number of update calls that added a new entry */
+ u64 updates_not_found;
+ /* The number of delete requests that deleted an existing entry */
+ u64 deletions_found;
+ /* The number of delete requests that did nothing */
+ u64 deletions_not_found;
+ /* The number of query calls that found existing entry */
+ u64 queries_found;
+ /* The number of query calls that did not find an entry */
+ u64 queries_not_found;
+ /* The total number of requests processed */
+ u64 requests;
+};
+
+enum uds_index_region {
+ /* No location information has been determined */
+ UDS_LOCATION_UNKNOWN = 0,
+ /* The index page entry has been found */
+ UDS_LOCATION_INDEX_PAGE_LOOKUP,
+ /* The record page entry has been found */
+ UDS_LOCATION_RECORD_PAGE_LOOKUP,
+ /* The record is not in the index */
+ UDS_LOCATION_UNAVAILABLE,
+ /* The record was found in the open chapter */
+ UDS_LOCATION_IN_OPEN_CHAPTER,
+ /* The record was found in the dense part of the index */
+ UDS_LOCATION_IN_DENSE,
+ /* The record was found in the sparse part of the index */
+ UDS_LOCATION_IN_SPARSE,
+} __packed;
+
+/* Zone message requests are used to communicate between index zones. */
+enum uds_zone_message_type {
+ /* A standard request with no message */
+ UDS_MESSAGE_NONE = 0,
+ /* Add a chapter to the sparse chapter index cache */
+ UDS_MESSAGE_SPARSE_CACHE_BARRIER,
+ /* Close a chapter to keep the zone from falling behind */
+ UDS_MESSAGE_ANNOUNCE_CHAPTER_CLOSED,
+} __packed;
+
+struct uds_zone_message {
+ /* The type of message, determining how it will be processed */
+ enum uds_zone_message_type type;
+ /* The virtual chapter number to which the message applies */
+ u64 virtual_chapter;
+};
+
+struct uds_index_session;
+struct uds_index;
+struct uds_request;
+
+/* Once this callback has been invoked, the uds_request structure can be reused or freed. */
+typedef void (*uds_request_callback_fn)(struct uds_request *request);
+
+struct uds_request {
+ /* These input fields must be set before launching a request. */
+
+ /* The name of the record to look up or create */
+ struct uds_record_name record_name;
+ /* New data to associate with the record name, if applicable */
+ struct uds_record_data new_metadata;
+ /* A callback to invoke when the request is complete */
+ uds_request_callback_fn callback;
+ /* The index session that will manage this request */
+ struct uds_index_session *session;
+ /* The type of operation to perform, as describe above */
+ enum uds_request_type type;
+
+ /* These output fields are set when a request is complete. */
+
+ /* The existing data associated with the request name, if any */
+ struct uds_record_data old_metadata;
+ /* Either UDS_SUCCESS or an error code for the request */
+ int status;
+ /* True if the record name had an existing entry in the index */
+ bool found;
+
+ /*
+ * The remaining fields are used internally and should not be altered by clients. The index
+ * relies on zone_number being the first field in this section.
+ */
+
+ /* The number of the zone which will process this request*/
+ unsigned int zone_number;
+ /* A link for adding a request to a lock-free queue */
+ struct funnel_queue_entry queue_link;
+ /* A link for adding a request to a standard linked list */
+ struct uds_request *next_request;
+ /* A pointer to the index processing this request */
+ struct uds_index *index;
+ /* Control message for coordinating between zones */
+ struct uds_zone_message zone_message;
+ /* If true, process request immediately by waking the worker thread */
+ bool unbatched;
+ /* If true, continue this request before processing newer requests */
+ bool requeued;
+ /* The virtual chapter containing the record name, if known */
+ u64 virtual_chapter;
+ /* The region of the index containing the record name */
+ enum uds_index_region location;
+};
+
+/* Compute the number of bytes needed to store an index. */
+int __must_check uds_compute_index_size(const struct uds_parameters *parameters,
+ u64 *index_size);
+
+/* A session is required for most index operations. */
+int __must_check uds_create_index_session(struct uds_index_session **session);
+
+/* Destroying an index session also closes and saves the associated index. */
+int uds_destroy_index_session(struct uds_index_session *session);
+
+/*
+ * Create or open an index with an existing session. This operation fails if the index session is
+ * suspended, or if there is already an open index.
+ */
+int __must_check uds_open_index(enum uds_open_index_type open_type,
+ const struct uds_parameters *parameters,
+ struct uds_index_session *session);
+
+/*
+ * Wait until all callbacks for index operations are complete, and prevent new index operations
+ * from starting. New index operations will fail with EBUSY until the session is resumed. Also
+ * optionally saves the index.
+ */
+int __must_check uds_suspend_index_session(struct uds_index_session *session, bool save);
+
+/*
+ * Allow new index operations for an index, whether it was suspended or not. If the index is
+ * suspended and the supplied block device differs from the current backing store, the index will
+ * start using the new backing store instead.
+ */
+int __must_check uds_resume_index_session(struct uds_index_session *session,
+ struct block_device *bdev);
+
+/* Wait until all outstanding index operations are complete. */
+int __must_check uds_flush_index_session(struct uds_index_session *session);
+
+/* Close an index. This operation fails if the index session is suspended. */
+int __must_check uds_close_index(struct uds_index_session *session);
+
+/* Get index statistics since the last time the index was opened. */
+int __must_check uds_get_index_session_stats(struct uds_index_session *session,
+ struct uds_index_stats *stats);
+
+/* This function will fail if any required field of the request is not set. */
+int __must_check uds_launch_request(struct uds_request *request);
+
+struct cond_var {
+ wait_queue_head_t wait_queue;
+};
+
+static inline void uds_init_cond(struct cond_var *cv)
+{
+ init_waitqueue_head(&cv->wait_queue);
+}
+
+static inline void uds_signal_cond(struct cond_var *cv)
+{
+ wake_up(&cv->wait_queue);
+}
+
+static inline void uds_broadcast_cond(struct cond_var *cv)
+{
+ wake_up_all(&cv->wait_queue);
+}
+
+void uds_wait_cond(struct cond_var *cv, struct mutex *mutex);
+
+#endif /* INDEXER_H */
diff --git a/drivers/md/dm-vdo/indexer/io-factory.c b/drivers/md/dm-vdo/indexer/io-factory.c
new file mode 100644
index 000000000000..515765d35794
--- /dev/null
+++ b/drivers/md/dm-vdo/indexer/io-factory.c
@@ -0,0 +1,415 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "io-factory.h"
+
+#include <linux/atomic.h>
+#include <linux/blkdev.h>
+#include <linux/err.h>
+#include <linux/mount.h>
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "numeric.h"
+
+/*
+ * The I/O factory object manages access to index storage, which is a contiguous range of blocks on
+ * a block device.
+ *
+ * The factory holds the open device and is responsible for closing it. The factory has methods to
+ * make helper structures that can be used to access sections of the index.
+ */
+struct io_factory {
+ struct block_device *bdev;
+ atomic_t ref_count;
+};
+
+/* The buffered reader allows efficient I/O by reading page-sized segments into a buffer. */
+struct buffered_reader {
+ struct io_factory *factory;
+ struct dm_bufio_client *client;
+ struct dm_buffer *buffer;
+ sector_t limit;
+ sector_t block_number;
+ u8 *start;
+ u8 *end;
+};
+
+#define MAX_READ_AHEAD_BLOCKS 4
+
+/*
+ * The buffered writer allows efficient I/O by buffering writes and committing page-sized segments
+ * to storage.
+ */
+struct buffered_writer {
+ struct io_factory *factory;
+ struct dm_bufio_client *client;
+ struct dm_buffer *buffer;
+ sector_t limit;
+ sector_t block_number;
+ u8 *start;
+ u8 *end;
+ int error;
+};
+
+static void uds_get_io_factory(struct io_factory *factory)
+{
+ atomic_inc(&factory->ref_count);
+}
+
+int uds_make_io_factory(struct block_device *bdev, struct io_factory **factory_ptr)
+{
+ int result;
+ struct io_factory *factory;
+
+ result = vdo_allocate(1, struct io_factory, __func__, &factory);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ factory->bdev = bdev;
+ atomic_set_release(&factory->ref_count, 1);
+
+ *factory_ptr = factory;
+ return UDS_SUCCESS;
+}
+
+int uds_replace_storage(struct io_factory *factory, struct block_device *bdev)
+{
+ factory->bdev = bdev;
+ return UDS_SUCCESS;
+}
+
+/* Free an I/O factory once all references have been released. */
+void uds_put_io_factory(struct io_factory *factory)
+{
+ if (atomic_add_return(-1, &factory->ref_count) <= 0)
+ vdo_free(factory);
+}
+
+size_t uds_get_writable_size(struct io_factory *factory)
+{
+ return i_size_read(factory->bdev->bd_inode);
+}
+
+/* Create a struct dm_bufio_client for an index region starting at offset. */
+int uds_make_bufio(struct io_factory *factory, off_t block_offset, size_t block_size,
+ unsigned int reserved_buffers, struct dm_bufio_client **client_ptr)
+{
+ struct dm_bufio_client *client;
+
+ client = dm_bufio_client_create(factory->bdev, block_size, reserved_buffers, 0,
+ NULL, NULL, 0);
+ if (IS_ERR(client))
+ return -PTR_ERR(client);
+
+ dm_bufio_set_sector_offset(client, block_offset * SECTORS_PER_BLOCK);
+ *client_ptr = client;
+ return UDS_SUCCESS;
+}
+
+static void read_ahead(struct buffered_reader *reader, sector_t block_number)
+{
+ if (block_number < reader->limit) {
+ sector_t read_ahead = min((sector_t) MAX_READ_AHEAD_BLOCKS,
+ reader->limit - block_number);
+
+ dm_bufio_prefetch(reader->client, block_number, read_ahead);
+ }
+}
+
+void uds_free_buffered_reader(struct buffered_reader *reader)
+{
+ if (reader == NULL)
+ return;
+
+ if (reader->buffer != NULL)
+ dm_bufio_release(reader->buffer);
+
+ dm_bufio_client_destroy(reader->client);
+ uds_put_io_factory(reader->factory);
+ vdo_free(reader);
+}
+
+/* Create a buffered reader for an index region starting at offset. */
+int uds_make_buffered_reader(struct io_factory *factory, off_t offset, u64 block_count,
+ struct buffered_reader **reader_ptr)
+{
+ int result;
+ struct dm_bufio_client *client = NULL;
+ struct buffered_reader *reader = NULL;
+
+ result = uds_make_bufio(factory, offset, UDS_BLOCK_SIZE, 1, &client);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ result = vdo_allocate(1, struct buffered_reader, "buffered reader", &reader);
+ if (result != VDO_SUCCESS) {
+ dm_bufio_client_destroy(client);
+ return result;
+ }
+
+ *reader = (struct buffered_reader) {
+ .factory = factory,
+ .client = client,
+ .buffer = NULL,
+ .limit = block_count,
+ .block_number = 0,
+ .start = NULL,
+ .end = NULL,
+ };
+
+ read_ahead(reader, 0);
+ uds_get_io_factory(factory);
+ *reader_ptr = reader;
+ return UDS_SUCCESS;
+}
+
+static int position_reader(struct buffered_reader *reader, sector_t block_number,
+ off_t offset)
+{
+ struct dm_buffer *buffer = NULL;
+ void *data;
+
+ if ((reader->end == NULL) || (block_number != reader->block_number)) {
+ if (block_number >= reader->limit)
+ return UDS_OUT_OF_RANGE;
+
+ if (reader->buffer != NULL)
+ dm_bufio_release(vdo_forget(reader->buffer));
+
+ data = dm_bufio_read(reader->client, block_number, &buffer);
+ if (IS_ERR(data))
+ return -PTR_ERR(data);
+
+ reader->buffer = buffer;
+ reader->start = data;
+ if (block_number == reader->block_number + 1)
+ read_ahead(reader, block_number + 1);
+ }
+
+ reader->block_number = block_number;
+ reader->end = reader->start + offset;
+ return UDS_SUCCESS;
+}
+
+static size_t bytes_remaining_in_read_buffer(struct buffered_reader *reader)
+{
+ return (reader->end == NULL) ? 0 : reader->start + UDS_BLOCK_SIZE - reader->end;
+}
+
+static int reset_reader(struct buffered_reader *reader)
+{
+ sector_t block_number;
+
+ if (bytes_remaining_in_read_buffer(reader) > 0)
+ return UDS_SUCCESS;
+
+ block_number = reader->block_number;
+ if (reader->end != NULL)
+ block_number++;
+
+ return position_reader(reader, block_number, 0);
+}
+
+int uds_read_from_buffered_reader(struct buffered_reader *reader, u8 *data,
+ size_t length)
+{
+ int result = UDS_SUCCESS;
+ size_t chunk_size;
+
+ while (length > 0) {
+ result = reset_reader(reader);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ chunk_size = min(length, bytes_remaining_in_read_buffer(reader));
+ memcpy(data, reader->end, chunk_size);
+ length -= chunk_size;
+ data += chunk_size;
+ reader->end += chunk_size;
+ }
+
+ return UDS_SUCCESS;
+}
+
+/*
+ * Verify that the next data on the reader matches the required value. If the value matches, the
+ * matching contents are consumed. If the value does not match, the reader state is unchanged.
+ */
+int uds_verify_buffered_data(struct buffered_reader *reader, const u8 *value,
+ size_t length)
+{
+ int result = UDS_SUCCESS;
+ size_t chunk_size;
+ sector_t start_block_number = reader->block_number;
+ int start_offset = reader->end - reader->start;
+
+ while (length > 0) {
+ result = reset_reader(reader);
+ if (result != UDS_SUCCESS) {
+ result = UDS_CORRUPT_DATA;
+ break;
+ }
+
+ chunk_size = min(length, bytes_remaining_in_read_buffer(reader));
+ if (memcmp(value, reader->end, chunk_size) != 0) {
+ result = UDS_CORRUPT_DATA;
+ break;
+ }
+
+ length -= chunk_size;
+ value += chunk_size;
+ reader->end += chunk_size;
+ }
+
+ if (result != UDS_SUCCESS)
+ position_reader(reader, start_block_number, start_offset);
+
+ return result;
+}
+
+/* Create a buffered writer for an index region starting at offset. */
+int uds_make_buffered_writer(struct io_factory *factory, off_t offset, u64 block_count,
+ struct buffered_writer **writer_ptr)
+{
+ int result;
+ struct dm_bufio_client *client = NULL;
+ struct buffered_writer *writer;
+
+ result = uds_make_bufio(factory, offset, UDS_BLOCK_SIZE, 1, &client);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ result = vdo_allocate(1, struct buffered_writer, "buffered writer", &writer);
+ if (result != VDO_SUCCESS) {
+ dm_bufio_client_destroy(client);
+ return result;
+ }
+
+ *writer = (struct buffered_writer) {
+ .factory = factory,
+ .client = client,
+ .buffer = NULL,
+ .limit = block_count,
+ .start = NULL,
+ .end = NULL,
+ .block_number = 0,
+ .error = UDS_SUCCESS,
+ };
+
+ uds_get_io_factory(factory);
+ *writer_ptr = writer;
+ return UDS_SUCCESS;
+}
+
+static size_t get_remaining_write_space(struct buffered_writer *writer)
+{
+ return writer->start + UDS_BLOCK_SIZE - writer->end;
+}
+
+static int __must_check prepare_next_buffer(struct buffered_writer *writer)
+{
+ struct dm_buffer *buffer = NULL;
+ void *data;
+
+ if (writer->block_number >= writer->limit) {
+ writer->error = UDS_OUT_OF_RANGE;
+ return UDS_OUT_OF_RANGE;
+ }
+
+ data = dm_bufio_new(writer->client, writer->block_number, &buffer);
+ if (IS_ERR(data)) {
+ writer->error = -PTR_ERR(data);
+ return writer->error;
+ }
+
+ writer->buffer = buffer;
+ writer->start = data;
+ writer->end = data;
+ return UDS_SUCCESS;
+}
+
+static int flush_previous_buffer(struct buffered_writer *writer)
+{
+ size_t available;
+
+ if (writer->buffer == NULL)
+ return writer->error;
+
+ if (writer->error == UDS_SUCCESS) {
+ available = get_remaining_write_space(writer);
+
+ if (available > 0)
+ memset(writer->end, 0, available);
+
+ dm_bufio_mark_buffer_dirty(writer->buffer);
+ }
+
+ dm_bufio_release(writer->buffer);
+ writer->buffer = NULL;
+ writer->start = NULL;
+ writer->end = NULL;
+ writer->block_number++;
+ return writer->error;
+}
+
+void uds_free_buffered_writer(struct buffered_writer *writer)
+{
+ int result;
+
+ if (writer == NULL)
+ return;
+
+ flush_previous_buffer(writer);
+ result = -dm_bufio_write_dirty_buffers(writer->client);
+ if (result != UDS_SUCCESS)
+ vdo_log_warning_strerror(result, "%s: failed to sync storage", __func__);
+
+ dm_bufio_client_destroy(writer->client);
+ uds_put_io_factory(writer->factory);
+ vdo_free(writer);
+}
+
+/*
+ * Append data to the buffer, writing as needed. If no data is provided, zeros are written instead.
+ * If a write error occurs, it is recorded and returned on every subsequent write attempt.
+ */
+int uds_write_to_buffered_writer(struct buffered_writer *writer, const u8 *data,
+ size_t length)
+{
+ int result = writer->error;
+ size_t chunk_size;
+
+ while ((length > 0) && (result == UDS_SUCCESS)) {
+ if (writer->buffer == NULL) {
+ result = prepare_next_buffer(writer);
+ continue;
+ }
+
+ chunk_size = min(length, get_remaining_write_space(writer));
+ if (data == NULL) {
+ memset(writer->end, 0, chunk_size);
+ } else {
+ memcpy(writer->end, data, chunk_size);
+ data += chunk_size;
+ }
+
+ length -= chunk_size;
+ writer->end += chunk_size;
+
+ if (get_remaining_write_space(writer) == 0)
+ result = uds_flush_buffered_writer(writer);
+ }
+
+ return result;
+}
+
+int uds_flush_buffered_writer(struct buffered_writer *writer)
+{
+ if (writer->error != UDS_SUCCESS)
+ return writer->error;
+
+ return flush_previous_buffer(writer);
+}
diff --git a/drivers/md/dm-vdo/indexer/io-factory.h b/drivers/md/dm-vdo/indexer/io-factory.h
new file mode 100644
index 000000000000..7fb5a0616a79
--- /dev/null
+++ b/drivers/md/dm-vdo/indexer/io-factory.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef UDS_IO_FACTORY_H
+#define UDS_IO_FACTORY_H
+
+#include <linux/dm-bufio.h>
+
+/*
+ * The I/O factory manages all low-level I/O operations to the underlying storage device. Its main
+ * clients are the index layout and the volume. The buffered reader and buffered writer interfaces
+ * are helpers for accessing data in a contiguous range of storage blocks.
+ */
+
+struct buffered_reader;
+struct buffered_writer;
+
+struct io_factory;
+
+enum {
+ UDS_BLOCK_SIZE = 4096,
+ SECTORS_PER_BLOCK = UDS_BLOCK_SIZE >> SECTOR_SHIFT,
+};
+
+int __must_check uds_make_io_factory(struct block_device *bdev,
+ struct io_factory **factory_ptr);
+
+int __must_check uds_replace_storage(struct io_factory *factory,
+ struct block_device *bdev);
+
+void uds_put_io_factory(struct io_factory *factory);
+
+size_t __must_check uds_get_writable_size(struct io_factory *factory);
+
+int __must_check uds_make_bufio(struct io_factory *factory, off_t block_offset,
+ size_t block_size, unsigned int reserved_buffers,
+ struct dm_bufio_client **client_ptr);
+
+int __must_check uds_make_buffered_reader(struct io_factory *factory, off_t offset,
+ u64 block_count,
+ struct buffered_reader **reader_ptr);
+
+void uds_free_buffered_reader(struct buffered_reader *reader);
+
+int __must_check uds_read_from_buffered_reader(struct buffered_reader *reader, u8 *data,
+ size_t length);
+
+int __must_check uds_verify_buffered_data(struct buffered_reader *reader, const u8 *value,
+ size_t length);
+
+int __must_check uds_make_buffered_writer(struct io_factory *factory, off_t offset,
+ u64 block_count,
+ struct buffered_writer **writer_ptr);
+
+void uds_free_buffered_writer(struct buffered_writer *buffer);
+
+int __must_check uds_write_to_buffered_writer(struct buffered_writer *writer,
+ const u8 *data, size_t length);
+
+int __must_check uds_flush_buffered_writer(struct buffered_writer *writer);
+
+#endif /* UDS_IO_FACTORY_H */
diff --git a/drivers/md/dm-vdo/indexer/open-chapter.c b/drivers/md/dm-vdo/indexer/open-chapter.c
new file mode 100644
index 000000000000..4a67bcadaae0
--- /dev/null
+++ b/drivers/md/dm-vdo/indexer/open-chapter.c
@@ -0,0 +1,426 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "open-chapter.h"
+
+#include <linux/log2.h>
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "numeric.h"
+#include "permassert.h"
+
+#include "config.h"
+#include "hash-utils.h"
+
+/*
+ * Each index zone has a dedicated open chapter zone structure which gets an equal share of the
+ * open chapter space. Records are assigned to zones based on their record name. Within each zone,
+ * records are stored in an array in the order they arrive. Additionally, a reference to each
+ * record is stored in a hash table to help determine if a new record duplicates an existing one.
+ * If new metadata for an existing name arrives, the record is altered in place. The array of
+ * records is 1-based so that record number 0 can be used to indicate an unused hash slot.
+ *
+ * Deleted records are marked with a flag rather than actually removed to simplify hash table
+ * management. The array of deleted flags overlays the array of hash slots, but the flags are
+ * indexed by record number instead of by record name. The number of hash slots will always be a
+ * power of two that is greater than the number of records to be indexed, guaranteeing that hash
+ * insertion cannot fail, and that there are sufficient flags for all records.
+ *
+ * Once any open chapter zone fills its available space, the chapter is closed. The records from
+ * each zone are interleaved to attempt to preserve temporal locality and assigned to record pages.
+ * Empty or deleted records are replaced by copies of a valid record so that the record pages only
+ * contain valid records. The chapter then constructs a delta index which maps each record name to
+ * the record page on which that record can be found, which is split into index pages. These
+ * structures are then passed to the volume to be recorded on storage.
+ *
+ * When the index is saved, the open chapter records are saved in a single array, once again
+ * interleaved to attempt to preserve temporal locality. When the index is reloaded, there may be a
+ * different number of zones than previously, so the records must be parcelled out to their new
+ * zones. In addition, depending on the distribution of record names, a new zone may have more
+ * records than it has space. In this case, the latest records for that zone will be discarded.
+ */
+
+static const u8 OPEN_CHAPTER_MAGIC[] = "ALBOC";
+static const u8 OPEN_CHAPTER_VERSION[] = "02.00";
+
+#define OPEN_CHAPTER_MAGIC_LENGTH (sizeof(OPEN_CHAPTER_MAGIC) - 1)
+#define OPEN_CHAPTER_VERSION_LENGTH (sizeof(OPEN_CHAPTER_VERSION) - 1)
+#define LOAD_RATIO 2
+
+static inline size_t records_size(const struct open_chapter_zone *open_chapter)
+{
+ return sizeof(struct uds_volume_record) * (1 + open_chapter->capacity);
+}
+
+static inline size_t slots_size(size_t slot_count)
+{
+ return sizeof(struct open_chapter_zone_slot) * slot_count;
+}
+
+int uds_make_open_chapter(const struct index_geometry *geometry, unsigned int zone_count,
+ struct open_chapter_zone **open_chapter_ptr)
+{
+ int result;
+ struct open_chapter_zone *open_chapter;
+ size_t capacity = geometry->records_per_chapter / zone_count;
+ size_t slot_count = (1 << bits_per(capacity * LOAD_RATIO));
+
+ result = vdo_allocate_extended(struct open_chapter_zone, slot_count,
+ struct open_chapter_zone_slot, "open chapter",
+ &open_chapter);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ open_chapter->slot_count = slot_count;
+ open_chapter->capacity = capacity;
+ result = vdo_allocate_cache_aligned(records_size(open_chapter), "record pages",
+ &open_chapter->records);
+ if (result != VDO_SUCCESS) {
+ uds_free_open_chapter(open_chapter);
+ return result;
+ }
+
+ *open_chapter_ptr = open_chapter;
+ return UDS_SUCCESS;
+}
+
+void uds_reset_open_chapter(struct open_chapter_zone *open_chapter)
+{
+ open_chapter->size = 0;
+ open_chapter->deletions = 0;
+
+ memset(open_chapter->records, 0, records_size(open_chapter));
+ memset(open_chapter->slots, 0, slots_size(open_chapter->slot_count));
+}
+
+static unsigned int probe_chapter_slots(struct open_chapter_zone *open_chapter,
+ const struct uds_record_name *name)
+{
+ struct uds_volume_record *record;
+ unsigned int slot_count = open_chapter->slot_count;
+ unsigned int slot = uds_name_to_hash_slot(name, slot_count);
+ unsigned int record_number;
+ unsigned int attempts = 1;
+
+ while (true) {
+ record_number = open_chapter->slots[slot].record_number;
+
+ /*
+ * If the hash slot is empty, we've reached the end of a chain without finding the
+ * record and should terminate the search.
+ */
+ if (record_number == 0)
+ return slot;
+
+ /*
+ * If the name of the record referenced by the slot matches and has not been
+ * deleted, then we've found the requested name.
+ */
+ record = &open_chapter->records[record_number];
+ if ((memcmp(&record->name, name, UDS_RECORD_NAME_SIZE) == 0) &&
+ !open_chapter->slots[record_number].deleted)
+ return slot;
+
+ /*
+ * Quadratic probing: advance the probe by 1, 2, 3, etc. and try again. This
+ * performs better than linear probing and works best for 2^N slots.
+ */
+ slot = (slot + attempts++) % slot_count;
+ }
+}
+
+void uds_search_open_chapter(struct open_chapter_zone *open_chapter,
+ const struct uds_record_name *name,
+ struct uds_record_data *metadata, bool *found)
+{
+ unsigned int slot;
+ unsigned int record_number;
+
+ slot = probe_chapter_slots(open_chapter, name);
+ record_number = open_chapter->slots[slot].record_number;
+ if (record_number == 0) {
+ *found = false;
+ } else {
+ *found = true;
+ *metadata = open_chapter->records[record_number].data;
+ }
+}
+
+/* Add a record to the open chapter zone and return the remaining space. */
+int uds_put_open_chapter(struct open_chapter_zone *open_chapter,
+ const struct uds_record_name *name,
+ const struct uds_record_data *metadata)
+{
+ unsigned int slot;
+ unsigned int record_number;
+ struct uds_volume_record *record;
+
+ if (open_chapter->size >= open_chapter->capacity)
+ return 0;
+
+ slot = probe_chapter_slots(open_chapter, name);
+ record_number = open_chapter->slots[slot].record_number;
+
+ if (record_number == 0) {
+ record_number = ++open_chapter->size;
+ open_chapter->slots[slot].record_number = record_number;
+ }
+
+ record = &open_chapter->records[record_number];
+ record->name = *name;
+ record->data = *metadata;
+
+ return open_chapter->capacity - open_chapter->size;
+}
+
+void uds_remove_from_open_chapter(struct open_chapter_zone *open_chapter,
+ const struct uds_record_name *name)
+{
+ unsigned int slot;
+ unsigned int record_number;
+
+ slot = probe_chapter_slots(open_chapter, name);
+ record_number = open_chapter->slots[slot].record_number;
+
+ if (record_number > 0) {
+ open_chapter->slots[record_number].deleted = true;
+ open_chapter->deletions += 1;
+ }
+}
+
+void uds_free_open_chapter(struct open_chapter_zone *open_chapter)
+{
+ if (open_chapter != NULL) {
+ vdo_free(open_chapter->records);
+ vdo_free(open_chapter);
+ }
+}
+
+/* Map each record name to its record page number in the delta chapter index. */
+static int fill_delta_chapter_index(struct open_chapter_zone **chapter_zones,
+ unsigned int zone_count,
+ struct open_chapter_index *index,
+ struct uds_volume_record *collated_records)
+{
+ int result;
+ unsigned int records_per_chapter;
+ unsigned int records_per_page;
+ unsigned int record_index;
+ unsigned int records = 0;
+ u32 page_number;
+ unsigned int z;
+ int overflow_count = 0;
+ struct uds_volume_record *fill_record = NULL;
+
+ /*
+ * The record pages should not have any empty space, so find a record with which to fill
+ * the chapter zone if it was closed early, and also to replace any deleted records. The
+ * last record in any filled zone is guaranteed to not have been deleted, so use one of
+ * those.
+ */
+ for (z = 0; z < zone_count; z++) {
+ struct open_chapter_zone *zone = chapter_zones[z];
+
+ if (zone->size == zone->capacity) {
+ fill_record = &zone->records[zone->size];
+ break;
+ }
+ }
+
+ records_per_chapter = index->geometry->records_per_chapter;
+ records_per_page = index->geometry->records_per_page;
+
+ for (records = 0; records < records_per_chapter; records++) {
+ struct uds_volume_record *record = &collated_records[records];
+ struct open_chapter_zone *open_chapter;
+
+ /* The record arrays in the zones are 1-based. */
+ record_index = 1 + (records / zone_count);
+ page_number = records / records_per_page;
+ open_chapter = chapter_zones[records % zone_count];
+
+ /* Use the fill record in place of an unused record. */
+ if (record_index > open_chapter->size ||
+ open_chapter->slots[record_index].deleted) {
+ *record = *fill_record;
+ continue;
+ }
+
+ *record = open_chapter->records[record_index];
+ result = uds_put_open_chapter_index_record(index, &record->name,
+ page_number);
+ switch (result) {
+ case UDS_SUCCESS:
+ break;
+ case UDS_OVERFLOW:
+ overflow_count++;
+ break;
+ default:
+ vdo_log_error_strerror(result,
+ "failed to build open chapter index");
+ return result;
+ }
+ }
+
+ if (overflow_count > 0)
+ vdo_log_warning("Failed to add %d entries to chapter index",
+ overflow_count);
+
+ return UDS_SUCCESS;
+}
+
+int uds_close_open_chapter(struct open_chapter_zone **chapter_zones,
+ unsigned int zone_count, struct volume *volume,
+ struct open_chapter_index *chapter_index,
+ struct uds_volume_record *collated_records,
+ u64 virtual_chapter_number)
+{
+ int result;
+
+ uds_empty_open_chapter_index(chapter_index, virtual_chapter_number);
+ result = fill_delta_chapter_index(chapter_zones, zone_count, chapter_index,
+ collated_records);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ return uds_write_chapter(volume, chapter_index, collated_records);
+}
+
+int uds_save_open_chapter(struct uds_index *index, struct buffered_writer *writer)
+{
+ int result;
+ struct open_chapter_zone *open_chapter;
+ struct uds_volume_record *record;
+ u8 record_count_data[sizeof(u32)];
+ u32 record_count = 0;
+ unsigned int record_index;
+ unsigned int z;
+
+ result = uds_write_to_buffered_writer(writer, OPEN_CHAPTER_MAGIC,
+ OPEN_CHAPTER_MAGIC_LENGTH);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ result = uds_write_to_buffered_writer(writer, OPEN_CHAPTER_VERSION,
+ OPEN_CHAPTER_VERSION_LENGTH);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ for (z = 0; z < index->zone_count; z++) {
+ open_chapter = index->zones[z]->open_chapter;
+ record_count += open_chapter->size - open_chapter->deletions;
+ }
+
+ put_unaligned_le32(record_count, record_count_data);
+ result = uds_write_to_buffered_writer(writer, record_count_data,
+ sizeof(record_count_data));
+ if (result != UDS_SUCCESS)
+ return result;
+
+ record_index = 1;
+ while (record_count > 0) {
+ for (z = 0; z < index->zone_count; z++) {
+ open_chapter = index->zones[z]->open_chapter;
+ if (record_index > open_chapter->size)
+ continue;
+
+ if (open_chapter->slots[record_index].deleted)
+ continue;
+
+ record = &open_chapter->records[record_index];
+ result = uds_write_to_buffered_writer(writer, (u8 *) record,
+ sizeof(*record));
+ if (result != UDS_SUCCESS)
+ return result;
+
+ record_count--;
+ }
+
+ record_index++;
+ }
+
+ return uds_flush_buffered_writer(writer);
+}
+
+u64 uds_compute_saved_open_chapter_size(struct index_geometry *geometry)
+{
+ unsigned int records_per_chapter = geometry->records_per_chapter;
+
+ return OPEN_CHAPTER_MAGIC_LENGTH + OPEN_CHAPTER_VERSION_LENGTH + sizeof(u32) +
+ records_per_chapter * sizeof(struct uds_volume_record);
+}
+
+static int load_version20(struct uds_index *index, struct buffered_reader *reader)
+{
+ int result;
+ u32 record_count;
+ u8 record_count_data[sizeof(u32)];
+ struct uds_volume_record record;
+
+ /*
+ * Track which zones cannot accept any more records. If the open chapter had a different
+ * number of zones previously, some new zones may have more records than they have space
+ * for. These overflow records will be discarded.
+ */
+ bool full_flags[MAX_ZONES] = {
+ false,
+ };
+
+ result = uds_read_from_buffered_reader(reader, (u8 *) &record_count_data,
+ sizeof(record_count_data));
+ if (result != UDS_SUCCESS)
+ return result;
+
+ record_count = get_unaligned_le32(record_count_data);
+ while (record_count-- > 0) {
+ unsigned int zone = 0;
+
+ result = uds_read_from_buffered_reader(reader, (u8 *) &record,
+ sizeof(record));
+ if (result != UDS_SUCCESS)
+ return result;
+
+ if (index->zone_count > 1)
+ zone = uds_get_volume_index_zone(index->volume_index,
+ &record.name);
+
+ if (!full_flags[zone]) {
+ struct open_chapter_zone *open_chapter;
+ unsigned int remaining;
+
+ open_chapter = index->zones[zone]->open_chapter;
+ remaining = uds_put_open_chapter(open_chapter, &record.name,
+ &record.data);
+ /* Do not allow any zone to fill completely. */
+ full_flags[zone] = (remaining <= 1);
+ }
+ }
+
+ return UDS_SUCCESS;
+}
+
+int uds_load_open_chapter(struct uds_index *index, struct buffered_reader *reader)
+{
+ u8 version[OPEN_CHAPTER_VERSION_LENGTH];
+ int result;
+
+ result = uds_verify_buffered_data(reader, OPEN_CHAPTER_MAGIC,
+ OPEN_CHAPTER_MAGIC_LENGTH);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ result = uds_read_from_buffered_reader(reader, version, sizeof(version));
+ if (result != UDS_SUCCESS)
+ return result;
+
+ if (memcmp(OPEN_CHAPTER_VERSION, version, sizeof(version)) != 0) {
+ return vdo_log_error_strerror(UDS_CORRUPT_DATA,
+ "Invalid open chapter version: %.*s",
+ (int) sizeof(version), version);
+ }
+
+ return load_version20(index, reader);
+}
diff --git a/drivers/md/dm-vdo/indexer/open-chapter.h b/drivers/md/dm-vdo/indexer/open-chapter.h
new file mode 100644
index 000000000000..a4250bb19525
--- /dev/null
+++ b/drivers/md/dm-vdo/indexer/open-chapter.h
@@ -0,0 +1,79 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef UDS_OPEN_CHAPTER_H
+#define UDS_OPEN_CHAPTER_H
+
+#include "chapter-index.h"
+#include "geometry.h"
+#include "index.h"
+#include "volume.h"
+
+/*
+ * The open chapter tracks the newest records in memory. Like the index as a whole, each open
+ * chapter is divided into a number of independent zones which are interleaved when the chapter is
+ * committed to the volume.
+ */
+
+enum {
+ OPEN_CHAPTER_RECORD_NUMBER_BITS = 23,
+};
+
+struct open_chapter_zone_slot {
+ /* If non-zero, the record number addressed by this hash slot */
+ unsigned int record_number : OPEN_CHAPTER_RECORD_NUMBER_BITS;
+ /* If true, the record at the index of this hash slot was deleted */
+ bool deleted : 1;
+} __packed;
+
+struct open_chapter_zone {
+ /* The maximum number of records that can be stored */
+ unsigned int capacity;
+ /* The number of records stored */
+ unsigned int size;
+ /* The number of deleted records */
+ unsigned int deletions;
+ /* Array of chunk records, 1-based */
+ struct uds_volume_record *records;
+ /* The number of slots in the hash table */
+ unsigned int slot_count;
+ /* The hash table slots, referencing virtual record numbers */
+ struct open_chapter_zone_slot slots[];
+};
+
+int __must_check uds_make_open_chapter(const struct index_geometry *geometry,
+ unsigned int zone_count,
+ struct open_chapter_zone **open_chapter_ptr);
+
+void uds_reset_open_chapter(struct open_chapter_zone *open_chapter);
+
+void uds_search_open_chapter(struct open_chapter_zone *open_chapter,
+ const struct uds_record_name *name,
+ struct uds_record_data *metadata, bool *found);
+
+int __must_check uds_put_open_chapter(struct open_chapter_zone *open_chapter,
+ const struct uds_record_name *name,
+ const struct uds_record_data *metadata);
+
+void uds_remove_from_open_chapter(struct open_chapter_zone *open_chapter,
+ const struct uds_record_name *name);
+
+void uds_free_open_chapter(struct open_chapter_zone *open_chapter);
+
+int __must_check uds_close_open_chapter(struct open_chapter_zone **chapter_zones,
+ unsigned int zone_count, struct volume *volume,
+ struct open_chapter_index *chapter_index,
+ struct uds_volume_record *collated_records,
+ u64 virtual_chapter_number);
+
+int __must_check uds_save_open_chapter(struct uds_index *index,
+ struct buffered_writer *writer);
+
+int __must_check uds_load_open_chapter(struct uds_index *index,
+ struct buffered_reader *reader);
+
+u64 uds_compute_saved_open_chapter_size(struct index_geometry *geometry);
+
+#endif /* UDS_OPEN_CHAPTER_H */
diff --git a/drivers/md/dm-vdo/indexer/radix-sort.c b/drivers/md/dm-vdo/indexer/radix-sort.c
new file mode 100644
index 000000000000..66b8c706a1ef
--- /dev/null
+++ b/drivers/md/dm-vdo/indexer/radix-sort.c
@@ -0,0 +1,330 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "radix-sort.h"
+
+#include <linux/limits.h>
+#include <linux/types.h>
+
+#include "memory-alloc.h"
+#include "string-utils.h"
+
+/*
+ * This implementation allocates one large object to do the sorting, which can be reused as many
+ * times as desired. The amount of memory required is logarithmically proportional to the number of
+ * keys to be sorted.
+ */
+
+/* Piles smaller than this are handled with a simple insertion sort. */
+#define INSERTION_SORT_THRESHOLD 12
+
+/* Sort keys are pointers to immutable fixed-length arrays of bytes. */
+typedef const u8 *sort_key_t;
+
+/*
+ * The keys are separated into piles based on the byte in each keys at the current offset, so the
+ * number of keys with each byte must be counted.
+ */
+struct histogram {
+ /* The number of non-empty bins */
+ u16 used;
+ /* The index (key byte) of the first non-empty bin */
+ u16 first;
+ /* The index (key byte) of the last non-empty bin */
+ u16 last;
+ /* The number of occurrences of each specific byte */
+ u32 size[256];
+};
+
+/*
+ * Sub-tasks are manually managed on a stack, both for performance and to put a logarithmic bound
+ * on the stack space needed.
+ */
+struct task {
+ /* Pointer to the first key to sort. */
+ sort_key_t *first_key;
+ /* Pointer to the last key to sort. */
+ sort_key_t *last_key;
+ /* The offset into the key at which to continue sorting. */
+ u16 offset;
+ /* The number of bytes remaining in the sort keys. */
+ u16 length;
+};
+
+struct radix_sorter {
+ unsigned int count;
+ struct histogram bins;
+ sort_key_t *pile[256];
+ struct task *end_of_stack;
+ struct task insertion_list[256];
+ struct task stack[];
+};
+
+/* Compare a segment of two fixed-length keys starting at an offset. */
+static inline int compare(sort_key_t key1, sort_key_t key2, u16 offset, u16 length)
+{
+ return memcmp(&key1[offset], &key2[offset], length);
+}
+
+/* Insert the next unsorted key into an array of sorted keys. */
+static inline void insert_key(const struct task task, sort_key_t *next)
+{
+ /* Pull the unsorted key out, freeing up the array slot. */
+ sort_key_t unsorted = *next;
+
+ /* Compare the key to the preceding sorted entries, shifting down ones that are larger. */
+ while ((--next >= task.first_key) &&
+ (compare(unsorted, next[0], task.offset, task.length) < 0))
+ next[1] = next[0];
+
+ /* Insert the key into the last slot that was cleared, sorting it. */
+ next[1] = unsorted;
+}
+
+/*
+ * Sort a range of key segments using an insertion sort. This simple sort is faster than the
+ * 256-way radix sort when the number of keys to sort is small.
+ */
+static inline void insertion_sort(const struct task task)
+{
+ sort_key_t *next;
+
+ for (next = task.first_key + 1; next <= task.last_key; next++)
+ insert_key(task, next);
+}
+
+/* Push a sorting task onto a task stack. */
+static inline void push_task(struct task **stack_pointer, sort_key_t *first_key,
+ u32 count, u16 offset, u16 length)
+{
+ struct task *task = (*stack_pointer)++;
+
+ task->first_key = first_key;
+ task->last_key = &first_key[count - 1];
+ task->offset = offset;
+ task->length = length;
+}
+
+static inline void swap_keys(sort_key_t *a, sort_key_t *b)
+{
+ sort_key_t c = *a;
+ *a = *b;
+ *b = c;
+}
+
+/*
+ * Count the number of times each byte value appears in the arrays of keys to sort at the current
+ * offset, keeping track of the number of non-empty bins, and the index of the first and last
+ * non-empty bin.
+ */
+static inline void measure_bins(const struct task task, struct histogram *bins)
+{
+ sort_key_t *key_ptr;
+
+ /*
+ * Subtle invariant: bins->used and bins->size[] are zero because the sorting code clears
+ * it all out as it goes. Even though this structure is re-used, we don't need to pay to
+ * zero it before starting a new tally.
+ */
+ bins->first = U8_MAX;
+ bins->last = 0;
+
+ for (key_ptr = task.first_key; key_ptr <= task.last_key; key_ptr++) {
+ /* Increment the count for the byte in the key at the current offset. */
+ u8 bin = (*key_ptr)[task.offset];
+ u32 size = ++bins->size[bin];
+
+ /* Track non-empty bins. */
+ if (size == 1) {
+ bins->used += 1;
+ if (bin < bins->first)
+ bins->first = bin;
+
+ if (bin > bins->last)
+ bins->last = bin;
+ }
+ }
+}
+
+/*
+ * Convert the bin sizes to pointers to where each pile goes.
+ *
+ * pile[0] = first_key + bin->size[0],
+ * pile[1] = pile[0] + bin->size[1], etc.
+ *
+ * After the keys are moved to the appropriate pile, we'll need to sort each of the piles by the
+ * next radix position. A new task is put on the stack for each pile containing lots of keys, or a
+ * new task is put on the list for each pile containing few keys.
+ *
+ * @stack: pointer the top of the stack
+ * @end_of_stack: the end of the stack
+ * @list: pointer the head of the list
+ * @pile: array for pointers to the end of each pile
+ * @bins: the histogram of the sizes of each pile
+ * @first_key: the first key of the stack
+ * @offset: the next radix position to sort by
+ * @length: the number of bytes remaining in the sort keys
+ *
+ * Return: UDS_SUCCESS or an error code
+ */
+static inline int push_bins(struct task **stack, struct task *end_of_stack,
+ struct task **list, sort_key_t *pile[],
+ struct histogram *bins, sort_key_t *first_key,
+ u16 offset, u16 length)
+{
+ sort_key_t *pile_start = first_key;
+ int bin;
+
+ for (bin = bins->first; ; bin++) {
+ u32 size = bins->size[bin];
+
+ /* Skip empty piles. */
+ if (size == 0)
+ continue;
+
+ /* There's no need to sort empty keys. */
+ if (length > 0) {
+ if (size > INSERTION_SORT_THRESHOLD) {
+ if (*stack >= end_of_stack)
+ return UDS_BAD_STATE;
+
+ push_task(stack, pile_start, size, offset, length);
+ } else if (size > 1) {
+ push_task(list, pile_start, size, offset, length);
+ }
+ }
+
+ pile_start += size;
+ pile[bin] = pile_start;
+ if (--bins->used == 0)
+ break;
+ }
+
+ return UDS_SUCCESS;
+}
+
+int uds_make_radix_sorter(unsigned int count, struct radix_sorter **sorter)
+{
+ int result;
+ unsigned int stack_size = count / INSERTION_SORT_THRESHOLD;
+ struct radix_sorter *radix_sorter;
+
+ result = vdo_allocate_extended(struct radix_sorter, stack_size, struct task,
+ __func__, &radix_sorter);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ radix_sorter->count = count;
+ radix_sorter->end_of_stack = radix_sorter->stack + stack_size;
+ *sorter = radix_sorter;
+ return UDS_SUCCESS;
+}
+
+void uds_free_radix_sorter(struct radix_sorter *sorter)
+{
+ vdo_free(sorter);
+}
+
+/*
+ * Sort pointers to fixed-length keys (arrays of bytes) using a radix sort. The sort implementation
+ * is unstable, so the relative ordering of equal keys is not preserved.
+ */
+int uds_radix_sort(struct radix_sorter *sorter, const unsigned char *keys[],
+ unsigned int count, unsigned short length)
+{
+ struct task start;
+ struct histogram *bins = &sorter->bins;
+ sort_key_t **pile = sorter->pile;
+ struct task *task_stack = sorter->stack;
+
+ /* All zero-length keys are identical and therefore already sorted. */
+ if ((count == 0) || (length == 0))
+ return UDS_SUCCESS;
+
+ /* The initial task is to sort the entire length of all the keys. */
+ start = (struct task) {
+ .first_key = keys,
+ .last_key = &keys[count - 1],
+ .offset = 0,
+ .length = length,
+ };
+
+ if (count <= INSERTION_SORT_THRESHOLD) {
+ insertion_sort(start);
+ return UDS_SUCCESS;
+ }
+
+ if (count > sorter->count)
+ return UDS_INVALID_ARGUMENT;
+
+ /*
+ * Repeatedly consume a sorting task from the stack and process it, pushing new sub-tasks
+ * onto the stack for each radix-sorted pile. When all tasks and sub-tasks have been
+ * processed, the stack will be empty and all the keys in the starting task will be fully
+ * sorted.
+ */
+ for (*task_stack = start; task_stack >= sorter->stack; task_stack--) {
+ const struct task task = *task_stack;
+ struct task *insertion_task_list;
+ int result;
+ sort_key_t *fence;
+ sort_key_t *end;
+
+ measure_bins(task, bins);
+
+ /*
+ * Now that we know how large each bin is, generate pointers for each of the piles
+ * and push a new task to sort each pile by the next radix byte.
+ */
+ insertion_task_list = sorter->insertion_list;
+ result = push_bins(&task_stack, sorter->end_of_stack,
+ &insertion_task_list, pile, bins, task.first_key,
+ task.offset + 1, task.length - 1);
+ if (result != UDS_SUCCESS) {
+ memset(bins, 0, sizeof(*bins));
+ return result;
+ }
+
+ /* Now bins->used is zero again. */
+
+ /*
+ * Don't bother processing the last pile: when piles 0..N-1 are all in place, then
+ * pile N must also be in place.
+ */
+ end = task.last_key - bins->size[bins->last];
+ bins->size[bins->last] = 0;
+
+ for (fence = task.first_key; fence <= end; ) {
+ u8 bin;
+ sort_key_t key = *fence;
+
+ /*
+ * The radix byte of the key tells us which pile it belongs in. Swap it for
+ * an unprocessed item just below that pile, and repeat.
+ */
+ while (--pile[bin = key[task.offset]] > fence)
+ swap_keys(pile[bin], &key);
+
+ /*
+ * The pile reached the fence. Put the key at the bottom of that pile,
+ * completing it, and advance the fence to the next pile.
+ */
+ *fence = key;
+ fence += bins->size[bin];
+ bins->size[bin] = 0;
+ }
+
+ /* Now bins->size[] is all zero again. */
+
+ /*
+ * When the number of keys in a task gets small enough, it is faster to use an
+ * insertion sort than to keep subdividing into tiny piles.
+ */
+ while (--insertion_task_list >= sorter->insertion_list)
+ insertion_sort(*insertion_task_list);
+ }
+
+ return UDS_SUCCESS;
+}
diff --git a/drivers/md/dm-vdo/indexer/radix-sort.h b/drivers/md/dm-vdo/indexer/radix-sort.h
new file mode 100644
index 000000000000..812949bc2cee
--- /dev/null
+++ b/drivers/md/dm-vdo/indexer/radix-sort.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef UDS_RADIX_SORT_H
+#define UDS_RADIX_SORT_H
+
+/*
+ * Radix sort is implemented using an American Flag sort, an unstable, in-place 8-bit radix
+ * exchange sort. This is adapted from the algorithm in the paper by Peter M. McIlroy, Keith
+ * Bostic, and M. Douglas McIlroy, "Engineering Radix Sort".
+ *
+ * http://www.usenix.org/publications/compsystems/1993/win_mcilroy.pdf
+ */
+
+struct radix_sorter;
+
+int __must_check uds_make_radix_sorter(unsigned int count, struct radix_sorter **sorter);
+
+void uds_free_radix_sorter(struct radix_sorter *sorter);
+
+int __must_check uds_radix_sort(struct radix_sorter *sorter, const unsigned char *keys[],
+ unsigned int count, unsigned short length);
+
+#endif /* UDS_RADIX_SORT_H */
diff --git a/drivers/md/dm-vdo/indexer/sparse-cache.c b/drivers/md/dm-vdo/indexer/sparse-cache.c
new file mode 100644
index 000000000000..28920167827c
--- /dev/null
+++ b/drivers/md/dm-vdo/indexer/sparse-cache.c
@@ -0,0 +1,624 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "sparse-cache.h"
+
+#include <linux/cache.h>
+#include <linux/delay.h>
+#include <linux/dm-bufio.h>
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "permassert.h"
+
+#include "chapter-index.h"
+#include "config.h"
+#include "index.h"
+
+/*
+ * Since the cache is small, it is implemented as a simple array of cache entries. Searching for a
+ * specific virtual chapter is implemented as a linear search. The cache replacement policy is
+ * least-recently-used (LRU). Again, the small size of the cache allows the LRU order to be
+ * maintained by shifting entries in an array list.
+ *
+ * Changing the contents of the cache requires the coordinated participation of all zone threads
+ * via the careful use of barrier messages sent to all the index zones by the triage queue worker
+ * thread. The critical invariant for coordination is that the cache membership must not change
+ * between updates, so that all calls to uds_sparse_cache_contains() from the zone threads must all
+ * receive the same results for every virtual chapter number. To ensure that critical invariant,
+ * state changes such as "that virtual chapter is no longer in the volume" and "skip searching that
+ * chapter because it has had too many cache misses" are represented separately from the cache
+ * membership information (the virtual chapter number).
+ *
+ * As a result of this invariant, we have the guarantee that every zone thread will call
+ * uds_update_sparse_cache() once and exactly once to request a chapter that is not in the cache,
+ * and the serialization of the barrier requests from the triage queue ensures they will all
+ * request the same chapter number. This means the only synchronization we need can be provided by
+ * a pair of thread barriers used only in the uds_update_sparse_cache() call, providing a critical
+ * section where a single zone thread can drive the cache update while all the other zone threads
+ * are known to be blocked, waiting in the second barrier. Outside that critical section, all the
+ * zone threads implicitly hold a shared lock. Inside it, the thread for zone zero holds an
+ * exclusive lock. No other threads may access or modify the cache entries.
+ *
+ * Chapter statistics must only be modified by a single thread, which is also the zone zero thread.
+ * All fields that might be frequently updated by that thread are kept in separate cache-aligned
+ * structures so they will not cause cache contention via "false sharing" with the fields that are
+ * frequently accessed by all of the zone threads.
+ *
+ * The LRU order is managed independently by each zone thread, and each zone uses its own list for
+ * searching and cache membership queries. The zone zero list is used to decide which chapter to
+ * evict when the cache is updated, and its search list is copied to the other threads at that
+ * time.
+ *
+ * The virtual chapter number field of the cache entry is the single field indicating whether a
+ * chapter is a member of the cache or not. The value NO_CHAPTER is used to represent a null or
+ * undefined chapter number. When present in the virtual chapter number field of a
+ * cached_chapter_index, it indicates that the cache entry is dead, and all the other fields of
+ * that entry (other than immutable pointers to cache memory) are undefined and irrelevant. Any
+ * cache entry that is not marked as dead is fully defined and a member of the cache, and
+ * uds_sparse_cache_contains() will always return true for any virtual chapter number that appears
+ * in any of the cache entries.
+ *
+ * A chapter index that is a member of the cache may be excluded from searches between calls to
+ * uds_update_sparse_cache() in two different ways. First, when a chapter falls off the end of the
+ * volume, its virtual chapter number will be less that the oldest virtual chapter number. Since
+ * that chapter is no longer part of the volume, there's no point in continuing to search that
+ * chapter index. Once invalidated, that virtual chapter will still be considered a member of the
+ * cache, but it will no longer be searched for matching names.
+ *
+ * The second mechanism is a heuristic based on keeping track of the number of consecutive search
+ * misses in a given chapter index. Once that count exceeds a threshold, the skip_search flag will
+ * be set to true, causing the chapter to be skipped when searching the entire cache, but still
+ * allowing it to be found when searching for a hook in that specific chapter. Finding a hook will
+ * clear the skip_search flag, once again allowing the non-hook searches to use that cache entry.
+ * Again, regardless of the state of the skip_search flag, the virtual chapter must still
+ * considered to be a member of the cache for uds_sparse_cache_contains().
+ */
+
+#define SKIP_SEARCH_THRESHOLD 20000
+#define ZONE_ZERO 0
+
+/*
+ * These counters are essentially fields of the struct cached_chapter_index, but are segregated
+ * into this structure because they are frequently modified. They are grouped and aligned to keep
+ * them on different cache lines from the chapter fields that are accessed far more often than they
+ * are updated.
+ */
+struct __aligned(L1_CACHE_BYTES) cached_index_counters {
+ u64 consecutive_misses;
+};
+
+struct __aligned(L1_CACHE_BYTES) cached_chapter_index {
+ /*
+ * The virtual chapter number of the cached chapter index. NO_CHAPTER means this cache
+ * entry is unused. This field must only be modified in the critical section in
+ * uds_update_sparse_cache().
+ */
+ u64 virtual_chapter;
+
+ u32 index_pages_count;
+
+ /*
+ * These pointers are immutable during the life of the cache. The contents of the arrays
+ * change when the cache entry is replaced.
+ */
+ struct delta_index_page *index_pages;
+ struct dm_buffer **page_buffers;
+
+ /*
+ * If set, skip the chapter when searching the entire cache. This flag is just a
+ * performance optimization. This flag is mutable between cache updates, but it rarely
+ * changes and is frequently accessed, so it groups with the immutable fields.
+ */
+ bool skip_search;
+
+ /*
+ * The cache-aligned counters change often and are placed at the end of the structure to
+ * prevent false sharing with the more stable fields above.
+ */
+ struct cached_index_counters counters;
+};
+
+/*
+ * A search_list represents an ordering of the sparse chapter index cache entry array, from most
+ * recently accessed to least recently accessed, which is the order in which the indexes should be
+ * searched and the reverse order in which they should be evicted from the cache.
+ *
+ * Cache entries that are dead or empty are kept at the end of the list, avoiding the need to even
+ * iterate over them to search, and ensuring that dead entries are replaced before any live entries
+ * are evicted.
+ *
+ * The search list is instantiated for each zone thread, avoiding any need for synchronization. The
+ * structure is allocated on a cache boundary to avoid false sharing of memory cache lines between
+ * zone threads.
+ */
+struct search_list {
+ u8 capacity;
+ u8 first_dead_entry;
+ struct cached_chapter_index *entries[];
+};
+
+struct threads_barrier {
+ /* Lock for this barrier object */
+ struct semaphore lock;
+ /* Semaphore for threads waiting at this barrier */
+ struct semaphore wait;
+ /* Number of threads which have arrived */
+ int arrived;
+ /* Total number of threads using this barrier */
+ int thread_count;
+};
+
+struct sparse_cache {
+ const struct index_geometry *geometry;
+ unsigned int capacity;
+ unsigned int zone_count;
+
+ unsigned int skip_threshold;
+ struct search_list *search_lists[MAX_ZONES];
+ struct cached_chapter_index **scratch_entries;
+
+ struct threads_barrier begin_update_barrier;
+ struct threads_barrier end_update_barrier;
+
+ struct cached_chapter_index chapters[];
+};
+
+static void initialize_threads_barrier(struct threads_barrier *barrier,
+ unsigned int thread_count)
+{
+ sema_init(&barrier->lock, 1);
+ barrier->arrived = 0;
+ barrier->thread_count = thread_count;
+ sema_init(&barrier->wait, 0);
+}
+
+static inline void __down(struct semaphore *semaphore)
+{
+ /*
+ * Do not use down(semaphore). Instead use down_interruptible so that
+ * we do not get 120 second stall messages in kern.log.
+ */
+ while (down_interruptible(semaphore) != 0) {
+ /*
+ * If we're called from a user-mode process (e.g., "dmsetup
+ * remove") while waiting for an operation that may take a
+ * while (e.g., UDS index save), and a signal is sent (SIGINT,
+ * SIGUSR2), then down_interruptible will not block. If that
+ * happens, sleep briefly to avoid keeping the CPU locked up in
+ * this loop. We could just call cond_resched, but then we'd
+ * still keep consuming CPU time slices and swamp other threads
+ * trying to do computational work.
+ */
+ fsleep(1000);
+ }
+}
+
+static void enter_threads_barrier(struct threads_barrier *barrier)
+{
+ __down(&barrier->lock);
+ if (++barrier->arrived == barrier->thread_count) {
+ /* last thread */
+ int i;
+
+ for (i = 1; i < barrier->thread_count; i++)
+ up(&barrier->wait);
+
+ barrier->arrived = 0;
+ up(&barrier->lock);
+ } else {
+ up(&barrier->lock);
+ __down(&barrier->wait);
+ }
+}
+
+static int __must_check initialize_cached_chapter_index(struct cached_chapter_index *chapter,
+ const struct index_geometry *geometry)
+{
+ int result;
+
+ chapter->virtual_chapter = NO_CHAPTER;
+ chapter->index_pages_count = geometry->index_pages_per_chapter;
+
+ result = vdo_allocate(chapter->index_pages_count, struct delta_index_page,
+ __func__, &chapter->index_pages);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ return vdo_allocate(chapter->index_pages_count, struct dm_buffer *,
+ "sparse index volume pages", &chapter->page_buffers);
+}
+
+static int __must_check make_search_list(struct sparse_cache *cache,
+ struct search_list **list_ptr)
+{
+ struct search_list *list;
+ unsigned int bytes;
+ u8 i;
+ int result;
+
+ bytes = (sizeof(struct search_list) +
+ (cache->capacity * sizeof(struct cached_chapter_index *)));
+ result = vdo_allocate_cache_aligned(bytes, "search list", &list);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ list->capacity = cache->capacity;
+ list->first_dead_entry = 0;
+
+ for (i = 0; i < list->capacity; i++)
+ list->entries[i] = &cache->chapters[i];
+
+ *list_ptr = list;
+ return UDS_SUCCESS;
+}
+
+int uds_make_sparse_cache(const struct index_geometry *geometry, unsigned int capacity,
+ unsigned int zone_count, struct sparse_cache **cache_ptr)
+{
+ int result;
+ unsigned int i;
+ struct sparse_cache *cache;
+ unsigned int bytes;
+
+ bytes = (sizeof(struct sparse_cache) + (capacity * sizeof(struct cached_chapter_index)));
+ result = vdo_allocate_cache_aligned(bytes, "sparse cache", &cache);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ cache->geometry = geometry;
+ cache->capacity = capacity;
+ cache->zone_count = zone_count;
+
+ /*
+ * Scale down the skip threshold since the cache only counts cache misses in zone zero, but
+ * requests are being handled in all zones.
+ */
+ cache->skip_threshold = (SKIP_SEARCH_THRESHOLD / zone_count);
+
+ initialize_threads_barrier(&cache->begin_update_barrier, zone_count);
+ initialize_threads_barrier(&cache->end_update_barrier, zone_count);
+
+ for (i = 0; i < capacity; i++) {
+ result = initialize_cached_chapter_index(&cache->chapters[i], geometry);
+ if (result != UDS_SUCCESS)
+ goto out;
+ }
+
+ for (i = 0; i < zone_count; i++) {
+ result = make_search_list(cache, &cache->search_lists[i]);
+ if (result != UDS_SUCCESS)
+ goto out;
+ }
+
+ /* purge_search_list() needs some temporary lists for sorting. */
+ result = vdo_allocate(capacity * 2, struct cached_chapter_index *,
+ "scratch entries", &cache->scratch_entries);
+ if (result != VDO_SUCCESS)
+ goto out;
+
+ *cache_ptr = cache;
+ return UDS_SUCCESS;
+out:
+ uds_free_sparse_cache(cache);
+ return result;
+}
+
+static inline void set_skip_search(struct cached_chapter_index *chapter,
+ bool skip_search)
+{
+ /* Check before setting to reduce cache line contention. */
+ if (READ_ONCE(chapter->skip_search) != skip_search)
+ WRITE_ONCE(chapter->skip_search, skip_search);
+}
+
+static void score_search_hit(struct cached_chapter_index *chapter)
+{
+ chapter->counters.consecutive_misses = 0;
+ set_skip_search(chapter, false);
+}
+
+static void score_search_miss(struct sparse_cache *cache,
+ struct cached_chapter_index *chapter)
+{
+ chapter->counters.consecutive_misses++;
+ if (chapter->counters.consecutive_misses > cache->skip_threshold)
+ set_skip_search(chapter, true);
+}
+
+static void release_cached_chapter_index(struct cached_chapter_index *chapter)
+{
+ unsigned int i;
+
+ chapter->virtual_chapter = NO_CHAPTER;
+ if (chapter->page_buffers == NULL)
+ return;
+
+ for (i = 0; i < chapter->index_pages_count; i++) {
+ if (chapter->page_buffers[i] != NULL)
+ dm_bufio_release(vdo_forget(chapter->page_buffers[i]));
+ }
+}
+
+void uds_free_sparse_cache(struct sparse_cache *cache)
+{
+ unsigned int i;
+
+ if (cache == NULL)
+ return;
+
+ vdo_free(cache->scratch_entries);
+
+ for (i = 0; i < cache->zone_count; i++)
+ vdo_free(cache->search_lists[i]);
+
+ for (i = 0; i < cache->capacity; i++) {
+ release_cached_chapter_index(&cache->chapters[i]);
+ vdo_free(cache->chapters[i].index_pages);
+ vdo_free(cache->chapters[i].page_buffers);
+ }
+
+ vdo_free(cache);
+}
+
+/*
+ * Take the indicated element of the search list and move it to the start, pushing the pointers
+ * previously before it back down the list.
+ */
+static inline void set_newest_entry(struct search_list *search_list, u8 index)
+{
+ struct cached_chapter_index *newest;
+
+ if (index > 0) {
+ newest = search_list->entries[index];
+ memmove(&search_list->entries[1], &search_list->entries[0],
+ index * sizeof(struct cached_chapter_index *));
+ search_list->entries[0] = newest;
+ }
+
+ /*
+ * This function may have moved a dead chapter to the front of the list for reuse, in which
+ * case the set of dead chapters becomes smaller.
+ */
+ if (search_list->first_dead_entry <= index)
+ search_list->first_dead_entry++;
+}
+
+bool uds_sparse_cache_contains(struct sparse_cache *cache, u64 virtual_chapter,
+ unsigned int zone_number)
+{
+ struct search_list *search_list;
+ struct cached_chapter_index *chapter;
+ u8 i;
+
+ /*
+ * The correctness of the barriers depends on the invariant that between calls to
+ * uds_update_sparse_cache(), the answers this function returns must never vary: the result
+ * for a given chapter must be identical across zones. That invariant must be maintained
+ * even if the chapter falls off the end of the volume, or if searching it is disabled
+ * because of too many search misses.
+ */
+ search_list = cache->search_lists[zone_number];
+ for (i = 0; i < search_list->first_dead_entry; i++) {
+ chapter = search_list->entries[i];
+
+ if (virtual_chapter == chapter->virtual_chapter) {
+ if (zone_number == ZONE_ZERO)
+ score_search_hit(chapter);
+
+ set_newest_entry(search_list, i);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/*
+ * Re-sort cache entries into three sets (active, skippable, and dead) while maintaining the LRU
+ * ordering that already existed. This operation must only be called during the critical section in
+ * uds_update_sparse_cache().
+ */
+static void purge_search_list(struct search_list *search_list,
+ struct sparse_cache *cache, u64 oldest_virtual_chapter)
+{
+ struct cached_chapter_index **entries;
+ struct cached_chapter_index **skipped;
+ struct cached_chapter_index **dead;
+ struct cached_chapter_index *chapter;
+ unsigned int next_alive = 0;
+ unsigned int next_skipped = 0;
+ unsigned int next_dead = 0;
+ unsigned int i;
+
+ entries = &search_list->entries[0];
+ skipped = &cache->scratch_entries[0];
+ dead = &cache->scratch_entries[search_list->capacity];
+
+ for (i = 0; i < search_list->first_dead_entry; i++) {
+ chapter = search_list->entries[i];
+ if ((chapter->virtual_chapter < oldest_virtual_chapter) ||
+ (chapter->virtual_chapter == NO_CHAPTER))
+ dead[next_dead++] = chapter;
+ else if (chapter->skip_search)
+ skipped[next_skipped++] = chapter;
+ else
+ entries[next_alive++] = chapter;
+ }
+
+ memcpy(&entries[next_alive], skipped,
+ next_skipped * sizeof(struct cached_chapter_index *));
+ memcpy(&entries[next_alive + next_skipped], dead,
+ next_dead * sizeof(struct cached_chapter_index *));
+ search_list->first_dead_entry = next_alive + next_skipped;
+}
+
+static int __must_check cache_chapter_index(struct cached_chapter_index *chapter,
+ u64 virtual_chapter,
+ const struct volume *volume)
+{
+ int result;
+
+ release_cached_chapter_index(chapter);
+
+ result = uds_read_chapter_index_from_volume(volume, virtual_chapter,
+ chapter->page_buffers,
+ chapter->index_pages);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ chapter->counters.consecutive_misses = 0;
+ chapter->virtual_chapter = virtual_chapter;
+ chapter->skip_search = false;
+
+ return UDS_SUCCESS;
+}
+
+static inline void copy_search_list(const struct search_list *source,
+ struct search_list *target)
+{
+ *target = *source;
+ memcpy(target->entries, source->entries,
+ source->capacity * sizeof(struct cached_chapter_index *));
+}
+
+/*
+ * Update the sparse cache to contain a chapter index. This function must be called by all the zone
+ * threads with the same chapter number to correctly enter the thread barriers used to synchronize
+ * the cache updates.
+ */
+int uds_update_sparse_cache(struct index_zone *zone, u64 virtual_chapter)
+{
+ int result = UDS_SUCCESS;
+ const struct uds_index *index = zone->index;
+ struct sparse_cache *cache = index->volume->sparse_cache;
+
+ if (uds_sparse_cache_contains(cache, virtual_chapter, zone->id))
+ return UDS_SUCCESS;
+
+ /*
+ * Wait for every zone thread to reach its corresponding barrier request and invoke this
+ * function before starting to modify the cache.
+ */
+ enter_threads_barrier(&cache->begin_update_barrier);
+
+ /*
+ * This is the start of the critical section: the zone zero thread is captain, effectively
+ * holding an exclusive lock on the sparse cache. All the other zone threads must do
+ * nothing between the two barriers. They will wait at the end_update_barrier again for the
+ * captain to finish the update.
+ */
+
+ if (zone->id == ZONE_ZERO) {
+ unsigned int z;
+ struct search_list *list = cache->search_lists[ZONE_ZERO];
+
+ purge_search_list(list, cache, zone->oldest_virtual_chapter);
+
+ if (virtual_chapter >= index->oldest_virtual_chapter) {
+ set_newest_entry(list, list->capacity - 1);
+ result = cache_chapter_index(list->entries[0], virtual_chapter,
+ index->volume);
+ }
+
+ for (z = 1; z < cache->zone_count; z++)
+ copy_search_list(list, cache->search_lists[z]);
+ }
+
+ /*
+ * This is the end of the critical section. All cache invariants must have been restored.
+ */
+ enter_threads_barrier(&cache->end_update_barrier);
+ return result;
+}
+
+void uds_invalidate_sparse_cache(struct sparse_cache *cache)
+{
+ unsigned int i;
+
+ for (i = 0; i < cache->capacity; i++)
+ release_cached_chapter_index(&cache->chapters[i]);
+}
+
+static inline bool should_skip_chapter(struct cached_chapter_index *chapter,
+ u64 oldest_chapter, u64 requested_chapter)
+{
+ if ((chapter->virtual_chapter == NO_CHAPTER) ||
+ (chapter->virtual_chapter < oldest_chapter))
+ return true;
+
+ if (requested_chapter != NO_CHAPTER)
+ return requested_chapter != chapter->virtual_chapter;
+ else
+ return READ_ONCE(chapter->skip_search);
+}
+
+static int __must_check search_cached_chapter_index(struct cached_chapter_index *chapter,
+ const struct index_geometry *geometry,
+ const struct index_page_map *index_page_map,
+ const struct uds_record_name *name,
+ u16 *record_page_ptr)
+{
+ u32 physical_chapter =
+ uds_map_to_physical_chapter(geometry, chapter->virtual_chapter);
+ u32 index_page_number =
+ uds_find_index_page_number(index_page_map, name, physical_chapter);
+ struct delta_index_page *index_page =
+ &chapter->index_pages[index_page_number];
+
+ return uds_search_chapter_index_page(index_page, geometry, name,
+ record_page_ptr);
+}
+
+int uds_search_sparse_cache(struct index_zone *zone, const struct uds_record_name *name,
+ u64 *virtual_chapter_ptr, u16 *record_page_ptr)
+{
+ int result;
+ struct volume *volume = zone->index->volume;
+ struct sparse_cache *cache = volume->sparse_cache;
+ struct cached_chapter_index *chapter;
+ struct search_list *search_list;
+ u8 i;
+ /* Search the entire cache unless a specific chapter was requested. */
+ bool search_one = (*virtual_chapter_ptr != NO_CHAPTER);
+
+ *record_page_ptr = NO_CHAPTER_INDEX_ENTRY;
+ search_list = cache->search_lists[zone->id];
+ for (i = 0; i < search_list->first_dead_entry; i++) {
+ chapter = search_list->entries[i];
+
+ if (should_skip_chapter(chapter, zone->oldest_virtual_chapter,
+ *virtual_chapter_ptr))
+ continue;
+
+ result = search_cached_chapter_index(chapter, cache->geometry,
+ volume->index_page_map, name,
+ record_page_ptr);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ if (*record_page_ptr != NO_CHAPTER_INDEX_ENTRY) {
+ /*
+ * In theory, this might be a false match while a true match exists in
+ * another chapter, but that's a very rare case and not worth the extra
+ * search complexity.
+ */
+ set_newest_entry(search_list, i);
+ if (zone->id == ZONE_ZERO)
+ score_search_hit(chapter);
+
+ *virtual_chapter_ptr = chapter->virtual_chapter;
+ return UDS_SUCCESS;
+ }
+
+ if (zone->id == ZONE_ZERO)
+ score_search_miss(cache, chapter);
+
+ if (search_one)
+ break;
+ }
+
+ return UDS_SUCCESS;
+}
diff --git a/drivers/md/dm-vdo/indexer/sparse-cache.h b/drivers/md/dm-vdo/indexer/sparse-cache.h
new file mode 100644
index 000000000000..45e2dcf165b5
--- /dev/null
+++ b/drivers/md/dm-vdo/indexer/sparse-cache.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef UDS_SPARSE_CACHE_H
+#define UDS_SPARSE_CACHE_H
+
+#include "geometry.h"
+#include "indexer.h"
+
+/*
+ * The sparse cache is a cache of entire chapter indexes from sparse chapters used for searching
+ * for names after all other search paths have failed. It contains only complete chapter indexes;
+ * record pages from sparse chapters and single index pages used for resolving hooks are kept in
+ * the regular page cache in the volume.
+ *
+ * The most important property of this cache is the absence of synchronization for read operations.
+ * Safe concurrent access to the cache by the zone threads is controlled by the triage queue and
+ * the barrier requests it issues to the zone queues. The set of cached chapters does not and must
+ * not change between the carefully coordinated calls to uds_update_sparse_cache() from the zone
+ * threads. Outside of updates, every zone will get the same result when calling
+ * uds_sparse_cache_contains() as every other zone.
+ */
+
+struct index_zone;
+struct sparse_cache;
+
+int __must_check uds_make_sparse_cache(const struct index_geometry *geometry,
+ unsigned int capacity, unsigned int zone_count,
+ struct sparse_cache **cache_ptr);
+
+void uds_free_sparse_cache(struct sparse_cache *cache);
+
+bool uds_sparse_cache_contains(struct sparse_cache *cache, u64 virtual_chapter,
+ unsigned int zone_number);
+
+int __must_check uds_update_sparse_cache(struct index_zone *zone, u64 virtual_chapter);
+
+void uds_invalidate_sparse_cache(struct sparse_cache *cache);
+
+int __must_check uds_search_sparse_cache(struct index_zone *zone,
+ const struct uds_record_name *name,
+ u64 *virtual_chapter_ptr, u16 *record_page_ptr);
+
+#endif /* UDS_SPARSE_CACHE_H */
diff --git a/drivers/md/dm-vdo/indexer/volume-index.c b/drivers/md/dm-vdo/indexer/volume-index.c
new file mode 100644
index 000000000000..12f954a0c532
--- /dev/null
+++ b/drivers/md/dm-vdo/indexer/volume-index.c
@@ -0,0 +1,1283 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+#include "volume-index.h"
+
+#include <linux/bitops.h>
+#include <linux/bits.h>
+#include <linux/cache.h>
+#include <linux/compiler.h>
+#include <linux/log2.h>
+
+#include "errors.h"
+#include "logger.h"
+#include "memory-alloc.h"
+#include "numeric.h"
+#include "permassert.h"
+#include "thread-utils.h"
+
+#include "config.h"
+#include "geometry.h"
+#include "hash-utils.h"
+#include "indexer.h"
+
+/*
+ * The volume index is a combination of two separate subindexes, one containing sparse hook entries
+ * (retained for all chapters), and one containing the remaining entries (retained only for the
+ * dense chapters). If there are no sparse chapters, only the non-hook sub index is used, and it
+ * will contain all records for all chapters.
+ *
+ * The volume index is also divided into zones, with one thread operating on each zone. Each
+ * incoming request is dispatched to the appropriate thread, and then to the appropriate subindex.
+ * Each delta list is handled by a single zone. To ensure that the distribution of delta lists to
+ * zones doesn't underflow (leaving some zone with no delta lists), the minimum number of delta
+ * lists must be the square of the maximum zone count for both subindexes.
+ *
+ * Each subindex zone is a delta index where the payload is a chapter number. The volume index can
+ * compute the delta list number, address, and zone number from the record name in order to
+ * dispatch record handling to the correct structures.
+ *
+ * Most operations that use all the zones take place either before request processing is allowed,
+ * or after all requests have been flushed in order to shut down. The only multi-threaded operation
+ * supported during normal operation is the uds_lookup_volume_index_name() method, used to determine
+ * whether a new chapter should be loaded into the sparse index cache. This operation only uses the
+ * sparse hook subindex, and the zone mutexes are used to make this operation safe.
+ *
+ * There are three ways of expressing chapter numbers in the volume index: virtual, index, and
+ * rolling. The interface to the volume index uses virtual chapter numbers, which are 64 bits long.
+ * Internally the subindex stores only the minimal number of bits necessary by masking away the
+ * high-order bits. When the index needs to deal with ordering of index chapter numbers, as when
+ * flushing entries from older chapters, it rolls the index chapter number around so that the
+ * smallest one in use is mapped to 0. See convert_index_to_virtual() or flush_invalid_entries()
+ * for an example of this technique.
+ *
+ * For efficiency, when older chapter numbers become invalid, the index does not immediately remove
+ * the invalidated entries. Instead it lazily removes them from a given delta list the next time it
+ * walks that list during normal operation. Because of this, the index size must be increased
+ * somewhat to accommodate all the invalid entries that have not yet been removed. For the standard
+ * index sizes, this requires about 4 chapters of old entries per 1024 chapters of valid entries in
+ * the index.
+ */
+
+struct sub_index_parameters {
+ /* The number of bits in address mask */
+ u8 address_bits;
+ /* The number of bits in chapter number */
+ u8 chapter_bits;
+ /* The mean delta */
+ u32 mean_delta;
+ /* The number of delta lists */
+ u64 list_count;
+ /* The number of chapters used */
+ u32 chapter_count;
+ /* The number of bits per chapter */
+ size_t chapter_size_in_bits;
+ /* The number of bytes of delta list memory */
+ size_t memory_size;
+ /* The number of bytes the index should keep free at all times */
+ size_t target_free_bytes;
+};
+
+struct split_config {
+ /* The hook subindex configuration */
+ struct uds_configuration hook_config;
+ struct index_geometry hook_geometry;
+
+ /* The non-hook subindex configuration */
+ struct uds_configuration non_hook_config;
+ struct index_geometry non_hook_geometry;
+};
+
+struct chapter_range {
+ u32 chapter_start;
+ u32 chapter_count;
+};
+
+#define MAGIC_SIZE 8
+
+static const char MAGIC_START_5[] = "MI5-0005";
+
+struct sub_index_data {
+ char magic[MAGIC_SIZE]; /* MAGIC_START_5 */
+ u64 volume_nonce;
+ u64 virtual_chapter_low;
+ u64 virtual_chapter_high;
+ u32 first_list;
+ u32 list_count;
+};
+
+static const char MAGIC_START_6[] = "MI6-0001";
+
+struct volume_index_data {
+ char magic[MAGIC_SIZE]; /* MAGIC_START_6 */
+ u32 sparse_sample_rate;
+};
+
+static inline u32 extract_address(const struct volume_sub_index *sub_index,
+ const struct uds_record_name *name)
+{
+ return uds_extract_volume_index_bytes(name) & sub_index->address_mask;
+}
+
+static inline u32 extract_dlist_num(const struct volume_sub_index *sub_index,
+ const struct uds_record_name *name)
+{
+ u64 bits = uds_extract_volume_index_bytes(name);
+
+ return (bits >> sub_index->address_bits) % sub_index->list_count;
+}
+
+static inline const struct volume_sub_index_zone *
+get_zone_for_record(const struct volume_index_record *record)
+{
+ return &record->sub_index->zones[record->zone_number];
+}
+
+static inline u64 convert_index_to_virtual(const struct volume_index_record *record,
+ u32 index_chapter)
+{
+ const struct volume_sub_index_zone *volume_index_zone = get_zone_for_record(record);
+ u32 rolling_chapter = ((index_chapter - volume_index_zone->virtual_chapter_low) &
+ record->sub_index->chapter_mask);
+
+ return volume_index_zone->virtual_chapter_low + rolling_chapter;
+}
+
+static inline u32 convert_virtual_to_index(const struct volume_sub_index *sub_index,
+ u64 virtual_chapter)
+{
+ return virtual_chapter & sub_index->chapter_mask;
+}
+
+static inline bool is_virtual_chapter_indexed(const struct volume_index_record *record,
+ u64 virtual_chapter)
+{
+ const struct volume_sub_index_zone *volume_index_zone = get_zone_for_record(record);
+
+ return ((virtual_chapter >= volume_index_zone->virtual_chapter_low) &&
+ (virtual_chapter <= volume_index_zone->virtual_chapter_high));
+}
+
+static inline bool has_sparse(const struct volume_index *volume_index)
+{
+ return volume_index->sparse_sample_rate > 0;
+}
+
+bool uds_is_volume_index_sample(const struct volume_index *volume_index,
+ const struct uds_record_name *name)
+{
+ if (!has_sparse(volume_index))
+ return false;
+
+ return (uds_extract_sampling_bytes(name) % volume_index->sparse_sample_rate) == 0;
+}
+
+static inline const struct volume_sub_index *
+get_volume_sub_index(const struct volume_index *volume_index,
+ const struct uds_record_name *name)
+{
+ return (uds_is_volume_index_sample(volume_index, name) ?
+ &volume_index->vi_hook :
+ &volume_index->vi_non_hook);
+}
+
+static unsigned int get_volume_sub_index_zone(const struct volume_sub_index *sub_index,
+ const struct uds_record_name *name)
+{
+ return extract_dlist_num(sub_index, name) / sub_index->delta_index.lists_per_zone;
+}
+
+unsigned int uds_get_volume_index_zone(const struct volume_index *volume_index,
+ const struct uds_record_name *name)
+{
+ return get_volume_sub_index_zone(get_volume_sub_index(volume_index, name), name);
+}
+
+#define DELTA_LIST_SIZE 256
+
+static int compute_volume_sub_index_parameters(const struct uds_configuration *config,
+ struct sub_index_parameters *params)
+{
+ u64 entries_in_volume_index, address_span;
+ u32 chapters_in_volume_index, invalid_chapters;
+ u32 rounded_chapters;
+ u64 delta_list_records;
+ u32 address_count;
+ u64 index_size_in_bits;
+ size_t expected_index_size;
+ u64 min_delta_lists = MAX_ZONES * MAX_ZONES;
+ struct index_geometry *geometry = config->geometry;
+ u64 records_per_chapter = geometry->records_per_chapter;
+
+ params->chapter_count = geometry->chapters_per_volume;
+ /*
+ * Make sure that the number of delta list records in the volume index does not change when
+ * the volume is reduced by one chapter. This preserves the mapping from name to volume
+ * index delta list.
+ */
+ rounded_chapters = params->chapter_count;
+ if (uds_is_reduced_index_geometry(geometry))
+ rounded_chapters += 1;
+ delta_list_records = records_per_chapter * rounded_chapters;
+ address_count = config->volume_index_mean_delta * DELTA_LIST_SIZE;
+ params->list_count = max(delta_list_records / DELTA_LIST_SIZE, min_delta_lists);
+ params->address_bits = bits_per(address_count - 1);
+ params->chapter_bits = bits_per(rounded_chapters - 1);
+ if ((u32) params->list_count != params->list_count) {
+ return vdo_log_warning_strerror(UDS_INVALID_ARGUMENT,
+ "cannot initialize volume index with %llu delta lists",
+ (unsigned long long) params->list_count);
+ }
+
+ if (params->address_bits > 31) {
+ return vdo_log_warning_strerror(UDS_INVALID_ARGUMENT,
+ "cannot initialize volume index with %u address bits",
+ params->address_bits);
+ }
+
+ /*
+ * The probability that a given delta list is not touched during the writing of an entire
+ * chapter is:
+ *
+ * double p_not_touched = pow((double) (params->list_count - 1) / params->list_count,
+ * records_per_chapter);
+ *
+ * For the standard index sizes, about 78% of the delta lists are not touched, and
+ * therefore contain old index entries that have not been eliminated by the lazy LRU
+ * processing. Then the number of old index entries that accumulate over the entire index,
+ * in terms of full chapters worth of entries, is:
+ *
+ * double invalid_chapters = p_not_touched / (1.0 - p_not_touched);
+ *
+ * For the standard index sizes, the index needs about 3.5 chapters of space for the old
+ * entries in a 1024 chapter index, so round this up to use 4 chapters per 1024 chapters in
+ * the index.
+ */
+ invalid_chapters = max(rounded_chapters / 256, 2U);
+ chapters_in_volume_index = rounded_chapters + invalid_chapters;
+ entries_in_volume_index = records_per_chapter * chapters_in_volume_index;
+
+ address_span = params->list_count << params->address_bits;
+ params->mean_delta = address_span / entries_in_volume_index;
+
+ /*
+ * Compute the expected size of a full index, then set the total memory to be 6% larger
+ * than that expected size. This number should be large enough that there are not many
+ * rebalances when the index is full.
+ */
+ params->chapter_size_in_bits = uds_compute_delta_index_size(records_per_chapter,
+ params->mean_delta,
+ params->chapter_bits);
+ index_size_in_bits = params->chapter_size_in_bits * chapters_in_volume_index;
+ expected_index_size = index_size_in_bits / BITS_PER_BYTE;
+ params->memory_size = expected_index_size * 106 / 100;
+
+ params->target_free_bytes = expected_index_size / 20;
+ return UDS_SUCCESS;
+}
+
+static void uninitialize_volume_sub_index(struct volume_sub_index *sub_index)
+{
+ vdo_free(vdo_forget(sub_index->flush_chapters));
+ vdo_free(vdo_forget(sub_index->zones));
+ uds_uninitialize_delta_index(&sub_index->delta_index);
+}
+
+void uds_free_volume_index(struct volume_index *volume_index)
+{
+ if (volume_index == NULL)
+ return;
+
+ if (volume_index->zones != NULL)
+ vdo_free(vdo_forget(volume_index->zones));
+
+ uninitialize_volume_sub_index(&volume_index->vi_non_hook);
+ uninitialize_volume_sub_index(&volume_index->vi_hook);
+ vdo_free(volume_index);
+}
+
+
+static int compute_volume_sub_index_save_bytes(const struct uds_configuration *config,
+ size_t *bytes)
+{
+ struct sub_index_parameters params = { .address_bits = 0 };
+ int result;
+
+ result = compute_volume_sub_index_parameters(config, &params);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ *bytes = (sizeof(struct sub_index_data) + params.list_count * sizeof(u64) +
+ uds_compute_delta_index_save_bytes(params.list_count,
+ params.memory_size));
+ return UDS_SUCCESS;
+}
+
+/* This function is only useful if the configuration includes sparse chapters. */
+static void split_configuration(const struct uds_configuration *config,
+ struct split_config *split)
+{
+ u64 sample_rate, sample_records;
+ u64 dense_chapters, sparse_chapters;
+
+ /* Start with copies of the base configuration. */
+ split->hook_config = *config;
+ split->hook_geometry = *config->geometry;
+ split->hook_config.geometry = &split->hook_geometry;
+ split->non_hook_config = *config;
+ split->non_hook_geometry = *config->geometry;
+ split->non_hook_config.geometry = &split->non_hook_geometry;
+
+ sample_rate = config->sparse_sample_rate;
+ sparse_chapters = config->geometry->sparse_chapters_per_volume;
+ dense_chapters = config->geometry->chapters_per_volume - sparse_chapters;
+ sample_records = config->geometry->records_per_chapter / sample_rate;
+
+ /* Adjust the number of records indexed for each chapter. */
+ split->hook_geometry.records_per_chapter = sample_records;
+ split->non_hook_geometry.records_per_chapter -= sample_records;
+
+ /* Adjust the number of chapters indexed. */
+ split->hook_geometry.sparse_chapters_per_volume = 0;
+ split->non_hook_geometry.sparse_chapters_per_volume = 0;
+ split->non_hook_geometry.chapters_per_volume = dense_chapters;
+}
+
+static int compute_volume_index_save_bytes(const struct uds_configuration *config,
+ size_t *bytes)
+{
+ size_t hook_bytes, non_hook_bytes;
+ struct split_config split;
+ int result;
+
+ if (!uds_is_sparse_index_geometry(config->geometry))
+ return compute_volume_sub_index_save_bytes(config, bytes);
+
+ split_configuration(config, &split);
+ result = compute_volume_sub_index_save_bytes(&split.hook_config, &hook_bytes);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ result = compute_volume_sub_index_save_bytes(&split.non_hook_config,
+ &non_hook_bytes);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ *bytes = sizeof(struct volume_index_data) + hook_bytes + non_hook_bytes;
+ return UDS_SUCCESS;
+}
+
+int uds_compute_volume_index_save_blocks(const struct uds_configuration *config,
+ size_t block_size, u64 *block_count)
+{
+ size_t bytes;
+ int result;
+
+ result = compute_volume_index_save_bytes(config, &bytes);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ bytes += sizeof(struct delta_list_save_info);
+ *block_count = DIV_ROUND_UP(bytes, block_size) + MAX_ZONES;
+ return UDS_SUCCESS;
+}
+
+/* Flush invalid entries while walking the delta list. */
+static inline int flush_invalid_entries(struct volume_index_record *record,
+ struct chapter_range *flush_range,
+ u32 *next_chapter_to_invalidate)
+{
+ int result;
+
+ result = uds_next_delta_index_entry(&record->delta_entry);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ while (!record->delta_entry.at_end) {
+ u32 index_chapter = uds_get_delta_entry_value(&record->delta_entry);
+ u32 relative_chapter = ((index_chapter - flush_range->chapter_start) &
+ record->sub_index->chapter_mask);
+
+ if (likely(relative_chapter >= flush_range->chapter_count)) {
+ if (relative_chapter < *next_chapter_to_invalidate)
+ *next_chapter_to_invalidate = relative_chapter;
+ break;
+ }
+
+ result = uds_remove_delta_index_entry(&record->delta_entry);
+ if (result != UDS_SUCCESS)
+ return result;
+ }
+
+ return UDS_SUCCESS;
+}
+
+/* Find the matching record, or the list offset where the record would go. */
+static int get_volume_index_entry(struct volume_index_record *record, u32 list_number,
+ u32 key, struct chapter_range *flush_range)
+{
+ struct volume_index_record other_record;
+ const struct volume_sub_index *sub_index = record->sub_index;
+ u32 next_chapter_to_invalidate = sub_index->chapter_mask;
+ int result;
+
+ result = uds_start_delta_index_search(&sub_index->delta_index, list_number, 0,
+ &record->delta_entry);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ do {
+ result = flush_invalid_entries(record, flush_range,
+ &next_chapter_to_invalidate);
+ if (result != UDS_SUCCESS)
+ return result;
+ } while (!record->delta_entry.at_end && (key > record->delta_entry.key));
+
+ result = uds_remember_delta_index_offset(&record->delta_entry);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ /* Check any collision records for a more precise match. */
+ other_record = *record;
+ if (!other_record.delta_entry.at_end && (key == other_record.delta_entry.key)) {
+ for (;;) {
+ u8 collision_name[UDS_RECORD_NAME_SIZE];
+
+ result = flush_invalid_entries(&other_record, flush_range,
+ &next_chapter_to_invalidate);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ if (other_record.delta_entry.at_end ||
+ !other_record.delta_entry.is_collision)
+ break;
+
+ result = uds_get_delta_entry_collision(&other_record.delta_entry,
+ collision_name);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ if (memcmp(collision_name, record->name, UDS_RECORD_NAME_SIZE) == 0) {
+ *record = other_record;
+ break;
+ }
+ }
+ }
+ while (!other_record.delta_entry.at_end) {
+ result = flush_invalid_entries(&other_record, flush_range,
+ &next_chapter_to_invalidate);
+ if (result != UDS_SUCCESS)
+ return result;
+ }
+ next_chapter_to_invalidate += flush_range->chapter_start;
+ next_chapter_to_invalidate &= sub_index->chapter_mask;
+ flush_range->chapter_start = next_chapter_to_invalidate;
+ flush_range->chapter_count = 0;
+ return UDS_SUCCESS;
+}
+
+static int get_volume_sub_index_record(struct volume_sub_index *sub_index,
+ const struct uds_record_name *name,
+ struct volume_index_record *record)
+{
+ int result;
+ const struct volume_sub_index_zone *volume_index_zone;
+ u32 address = extract_address(sub_index, name);
+ u32 delta_list_number = extract_dlist_num(sub_index, name);
+ u64 flush_chapter = sub_index->flush_chapters[delta_list_number];
+
+ record->sub_index = sub_index;
+ record->mutex = NULL;
+ record->name = name;
+ record->zone_number = delta_list_number / sub_index->delta_index.lists_per_zone;
+ volume_index_zone = get_zone_for_record(record);
+
+ if (flush_chapter < volume_index_zone->virtual_chapter_low) {
+ struct chapter_range range;
+ u64 flush_count = volume_index_zone->virtual_chapter_low - flush_chapter;
+
+ range.chapter_start = convert_virtual_to_index(sub_index, flush_chapter);
+ range.chapter_count = (flush_count > sub_index->chapter_mask ?
+ sub_index->chapter_mask + 1 :
+ flush_count);
+ result = get_volume_index_entry(record, delta_list_number, address,
+ &range);
+ flush_chapter = convert_index_to_virtual(record, range.chapter_start);
+ if (flush_chapter > volume_index_zone->virtual_chapter_high)
+ flush_chapter = volume_index_zone->virtual_chapter_high;
+ sub_index->flush_chapters[delta_list_number] = flush_chapter;
+ } else {
+ result = uds_get_delta_index_entry(&sub_index->delta_index,
+ delta_list_number, address,
+ name->name, &record->delta_entry);
+ }
+
+ if (result != UDS_SUCCESS)
+ return result;
+
+ record->is_found =
+ (!record->delta_entry.at_end && (record->delta_entry.key == address));
+ if (record->is_found) {
+ u32 index_chapter = uds_get_delta_entry_value(&record->delta_entry);
+
+ record->virtual_chapter = convert_index_to_virtual(record, index_chapter);
+ }
+
+ record->is_collision = record->delta_entry.is_collision;
+ return UDS_SUCCESS;
+}
+
+int uds_get_volume_index_record(struct volume_index *volume_index,
+ const struct uds_record_name *name,
+ struct volume_index_record *record)
+{
+ int result;
+
+ if (uds_is_volume_index_sample(volume_index, name)) {
+ /*
+ * Other threads cannot be allowed to call uds_lookup_volume_index_name() while
+ * this thread is finding the volume index record. Due to the lazy LRU flushing of
+ * the volume index, uds_get_volume_index_record() is not a read-only operation.
+ */
+ unsigned int zone =
+ get_volume_sub_index_zone(&volume_index->vi_hook, name);
+ struct mutex *mutex = &volume_index->zones[zone].hook_mutex;
+
+ mutex_lock(mutex);
+ result = get_volume_sub_index_record(&volume_index->vi_hook, name,
+ record);
+ mutex_unlock(mutex);
+ /* Remember the mutex so that other operations on the index record can use it. */
+ record->mutex = mutex;
+ } else {
+ result = get_volume_sub_index_record(&volume_index->vi_non_hook, name,
+ record);
+ }
+
+ return result;
+}
+
+int uds_put_volume_index_record(struct volume_index_record *record, u64 virtual_chapter)
+{
+ int result;
+ u32 address;
+ const struct volume_sub_index *sub_index = record->sub_index;
+
+ if (!is_virtual_chapter_indexed(record, virtual_chapter)) {
+ u64 low = get_zone_for_record(record)->virtual_chapter_low;
+ u64 high = get_zone_for_record(record)->virtual_chapter_high;
+
+ return vdo_log_warning_strerror(UDS_INVALID_ARGUMENT,
+ "cannot put record into chapter number %llu that is out of the valid range %llu to %llu",
+ (unsigned long long) virtual_chapter,
+ (unsigned long long) low,
+ (unsigned long long) high);
+ }
+ address = extract_address(sub_index, record->name);
+ if (unlikely(record->mutex != NULL))
+ mutex_lock(record->mutex);
+ result = uds_put_delta_index_entry(&record->delta_entry, address,
+ convert_virtual_to_index(sub_index,
+ virtual_chapter),
+ record->is_found ? record->name->name : NULL);
+ if (unlikely(record->mutex != NULL))
+ mutex_unlock(record->mutex);
+ switch (result) {
+ case UDS_SUCCESS:
+ record->virtual_chapter = virtual_chapter;
+ record->is_collision = record->delta_entry.is_collision;
+ record->is_found = true;
+ break;
+ case UDS_OVERFLOW:
+ vdo_log_ratelimit(vdo_log_warning_strerror, UDS_OVERFLOW,
+ "Volume index entry dropped due to overflow condition");
+ uds_log_delta_index_entry(&record->delta_entry);
+ break;
+ default:
+ break;
+ }
+
+ return result;
+}
+
+int uds_remove_volume_index_record(struct volume_index_record *record)
+{
+ int result;
+
+ if (!record->is_found)
+ return vdo_log_warning_strerror(UDS_BAD_STATE,
+ "illegal operation on new record");
+
+ /* Mark the record so that it cannot be used again */
+ record->is_found = false;
+ if (unlikely(record->mutex != NULL))
+ mutex_lock(record->mutex);
+ result = uds_remove_delta_index_entry(&record->delta_entry);
+ if (unlikely(record->mutex != NULL))
+ mutex_unlock(record->mutex);
+ return result;
+}
+
+static void set_volume_sub_index_zone_open_chapter(struct volume_sub_index *sub_index,
+ unsigned int zone_number,
+ u64 virtual_chapter)
+{
+ u64 used_bits = 0;
+ struct volume_sub_index_zone *zone = &sub_index->zones[zone_number];
+ struct delta_zone *delta_zone;
+ u32 i;
+
+ zone->virtual_chapter_low = (virtual_chapter >= sub_index->chapter_count ?
+ virtual_chapter - sub_index->chapter_count + 1 :
+ 0);
+ zone->virtual_chapter_high = virtual_chapter;
+
+ /* Check to see if the new zone data is too large. */
+ delta_zone = &sub_index->delta_index.delta_zones[zone_number];
+ for (i = 1; i <= delta_zone->list_count; i++)
+ used_bits += delta_zone->delta_lists[i].size;
+
+ if (used_bits > sub_index->max_zone_bits) {
+ /* Expire enough chapters to free the desired space. */
+ u64 expire_count =
+ 1 + (used_bits - sub_index->max_zone_bits) / sub_index->chapter_zone_bits;
+
+ if (expire_count == 1) {
+ vdo_log_ratelimit(vdo_log_info,
+ "zone %u: At chapter %llu, expiring chapter %llu early",
+ zone_number,
+ (unsigned long long) virtual_chapter,
+ (unsigned long long) zone->virtual_chapter_low);
+ zone->early_flushes++;
+ zone->virtual_chapter_low++;
+ } else {
+ u64 first_expired = zone->virtual_chapter_low;
+
+ if (first_expired + expire_count < zone->virtual_chapter_high) {
+ zone->early_flushes += expire_count;
+ zone->virtual_chapter_low += expire_count;
+ } else {
+ zone->early_flushes +=
+ zone->virtual_chapter_high - zone->virtual_chapter_low;
+ zone->virtual_chapter_low = zone->virtual_chapter_high;
+ }
+ vdo_log_ratelimit(vdo_log_info,
+ "zone %u: At chapter %llu, expiring chapters %llu to %llu early",
+ zone_number,
+ (unsigned long long) virtual_chapter,
+ (unsigned long long) first_expired,
+ (unsigned long long) zone->virtual_chapter_low - 1);
+ }
+ }
+}
+
+void uds_set_volume_index_zone_open_chapter(struct volume_index *volume_index,
+ unsigned int zone_number,
+ u64 virtual_chapter)
+{
+ struct mutex *mutex = &volume_index->zones[zone_number].hook_mutex;
+
+ set_volume_sub_index_zone_open_chapter(&volume_index->vi_non_hook, zone_number,
+ virtual_chapter);
+
+ /*
+ * Other threads cannot be allowed to call uds_lookup_volume_index_name() while the open
+ * chapter number is changing.
+ */
+ if (has_sparse(volume_index)) {
+ mutex_lock(mutex);
+ set_volume_sub_index_zone_open_chapter(&volume_index->vi_hook,
+ zone_number, virtual_chapter);
+ mutex_unlock(mutex);
+ }
+}
+
+/*
+ * Set the newest open chapter number for the index, while also advancing the oldest valid chapter
+ * number.
+ */
+void uds_set_volume_index_open_chapter(struct volume_index *volume_index,
+ u64 virtual_chapter)
+{
+ unsigned int zone;
+
+ for (zone = 0; zone < volume_index->zone_count; zone++)
+ uds_set_volume_index_zone_open_chapter(volume_index, zone, virtual_chapter);
+}
+
+int uds_set_volume_index_record_chapter(struct volume_index_record *record,
+ u64 virtual_chapter)
+{
+ const struct volume_sub_index *sub_index = record->sub_index;
+ int result;
+
+ if (!record->is_found)
+ return vdo_log_warning_strerror(UDS_BAD_STATE,
+ "illegal operation on new record");
+
+ if (!is_virtual_chapter_indexed(record, virtual_chapter)) {
+ u64 low = get_zone_for_record(record)->virtual_chapter_low;
+ u64 high = get_zone_for_record(record)->virtual_chapter_high;
+
+ return vdo_log_warning_strerror(UDS_INVALID_ARGUMENT,
+ "cannot set chapter number %llu that is out of the valid range %llu to %llu",
+ (unsigned long long) virtual_chapter,
+ (unsigned long long) low,
+ (unsigned long long) high);
+ }
+
+ if (unlikely(record->mutex != NULL))
+ mutex_lock(record->mutex);
+ result = uds_set_delta_entry_value(&record->delta_entry,
+ convert_virtual_to_index(sub_index,
+ virtual_chapter));
+ if (unlikely(record->mutex != NULL))
+ mutex_unlock(record->mutex);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ record->virtual_chapter = virtual_chapter;
+ return UDS_SUCCESS;
+}
+
+static u64 lookup_volume_sub_index_name(const struct volume_sub_index *sub_index,
+ const struct uds_record_name *name)
+{
+ int result;
+ u32 address = extract_address(sub_index, name);
+ u32 delta_list_number = extract_dlist_num(sub_index, name);
+ unsigned int zone_number = get_volume_sub_index_zone(sub_index, name);
+ const struct volume_sub_index_zone *zone = &sub_index->zones[zone_number];
+ u64 virtual_chapter;
+ u32 index_chapter;
+ u32 rolling_chapter;
+ struct delta_index_entry delta_entry;
+
+ result = uds_get_delta_index_entry(&sub_index->delta_index, delta_list_number,
+ address, name->name, &delta_entry);
+ if (result != UDS_SUCCESS)
+ return NO_CHAPTER;
+
+ if (delta_entry.at_end || (delta_entry.key != address))
+ return NO_CHAPTER;
+
+ index_chapter = uds_get_delta_entry_value(&delta_entry);
+ rolling_chapter = (index_chapter - zone->virtual_chapter_low) & sub_index->chapter_mask;
+
+ virtual_chapter = zone->virtual_chapter_low + rolling_chapter;
+ if (virtual_chapter > zone->virtual_chapter_high)
+ return NO_CHAPTER;
+
+ return virtual_chapter;
+}
+
+/* Do a read-only lookup of the record name for sparse cache management. */
+u64 uds_lookup_volume_index_name(const struct volume_index *volume_index,
+ const struct uds_record_name *name)
+{
+ unsigned int zone_number = uds_get_volume_index_zone(volume_index, name);
+ struct mutex *mutex = &volume_index->zones[zone_number].hook_mutex;
+ u64 virtual_chapter;
+
+ if (!uds_is_volume_index_sample(volume_index, name))
+ return NO_CHAPTER;
+
+ mutex_lock(mutex);
+ virtual_chapter = lookup_volume_sub_index_name(&volume_index->vi_hook, name);
+ mutex_unlock(mutex);
+
+ return virtual_chapter;
+}
+
+static void abort_restoring_volume_sub_index(struct volume_sub_index *sub_index)
+{
+ uds_reset_delta_index(&sub_index->delta_index);
+}
+
+static void abort_restoring_volume_index(struct volume_index *volume_index)
+{
+ abort_restoring_volume_sub_index(&volume_index->vi_non_hook);
+ if (has_sparse(volume_index))
+ abort_restoring_volume_sub_index(&volume_index->vi_hook);
+}
+
+static int start_restoring_volume_sub_index(struct volume_sub_index *sub_index,
+ struct buffered_reader **readers,
+ unsigned int reader_count)
+{
+ unsigned int z;
+ int result;
+ u64 virtual_chapter_low = 0, virtual_chapter_high = 0;
+ unsigned int i;
+
+ for (i = 0; i < reader_count; i++) {
+ struct sub_index_data header;
+ u8 buffer[sizeof(struct sub_index_data)];
+ size_t offset = 0;
+ u32 j;
+
+ result = uds_read_from_buffered_reader(readers[i], buffer,
+ sizeof(buffer));
+ if (result != UDS_SUCCESS) {
+ return vdo_log_warning_strerror(result,
+ "failed to read volume index header");
+ }
+
+ memcpy(&header.magic, buffer, MAGIC_SIZE);
+ offset += MAGIC_SIZE;
+ decode_u64_le(buffer, &offset, &header.volume_nonce);
+ decode_u64_le(buffer, &offset, &header.virtual_chapter_low);
+ decode_u64_le(buffer, &offset, &header.virtual_chapter_high);
+ decode_u32_le(buffer, &offset, &header.first_list);
+ decode_u32_le(buffer, &offset, &header.list_count);
+
+ result = VDO_ASSERT(offset == sizeof(buffer),
+ "%zu bytes decoded of %zu expected", offset,
+ sizeof(buffer));
+ if (result != VDO_SUCCESS)
+ result = UDS_CORRUPT_DATA;
+
+ if (memcmp(header.magic, MAGIC_START_5, MAGIC_SIZE) != 0) {
+ return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
+ "volume index file had bad magic number");
+ }
+
+ if (sub_index->volume_nonce == 0) {
+ sub_index->volume_nonce = header.volume_nonce;
+ } else if (header.volume_nonce != sub_index->volume_nonce) {
+ return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
+ "volume index volume nonce incorrect");
+ }
+
+ if (i == 0) {
+ virtual_chapter_low = header.virtual_chapter_low;
+ virtual_chapter_high = header.virtual_chapter_high;
+ } else if (virtual_chapter_high != header.virtual_chapter_high) {
+ u64 low = header.virtual_chapter_low;
+ u64 high = header.virtual_chapter_high;
+
+ return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
+ "Inconsistent volume index zone files: Chapter range is [%llu,%llu], chapter range %d is [%llu,%llu]",
+ (unsigned long long) virtual_chapter_low,
+ (unsigned long long) virtual_chapter_high,
+ i, (unsigned long long) low,
+ (unsigned long long) high);
+ } else if (virtual_chapter_low < header.virtual_chapter_low) {
+ virtual_chapter_low = header.virtual_chapter_low;
+ }
+
+ for (j = 0; j < header.list_count; j++) {
+ u8 decoded[sizeof(u64)];
+
+ result = uds_read_from_buffered_reader(readers[i], decoded,
+ sizeof(u64));
+ if (result != UDS_SUCCESS) {
+ return vdo_log_warning_strerror(result,
+ "failed to read volume index flush ranges");
+ }
+
+ sub_index->flush_chapters[header.first_list + j] =
+ get_unaligned_le64(decoded);
+ }
+ }
+
+ for (z = 0; z < sub_index->zone_count; z++) {
+ memset(&sub_index->zones[z], 0, sizeof(struct volume_sub_index_zone));
+ sub_index->zones[z].virtual_chapter_low = virtual_chapter_low;
+ sub_index->zones[z].virtual_chapter_high = virtual_chapter_high;
+ }
+
+ result = uds_start_restoring_delta_index(&sub_index->delta_index, readers,
+ reader_count);
+ if (result != UDS_SUCCESS)
+ return vdo_log_warning_strerror(result, "restoring delta index failed");
+
+ return UDS_SUCCESS;
+}
+
+static int start_restoring_volume_index(struct volume_index *volume_index,
+ struct buffered_reader **buffered_readers,
+ unsigned int reader_count)
+{
+ unsigned int i;
+ int result;
+
+ if (!has_sparse(volume_index)) {
+ return start_restoring_volume_sub_index(&volume_index->vi_non_hook,
+ buffered_readers, reader_count);
+ }
+
+ for (i = 0; i < reader_count; i++) {
+ struct volume_index_data header;
+ u8 buffer[sizeof(struct volume_index_data)];
+ size_t offset = 0;
+
+ result = uds_read_from_buffered_reader(buffered_readers[i], buffer,
+ sizeof(buffer));
+ if (result != UDS_SUCCESS) {
+ return vdo_log_warning_strerror(result,
+ "failed to read volume index header");
+ }
+
+ memcpy(&header.magic, buffer, MAGIC_SIZE);
+ offset += MAGIC_SIZE;
+ decode_u32_le(buffer, &offset, &header.sparse_sample_rate);
+
+ result = VDO_ASSERT(offset == sizeof(buffer),
+ "%zu bytes decoded of %zu expected", offset,
+ sizeof(buffer));
+ if (result != VDO_SUCCESS)
+ result = UDS_CORRUPT_DATA;
+
+ if (memcmp(header.magic, MAGIC_START_6, MAGIC_SIZE) != 0)
+ return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
+ "volume index file had bad magic number");
+
+ if (i == 0) {
+ volume_index->sparse_sample_rate = header.sparse_sample_rate;
+ } else if (volume_index->sparse_sample_rate != header.sparse_sample_rate) {
+ vdo_log_warning_strerror(UDS_CORRUPT_DATA,
+ "Inconsistent sparse sample rate in delta index zone files: %u vs. %u",
+ volume_index->sparse_sample_rate,
+ header.sparse_sample_rate);
+ return UDS_CORRUPT_DATA;
+ }
+ }
+
+ result = start_restoring_volume_sub_index(&volume_index->vi_non_hook,
+ buffered_readers, reader_count);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ return start_restoring_volume_sub_index(&volume_index->vi_hook, buffered_readers,
+ reader_count);
+}
+
+static int finish_restoring_volume_sub_index(struct volume_sub_index *sub_index,
+ struct buffered_reader **buffered_readers,
+ unsigned int reader_count)
+{
+ return uds_finish_restoring_delta_index(&sub_index->delta_index,
+ buffered_readers, reader_count);
+}
+
+static int finish_restoring_volume_index(struct volume_index *volume_index,
+ struct buffered_reader **buffered_readers,
+ unsigned int reader_count)
+{
+ int result;
+
+ result = finish_restoring_volume_sub_index(&volume_index->vi_non_hook,
+ buffered_readers, reader_count);
+ if ((result == UDS_SUCCESS) && has_sparse(volume_index)) {
+ result = finish_restoring_volume_sub_index(&volume_index->vi_hook,
+ buffered_readers,
+ reader_count);
+ }
+
+ return result;
+}
+
+int uds_load_volume_index(struct volume_index *volume_index,
+ struct buffered_reader **readers, unsigned int reader_count)
+{
+ int result;
+
+ /* Start by reading the header section of the stream. */
+ result = start_restoring_volume_index(volume_index, readers, reader_count);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ result = finish_restoring_volume_index(volume_index, readers, reader_count);
+ if (result != UDS_SUCCESS) {
+ abort_restoring_volume_index(volume_index);
+ return result;
+ }
+
+ /* Check the final guard lists to make sure there is no extra data. */
+ result = uds_check_guard_delta_lists(readers, reader_count);
+ if (result != UDS_SUCCESS)
+ abort_restoring_volume_index(volume_index);
+
+ return result;
+}
+
+static int start_saving_volume_sub_index(const struct volume_sub_index *sub_index,
+ unsigned int zone_number,
+ struct buffered_writer *buffered_writer)
+{
+ int result;
+ struct volume_sub_index_zone *volume_index_zone = &sub_index->zones[zone_number];
+ u32 first_list = sub_index->delta_index.delta_zones[zone_number].first_list;
+ u32 list_count = sub_index->delta_index.delta_zones[zone_number].list_count;
+ u8 buffer[sizeof(struct sub_index_data)];
+ size_t offset = 0;
+ u32 i;
+
+ memcpy(buffer, MAGIC_START_5, MAGIC_SIZE);
+ offset += MAGIC_SIZE;
+ encode_u64_le(buffer, &offset, sub_index->volume_nonce);
+ encode_u64_le(buffer, &offset, volume_index_zone->virtual_chapter_low);
+ encode_u64_le(buffer, &offset, volume_index_zone->virtual_chapter_high);
+ encode_u32_le(buffer, &offset, first_list);
+ encode_u32_le(buffer, &offset, list_count);
+
+ result = VDO_ASSERT(offset == sizeof(struct sub_index_data),
+ "%zu bytes of config written, of %zu expected", offset,
+ sizeof(struct sub_index_data));
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = uds_write_to_buffered_writer(buffered_writer, buffer, offset);
+ if (result != UDS_SUCCESS)
+ return vdo_log_warning_strerror(result,
+ "failed to write volume index header");
+
+ for (i = 0; i < list_count; i++) {
+ u8 encoded[sizeof(u64)];
+
+ put_unaligned_le64(sub_index->flush_chapters[first_list + i], &encoded);
+ result = uds_write_to_buffered_writer(buffered_writer, encoded,
+ sizeof(u64));
+ if (result != UDS_SUCCESS) {
+ return vdo_log_warning_strerror(result,
+ "failed to write volume index flush ranges");
+ }
+ }
+
+ return uds_start_saving_delta_index(&sub_index->delta_index, zone_number,
+ buffered_writer);
+}
+
+static int start_saving_volume_index(const struct volume_index *volume_index,
+ unsigned int zone_number,
+ struct buffered_writer *writer)
+{
+ u8 buffer[sizeof(struct volume_index_data)];
+ size_t offset = 0;
+ int result;
+
+ if (!has_sparse(volume_index)) {
+ return start_saving_volume_sub_index(&volume_index->vi_non_hook,
+ zone_number, writer);
+ }
+
+ memcpy(buffer, MAGIC_START_6, MAGIC_SIZE);
+ offset += MAGIC_SIZE;
+ encode_u32_le(buffer, &offset, volume_index->sparse_sample_rate);
+ result = VDO_ASSERT(offset == sizeof(struct volume_index_data),
+ "%zu bytes of header written, of %zu expected", offset,
+ sizeof(struct volume_index_data));
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = uds_write_to_buffered_writer(writer, buffer, offset);
+ if (result != UDS_SUCCESS) {
+ vdo_log_warning_strerror(result, "failed to write volume index header");
+ return result;
+ }
+
+ result = start_saving_volume_sub_index(&volume_index->vi_non_hook, zone_number,
+ writer);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ return start_saving_volume_sub_index(&volume_index->vi_hook, zone_number,
+ writer);
+}
+
+static int finish_saving_volume_sub_index(const struct volume_sub_index *sub_index,
+ unsigned int zone_number)
+{
+ return uds_finish_saving_delta_index(&sub_index->delta_index, zone_number);
+}
+
+static int finish_saving_volume_index(const struct volume_index *volume_index,
+ unsigned int zone_number)
+{
+ int result;
+
+ result = finish_saving_volume_sub_index(&volume_index->vi_non_hook, zone_number);
+ if ((result == UDS_SUCCESS) && has_sparse(volume_index))
+ result = finish_saving_volume_sub_index(&volume_index->vi_hook, zone_number);
+ return result;
+}
+
+int uds_save_volume_index(struct volume_index *volume_index,
+ struct buffered_writer **writers, unsigned int writer_count)
+{
+ int result = UDS_SUCCESS;
+ unsigned int zone;
+
+ for (zone = 0; zone < writer_count; zone++) {
+ result = start_saving_volume_index(volume_index, zone, writers[zone]);
+ if (result != UDS_SUCCESS)
+ break;
+
+ result = finish_saving_volume_index(volume_index, zone);
+ if (result != UDS_SUCCESS)
+ break;
+
+ result = uds_write_guard_delta_list(writers[zone]);
+ if (result != UDS_SUCCESS)
+ break;
+
+ result = uds_flush_buffered_writer(writers[zone]);
+ if (result != UDS_SUCCESS)
+ break;
+ }
+
+ return result;
+}
+
+static void get_volume_sub_index_stats(const struct volume_sub_index *sub_index,
+ struct volume_index_stats *stats)
+{
+ struct delta_index_stats dis;
+ unsigned int z;
+
+ uds_get_delta_index_stats(&sub_index->delta_index, &dis);
+ stats->rebalance_time = dis.rebalance_time;
+ stats->rebalance_count = dis.rebalance_count;
+ stats->record_count = dis.record_count;
+ stats->collision_count = dis.collision_count;
+ stats->discard_count = dis.discard_count;
+ stats->overflow_count = dis.overflow_count;
+ stats->delta_lists = dis.list_count;
+ stats->early_flushes = 0;
+ for (z = 0; z < sub_index->zone_count; z++)
+ stats->early_flushes += sub_index->zones[z].early_flushes;
+}
+
+void uds_get_volume_index_stats(const struct volume_index *volume_index,
+ struct volume_index_stats *stats)
+{
+ struct volume_index_stats sparse_stats;
+
+ get_volume_sub_index_stats(&volume_index->vi_non_hook, stats);
+ if (!has_sparse(volume_index))
+ return;
+
+ get_volume_sub_index_stats(&volume_index->vi_hook, &sparse_stats);
+ stats->rebalance_time += sparse_stats.rebalance_time;
+ stats->rebalance_count += sparse_stats.rebalance_count;
+ stats->record_count += sparse_stats.record_count;
+ stats->collision_count += sparse_stats.collision_count;
+ stats->discard_count += sparse_stats.discard_count;
+ stats->overflow_count += sparse_stats.overflow_count;
+ stats->delta_lists += sparse_stats.delta_lists;
+ stats->early_flushes += sparse_stats.early_flushes;
+}
+
+static int initialize_volume_sub_index(const struct uds_configuration *config,
+ u64 volume_nonce, u8 tag,
+ struct volume_sub_index *sub_index)
+{
+ struct sub_index_parameters params = { .address_bits = 0 };
+ unsigned int zone_count = config->zone_count;
+ u64 available_bytes = 0;
+ unsigned int z;
+ int result;
+
+ result = compute_volume_sub_index_parameters(config, &params);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ sub_index->address_bits = params.address_bits;
+ sub_index->address_mask = (1u << params.address_bits) - 1;
+ sub_index->chapter_bits = params.chapter_bits;
+ sub_index->chapter_mask = (1u << params.chapter_bits) - 1;
+ sub_index->chapter_count = params.chapter_count;
+ sub_index->list_count = params.list_count;
+ sub_index->zone_count = zone_count;
+ sub_index->chapter_zone_bits = params.chapter_size_in_bits / zone_count;
+ sub_index->volume_nonce = volume_nonce;
+
+ result = uds_initialize_delta_index(&sub_index->delta_index, zone_count,
+ params.list_count, params.mean_delta,
+ params.chapter_bits, params.memory_size,
+ tag);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ for (z = 0; z < sub_index->delta_index.zone_count; z++)
+ available_bytes += sub_index->delta_index.delta_zones[z].size;
+ available_bytes -= params.target_free_bytes;
+ sub_index->max_zone_bits = (available_bytes * BITS_PER_BYTE) / zone_count;
+ sub_index->memory_size = (sub_index->delta_index.memory_size +
+ sizeof(struct volume_sub_index) +
+ (params.list_count * sizeof(u64)) +
+ (zone_count * sizeof(struct volume_sub_index_zone)));
+
+ /* The following arrays are initialized to all zeros. */
+ result = vdo_allocate(params.list_count, u64, "first chapter to flush",
+ &sub_index->flush_chapters);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ return vdo_allocate(zone_count, struct volume_sub_index_zone,
+ "volume index zones", &sub_index->zones);
+}
+
+int uds_make_volume_index(const struct uds_configuration *config, u64 volume_nonce,
+ struct volume_index **volume_index_ptr)
+{
+ struct split_config split;
+ unsigned int zone;
+ struct volume_index *volume_index;
+ int result;
+
+ result = vdo_allocate(1, struct volume_index, "volume index", &volume_index);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ volume_index->zone_count = config->zone_count;
+
+ if (!uds_is_sparse_index_geometry(config->geometry)) {
+ result = initialize_volume_sub_index(config, volume_nonce, 'm',
+ &volume_index->vi_non_hook);
+ if (result != UDS_SUCCESS) {
+ uds_free_volume_index(volume_index);
+ return result;
+ }
+
+ volume_index->memory_size = volume_index->vi_non_hook.memory_size;
+ *volume_index_ptr = volume_index;
+ return UDS_SUCCESS;
+ }
+
+ volume_index->sparse_sample_rate = config->sparse_sample_rate;
+
+ result = vdo_allocate(config->zone_count, struct volume_index_zone,
+ "volume index zones", &volume_index->zones);
+ if (result != VDO_SUCCESS) {
+ uds_free_volume_index(volume_index);
+ return result;
+ }
+
+ for (zone = 0; zone < config->zone_count; zone++)
+ mutex_init(&volume_index->zones[zone].hook_mutex);
+
+ split_configuration(config, &split);
+ result = initialize_volume_sub_index(&split.non_hook_config, volume_nonce, 'd',
+ &volume_index->vi_non_hook);
+ if (result != UDS_SUCCESS) {
+ uds_free_volume_index(volume_index);
+ return vdo_log_error_strerror(result,
+ "Error creating non hook volume index");
+ }
+
+ result = initialize_volume_sub_index(&split.hook_config, volume_nonce, 's',
+ &volume_index->vi_hook);
+ if (result != UDS_SUCCESS) {
+ uds_free_volume_index(volume_index);
+ return vdo_log_error_strerror(result,
+ "Error creating hook volume index");
+ }
+
+ volume_index->memory_size =
+ volume_index->vi_non_hook.memory_size + volume_index->vi_hook.memory_size;
+ *volume_index_ptr = volume_index;
+ return UDS_SUCCESS;
+}
diff --git a/drivers/md/dm-vdo/indexer/volume-index.h b/drivers/md/dm-vdo/indexer/volume-index.h
new file mode 100644
index 000000000000..583998c547b7
--- /dev/null
+++ b/drivers/md/dm-vdo/indexer/volume-index.h
@@ -0,0 +1,193 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef UDS_VOLUME_INDEX_H
+#define UDS_VOLUME_INDEX_H
+
+#include <linux/limits.h>
+
+#include "thread-utils.h"
+
+#include "config.h"
+#include "delta-index.h"
+#include "indexer.h"
+
+/*
+ * The volume index is the primary top-level index for UDS. It contains records which map a record
+ * name to the chapter where a record with that name is stored. This mapping can definitively say
+ * when no record exists. However, because we only use a subset of the name for this index, it
+ * cannot definitively say that a record for the entry does exist. It can only say that if a record
+ * exists, it will be in a particular chapter. The request can then be dispatched to that chapter
+ * for further processing.
+ *
+ * If the volume_index_record does not actually match the record name, the index can store a more
+ * specific collision record to disambiguate the new entry from the existing one. Index entries are
+ * managed with volume_index_record structures.
+ */
+
+#define NO_CHAPTER U64_MAX
+
+struct volume_index_stats {
+ /* Nanoseconds spent rebalancing */
+ ktime_t rebalance_time;
+ /* Number of memory rebalances */
+ u32 rebalance_count;
+ /* The number of records in the index */
+ u64 record_count;
+ /* The number of collision records */
+ u64 collision_count;
+ /* The number of records removed */
+ u64 discard_count;
+ /* The number of UDS_OVERFLOWs detected */
+ u64 overflow_count;
+ /* The number of delta lists */
+ u32 delta_lists;
+ /* Number of early flushes */
+ u64 early_flushes;
+};
+
+struct volume_sub_index_zone {
+ u64 virtual_chapter_low;
+ u64 virtual_chapter_high;
+ u64 early_flushes;
+} __aligned(L1_CACHE_BYTES);
+
+struct volume_sub_index {
+ /* The delta index */
+ struct delta_index delta_index;
+ /* The first chapter to be flushed in each zone */
+ u64 *flush_chapters;
+ /* The zones */
+ struct volume_sub_index_zone *zones;
+ /* The volume nonce */
+ u64 volume_nonce;
+ /* Expected size of a chapter (per zone) */
+ u64 chapter_zone_bits;
+ /* Maximum size of the index (per zone) */
+ u64 max_zone_bits;
+ /* The number of bits in address mask */
+ u8 address_bits;
+ /* Mask to get address within delta list */
+ u32 address_mask;
+ /* The number of bits in chapter number */
+ u8 chapter_bits;
+ /* The largest storable chapter number */
+ u32 chapter_mask;
+ /* The number of chapters used */
+ u32 chapter_count;
+ /* The number of delta lists */
+ u32 list_count;
+ /* The number of zones */
+ unsigned int zone_count;
+ /* The amount of memory allocated */
+ u64 memory_size;
+};
+
+struct volume_index_zone {
+ /* Protects the sampled index in this zone */
+ struct mutex hook_mutex;
+} __aligned(L1_CACHE_BYTES);
+
+struct volume_index {
+ u32 sparse_sample_rate;
+ unsigned int zone_count;
+ u64 memory_size;
+ struct volume_sub_index vi_non_hook;
+ struct volume_sub_index vi_hook;
+ struct volume_index_zone *zones;
+};
+
+/*
+ * The volume_index_record structure is used to facilitate processing of a record name. A client
+ * first calls uds_get_volume_index_record() to find the volume index record for a record name. The
+ * fields of the record can then be examined to determine the state of the record.
+ *
+ * If is_found is false, then the index did not find an entry for the record name. Calling
+ * uds_put_volume_index_record() will insert a new entry for that name at the proper place.
+ *
+ * If is_found is true, then we did find an entry for the record name, and the virtual_chapter and
+ * is_collision fields reflect the entry found. Subsequently, a call to
+ * uds_remove_volume_index_record() will remove the entry, a call to
+ * uds_set_volume_index_record_chapter() will update the existing entry, and a call to
+ * uds_put_volume_index_record() will insert a new collision record after the existing entry.
+ */
+struct volume_index_record {
+ /* Public fields */
+
+ /* Chapter where the record info is found */
+ u64 virtual_chapter;
+ /* This record is a collision */
+ bool is_collision;
+ /* This record is the requested record */
+ bool is_found;
+
+ /* Private fields */
+
+ /* Zone that contains this name */
+ unsigned int zone_number;
+ /* The volume index */
+ struct volume_sub_index *sub_index;
+ /* Mutex for accessing this delta index entry in the hook index */
+ struct mutex *mutex;
+ /* The record name to which this record refers */
+ const struct uds_record_name *name;
+ /* The delta index entry for this record */
+ struct delta_index_entry delta_entry;
+};
+
+int __must_check uds_make_volume_index(const struct uds_configuration *config,
+ u64 volume_nonce,
+ struct volume_index **volume_index);
+
+void uds_free_volume_index(struct volume_index *volume_index);
+
+int __must_check uds_compute_volume_index_save_blocks(const struct uds_configuration *config,
+ size_t block_size,
+ u64 *block_count);
+
+unsigned int __must_check uds_get_volume_index_zone(const struct volume_index *volume_index,
+ const struct uds_record_name *name);
+
+bool __must_check uds_is_volume_index_sample(const struct volume_index *volume_index,
+ const struct uds_record_name *name);
+
+/*
+ * This function is only used to manage sparse cache membership. Most requests should use
+ * uds_get_volume_index_record() to look up index records instead.
+ */
+u64 __must_check uds_lookup_volume_index_name(const struct volume_index *volume_index,
+ const struct uds_record_name *name);
+
+int __must_check uds_get_volume_index_record(struct volume_index *volume_index,
+ const struct uds_record_name *name,
+ struct volume_index_record *record);
+
+int __must_check uds_put_volume_index_record(struct volume_index_record *record,
+ u64 virtual_chapter);
+
+int __must_check uds_remove_volume_index_record(struct volume_index_record *record);
+
+int __must_check uds_set_volume_index_record_chapter(struct volume_index_record *record,
+ u64 virtual_chapter);
+
+void uds_set_volume_index_open_chapter(struct volume_index *volume_index,
+ u64 virtual_chapter);
+
+void uds_set_volume_index_zone_open_chapter(struct volume_index *volume_index,
+ unsigned int zone_number,
+ u64 virtual_chapter);
+
+int __must_check uds_load_volume_index(struct volume_index *volume_index,
+ struct buffered_reader **readers,
+ unsigned int reader_count);
+
+int __must_check uds_save_volume_index(struct volume_index *volume_index,
+ struct buffered_writer **writers,
+ unsigned int writer_count);
+
+void uds_get_volume_index_stats(const struct volume_index *volume_index,
+ struct volume_index_stats *stats);
+
+#endif /* UDS_VOLUME_INDEX_H */
diff --git a/drivers/md/dm-vdo/indexer/volume.c b/drivers/md/dm-vdo/indexer/volume.c
new file mode 100644
index 000000000000..655453bb276b
--- /dev/null
+++ b/drivers/md/dm-vdo/indexer/volume.c
@@ -0,0 +1,1693 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "volume.h"
+
+#include <linux/atomic.h>
+#include <linux/dm-bufio.h>
+#include <linux/err.h>
+
+#include "errors.h"
+#include "logger.h"
+#include "memory-alloc.h"
+#include "permassert.h"
+#include "string-utils.h"
+#include "thread-utils.h"
+
+#include "chapter-index.h"
+#include "config.h"
+#include "geometry.h"
+#include "hash-utils.h"
+#include "index.h"
+#include "sparse-cache.h"
+
+/*
+ * The first block of the volume layout is reserved for the volume header, which is no longer used.
+ * The remainder of the volume is divided into chapters consisting of several pages of records, and
+ * several pages of static index to use to find those records. The index pages are recorded first,
+ * followed by the record pages. The chapters are written in order as they are filled, so the
+ * volume storage acts as a circular log of the most recent chapters, with each new chapter
+ * overwriting the oldest saved one.
+ *
+ * When a new chapter is filled and closed, the records from that chapter are sorted and
+ * interleaved in approximate temporal order, and assigned to record pages. Then a static delta
+ * index is generated to store which record page contains each record. The in-memory index page map
+ * is also updated to indicate which delta lists fall on each chapter index page. This means that
+ * when a record is read, the volume only has to load a single index page and a single record page,
+ * rather than search the entire chapter. These index and record pages are written to storage, and
+ * the index pages are transferred to the page cache under the theory that the most recently
+ * written chapter is likely to be accessed again soon.
+ *
+ * When reading a record, the volume index will indicate which chapter should contain it. The
+ * volume uses the index page map to determine which chapter index page needs to be loaded, and
+ * then reads the relevant record page number from the chapter index. Both index and record pages
+ * are stored in a page cache when read for the common case that subsequent records need the same
+ * pages. The page cache evicts the least recently accessed entries when caching new pages. In
+ * addition, the volume uses dm-bufio to manage access to the storage, which may allow for
+ * additional caching depending on available system resources.
+ *
+ * Record requests are handled from cached pages when possible. If a page needs to be read, it is
+ * placed on a queue along with the request that wants to read it. Any requests for the same page
+ * that arrive while the read is pending are added to the queue entry. A separate reader thread
+ * handles the queued reads, adding the page to the cache and updating any requests queued with it
+ * so they can continue processing. This allows the index zone threads to continue processing new
+ * requests rather than wait for the storage reads.
+ *
+ * When an index rebuild is necessary, the volume reads each stored chapter to determine which
+ * range of chapters contain valid records, so that those records can be used to reconstruct the
+ * in-memory volume index.
+ */
+
+/* The maximum allowable number of contiguous bad chapters */
+#define MAX_BAD_CHAPTERS 100
+#define VOLUME_CACHE_MAX_ENTRIES (U16_MAX >> 1)
+#define VOLUME_CACHE_QUEUED_FLAG (1 << 15)
+#define VOLUME_CACHE_MAX_QUEUED_READS 4096
+
+static const u64 BAD_CHAPTER = U64_MAX;
+
+/*
+ * The invalidate counter is two 32 bits fields stored together atomically. The low order 32 bits
+ * are the physical page number of the cached page being read. The high order 32 bits are a
+ * sequence number. This value is written when the zone that owns it begins or completes a cache
+ * search. Any other thread will only read the counter in wait_for_pending_searches() while waiting
+ * to update the cache contents.
+ */
+union invalidate_counter {
+ u64 value;
+ struct {
+ u32 page;
+ u32 counter;
+ };
+};
+
+static inline u32 map_to_page_number(struct index_geometry *geometry, u32 physical_page)
+{
+ return (physical_page - HEADER_PAGES_PER_VOLUME) % geometry->pages_per_chapter;
+}
+
+static inline u32 map_to_chapter_number(struct index_geometry *geometry, u32 physical_page)
+{
+ return (physical_page - HEADER_PAGES_PER_VOLUME) / geometry->pages_per_chapter;
+}
+
+static inline bool is_record_page(struct index_geometry *geometry, u32 physical_page)
+{
+ return map_to_page_number(geometry, physical_page) >= geometry->index_pages_per_chapter;
+}
+
+static u32 map_to_physical_page(const struct index_geometry *geometry, u32 chapter, u32 page)
+{
+ /* Page zero is the header page, so the first chapter index page is page one. */
+ return HEADER_PAGES_PER_VOLUME + (geometry->pages_per_chapter * chapter) + page;
+}
+
+static inline union invalidate_counter get_invalidate_counter(struct page_cache *cache,
+ unsigned int zone_number)
+{
+ return (union invalidate_counter) {
+ .value = READ_ONCE(cache->search_pending_counters[zone_number].atomic_value),
+ };
+}
+
+static inline void set_invalidate_counter(struct page_cache *cache,
+ unsigned int zone_number,
+ union invalidate_counter invalidate_counter)
+{
+ WRITE_ONCE(cache->search_pending_counters[zone_number].atomic_value,
+ invalidate_counter.value);
+}
+
+static inline bool search_pending(union invalidate_counter invalidate_counter)
+{
+ return (invalidate_counter.counter & 1) != 0;
+}
+
+/* Lock the cache for a zone in order to search for a page. */
+static void begin_pending_search(struct page_cache *cache, u32 physical_page,
+ unsigned int zone_number)
+{
+ union invalidate_counter invalidate_counter =
+ get_invalidate_counter(cache, zone_number);
+
+ invalidate_counter.page = physical_page;
+ invalidate_counter.counter++;
+ set_invalidate_counter(cache, zone_number, invalidate_counter);
+ VDO_ASSERT_LOG_ONLY(search_pending(invalidate_counter),
+ "Search is pending for zone %u", zone_number);
+ /*
+ * This memory barrier ensures that the write to the invalidate counter is seen by other
+ * threads before this thread accesses the cached page. The corresponding read memory
+ * barrier is in wait_for_pending_searches().
+ */
+ smp_mb();
+}
+
+/* Unlock the cache for a zone by clearing its invalidate counter. */
+static void end_pending_search(struct page_cache *cache, unsigned int zone_number)
+{
+ union invalidate_counter invalidate_counter;
+
+ /*
+ * This memory barrier ensures that this thread completes reads of the
+ * cached page before other threads see the write to the invalidate
+ * counter.
+ */
+ smp_mb();
+
+ invalidate_counter = get_invalidate_counter(cache, zone_number);
+ VDO_ASSERT_LOG_ONLY(search_pending(invalidate_counter),
+ "Search is pending for zone %u", zone_number);
+ invalidate_counter.counter++;
+ set_invalidate_counter(cache, zone_number, invalidate_counter);
+}
+
+static void wait_for_pending_searches(struct page_cache *cache, u32 physical_page)
+{
+ union invalidate_counter initial_counters[MAX_ZONES];
+ unsigned int i;
+
+ /*
+ * We hold the read_threads_mutex. We are waiting for threads that do not hold the
+ * read_threads_mutex. Those threads have "locked" their targeted page by setting the
+ * search_pending_counter. The corresponding write memory barrier is in
+ * begin_pending_search().
+ */
+ smp_mb();
+
+ for (i = 0; i < cache->zone_count; i++)
+ initial_counters[i] = get_invalidate_counter(cache, i);
+ for (i = 0; i < cache->zone_count; i++) {
+ if (search_pending(initial_counters[i]) &&
+ (initial_counters[i].page == physical_page)) {
+ /*
+ * There is an active search using the physical page. We need to wait for
+ * the search to finish.
+ *
+ * FIXME: Investigate using wait_event() to wait for the search to finish.
+ */
+ while (initial_counters[i].value ==
+ get_invalidate_counter(cache, i).value)
+ cond_resched();
+ }
+ }
+}
+
+static void release_page_buffer(struct cached_page *page)
+{
+ if (page->buffer != NULL)
+ dm_bufio_release(vdo_forget(page->buffer));
+}
+
+static void clear_cache_page(struct page_cache *cache, struct cached_page *page)
+{
+ /* Do not clear read_pending because the read queue relies on it. */
+ release_page_buffer(page);
+ page->physical_page = cache->indexable_pages;
+ WRITE_ONCE(page->last_used, 0);
+}
+
+static void make_page_most_recent(struct page_cache *cache, struct cached_page *page)
+{
+ /*
+ * ASSERTION: We are either a zone thread holding a search_pending_counter, or we are any
+ * thread holding the read_threads_mutex.
+ */
+ if (atomic64_read(&cache->clock) != READ_ONCE(page->last_used))
+ WRITE_ONCE(page->last_used, atomic64_inc_return(&cache->clock));
+}
+
+/* Select a page to remove from the cache to make space for a new entry. */
+static struct cached_page *select_victim_in_cache(struct page_cache *cache)
+{
+ struct cached_page *page;
+ int oldest_index = 0;
+ s64 oldest_time = S64_MAX;
+ s64 last_used;
+ u16 i;
+
+ /* Find the oldest unclaimed page. We hold the read_threads_mutex. */
+ for (i = 0; i < cache->cache_slots; i++) {
+ /* A page with a pending read must not be replaced. */
+ if (cache->cache[i].read_pending)
+ continue;
+
+ last_used = READ_ONCE(cache->cache[i].last_used);
+ if (last_used <= oldest_time) {
+ oldest_time = last_used;
+ oldest_index = i;
+ }
+ }
+
+ page = &cache->cache[oldest_index];
+ if (page->physical_page != cache->indexable_pages) {
+ WRITE_ONCE(cache->index[page->physical_page], cache->cache_slots);
+ wait_for_pending_searches(cache, page->physical_page);
+ }
+
+ page->read_pending = true;
+ clear_cache_page(cache, page);
+ return page;
+}
+
+/* Make a newly filled cache entry available to other threads. */
+static int put_page_in_cache(struct page_cache *cache, u32 physical_page,
+ struct cached_page *page)
+{
+ int result;
+
+ /* We hold the read_threads_mutex. */
+ result = VDO_ASSERT((page->read_pending), "page to install has a pending read");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ page->physical_page = physical_page;
+ make_page_most_recent(cache, page);
+ page->read_pending = false;
+
+ /*
+ * We hold the read_threads_mutex, but we must have a write memory barrier before making
+ * the cached_page available to the readers that do not hold the mutex. The corresponding
+ * read memory barrier is in get_page_and_index().
+ */
+ smp_wmb();
+
+ /* This assignment also clears the queued flag. */
+ WRITE_ONCE(cache->index[physical_page], page - cache->cache);
+ return UDS_SUCCESS;
+}
+
+static void cancel_page_in_cache(struct page_cache *cache, u32 physical_page,
+ struct cached_page *page)
+{
+ int result;
+
+ /* We hold the read_threads_mutex. */
+ result = VDO_ASSERT((page->read_pending), "page to install has a pending read");
+ if (result != VDO_SUCCESS)
+ return;
+
+ clear_cache_page(cache, page);
+ page->read_pending = false;
+
+ /* Clear the mapping and the queued flag for the new page. */
+ WRITE_ONCE(cache->index[physical_page], cache->cache_slots);
+}
+
+static inline u16 next_queue_position(u16 position)
+{
+ return (position + 1) % VOLUME_CACHE_MAX_QUEUED_READS;
+}
+
+static inline void advance_queue_position(u16 *position)
+{
+ *position = next_queue_position(*position);
+}
+
+static inline bool read_queue_is_full(struct page_cache *cache)
+{
+ return cache->read_queue_first == next_queue_position(cache->read_queue_last);
+}
+
+static bool enqueue_read(struct page_cache *cache, struct uds_request *request,
+ u32 physical_page)
+{
+ struct queued_read *queue_entry;
+ u16 last = cache->read_queue_last;
+ u16 read_queue_index;
+
+ /* We hold the read_threads_mutex. */
+ if ((cache->index[physical_page] & VOLUME_CACHE_QUEUED_FLAG) == 0) {
+ /* This page has no existing entry in the queue. */
+ if (read_queue_is_full(cache))
+ return false;
+
+ /* Fill in the read queue entry. */
+ cache->read_queue[last].physical_page = physical_page;
+ cache->read_queue[last].invalid = false;
+ cache->read_queue[last].first_request = NULL;
+ cache->read_queue[last].last_request = NULL;
+
+ /* Point the cache index to the read queue entry. */
+ read_queue_index = last;
+ WRITE_ONCE(cache->index[physical_page],
+ read_queue_index | VOLUME_CACHE_QUEUED_FLAG);
+
+ advance_queue_position(&cache->read_queue_last);
+ } else {
+ /* It's already queued, so add this request to the existing entry. */
+ read_queue_index = cache->index[physical_page] & ~VOLUME_CACHE_QUEUED_FLAG;
+ }
+
+ request->next_request = NULL;
+ queue_entry = &cache->read_queue[read_queue_index];
+ if (queue_entry->first_request == NULL)
+ queue_entry->first_request = request;
+ else
+ queue_entry->last_request->next_request = request;
+ queue_entry->last_request = request;
+
+ return true;
+}
+
+static void enqueue_page_read(struct volume *volume, struct uds_request *request,
+ u32 physical_page)
+{
+ /* Mark the page as queued, so that chapter invalidation knows to cancel a read. */
+ while (!enqueue_read(&volume->page_cache, request, physical_page)) {
+ vdo_log_debug("Read queue full, waiting for reads to finish");
+ uds_wait_cond(&volume->read_threads_read_done_cond,
+ &volume->read_threads_mutex);
+ }
+
+ uds_signal_cond(&volume->read_threads_cond);
+}
+
+/*
+ * Reserve the next read queue entry for processing, but do not actually remove it from the queue.
+ * Must be followed by release_queued_requests().
+ */
+static struct queued_read *reserve_read_queue_entry(struct page_cache *cache)
+{
+ /* We hold the read_threads_mutex. */
+ struct queued_read *entry;
+ u16 index_value;
+ bool queued;
+
+ /* No items to dequeue */
+ if (cache->read_queue_next_read == cache->read_queue_last)
+ return NULL;
+
+ entry = &cache->read_queue[cache->read_queue_next_read];
+ index_value = cache->index[entry->physical_page];
+ queued = (index_value & VOLUME_CACHE_QUEUED_FLAG) != 0;
+ /* Check to see if it's still queued before resetting. */
+ if (entry->invalid && queued)
+ WRITE_ONCE(cache->index[entry->physical_page], cache->cache_slots);
+
+ /*
+ * If a synchronous read has taken this page, set invalid to true so it doesn't get
+ * overwritten. Requests will just be requeued.
+ */
+ if (!queued)
+ entry->invalid = true;
+
+ entry->reserved = true;
+ advance_queue_position(&cache->read_queue_next_read);
+ return entry;
+}
+
+static inline struct queued_read *wait_to_reserve_read_queue_entry(struct volume *volume)
+{
+ struct queued_read *queue_entry = NULL;
+
+ while (!volume->read_threads_exiting) {
+ queue_entry = reserve_read_queue_entry(&volume->page_cache);
+ if (queue_entry != NULL)
+ break;
+
+ uds_wait_cond(&volume->read_threads_cond, &volume->read_threads_mutex);
+ }
+
+ return queue_entry;
+}
+
+static int init_chapter_index_page(const struct volume *volume, u8 *index_page,
+ u32 chapter, u32 index_page_number,
+ struct delta_index_page *chapter_index_page)
+{
+ u64 ci_virtual;
+ u32 ci_chapter;
+ u32 lowest_list;
+ u32 highest_list;
+ struct index_geometry *geometry = volume->geometry;
+ int result;
+
+ result = uds_initialize_chapter_index_page(chapter_index_page, geometry,
+ index_page, volume->nonce);
+ if (volume->lookup_mode == LOOKUP_FOR_REBUILD)
+ return result;
+
+ if (result != UDS_SUCCESS) {
+ return vdo_log_error_strerror(result,
+ "Reading chapter index page for chapter %u page %u",
+ chapter, index_page_number);
+ }
+
+ uds_get_list_number_bounds(volume->index_page_map, chapter, index_page_number,
+ &lowest_list, &highest_list);
+ ci_virtual = chapter_index_page->virtual_chapter_number;
+ ci_chapter = uds_map_to_physical_chapter(geometry, ci_virtual);
+ if ((chapter == ci_chapter) &&
+ (lowest_list == chapter_index_page->lowest_list_number) &&
+ (highest_list == chapter_index_page->highest_list_number))
+ return UDS_SUCCESS;
+
+ vdo_log_warning("Index page map updated to %llu",
+ (unsigned long long) volume->index_page_map->last_update);
+ vdo_log_warning("Page map expects that chapter %u page %u has range %u to %u, but chapter index page has chapter %llu with range %u to %u",
+ chapter, index_page_number, lowest_list, highest_list,
+ (unsigned long long) ci_virtual,
+ chapter_index_page->lowest_list_number,
+ chapter_index_page->highest_list_number);
+ return vdo_log_error_strerror(UDS_CORRUPT_DATA,
+ "index page map mismatch with chapter index");
+}
+
+static int initialize_index_page(const struct volume *volume, u32 physical_page,
+ struct cached_page *page)
+{
+ u32 chapter = map_to_chapter_number(volume->geometry, physical_page);
+ u32 index_page_number = map_to_page_number(volume->geometry, physical_page);
+
+ return init_chapter_index_page(volume, dm_bufio_get_block_data(page->buffer),
+ chapter, index_page_number, &page->index_page);
+}
+
+static bool search_record_page(const u8 record_page[],
+ const struct uds_record_name *name,
+ const struct index_geometry *geometry,
+ struct uds_record_data *metadata)
+{
+ /*
+ * The array of records is sorted by name and stored as a binary tree in heap order, so the
+ * root of the tree is the first array element.
+ */
+ u32 node = 0;
+ const struct uds_volume_record *records = (const struct uds_volume_record *) record_page;
+
+ while (node < geometry->records_per_page) {
+ int result;
+ const struct uds_volume_record *record = &records[node];
+
+ result = memcmp(name, &record->name, UDS_RECORD_NAME_SIZE);
+ if (result == 0) {
+ if (metadata != NULL)
+ *metadata = record->data;
+ return true;
+ }
+
+ /* The children of node N are at indexes 2N+1 and 2N+2. */
+ node = ((2 * node) + ((result < 0) ? 1 : 2));
+ }
+
+ return false;
+}
+
+/*
+ * If we've read in a record page, we're going to do an immediate search, to speed up processing by
+ * avoiding get_record_from_zone(), and to ensure that requests make progress even when queued. If
+ * we've read in an index page, we save the record page number so we don't have to resolve the
+ * index page again. We use the location, virtual_chapter, and old_metadata fields in the request
+ * to allow the index code to know where to begin processing the request again.
+ */
+static int search_page(struct cached_page *page, const struct volume *volume,
+ struct uds_request *request, u32 physical_page)
+{
+ int result;
+ enum uds_index_region location;
+ u16 record_page_number;
+
+ if (is_record_page(volume->geometry, physical_page)) {
+ if (search_record_page(dm_bufio_get_block_data(page->buffer),
+ &request->record_name, volume->geometry,
+ &request->old_metadata))
+ location = UDS_LOCATION_RECORD_PAGE_LOOKUP;
+ else
+ location = UDS_LOCATION_UNAVAILABLE;
+ } else {
+ result = uds_search_chapter_index_page(&page->index_page,
+ volume->geometry,
+ &request->record_name,
+ &record_page_number);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ if (record_page_number == NO_CHAPTER_INDEX_ENTRY) {
+ location = UDS_LOCATION_UNAVAILABLE;
+ } else {
+ location = UDS_LOCATION_INDEX_PAGE_LOOKUP;
+ *((u16 *) &request->old_metadata) = record_page_number;
+ }
+ }
+
+ request->location = location;
+ request->found = false;
+ return UDS_SUCCESS;
+}
+
+static int process_entry(struct volume *volume, struct queued_read *entry)
+{
+ u32 page_number = entry->physical_page;
+ struct uds_request *request;
+ struct cached_page *page = NULL;
+ u8 *page_data;
+ int result;
+
+ if (entry->invalid) {
+ vdo_log_debug("Requeuing requests for invalid page");
+ return UDS_SUCCESS;
+ }
+
+ page = select_victim_in_cache(&volume->page_cache);
+
+ mutex_unlock(&volume->read_threads_mutex);
+ page_data = dm_bufio_read(volume->client, page_number, &page->buffer);
+ mutex_lock(&volume->read_threads_mutex);
+ if (IS_ERR(page_data)) {
+ result = -PTR_ERR(page_data);
+ vdo_log_warning_strerror(result,
+ "error reading physical page %u from volume",
+ page_number);
+ cancel_page_in_cache(&volume->page_cache, page_number, page);
+ return result;
+ }
+
+ if (entry->invalid) {
+ vdo_log_warning("Page %u invalidated after read", page_number);
+ cancel_page_in_cache(&volume->page_cache, page_number, page);
+ return UDS_SUCCESS;
+ }
+
+ if (!is_record_page(volume->geometry, page_number)) {
+ result = initialize_index_page(volume, page_number, page);
+ if (result != UDS_SUCCESS) {
+ vdo_log_warning("Error initializing chapter index page");
+ cancel_page_in_cache(&volume->page_cache, page_number, page);
+ return result;
+ }
+ }
+
+ result = put_page_in_cache(&volume->page_cache, page_number, page);
+ if (result != UDS_SUCCESS) {
+ vdo_log_warning("Error putting page %u in cache", page_number);
+ cancel_page_in_cache(&volume->page_cache, page_number, page);
+ return result;
+ }
+
+ request = entry->first_request;
+ while ((request != NULL) && (result == UDS_SUCCESS)) {
+ result = search_page(page, volume, request, page_number);
+ request = request->next_request;
+ }
+
+ return result;
+}
+
+static void release_queued_requests(struct volume *volume, struct queued_read *entry,
+ int result)
+{
+ struct page_cache *cache = &volume->page_cache;
+ u16 next_read = cache->read_queue_next_read;
+ struct uds_request *request;
+ struct uds_request *next;
+
+ for (request = entry->first_request; request != NULL; request = next) {
+ next = request->next_request;
+ request->status = result;
+ request->requeued = true;
+ uds_enqueue_request(request, STAGE_INDEX);
+ }
+
+ entry->reserved = false;
+
+ /* Move the read_queue_first pointer as far as we can. */
+ while ((cache->read_queue_first != next_read) &&
+ (!cache->read_queue[cache->read_queue_first].reserved))
+ advance_queue_position(&cache->read_queue_first);
+ uds_broadcast_cond(&volume->read_threads_read_done_cond);
+}
+
+static void read_thread_function(void *arg)
+{
+ struct volume *volume = arg;
+
+ vdo_log_debug("reader starting");
+ mutex_lock(&volume->read_threads_mutex);
+ while (true) {
+ struct queued_read *queue_entry;
+ int result;
+
+ queue_entry = wait_to_reserve_read_queue_entry(volume);
+ if (volume->read_threads_exiting)
+ break;
+
+ result = process_entry(volume, queue_entry);
+ release_queued_requests(volume, queue_entry, result);
+ }
+ mutex_unlock(&volume->read_threads_mutex);
+ vdo_log_debug("reader done");
+}
+
+static void get_page_and_index(struct page_cache *cache, u32 physical_page,
+ int *queue_index, struct cached_page **page_ptr)
+{
+ u16 index_value;
+ u16 index;
+ bool queued;
+
+ /*
+ * ASSERTION: We are either a zone thread holding a search_pending_counter, or we are any
+ * thread holding the read_threads_mutex.
+ *
+ * Holding only a search_pending_counter is the most frequent case.
+ */
+ /*
+ * It would be unlikely for the compiler to turn the usage of index_value into two reads of
+ * cache->index, but it would be possible and very bad if those reads did not return the
+ * same bits.
+ */
+ index_value = READ_ONCE(cache->index[physical_page]);
+ queued = (index_value & VOLUME_CACHE_QUEUED_FLAG) != 0;
+ index = index_value & ~VOLUME_CACHE_QUEUED_FLAG;
+
+ if (!queued && (index < cache->cache_slots)) {
+ *page_ptr = &cache->cache[index];
+ /*
+ * We have acquired access to the cached page, but unless we hold the
+ * read_threads_mutex, we need a read memory barrier now. The corresponding write
+ * memory barrier is in put_page_in_cache().
+ */
+ smp_rmb();
+ } else {
+ *page_ptr = NULL;
+ }
+
+ *queue_index = queued ? index : -1;
+}
+
+static void get_page_from_cache(struct page_cache *cache, u32 physical_page,
+ struct cached_page **page)
+{
+ /*
+ * ASSERTION: We are in a zone thread.
+ * ASSERTION: We holding a search_pending_counter or the read_threads_mutex.
+ */
+ int queue_index = -1;
+
+ get_page_and_index(cache, physical_page, &queue_index, page);
+}
+
+static int read_page_locked(struct volume *volume, u32 physical_page,
+ struct cached_page **page_ptr)
+{
+ int result = UDS_SUCCESS;
+ struct cached_page *page = NULL;
+ u8 *page_data;
+
+ page = select_victim_in_cache(&volume->page_cache);
+ page_data = dm_bufio_read(volume->client, physical_page, &page->buffer);
+ if (IS_ERR(page_data)) {
+ result = -PTR_ERR(page_data);
+ vdo_log_warning_strerror(result,
+ "error reading physical page %u from volume",
+ physical_page);
+ cancel_page_in_cache(&volume->page_cache, physical_page, page);
+ return result;
+ }
+
+ if (!is_record_page(volume->geometry, physical_page)) {
+ result = initialize_index_page(volume, physical_page, page);
+ if (result != UDS_SUCCESS) {
+ if (volume->lookup_mode != LOOKUP_FOR_REBUILD)
+ vdo_log_warning("Corrupt index page %u", physical_page);
+ cancel_page_in_cache(&volume->page_cache, physical_page, page);
+ return result;
+ }
+ }
+
+ result = put_page_in_cache(&volume->page_cache, physical_page, page);
+ if (result != UDS_SUCCESS) {
+ vdo_log_warning("Error putting page %u in cache", physical_page);
+ cancel_page_in_cache(&volume->page_cache, physical_page, page);
+ return result;
+ }
+
+ *page_ptr = page;
+ return UDS_SUCCESS;
+}
+
+/* Retrieve a page from the cache while holding the read threads mutex. */
+static int get_volume_page_locked(struct volume *volume, u32 physical_page,
+ struct cached_page **page_ptr)
+{
+ int result;
+ struct cached_page *page = NULL;
+
+ get_page_from_cache(&volume->page_cache, physical_page, &page);
+ if (page == NULL) {
+ result = read_page_locked(volume, physical_page, &page);
+ if (result != UDS_SUCCESS)
+ return result;
+ } else {
+ make_page_most_recent(&volume->page_cache, page);
+ }
+
+ *page_ptr = page;
+ return UDS_SUCCESS;
+}
+
+/* Retrieve a page from the cache while holding a search_pending lock. */
+static int get_volume_page_protected(struct volume *volume, struct uds_request *request,
+ u32 physical_page, struct cached_page **page_ptr)
+{
+ struct cached_page *page;
+
+ get_page_from_cache(&volume->page_cache, physical_page, &page);
+ if (page != NULL) {
+ if (request->zone_number == 0) {
+ /* Only one zone is allowed to update the LRU. */
+ make_page_most_recent(&volume->page_cache, page);
+ }
+
+ *page_ptr = page;
+ return UDS_SUCCESS;
+ }
+
+ /* Prepare to enqueue a read for the page. */
+ end_pending_search(&volume->page_cache, request->zone_number);
+ mutex_lock(&volume->read_threads_mutex);
+
+ /*
+ * Do the lookup again while holding the read mutex (no longer the fast case so this should
+ * be fine to repeat). We need to do this because a page may have been added to the cache
+ * by a reader thread between the time we searched above and the time we went to actually
+ * try to enqueue it below. This could result in us enqueuing another read for a page which
+ * is already in the cache, which would mean we end up with two entries in the cache for
+ * the same page.
+ */
+ get_page_from_cache(&volume->page_cache, physical_page, &page);
+ if (page == NULL) {
+ enqueue_page_read(volume, request, physical_page);
+ /*
+ * The performance gain from unlocking first, while "search pending" mode is off,
+ * turns out to be significant in some cases. The page is not available yet so
+ * the order does not matter for correctness as it does below.
+ */
+ mutex_unlock(&volume->read_threads_mutex);
+ begin_pending_search(&volume->page_cache, physical_page,
+ request->zone_number);
+ return UDS_QUEUED;
+ }
+
+ /*
+ * Now that the page is loaded, the volume needs to switch to "reader thread unlocked" and
+ * "search pending" state in careful order so no other thread can mess with the data before
+ * the caller gets to look at it.
+ */
+ begin_pending_search(&volume->page_cache, physical_page, request->zone_number);
+ mutex_unlock(&volume->read_threads_mutex);
+ *page_ptr = page;
+ return UDS_SUCCESS;
+}
+
+static int get_volume_page(struct volume *volume, u32 chapter, u32 page_number,
+ struct cached_page **page_ptr)
+{
+ int result;
+ u32 physical_page = map_to_physical_page(volume->geometry, chapter, page_number);
+
+ mutex_lock(&volume->read_threads_mutex);
+ result = get_volume_page_locked(volume, physical_page, page_ptr);
+ mutex_unlock(&volume->read_threads_mutex);
+ return result;
+}
+
+int uds_get_volume_record_page(struct volume *volume, u32 chapter, u32 page_number,
+ u8 **data_ptr)
+{
+ int result;
+ struct cached_page *page = NULL;
+
+ result = get_volume_page(volume, chapter, page_number, &page);
+ if (result == UDS_SUCCESS)
+ *data_ptr = dm_bufio_get_block_data(page->buffer);
+ return result;
+}
+
+int uds_get_volume_index_page(struct volume *volume, u32 chapter, u32 page_number,
+ struct delta_index_page **index_page_ptr)
+{
+ int result;
+ struct cached_page *page = NULL;
+
+ result = get_volume_page(volume, chapter, page_number, &page);
+ if (result == UDS_SUCCESS)
+ *index_page_ptr = &page->index_page;
+ return result;
+}
+
+/*
+ * Find the record page associated with a name in a given index page. This will return UDS_QUEUED
+ * if the page in question must be read from storage.
+ */
+static int search_cached_index_page(struct volume *volume, struct uds_request *request,
+ u32 chapter, u32 index_page_number,
+ u16 *record_page_number)
+{
+ int result;
+ struct cached_page *page = NULL;
+ u32 physical_page = map_to_physical_page(volume->geometry, chapter,
+ index_page_number);
+
+ /*
+ * Make sure the invalidate counter is updated before we try and read the mapping. This
+ * prevents this thread from reading a page in the cache which has already been marked for
+ * invalidation by the reader thread, before the reader thread has noticed that the
+ * invalidate_counter has been incremented.
+ */
+ begin_pending_search(&volume->page_cache, physical_page, request->zone_number);
+
+ result = get_volume_page_protected(volume, request, physical_page, &page);
+ if (result != UDS_SUCCESS) {
+ end_pending_search(&volume->page_cache, request->zone_number);
+ return result;
+ }
+
+ result = uds_search_chapter_index_page(&page->index_page, volume->geometry,
+ &request->record_name,
+ record_page_number);
+ end_pending_search(&volume->page_cache, request->zone_number);
+ return result;
+}
+
+/*
+ * Find the metadata associated with a name in a given record page. This will return UDS_QUEUED if
+ * the page in question must be read from storage.
+ */
+int uds_search_cached_record_page(struct volume *volume, struct uds_request *request,
+ u32 chapter, u16 record_page_number, bool *found)
+{
+ struct cached_page *record_page;
+ struct index_geometry *geometry = volume->geometry;
+ int result;
+ u32 physical_page, page_number;
+
+ *found = false;
+ if (record_page_number == NO_CHAPTER_INDEX_ENTRY)
+ return UDS_SUCCESS;
+
+ result = VDO_ASSERT(record_page_number < geometry->record_pages_per_chapter,
+ "0 <= %d < %u", record_page_number,
+ geometry->record_pages_per_chapter);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ page_number = geometry->index_pages_per_chapter + record_page_number;
+
+ physical_page = map_to_physical_page(volume->geometry, chapter, page_number);
+
+ /*
+ * Make sure the invalidate counter is updated before we try and read the mapping. This
+ * prevents this thread from reading a page in the cache which has already been marked for
+ * invalidation by the reader thread, before the reader thread has noticed that the
+ * invalidate_counter has been incremented.
+ */
+ begin_pending_search(&volume->page_cache, physical_page, request->zone_number);
+
+ result = get_volume_page_protected(volume, request, physical_page, &record_page);
+ if (result != UDS_SUCCESS) {
+ end_pending_search(&volume->page_cache, request->zone_number);
+ return result;
+ }
+
+ if (search_record_page(dm_bufio_get_block_data(record_page->buffer),
+ &request->record_name, geometry, &request->old_metadata))
+ *found = true;
+
+ end_pending_search(&volume->page_cache, request->zone_number);
+ return UDS_SUCCESS;
+}
+
+void uds_prefetch_volume_chapter(const struct volume *volume, u32 chapter)
+{
+ const struct index_geometry *geometry = volume->geometry;
+ u32 physical_page = map_to_physical_page(geometry, chapter, 0);
+
+ dm_bufio_prefetch(volume->client, physical_page, geometry->pages_per_chapter);
+}
+
+int uds_read_chapter_index_from_volume(const struct volume *volume, u64 virtual_chapter,
+ struct dm_buffer *volume_buffers[],
+ struct delta_index_page index_pages[])
+{
+ int result;
+ u32 i;
+ const struct index_geometry *geometry = volume->geometry;
+ u32 physical_chapter = uds_map_to_physical_chapter(geometry, virtual_chapter);
+ u32 physical_page = map_to_physical_page(geometry, physical_chapter, 0);
+
+ dm_bufio_prefetch(volume->client, physical_page, geometry->index_pages_per_chapter);
+ for (i = 0; i < geometry->index_pages_per_chapter; i++) {
+ u8 *index_page;
+
+ index_page = dm_bufio_read(volume->client, physical_page + i,
+ &volume_buffers[i]);
+ if (IS_ERR(index_page)) {
+ result = -PTR_ERR(index_page);
+ vdo_log_warning_strerror(result,
+ "error reading physical page %u",
+ physical_page);
+ return result;
+ }
+
+ result = init_chapter_index_page(volume, index_page, physical_chapter, i,
+ &index_pages[i]);
+ if (result != UDS_SUCCESS)
+ return result;
+ }
+
+ return UDS_SUCCESS;
+}
+
+int uds_search_volume_page_cache(struct volume *volume, struct uds_request *request,
+ bool *found)
+{
+ int result;
+ u32 physical_chapter =
+ uds_map_to_physical_chapter(volume->geometry, request->virtual_chapter);
+ u32 index_page_number;
+ u16 record_page_number;
+
+ index_page_number = uds_find_index_page_number(volume->index_page_map,
+ &request->record_name,
+ physical_chapter);
+
+ if (request->location == UDS_LOCATION_INDEX_PAGE_LOOKUP) {
+ record_page_number = *((u16 *) &request->old_metadata);
+ } else {
+ result = search_cached_index_page(volume, request, physical_chapter,
+ index_page_number,
+ &record_page_number);
+ if (result != UDS_SUCCESS)
+ return result;
+ }
+
+ return uds_search_cached_record_page(volume, request, physical_chapter,
+ record_page_number, found);
+}
+
+int uds_search_volume_page_cache_for_rebuild(struct volume *volume,
+ const struct uds_record_name *name,
+ u64 virtual_chapter, bool *found)
+{
+ int result;
+ struct index_geometry *geometry = volume->geometry;
+ struct cached_page *page;
+ u32 physical_chapter = uds_map_to_physical_chapter(geometry, virtual_chapter);
+ u32 index_page_number;
+ u16 record_page_number;
+ u32 page_number;
+
+ *found = false;
+ index_page_number =
+ uds_find_index_page_number(volume->index_page_map, name,
+ physical_chapter);
+ result = get_volume_page(volume, physical_chapter, index_page_number, &page);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ result = uds_search_chapter_index_page(&page->index_page, geometry, name,
+ &record_page_number);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ if (record_page_number == NO_CHAPTER_INDEX_ENTRY)
+ return UDS_SUCCESS;
+
+ page_number = geometry->index_pages_per_chapter + record_page_number;
+ result = get_volume_page(volume, physical_chapter, page_number, &page);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ *found = search_record_page(dm_bufio_get_block_data(page->buffer), name,
+ geometry, NULL);
+ return UDS_SUCCESS;
+}
+
+static void invalidate_page(struct page_cache *cache, u32 physical_page)
+{
+ struct cached_page *page;
+ int queue_index = -1;
+
+ /* We hold the read_threads_mutex. */
+ get_page_and_index(cache, physical_page, &queue_index, &page);
+ if (page != NULL) {
+ WRITE_ONCE(cache->index[page->physical_page], cache->cache_slots);
+ wait_for_pending_searches(cache, page->physical_page);
+ clear_cache_page(cache, page);
+ } else if (queue_index > -1) {
+ vdo_log_debug("setting pending read to invalid");
+ cache->read_queue[queue_index].invalid = true;
+ }
+}
+
+void uds_forget_chapter(struct volume *volume, u64 virtual_chapter)
+{
+ u32 physical_chapter =
+ uds_map_to_physical_chapter(volume->geometry, virtual_chapter);
+ u32 first_page = map_to_physical_page(volume->geometry, physical_chapter, 0);
+ u32 i;
+
+ vdo_log_debug("forgetting chapter %llu", (unsigned long long) virtual_chapter);
+ mutex_lock(&volume->read_threads_mutex);
+ for (i = 0; i < volume->geometry->pages_per_chapter; i++)
+ invalidate_page(&volume->page_cache, first_page + i);
+ mutex_unlock(&volume->read_threads_mutex);
+}
+
+/*
+ * Donate an index pages from a newly written chapter to the page cache since it is likely to be
+ * used again soon. The caller must already hold the reader thread mutex.
+ */
+static int donate_index_page_locked(struct volume *volume, u32 physical_chapter,
+ u32 index_page_number, struct dm_buffer *page_buffer)
+{
+ int result;
+ struct cached_page *page = NULL;
+ u32 physical_page =
+ map_to_physical_page(volume->geometry, physical_chapter,
+ index_page_number);
+
+ page = select_victim_in_cache(&volume->page_cache);
+ page->buffer = page_buffer;
+ result = init_chapter_index_page(volume, dm_bufio_get_block_data(page_buffer),
+ physical_chapter, index_page_number,
+ &page->index_page);
+ if (result != UDS_SUCCESS) {
+ vdo_log_warning("Error initialize chapter index page");
+ cancel_page_in_cache(&volume->page_cache, physical_page, page);
+ return result;
+ }
+
+ result = put_page_in_cache(&volume->page_cache, physical_page, page);
+ if (result != UDS_SUCCESS) {
+ vdo_log_warning("Error putting page %u in cache", physical_page);
+ cancel_page_in_cache(&volume->page_cache, physical_page, page);
+ return result;
+ }
+
+ return UDS_SUCCESS;
+}
+
+static int write_index_pages(struct volume *volume, u32 physical_chapter_number,
+ struct open_chapter_index *chapter_index)
+{
+ struct index_geometry *geometry = volume->geometry;
+ struct dm_buffer *page_buffer;
+ u32 first_index_page = map_to_physical_page(geometry, physical_chapter_number, 0);
+ u32 delta_list_number = 0;
+ u32 index_page_number;
+
+ for (index_page_number = 0;
+ index_page_number < geometry->index_pages_per_chapter;
+ index_page_number++) {
+ u8 *page_data;
+ u32 physical_page = first_index_page + index_page_number;
+ u32 lists_packed;
+ bool last_page;
+ int result;
+
+ page_data = dm_bufio_new(volume->client, physical_page, &page_buffer);
+ if (IS_ERR(page_data)) {
+ return vdo_log_warning_strerror(-PTR_ERR(page_data),
+ "failed to prepare index page");
+ }
+
+ last_page = ((index_page_number + 1) == geometry->index_pages_per_chapter);
+ result = uds_pack_open_chapter_index_page(chapter_index, page_data,
+ delta_list_number, last_page,
+ &lists_packed);
+ if (result != UDS_SUCCESS) {
+ dm_bufio_release(page_buffer);
+ return vdo_log_warning_strerror(result,
+ "failed to pack index page");
+ }
+
+ dm_bufio_mark_buffer_dirty(page_buffer);
+
+ if (lists_packed == 0) {
+ vdo_log_debug("no delta lists packed on chapter %u page %u",
+ physical_chapter_number, index_page_number);
+ } else {
+ delta_list_number += lists_packed;
+ }
+
+ uds_update_index_page_map(volume->index_page_map,
+ chapter_index->virtual_chapter_number,
+ physical_chapter_number, index_page_number,
+ delta_list_number - 1);
+
+ mutex_lock(&volume->read_threads_mutex);
+ result = donate_index_page_locked(volume, physical_chapter_number,
+ index_page_number, page_buffer);
+ mutex_unlock(&volume->read_threads_mutex);
+ if (result != UDS_SUCCESS) {
+ dm_bufio_release(page_buffer);
+ return result;
+ }
+ }
+
+ return UDS_SUCCESS;
+}
+
+static u32 encode_tree(u8 record_page[],
+ const struct uds_volume_record *sorted_pointers[],
+ u32 next_record, u32 node, u32 node_count)
+{
+ if (node < node_count) {
+ u32 child = (2 * node) + 1;
+
+ next_record = encode_tree(record_page, sorted_pointers, next_record,
+ child, node_count);
+
+ /*
+ * In-order traversal: copy the contents of the next record into the page at the
+ * node offset.
+ */
+ memcpy(&record_page[node * BYTES_PER_RECORD],
+ sorted_pointers[next_record++], BYTES_PER_RECORD);
+
+ next_record = encode_tree(record_page, sorted_pointers, next_record,
+ child + 1, node_count);
+ }
+
+ return next_record;
+}
+
+static int encode_record_page(const struct volume *volume,
+ const struct uds_volume_record records[], u8 record_page[])
+{
+ int result;
+ u32 i;
+ u32 records_per_page = volume->geometry->records_per_page;
+ const struct uds_volume_record **record_pointers = volume->record_pointers;
+
+ for (i = 0; i < records_per_page; i++)
+ record_pointers[i] = &records[i];
+
+ /*
+ * Sort the record pointers by using just the names in the records, which is less work than
+ * sorting the entire record values.
+ */
+ BUILD_BUG_ON(offsetof(struct uds_volume_record, name) != 0);
+ result = uds_radix_sort(volume->radix_sorter, (const u8 **) record_pointers,
+ records_per_page, UDS_RECORD_NAME_SIZE);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ encode_tree(record_page, record_pointers, 0, 0, records_per_page);
+ return UDS_SUCCESS;
+}
+
+static int write_record_pages(struct volume *volume, u32 physical_chapter_number,
+ const struct uds_volume_record *records)
+{
+ u32 record_page_number;
+ struct index_geometry *geometry = volume->geometry;
+ struct dm_buffer *page_buffer;
+ const struct uds_volume_record *next_record = records;
+ u32 first_record_page = map_to_physical_page(geometry, physical_chapter_number,
+ geometry->index_pages_per_chapter);
+
+ for (record_page_number = 0;
+ record_page_number < geometry->record_pages_per_chapter;
+ record_page_number++) {
+ u8 *page_data;
+ u32 physical_page = first_record_page + record_page_number;
+ int result;
+
+ page_data = dm_bufio_new(volume->client, physical_page, &page_buffer);
+ if (IS_ERR(page_data)) {
+ return vdo_log_warning_strerror(-PTR_ERR(page_data),
+ "failed to prepare record page");
+ }
+
+ result = encode_record_page(volume, next_record, page_data);
+ if (result != UDS_SUCCESS) {
+ dm_bufio_release(page_buffer);
+ return vdo_log_warning_strerror(result,
+ "failed to encode record page %u",
+ record_page_number);
+ }
+
+ next_record += geometry->records_per_page;
+ dm_bufio_mark_buffer_dirty(page_buffer);
+ dm_bufio_release(page_buffer);
+ }
+
+ return UDS_SUCCESS;
+}
+
+int uds_write_chapter(struct volume *volume, struct open_chapter_index *chapter_index,
+ const struct uds_volume_record *records)
+{
+ int result;
+ u32 physical_chapter_number =
+ uds_map_to_physical_chapter(volume->geometry,
+ chapter_index->virtual_chapter_number);
+
+ result = write_index_pages(volume, physical_chapter_number, chapter_index);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ result = write_record_pages(volume, physical_chapter_number, records);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ result = -dm_bufio_write_dirty_buffers(volume->client);
+ if (result != UDS_SUCCESS)
+ vdo_log_error_strerror(result, "cannot sync chapter to volume");
+
+ return result;
+}
+
+static void probe_chapter(struct volume *volume, u32 chapter_number,
+ u64 *virtual_chapter_number)
+{
+ const struct index_geometry *geometry = volume->geometry;
+ u32 expected_list_number = 0;
+ u32 i;
+ u64 vcn = BAD_CHAPTER;
+
+ *virtual_chapter_number = BAD_CHAPTER;
+ dm_bufio_prefetch(volume->client,
+ map_to_physical_page(geometry, chapter_number, 0),
+ geometry->index_pages_per_chapter);
+
+ for (i = 0; i < geometry->index_pages_per_chapter; i++) {
+ struct delta_index_page *page;
+ int result;
+
+ result = uds_get_volume_index_page(volume, chapter_number, i, &page);
+ if (result != UDS_SUCCESS)
+ return;
+
+ if (page->virtual_chapter_number == BAD_CHAPTER) {
+ vdo_log_error("corrupt index page in chapter %u",
+ chapter_number);
+ return;
+ }
+
+ if (vcn == BAD_CHAPTER) {
+ vcn = page->virtual_chapter_number;
+ } else if (page->virtual_chapter_number != vcn) {
+ vdo_log_error("inconsistent chapter %u index page %u: expected vcn %llu, got vcn %llu",
+ chapter_number, i, (unsigned long long) vcn,
+ (unsigned long long) page->virtual_chapter_number);
+ return;
+ }
+
+ if (expected_list_number != page->lowest_list_number) {
+ vdo_log_error("inconsistent chapter %u index page %u: expected list number %u, got list number %u",
+ chapter_number, i, expected_list_number,
+ page->lowest_list_number);
+ return;
+ }
+ expected_list_number = page->highest_list_number + 1;
+
+ result = uds_validate_chapter_index_page(page, geometry);
+ if (result != UDS_SUCCESS)
+ return;
+ }
+
+ if (chapter_number != uds_map_to_physical_chapter(geometry, vcn)) {
+ vdo_log_error("chapter %u vcn %llu is out of phase (%u)", chapter_number,
+ (unsigned long long) vcn, geometry->chapters_per_volume);
+ return;
+ }
+
+ *virtual_chapter_number = vcn;
+}
+
+/* Find the last valid physical chapter in the volume. */
+static void find_real_end_of_volume(struct volume *volume, u32 limit, u32 *limit_ptr)
+{
+ u32 span = 1;
+ u32 tries = 0;
+
+ while (limit > 0) {
+ u32 chapter = (span > limit) ? 0 : limit - span;
+ u64 vcn = 0;
+
+ probe_chapter(volume, chapter, &vcn);
+ if (vcn == BAD_CHAPTER) {
+ limit = chapter;
+ if (++tries > 1)
+ span *= 2;
+ } else {
+ if (span == 1)
+ break;
+ span /= 2;
+ tries = 0;
+ }
+ }
+
+ *limit_ptr = limit;
+}
+
+static int find_chapter_limits(struct volume *volume, u32 chapter_limit, u64 *lowest_vcn,
+ u64 *highest_vcn)
+{
+ struct index_geometry *geometry = volume->geometry;
+ u64 zero_vcn;
+ u64 lowest = BAD_CHAPTER;
+ u64 highest = BAD_CHAPTER;
+ u64 moved_chapter = BAD_CHAPTER;
+ u32 left_chapter = 0;
+ u32 right_chapter = 0;
+ u32 bad_chapters = 0;
+
+ /*
+ * This method assumes there is at most one run of contiguous bad chapters caused by
+ * unflushed writes. Either the bad spot is at the beginning and end, or somewhere in the
+ * middle. Wherever it is, the highest and lowest VCNs are adjacent to it. Otherwise the
+ * volume is cleanly saved and somewhere in the middle of it the highest VCN immediately
+ * precedes the lowest one.
+ */
+
+ /* It doesn't matter if this results in a bad spot (BAD_CHAPTER). */
+ probe_chapter(volume, 0, &zero_vcn);
+
+ /*
+ * Binary search for end of the discontinuity in the monotonically increasing virtual
+ * chapter numbers; bad spots are treated as a span of BAD_CHAPTER values. In effect we're
+ * searching for the index of the smallest value less than zero_vcn. In the case we go off
+ * the end it means that chapter 0 has the lowest vcn.
+ *
+ * If a virtual chapter is out-of-order, it will be the one moved by conversion. Always
+ * skip over the moved chapter when searching, adding it to the range at the end if
+ * necessary.
+ */
+ if (geometry->remapped_physical > 0) {
+ u64 remapped_vcn;
+
+ probe_chapter(volume, geometry->remapped_physical, &remapped_vcn);
+ if (remapped_vcn == geometry->remapped_virtual)
+ moved_chapter = geometry->remapped_physical;
+ }
+
+ left_chapter = 0;
+ right_chapter = chapter_limit;
+
+ while (left_chapter < right_chapter) {
+ u64 probe_vcn;
+ u32 chapter = (left_chapter + right_chapter) / 2;
+
+ if (chapter == moved_chapter)
+ chapter--;
+
+ probe_chapter(volume, chapter, &probe_vcn);
+ if (zero_vcn <= probe_vcn) {
+ left_chapter = chapter + 1;
+ if (left_chapter == moved_chapter)
+ left_chapter++;
+ } else {
+ right_chapter = chapter;
+ }
+ }
+
+ /* If left_chapter goes off the end, chapter 0 has the lowest virtual chapter number.*/
+ if (left_chapter >= chapter_limit)
+ left_chapter = 0;
+
+ /* At this point, left_chapter is the chapter with the lowest virtual chapter number. */
+ probe_chapter(volume, left_chapter, &lowest);
+
+ /* The moved chapter might be the lowest in the range. */
+ if ((moved_chapter != BAD_CHAPTER) && (lowest == geometry->remapped_virtual + 1))
+ lowest = geometry->remapped_virtual;
+
+ /*
+ * Circularly scan backwards, moving over any bad chapters until encountering a good one,
+ * which is the chapter with the highest vcn.
+ */
+ while (highest == BAD_CHAPTER) {
+ right_chapter = (right_chapter + chapter_limit - 1) % chapter_limit;
+ if (right_chapter == moved_chapter)
+ continue;
+
+ probe_chapter(volume, right_chapter, &highest);
+ if (bad_chapters++ >= MAX_BAD_CHAPTERS) {
+ vdo_log_error("too many bad chapters in volume: %u",
+ bad_chapters);
+ return UDS_CORRUPT_DATA;
+ }
+ }
+
+ *lowest_vcn = lowest;
+ *highest_vcn = highest;
+ return UDS_SUCCESS;
+}
+
+/*
+ * Find the highest and lowest contiguous chapters present in the volume and determine their
+ * virtual chapter numbers. This is used by rebuild.
+ */
+int uds_find_volume_chapter_boundaries(struct volume *volume, u64 *lowest_vcn,
+ u64 *highest_vcn, bool *is_empty)
+{
+ u32 chapter_limit = volume->geometry->chapters_per_volume;
+
+ find_real_end_of_volume(volume, chapter_limit, &chapter_limit);
+ if (chapter_limit == 0) {
+ *lowest_vcn = 0;
+ *highest_vcn = 0;
+ *is_empty = true;
+ return UDS_SUCCESS;
+ }
+
+ *is_empty = false;
+ return find_chapter_limits(volume, chapter_limit, lowest_vcn, highest_vcn);
+}
+
+int __must_check uds_replace_volume_storage(struct volume *volume,
+ struct index_layout *layout,
+ struct block_device *bdev)
+{
+ int result;
+ u32 i;
+
+ result = uds_replace_index_layout_storage(layout, bdev);
+ if (result != UDS_SUCCESS)
+ return result;
+
+ /* Release all outstanding dm_bufio objects */
+ for (i = 0; i < volume->page_cache.indexable_pages; i++)
+ volume->page_cache.index[i] = volume->page_cache.cache_slots;
+ for (i = 0; i < volume->page_cache.cache_slots; i++)
+ clear_cache_page(&volume->page_cache, &volume->page_cache.cache[i]);
+ if (volume->sparse_cache != NULL)
+ uds_invalidate_sparse_cache(volume->sparse_cache);
+ if (volume->client != NULL)
+ dm_bufio_client_destroy(vdo_forget(volume->client));
+
+ return uds_open_volume_bufio(layout, volume->geometry->bytes_per_page,
+ volume->reserved_buffers, &volume->client);
+}
+
+static int __must_check initialize_page_cache(struct page_cache *cache,
+ const struct index_geometry *geometry,
+ u32 chapters_in_cache,
+ unsigned int zone_count)
+{
+ int result;
+ u32 i;
+
+ cache->indexable_pages = geometry->pages_per_volume + 1;
+ cache->cache_slots = chapters_in_cache * geometry->record_pages_per_chapter;
+ cache->zone_count = zone_count;
+ atomic64_set(&cache->clock, 1);
+
+ result = VDO_ASSERT((cache->cache_slots <= VOLUME_CACHE_MAX_ENTRIES),
+ "requested cache size, %u, within limit %u",
+ cache->cache_slots, VOLUME_CACHE_MAX_ENTRIES);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = vdo_allocate(VOLUME_CACHE_MAX_QUEUED_READS, struct queued_read,
+ "volume read queue", &cache->read_queue);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = vdo_allocate(cache->zone_count, struct search_pending_counter,
+ "Volume Cache Zones", &cache->search_pending_counters);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = vdo_allocate(cache->indexable_pages, u16, "page cache index",
+ &cache->index);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = vdo_allocate(cache->cache_slots, struct cached_page, "page cache cache",
+ &cache->cache);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ /* Initialize index values to invalid values. */
+ for (i = 0; i < cache->indexable_pages; i++)
+ cache->index[i] = cache->cache_slots;
+
+ for (i = 0; i < cache->cache_slots; i++)
+ clear_cache_page(cache, &cache->cache[i]);
+
+ return UDS_SUCCESS;
+}
+
+int uds_make_volume(const struct uds_configuration *config, struct index_layout *layout,
+ struct volume **new_volume)
+{
+ unsigned int i;
+ struct volume *volume = NULL;
+ struct index_geometry *geometry;
+ unsigned int reserved_buffers;
+ int result;
+
+ result = vdo_allocate(1, struct volume, "volume", &volume);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ volume->nonce = uds_get_volume_nonce(layout);
+
+ result = uds_copy_index_geometry(config->geometry, &volume->geometry);
+ if (result != UDS_SUCCESS) {
+ uds_free_volume(volume);
+ return vdo_log_warning_strerror(result,
+ "failed to allocate geometry: error");
+ }
+ geometry = volume->geometry;
+
+ /*
+ * Reserve a buffer for each entry in the page cache, one for the chapter writer, and one
+ * for each entry in the sparse cache.
+ */
+ reserved_buffers = config->cache_chapters * geometry->record_pages_per_chapter;
+ reserved_buffers += 1;
+ if (uds_is_sparse_index_geometry(geometry))
+ reserved_buffers += (config->cache_chapters * geometry->index_pages_per_chapter);
+ volume->reserved_buffers = reserved_buffers;
+ result = uds_open_volume_bufio(layout, geometry->bytes_per_page,
+ volume->reserved_buffers, &volume->client);
+ if (result != UDS_SUCCESS) {
+ uds_free_volume(volume);
+ return result;
+ }
+
+ result = uds_make_radix_sorter(geometry->records_per_page,
+ &volume->radix_sorter);
+ if (result != UDS_SUCCESS) {
+ uds_free_volume(volume);
+ return result;
+ }
+
+ result = vdo_allocate(geometry->records_per_page,
+ const struct uds_volume_record *, "record pointers",
+ &volume->record_pointers);
+ if (result != VDO_SUCCESS) {
+ uds_free_volume(volume);
+ return result;
+ }
+
+ if (uds_is_sparse_index_geometry(geometry)) {
+ size_t page_size = sizeof(struct delta_index_page) + geometry->bytes_per_page;
+
+ result = uds_make_sparse_cache(geometry, config->cache_chapters,
+ config->zone_count,
+ &volume->sparse_cache);
+ if (result != UDS_SUCCESS) {
+ uds_free_volume(volume);
+ return result;
+ }
+
+ volume->cache_size =
+ page_size * geometry->index_pages_per_chapter * config->cache_chapters;
+ }
+
+ result = initialize_page_cache(&volume->page_cache, geometry,
+ config->cache_chapters, config->zone_count);
+ if (result != UDS_SUCCESS) {
+ uds_free_volume(volume);
+ return result;
+ }
+
+ volume->cache_size += volume->page_cache.cache_slots * sizeof(struct delta_index_page);
+ result = uds_make_index_page_map(geometry, &volume->index_page_map);
+ if (result != UDS_SUCCESS) {
+ uds_free_volume(volume);
+ return result;
+ }
+
+ mutex_init(&volume->read_threads_mutex);
+ uds_init_cond(&volume->read_threads_read_done_cond);
+ uds_init_cond(&volume->read_threads_cond);
+
+ result = vdo_allocate(config->read_threads, struct thread *, "reader threads",
+ &volume->reader_threads);
+ if (result != VDO_SUCCESS) {
+ uds_free_volume(volume);
+ return result;
+ }
+
+ for (i = 0; i < config->read_threads; i++) {
+ result = vdo_create_thread(read_thread_function, (void *) volume,
+ "reader", &volume->reader_threads[i]);
+ if (result != VDO_SUCCESS) {
+ uds_free_volume(volume);
+ return result;
+ }
+
+ volume->read_thread_count = i + 1;
+ }
+
+ *new_volume = volume;
+ return UDS_SUCCESS;
+}
+
+static void uninitialize_page_cache(struct page_cache *cache)
+{
+ u16 i;
+
+ if (cache->cache != NULL) {
+ for (i = 0; i < cache->cache_slots; i++)
+ release_page_buffer(&cache->cache[i]);
+ }
+ vdo_free(cache->index);
+ vdo_free(cache->cache);
+ vdo_free(cache->search_pending_counters);
+ vdo_free(cache->read_queue);
+}
+
+void uds_free_volume(struct volume *volume)
+{
+ if (volume == NULL)
+ return;
+
+ if (volume->reader_threads != NULL) {
+ unsigned int i;
+
+ /* This works even if some threads weren't started. */
+ mutex_lock(&volume->read_threads_mutex);
+ volume->read_threads_exiting = true;
+ uds_broadcast_cond(&volume->read_threads_cond);
+ mutex_unlock(&volume->read_threads_mutex);
+ for (i = 0; i < volume->read_thread_count; i++)
+ vdo_join_threads(volume->reader_threads[i]);
+ vdo_free(volume->reader_threads);
+ volume->reader_threads = NULL;
+ }
+
+ /* Must destroy the client AFTER freeing the cached pages. */
+ uninitialize_page_cache(&volume->page_cache);
+ uds_free_sparse_cache(volume->sparse_cache);
+ if (volume->client != NULL)
+ dm_bufio_client_destroy(vdo_forget(volume->client));
+
+ uds_free_index_page_map(volume->index_page_map);
+ uds_free_radix_sorter(volume->radix_sorter);
+ vdo_free(volume->geometry);
+ vdo_free(volume->record_pointers);
+ vdo_free(volume);
+}
diff --git a/drivers/md/dm-vdo/indexer/volume.h b/drivers/md/dm-vdo/indexer/volume.h
new file mode 100644
index 000000000000..8679a5e55347
--- /dev/null
+++ b/drivers/md/dm-vdo/indexer/volume.h
@@ -0,0 +1,172 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef UDS_VOLUME_H
+#define UDS_VOLUME_H
+
+#include <linux/atomic.h>
+#include <linux/cache.h>
+#include <linux/dm-bufio.h>
+#include <linux/limits.h>
+
+#include "permassert.h"
+#include "thread-utils.h"
+
+#include "chapter-index.h"
+#include "config.h"
+#include "geometry.h"
+#include "indexer.h"
+#include "index-layout.h"
+#include "index-page-map.h"
+#include "radix-sort.h"
+#include "sparse-cache.h"
+
+/*
+ * The volume manages deduplication records on permanent storage. The term "volume" can also refer
+ * to the region of permanent storage where the records (and the chapters containing them) are
+ * stored. The volume handles all I/O to this region by reading, caching, and writing chapter pages
+ * as necessary.
+ */
+
+enum index_lookup_mode {
+ /* Always do lookups in all chapters normally */
+ LOOKUP_NORMAL,
+ /* Only do a subset of lookups needed when rebuilding an index */
+ LOOKUP_FOR_REBUILD,
+};
+
+struct queued_read {
+ bool invalid;
+ bool reserved;
+ u32 physical_page;
+ struct uds_request *first_request;
+ struct uds_request *last_request;
+};
+
+struct __aligned(L1_CACHE_BYTES) search_pending_counter {
+ u64 atomic_value;
+};
+
+struct cached_page {
+ /* Whether this page is currently being read asynchronously */
+ bool read_pending;
+ /* The physical page stored in this cache entry */
+ u32 physical_page;
+ /* The value of the volume clock when this page was last used */
+ s64 last_used;
+ /* The cached page buffer */
+ struct dm_buffer *buffer;
+ /* The chapter index page, meaningless for record pages */
+ struct delta_index_page index_page;
+};
+
+struct page_cache {
+ /* The number of zones */
+ unsigned int zone_count;
+ /* The number of volume pages that can be cached */
+ u32 indexable_pages;
+ /* The maximum number of simultaneously cached pages */
+ u16 cache_slots;
+ /* An index for each physical page noting where it is in the cache */
+ u16 *index;
+ /* The array of cached pages */
+ struct cached_page *cache;
+ /* A counter for each zone tracking if a search is occurring there */
+ struct search_pending_counter *search_pending_counters;
+ /* The read queue entries as a circular array */
+ struct queued_read *read_queue;
+
+ /* All entries above this point are constant after initialization. */
+
+ /*
+ * These values are all indexes into the array of read queue entries. New entries in the
+ * read queue are enqueued at read_queue_last. To dequeue entries, a reader thread gets the
+ * lock and then claims the entry pointed to by read_queue_next_read and increments that
+ * value. After the read is completed, the reader thread calls release_read_queue_entry(),
+ * which increments read_queue_first until it points to a pending read, or is equal to
+ * read_queue_next_read. This means that if multiple reads are outstanding,
+ * read_queue_first might not advance until the last of the reads finishes.
+ */
+ u16 read_queue_first;
+ u16 read_queue_next_read;
+ u16 read_queue_last;
+
+ atomic64_t clock;
+};
+
+struct volume {
+ struct index_geometry *geometry;
+ struct dm_bufio_client *client;
+ u64 nonce;
+ size_t cache_size;
+
+ /* A single page worth of records, for sorting */
+ const struct uds_volume_record **record_pointers;
+ /* Sorter for sorting records within each page */
+ struct radix_sorter *radix_sorter;
+
+ struct sparse_cache *sparse_cache;
+ struct page_cache page_cache;
+ struct index_page_map *index_page_map;
+
+ struct mutex read_threads_mutex;
+ struct cond_var read_threads_cond;
+ struct cond_var read_threads_read_done_cond;
+ struct thread **reader_threads;
+ unsigned int read_thread_count;
+ bool read_threads_exiting;
+
+ enum index_lookup_mode lookup_mode;
+ unsigned int reserved_buffers;
+};
+
+int __must_check uds_make_volume(const struct uds_configuration *config,
+ struct index_layout *layout,
+ struct volume **new_volume);
+
+void uds_free_volume(struct volume *volume);
+
+int __must_check uds_replace_volume_storage(struct volume *volume,
+ struct index_layout *layout,
+ struct block_device *bdev);
+
+int __must_check uds_find_volume_chapter_boundaries(struct volume *volume,
+ u64 *lowest_vcn, u64 *highest_vcn,
+ bool *is_empty);
+
+int __must_check uds_search_volume_page_cache(struct volume *volume,
+ struct uds_request *request,
+ bool *found);
+
+int __must_check uds_search_volume_page_cache_for_rebuild(struct volume *volume,
+ const struct uds_record_name *name,
+ u64 virtual_chapter,
+ bool *found);
+
+int __must_check uds_search_cached_record_page(struct volume *volume,
+ struct uds_request *request, u32 chapter,
+ u16 record_page_number, bool *found);
+
+void uds_forget_chapter(struct volume *volume, u64 chapter);
+
+int __must_check uds_write_chapter(struct volume *volume,
+ struct open_chapter_index *chapter_index,
+ const struct uds_volume_record records[]);
+
+void uds_prefetch_volume_chapter(const struct volume *volume, u32 chapter);
+
+int __must_check uds_read_chapter_index_from_volume(const struct volume *volume,
+ u64 virtual_chapter,
+ struct dm_buffer *volume_buffers[],
+ struct delta_index_page index_pages[]);
+
+int __must_check uds_get_volume_record_page(struct volume *volume, u32 chapter,
+ u32 page_number, u8 **data_ptr);
+
+int __must_check uds_get_volume_index_page(struct volume *volume, u32 chapter,
+ u32 page_number,
+ struct delta_index_page **page_ptr);
+
+#endif /* UDS_VOLUME_H */
diff --git a/drivers/md/dm-vdo/int-map.c b/drivers/md/dm-vdo/int-map.c
new file mode 100644
index 000000000000..3aa438f84ea1
--- /dev/null
+++ b/drivers/md/dm-vdo/int-map.c
@@ -0,0 +1,707 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+/**
+ * DOC:
+ *
+ * Hash table implementation of a map from integers to pointers, implemented using the Hopscotch
+ * Hashing algorithm by Herlihy, Shavit, and Tzafrir (see
+ * http://en.wikipedia.org/wiki/Hopscotch_hashing). This implementation does not contain any of the
+ * locking/concurrency features of the algorithm, just the collision resolution scheme.
+ *
+ * Hopscotch Hashing is based on hashing with open addressing and linear probing. All the entries
+ * are stored in a fixed array of buckets, with no dynamic allocation for collisions. Unlike linear
+ * probing, all the entries that hash to a given bucket are stored within a fixed neighborhood
+ * starting at that bucket. Chaining is effectively represented as a bit vector relative to each
+ * bucket instead of as pointers or explicit offsets.
+ *
+ * When an empty bucket cannot be found within a given neighborhood, subsequent neighborhoods are
+ * searched, and one or more entries will "hop" into those neighborhoods. When this process works,
+ * an empty bucket will move into the desired neighborhood, allowing the entry to be added. When
+ * that process fails (typically when the buckets are around 90% full), the table must be resized
+ * and the all entries rehashed and added to the expanded table.
+ *
+ * Unlike linear probing, the number of buckets that must be searched in the worst case has a fixed
+ * upper bound (the size of the neighborhood). Those entries occupy a small number of memory cache
+ * lines, leading to improved use of the cache (fewer misses on both successful and unsuccessful
+ * searches). Hopscotch hashing outperforms linear probing at much higher load factors, so even
+ * with the increased memory burden for maintaining the hop vectors, less memory is needed to
+ * achieve that performance. Hopscotch is also immune to "contamination" from deleting entries
+ * since entries are genuinely removed instead of being replaced by a placeholder.
+ *
+ * The published description of the algorithm used a bit vector, but the paper alludes to an offset
+ * scheme which is used by this implementation. Since the entries in the neighborhood are within N
+ * entries of the hash bucket at the start of the neighborhood, a pair of small offset fields each
+ * log2(N) bits wide is all that's needed to maintain the hops as a linked list. In order to encode
+ * "no next hop" (i.e. NULL) as the natural initial value of zero, the offsets are biased by one
+ * (i.e. 0 => NULL, 1 => offset=0, 2 => offset=1, etc.) We can represent neighborhoods of up to 255
+ * entries with just 8+8=16 bits per entry. The hop list is sorted by hop offset so the first entry
+ * in the list is always the bucket closest to the start of the neighborhood.
+ *
+ * While individual accesses tend to be very fast, the table resize operations are very, very
+ * expensive. If an upper bound on the latency of adding an entry to the table is needed, we either
+ * need to ensure the table is pre-sized to be large enough so no resize is ever needed, or we'll
+ * need to develop an approach to incrementally resize the table.
+ */
+
+#include "int-map.h"
+
+#include <linux/minmax.h>
+
+#include "errors.h"
+#include "logger.h"
+#include "memory-alloc.h"
+#include "numeric.h"
+#include "permassert.h"
+
+#define DEFAULT_CAPACITY 16 /* the number of neighborhoods in a new table */
+#define NEIGHBORHOOD 255 /* the number of buckets in each neighborhood */
+#define MAX_PROBES 1024 /* limit on the number of probes for a free bucket */
+#define NULL_HOP_OFFSET 0 /* the hop offset value terminating the hop list */
+#define DEFAULT_LOAD 75 /* a compromise between memory use and performance */
+
+/**
+ * struct bucket - hash bucket
+ *
+ * Buckets are packed together to reduce memory usage and improve cache efficiency. It would be
+ * tempting to encode the hop offsets separately and maintain alignment of key/value pairs, but
+ * it's crucial to keep the hop fields near the buckets that they use them so they'll tend to share
+ * cache lines.
+ */
+struct __packed bucket {
+ /**
+ * @first_hop: The biased offset of the first entry in the hop list of the neighborhood
+ * that hashes to this bucket.
+ */
+ u8 first_hop;
+ /** @next_hop: The biased offset of the next bucket in the hop list. */
+ u8 next_hop;
+ /** @key: The key stored in this bucket. */
+ u64 key;
+ /** @value: The value stored in this bucket (NULL if empty). */
+ void *value;
+};
+
+/**
+ * struct int_map - The concrete definition of the opaque int_map type.
+ *
+ * To avoid having to wrap the neighborhoods of the last entries back around to the start of the
+ * bucket array, we allocate a few more buckets at the end of the array instead, which is why
+ * capacity and bucket_count are different.
+ */
+struct int_map {
+ /** @size: The number of entries stored in the map. */
+ size_t size;
+ /** @capacity: The number of neighborhoods in the map. */
+ size_t capacity;
+ /* @bucket_count: The number of buckets in the bucket array. */
+ size_t bucket_count;
+ /** @buckets: The array of hash buckets. */
+ struct bucket *buckets;
+};
+
+/**
+ * mix() - The Google CityHash 16-byte hash mixing function.
+ * @input1: The first input value.
+ * @input2: The second input value.
+ *
+ * Return: A hash of the two inputs.
+ */
+static u64 mix(u64 input1, u64 input2)
+{
+ static const u64 CITY_MULTIPLIER = 0x9ddfea08eb382d69ULL;
+ u64 hash = (input1 ^ input2);
+
+ hash *= CITY_MULTIPLIER;
+ hash ^= (hash >> 47);
+ hash ^= input2;
+ hash *= CITY_MULTIPLIER;
+ hash ^= (hash >> 47);
+ hash *= CITY_MULTIPLIER;
+ return hash;
+}
+
+/**
+ * hash_key() - Calculate a 64-bit non-cryptographic hash value for the provided 64-bit integer
+ * key.
+ * @key: The mapping key.
+ *
+ * The implementation is based on Google's CityHash, only handling the specific case of an 8-byte
+ * input.
+ *
+ * Return: The hash of the mapping key.
+ */
+static u64 hash_key(u64 key)
+{
+ /*
+ * Aliasing restrictions forbid us from casting pointer types, so use a union to convert a
+ * single u64 to two u32 values.
+ */
+ union {
+ u64 u64;
+ u32 u32[2];
+ } pun = {.u64 = key};
+
+ return mix(sizeof(key) + (((u64) pun.u32[0]) << 3), pun.u32[1]);
+}
+
+/**
+ * allocate_buckets() - Initialize an int_map.
+ * @map: The map to initialize.
+ * @capacity: The initial capacity of the map.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+static int allocate_buckets(struct int_map *map, size_t capacity)
+{
+ map->size = 0;
+ map->capacity = capacity;
+
+ /*
+ * Allocate NEIGHBORHOOD - 1 extra buckets so the last bucket can have a full neighborhood
+ * without have to wrap back around to element zero.
+ */
+ map->bucket_count = capacity + (NEIGHBORHOOD - 1);
+ return vdo_allocate(map->bucket_count, struct bucket,
+ "struct int_map buckets", &map->buckets);
+}
+
+/**
+ * vdo_int_map_create() - Allocate and initialize an int_map.
+ * @initial_capacity: The number of entries the map should initially be capable of holding (zero
+ * tells the map to use its own small default).
+ * @map_ptr: Output, a pointer to hold the new int_map.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+int vdo_int_map_create(size_t initial_capacity, struct int_map **map_ptr)
+{
+ struct int_map *map;
+ int result;
+ size_t capacity;
+
+ result = vdo_allocate(1, struct int_map, "struct int_map", &map);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ /* Use the default capacity if the caller did not specify one. */
+ capacity = (initial_capacity > 0) ? initial_capacity : DEFAULT_CAPACITY;
+
+ /*
+ * Scale up the capacity by the specified initial load factor. (i.e to hold 1000 entries at
+ * 80% load we need a capacity of 1250)
+ */
+ capacity = capacity * 100 / DEFAULT_LOAD;
+
+ result = allocate_buckets(map, capacity);
+ if (result != VDO_SUCCESS) {
+ vdo_int_map_free(vdo_forget(map));
+ return result;
+ }
+
+ *map_ptr = map;
+ return VDO_SUCCESS;
+}
+
+/**
+ * vdo_int_map_free() - Free an int_map.
+ * @map: The int_map to free.
+ *
+ * NOTE: The map does not own the pointer values stored in the map and they are not freed by this
+ * call.
+ */
+void vdo_int_map_free(struct int_map *map)
+{
+ if (map == NULL)
+ return;
+
+ vdo_free(vdo_forget(map->buckets));
+ vdo_free(vdo_forget(map));
+}
+
+/**
+ * vdo_int_map_size() - Get the number of entries stored in an int_map.
+ * @map: The int_map to query.
+ *
+ * Return: The number of entries in the map.
+ */
+size_t vdo_int_map_size(const struct int_map *map)
+{
+ return map->size;
+}
+
+/**
+ * dereference_hop() - Convert a biased hop offset within a neighborhood to a pointer to the bucket
+ * it references.
+ * @neighborhood: The first bucket in the neighborhood.
+ * @hop_offset: The biased hop offset to the desired bucket.
+ *
+ * Return: NULL if hop_offset is zero, otherwise a pointer to the bucket in the neighborhood at
+ * hop_offset - 1.
+ */
+static struct bucket *dereference_hop(struct bucket *neighborhood, unsigned int hop_offset)
+{
+ BUILD_BUG_ON(NULL_HOP_OFFSET != 0);
+ if (hop_offset == NULL_HOP_OFFSET)
+ return NULL;
+
+ return &neighborhood[hop_offset - 1];
+}
+
+/**
+ * insert_in_hop_list() - Add a bucket into the hop list for the neighborhood.
+ * @neighborhood: The first bucket in the neighborhood.
+ * @new_bucket: The bucket to add to the hop list.
+ *
+ * The bucket is inserted it into the list so the hop list remains sorted by hop offset.
+ */
+static void insert_in_hop_list(struct bucket *neighborhood, struct bucket *new_bucket)
+{
+ /* Zero indicates a NULL hop offset, so bias the hop offset by one. */
+ int hop_offset = 1 + (new_bucket - neighborhood);
+
+ /* Handle the special case of adding a bucket at the start of the list. */
+ int next_hop = neighborhood->first_hop;
+
+ if ((next_hop == NULL_HOP_OFFSET) || (next_hop > hop_offset)) {
+ new_bucket->next_hop = next_hop;
+ neighborhood->first_hop = hop_offset;
+ return;
+ }
+
+ /* Search the hop list for the insertion point that maintains the sort order. */
+ for (;;) {
+ struct bucket *bucket = dereference_hop(neighborhood, next_hop);
+
+ next_hop = bucket->next_hop;
+
+ if ((next_hop == NULL_HOP_OFFSET) || (next_hop > hop_offset)) {
+ new_bucket->next_hop = next_hop;
+ bucket->next_hop = hop_offset;
+ return;
+ }
+ }
+}
+
+/**
+ * select_bucket() - Select and return the hash bucket for a given search key.
+ * @map: The map to search.
+ * @key: The mapping key.
+ */
+static struct bucket *select_bucket(const struct int_map *map, u64 key)
+{
+ /*
+ * Calculate a good hash value for the provided key. We want exactly 32 bits, so mask the
+ * result.
+ */
+ u64 hash = hash_key(key) & 0xFFFFFFFF;
+
+ /*
+ * Scale the 32-bit hash to a bucket index by treating it as a binary fraction and
+ * multiplying that by the capacity. If the hash is uniformly distributed over [0 ..
+ * 2^32-1], then (hash * capacity / 2^32) should be uniformly distributed over [0 ..
+ * capacity-1]. The multiply and shift is much faster than a divide (modulus) on X86 CPUs.
+ */
+ return &map->buckets[(hash * map->capacity) >> 32];
+}
+
+/**
+ * search_hop_list() - Search the hop list associated with given hash bucket for a given search
+ * key.
+ * @map: The map being searched.
+ * @bucket: The map bucket to search for the key.
+ * @key: The mapping key.
+ * @previous_ptr: Output. if not NULL, a pointer in which to store the bucket in the list preceding
+ * the one that had the matching key
+ *
+ * If the key is found, returns a pointer to the entry (bucket or collision), otherwise returns
+ * NULL.
+ *
+ * Return: An entry that matches the key, or NULL if not found.
+ */
+static struct bucket *search_hop_list(struct int_map *map __always_unused,
+ struct bucket *bucket,
+ u64 key,
+ struct bucket **previous_ptr)
+{
+ struct bucket *previous = NULL;
+ unsigned int next_hop = bucket->first_hop;
+
+ while (next_hop != NULL_HOP_OFFSET) {
+ /*
+ * Check the neighboring bucket indexed by the offset for the
+ * desired key.
+ */
+ struct bucket *entry = dereference_hop(bucket, next_hop);
+
+ if ((key == entry->key) && (entry->value != NULL)) {
+ if (previous_ptr != NULL)
+ *previous_ptr = previous;
+ return entry;
+ }
+ next_hop = entry->next_hop;
+ previous = entry;
+ }
+
+ return NULL;
+}
+
+/**
+ * vdo_int_map_get() - Retrieve the value associated with a given key from the int_map.
+ * @map: The int_map to query.
+ * @key: The key to look up.
+ *
+ * Return: The value associated with the given key, or NULL if the key is not mapped to any value.
+ */
+void *vdo_int_map_get(struct int_map *map, u64 key)
+{
+ struct bucket *match = search_hop_list(map, select_bucket(map, key), key, NULL);
+
+ return ((match != NULL) ? match->value : NULL);
+}
+
+/**
+ * resize_buckets() - Increase the number of hash buckets.
+ * @map: The map to resize.
+ *
+ * Resizes and rehashes all the existing entries, storing them in the new buckets.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+static int resize_buckets(struct int_map *map)
+{
+ int result;
+ size_t i;
+
+ /* Copy the top-level map data to the stack. */
+ struct int_map old_map = *map;
+
+ /* Re-initialize the map to be empty and 50% larger. */
+ size_t new_capacity = map->capacity / 2 * 3;
+
+ vdo_log_info("%s: attempting resize from %zu to %zu, current size=%zu",
+ __func__, map->capacity, new_capacity, map->size);
+ result = allocate_buckets(map, new_capacity);
+ if (result != VDO_SUCCESS) {
+ *map = old_map;
+ return result;
+ }
+
+ /* Populate the new hash table from the entries in the old bucket array. */
+ for (i = 0; i < old_map.bucket_count; i++) {
+ struct bucket *entry = &old_map.buckets[i];
+
+ if (entry->value == NULL)
+ continue;
+
+ result = vdo_int_map_put(map, entry->key, entry->value, true, NULL);
+ if (result != VDO_SUCCESS) {
+ /* Destroy the new partial map and restore the map from the stack. */
+ vdo_free(vdo_forget(map->buckets));
+ *map = old_map;
+ return result;
+ }
+ }
+
+ /* Destroy the old bucket array. */
+ vdo_free(vdo_forget(old_map.buckets));
+ return VDO_SUCCESS;
+}
+
+/**
+ * find_empty_bucket() - Probe the bucket array starting at the given bucket for the next empty
+ * bucket, returning a pointer to it.
+ * @map: The map containing the buckets to search.
+ * @bucket: The bucket at which to start probing.
+ * @max_probes: The maximum number of buckets to search.
+ *
+ * NULL will be returned if the search reaches the end of the bucket array or if the number of
+ * linear probes exceeds a specified limit.
+ *
+ * Return: The next empty bucket, or NULL if the search failed.
+ */
+static struct bucket *
+find_empty_bucket(struct int_map *map, struct bucket *bucket, unsigned int max_probes)
+{
+ /*
+ * Limit the search to either the nearer of the end of the bucket array or a fixed distance
+ * beyond the initial bucket.
+ */
+ ptrdiff_t remaining = &map->buckets[map->bucket_count] - bucket;
+ struct bucket *sentinel = &bucket[min_t(ptrdiff_t, remaining, max_probes)];
+ struct bucket *entry;
+
+ for (entry = bucket; entry < sentinel; entry++) {
+ if (entry->value == NULL)
+ return entry;
+ }
+
+ return NULL;
+}
+
+/**
+ * move_empty_bucket() - Move an empty bucket closer to the start of the bucket array.
+ * @map: The map containing the bucket.
+ * @hole: The empty bucket to fill with an entry that precedes it in one of its enclosing
+ * neighborhoods.
+ *
+ * This searches the neighborhoods that contain the empty bucket for a non-empty bucket closer to
+ * the start of the array. If such a bucket is found, this swaps the two buckets by moving the
+ * entry to the empty bucket.
+ *
+ * Return: The bucket that was vacated by moving its entry to the provided hole, or NULL if no
+ * entry could be moved.
+ */
+static struct bucket *move_empty_bucket(struct int_map *map __always_unused,
+ struct bucket *hole)
+{
+ /*
+ * Examine every neighborhood that the empty bucket is part of, starting with the one in
+ * which it is the last bucket. No boundary check is needed for the negative array
+ * arithmetic since this function is only called when hole is at least NEIGHBORHOOD cells
+ * deeper into the array than a valid bucket.
+ */
+ struct bucket *bucket;
+
+ for (bucket = &hole[1 - NEIGHBORHOOD]; bucket < hole; bucket++) {
+ /*
+ * Find the entry that is nearest to the bucket, which means it will be nearest to
+ * the hash bucket whose neighborhood is full.
+ */
+ struct bucket *new_hole = dereference_hop(bucket, bucket->first_hop);
+
+ if (new_hole == NULL) {
+ /*
+ * There are no buckets in this neighborhood that are in use by this one
+ * (they must all be owned by overlapping neighborhoods).
+ */
+ continue;
+ }
+
+ /*
+ * Skip this bucket if its first entry is actually further away than the hole that
+ * we're already trying to fill.
+ */
+ if (hole < new_hole)
+ continue;
+
+ /*
+ * We've found an entry in this neighborhood that we can "hop" further away, moving
+ * the hole closer to the hash bucket, if not all the way into its neighborhood.
+ */
+
+ /*
+ * The entry that will be the new hole is the first bucket in the list, so setting
+ * first_hop is all that's needed remove it from the list.
+ */
+ bucket->first_hop = new_hole->next_hop;
+ new_hole->next_hop = NULL_HOP_OFFSET;
+
+ /* Move the entry into the original hole. */
+ hole->key = new_hole->key;
+ hole->value = new_hole->value;
+ new_hole->value = NULL;
+
+ /* Insert the filled hole into the hop list for the neighborhood. */
+ insert_in_hop_list(bucket, hole);
+ return new_hole;
+ }
+
+ /* We couldn't find an entry to relocate to the hole. */
+ return NULL;
+}
+
+/**
+ * update_mapping() - Find and update any existing mapping for a given key, returning the value
+ * associated with the key in the provided pointer.
+ * @map: The int_map to attempt to modify.
+ * @neighborhood: The first bucket in the neighborhood that would contain the search key
+ * @key: The key with which to associate the new value.
+ * @new_value: The value to be associated with the key.
+ * @update: Whether to overwrite an existing value.
+ * @old_value_ptr: a pointer in which to store the old value (unmodified if no mapping was found)
+ *
+ * Return: true if the map contains a mapping for the key, false if it does not.
+ */
+static bool update_mapping(struct int_map *map, struct bucket *neighborhood,
+ u64 key, void *new_value, bool update, void **old_value_ptr)
+{
+ struct bucket *bucket = search_hop_list(map, neighborhood, key, NULL);
+
+ if (bucket == NULL) {
+ /* There is no bucket containing the key in the neighborhood. */
+ return false;
+ }
+
+ /*
+ * Return the value of the current mapping (if desired) and update the mapping with the new
+ * value (if desired).
+ */
+ if (old_value_ptr != NULL)
+ *old_value_ptr = bucket->value;
+ if (update)
+ bucket->value = new_value;
+ return true;
+}
+
+/**
+ * find_or_make_vacancy() - Find an empty bucket.
+ * @map: The int_map to search or modify.
+ * @neighborhood: The first bucket in the neighborhood in which an empty bucket is needed for a new
+ * mapping.
+ *
+ * Find an empty bucket in a specified neighborhood for a new mapping or attempt to re-arrange
+ * mappings so there is such a bucket. This operation may fail (returning NULL) if an empty bucket
+ * is not available or could not be relocated to the neighborhood.
+ *
+ * Return: a pointer to an empty bucket in the desired neighborhood, or NULL if a vacancy could not
+ * be found or arranged.
+ */
+static struct bucket *find_or_make_vacancy(struct int_map *map,
+ struct bucket *neighborhood)
+{
+ /* Probe within and beyond the neighborhood for the first empty bucket. */
+ struct bucket *hole = find_empty_bucket(map, neighborhood, MAX_PROBES);
+
+ /*
+ * Keep trying until the empty bucket is in the bucket's neighborhood or we are unable to
+ * move it any closer by swapping it with a filled bucket.
+ */
+ while (hole != NULL) {
+ int distance = hole - neighborhood;
+
+ if (distance < NEIGHBORHOOD) {
+ /*
+ * We've found or relocated an empty bucket close enough to the initial
+ * hash bucket to be referenced by its hop vector.
+ */
+ return hole;
+ }
+
+ /*
+ * The nearest empty bucket isn't within the neighborhood that must contain the new
+ * entry, so try to swap it with bucket that is closer.
+ */
+ hole = move_empty_bucket(map, hole);
+ }
+
+ return NULL;
+}
+
+/**
+ * vdo_int_map_put() - Try to associate a value with an integer.
+ * @map: The int_map to attempt to modify.
+ * @key: The key with which to associate the new value.
+ * @new_value: The value to be associated with the key.
+ * @update: Whether to overwrite an existing value.
+ * @old_value_ptr: A pointer in which to store either the old value (if the key was already mapped)
+ * or NULL if the map did not contain the key; NULL may be provided if the caller
+ * does not need to know the old value
+ *
+ * Try to associate a value (a pointer) with an integer in an int_map. If the map already contains
+ * a mapping for the provided key, the old value is only replaced with the specified value if
+ * update is true. In either case the old value is returned. If the map does not already contain a
+ * value for the specified key, the new value is added regardless of the value of update.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+int vdo_int_map_put(struct int_map *map, u64 key, void *new_value, bool update,
+ void **old_value_ptr)
+{
+ struct bucket *neighborhood, *bucket;
+
+ if (unlikely(new_value == NULL))
+ return -EINVAL;
+
+ /*
+ * Select the bucket at the start of the neighborhood that must contain any entry for the
+ * provided key.
+ */
+ neighborhood = select_bucket(map, key);
+
+ /*
+ * Check whether the neighborhood already contains an entry for the key, in which case we
+ * optionally update it, returning the old value.
+ */
+ if (update_mapping(map, neighborhood, key, new_value, update, old_value_ptr))
+ return VDO_SUCCESS;
+
+ /*
+ * Find an empty bucket in the desired neighborhood for the new entry or re-arrange entries
+ * in the map so there is such a bucket. This operation will usually succeed; the loop body
+ * will only be executed on the rare occasions that we have to resize the map.
+ */
+ while ((bucket = find_or_make_vacancy(map, neighborhood)) == NULL) {
+ int result;
+
+ /*
+ * There is no empty bucket in which to put the new entry in the current map, so
+ * we're forced to allocate a new bucket array with a larger capacity, re-hash all
+ * the entries into those buckets, and try again (a very expensive operation for
+ * large maps).
+ */
+ result = resize_buckets(map);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ /*
+ * Resizing the map invalidates all pointers to buckets, so recalculate the
+ * neighborhood pointer.
+ */
+ neighborhood = select_bucket(map, key);
+ }
+
+ /* Put the new entry in the empty bucket, adding it to the neighborhood. */
+ bucket->key = key;
+ bucket->value = new_value;
+ insert_in_hop_list(neighborhood, bucket);
+ map->size += 1;
+
+ /* There was no existing entry, so there was no old value to be returned. */
+ if (old_value_ptr != NULL)
+ *old_value_ptr = NULL;
+ return VDO_SUCCESS;
+}
+
+/**
+ * vdo_int_map_remove() - Remove the mapping for a given key from the int_map.
+ * @map: The int_map from which to remove the mapping.
+ * @key: The key whose mapping is to be removed.
+ *
+ * Return: the value that was associated with the key, or NULL if it was not mapped.
+ */
+void *vdo_int_map_remove(struct int_map *map, u64 key)
+{
+ void *value;
+
+ /* Select the bucket to search and search it for an existing entry. */
+ struct bucket *bucket = select_bucket(map, key);
+ struct bucket *previous;
+ struct bucket *victim = search_hop_list(map, bucket, key, &previous);
+
+ if (victim == NULL) {
+ /* There is no matching entry to remove. */
+ return NULL;
+ }
+
+ /*
+ * We found an entry to remove. Save the mapped value to return later and empty the bucket.
+ */
+ map->size -= 1;
+ value = victim->value;
+ victim->value = NULL;
+ victim->key = 0;
+
+ /* The victim bucket is now empty, but it still needs to be spliced out of the hop list. */
+ if (previous == NULL) {
+ /* The victim is the head of the list, so swing first_hop. */
+ bucket->first_hop = victim->next_hop;
+ } else {
+ previous->next_hop = victim->next_hop;
+ }
+
+ victim->next_hop = NULL_HOP_OFFSET;
+ return value;
+}
diff --git a/drivers/md/dm-vdo/int-map.h b/drivers/md/dm-vdo/int-map.h
new file mode 100644
index 000000000000..1858ad799887
--- /dev/null
+++ b/drivers/md/dm-vdo/int-map.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef VDO_INT_MAP_H
+#define VDO_INT_MAP_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+/**
+ * DOC: int_map
+ *
+ * An int_map associates pointers (void *) with integer keys (u64). NULL pointer values are
+ * not supported.
+ *
+ * The map is implemented as hash table, which should provide constant-time insert, query, and
+ * remove operations, although the insert may occasionally grow the table, which is linear in the
+ * number of entries in the map. The table will grow as needed to hold new entries, but will not
+ * shrink as entries are removed.
+ */
+
+struct int_map;
+
+int __must_check vdo_int_map_create(size_t initial_capacity, struct int_map **map_ptr);
+
+void vdo_int_map_free(struct int_map *map);
+
+size_t vdo_int_map_size(const struct int_map *map);
+
+void *vdo_int_map_get(struct int_map *map, u64 key);
+
+int __must_check vdo_int_map_put(struct int_map *map, u64 key, void *new_value,
+ bool update, void **old_value_ptr);
+
+void *vdo_int_map_remove(struct int_map *map, u64 key);
+
+#endif /* VDO_INT_MAP_H */
diff --git a/drivers/md/dm-vdo/io-submitter.c b/drivers/md/dm-vdo/io-submitter.c
new file mode 100644
index 000000000000..9a3716bb3c05
--- /dev/null
+++ b/drivers/md/dm-vdo/io-submitter.c
@@ -0,0 +1,477 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "io-submitter.h"
+
+#include <linux/bio.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+
+#include "memory-alloc.h"
+#include "permassert.h"
+
+#include "data-vio.h"
+#include "logger.h"
+#include "types.h"
+#include "vdo.h"
+#include "vio.h"
+
+/*
+ * Submission of bio operations to the underlying storage device will go through a separate work
+ * queue thread (or more than one) to prevent blocking in other threads if the storage device has a
+ * full queue. The plug structure allows that thread to do better batching of requests to make the
+ * I/O more efficient.
+ *
+ * When multiple worker threads are used, a thread is chosen for a I/O operation submission based
+ * on the PBN, so a given PBN will consistently wind up on the same thread. Flush operations are
+ * assigned round-robin.
+ *
+ * The map (protected by the mutex) collects pending I/O operations so that the worker thread can
+ * reorder them to try to encourage I/O request merging in the request queue underneath.
+ */
+struct bio_queue_data {
+ struct vdo_work_queue *queue;
+ struct blk_plug plug;
+ struct int_map *map;
+ struct mutex lock;
+ unsigned int queue_number;
+};
+
+struct io_submitter {
+ unsigned int num_bio_queues_used;
+ unsigned int bio_queue_rotation_interval;
+ struct bio_queue_data bio_queue_data[];
+};
+
+static void start_bio_queue(void *ptr)
+{
+ struct bio_queue_data *bio_queue_data = ptr;
+
+ blk_start_plug(&bio_queue_data->plug);
+}
+
+static void finish_bio_queue(void *ptr)
+{
+ struct bio_queue_data *bio_queue_data = ptr;
+
+ blk_finish_plug(&bio_queue_data->plug);
+}
+
+static const struct vdo_work_queue_type bio_queue_type = {
+ .start = start_bio_queue,
+ .finish = finish_bio_queue,
+ .max_priority = BIO_Q_MAX_PRIORITY,
+ .default_priority = BIO_Q_DATA_PRIORITY,
+};
+
+/**
+ * count_all_bios() - Determine which bio counter to use.
+ * @vio: The vio associated with the bio.
+ * @bio: The bio to count.
+ */
+static void count_all_bios(struct vio *vio, struct bio *bio)
+{
+ struct atomic_statistics *stats = &vio->completion.vdo->stats;
+
+ if (is_data_vio(vio)) {
+ vdo_count_bios(&stats->bios_out, bio);
+ return;
+ }
+
+ vdo_count_bios(&stats->bios_meta, bio);
+ if (vio->type == VIO_TYPE_RECOVERY_JOURNAL)
+ vdo_count_bios(&stats->bios_journal, bio);
+ else if (vio->type == VIO_TYPE_BLOCK_MAP)
+ vdo_count_bios(&stats->bios_page_cache, bio);
+}
+
+/**
+ * assert_in_bio_zone() - Assert that a vio is in the correct bio zone and not in interrupt
+ * context.
+ * @vio: The vio to check.
+ */
+static void assert_in_bio_zone(struct vio *vio)
+{
+ VDO_ASSERT_LOG_ONLY(!in_interrupt(), "not in interrupt context");
+ assert_vio_in_bio_zone(vio);
+}
+
+/**
+ * send_bio_to_device() - Update stats and tracing info, then submit the supplied bio to the OS for
+ * processing.
+ * @vio: The vio associated with the bio.
+ * @bio: The bio to submit to the OS.
+ */
+static void send_bio_to_device(struct vio *vio, struct bio *bio)
+{
+ struct vdo *vdo = vio->completion.vdo;
+
+ assert_in_bio_zone(vio);
+ atomic64_inc(&vdo->stats.bios_submitted);
+ count_all_bios(vio, bio);
+ bio_set_dev(bio, vdo_get_backing_device(vdo));
+ submit_bio_noacct(bio);
+}
+
+/**
+ * vdo_submit_vio() - Submits a vio's bio to the underlying block device. May block if the device
+ * is busy. This callback should be used by vios which did not attempt to merge.
+ */
+void vdo_submit_vio(struct vdo_completion *completion)
+{
+ struct vio *vio = as_vio(completion);
+
+ send_bio_to_device(vio, vio->bio);
+}
+
+/**
+ * get_bio_list() - Extract the list of bios to submit from a vio.
+ * @vio: The vio submitting I/O.
+ *
+ * The list will always contain at least one entry (the bio for the vio on which it is called), but
+ * other bios may have been merged with it as well.
+ *
+ * Return: bio The head of the bio list to submit.
+ */
+static struct bio *get_bio_list(struct vio *vio)
+{
+ struct bio *bio;
+ struct io_submitter *submitter = vio->completion.vdo->io_submitter;
+ struct bio_queue_data *bio_queue_data = &(submitter->bio_queue_data[vio->bio_zone]);
+
+ assert_in_bio_zone(vio);
+
+ mutex_lock(&bio_queue_data->lock);
+ vdo_int_map_remove(bio_queue_data->map,
+ vio->bios_merged.head->bi_iter.bi_sector);
+ vdo_int_map_remove(bio_queue_data->map,
+ vio->bios_merged.tail->bi_iter.bi_sector);
+ bio = vio->bios_merged.head;
+ bio_list_init(&vio->bios_merged);
+ mutex_unlock(&bio_queue_data->lock);
+
+ return bio;
+}
+
+/**
+ * submit_data_vio() - Submit a data_vio's bio to the storage below along with
+ * any bios that have been merged with it.
+ *
+ * Context: This call may block and so should only be called from a bio thread.
+ */
+static void submit_data_vio(struct vdo_completion *completion)
+{
+ struct bio *bio, *next;
+ struct vio *vio = as_vio(completion);
+
+ assert_in_bio_zone(vio);
+ for (bio = get_bio_list(vio); bio != NULL; bio = next) {
+ next = bio->bi_next;
+ bio->bi_next = NULL;
+ send_bio_to_device((struct vio *) bio->bi_private, bio);
+ }
+}
+
+/**
+ * get_mergeable_locked() - Attempt to find an already queued bio that the current bio can be
+ * merged with.
+ * @map: The bio map to use for merging.
+ * @vio: The vio we want to merge.
+ * @back_merge: Set to true for a back merge, false for a front merge.
+ *
+ * There are two types of merging possible, forward and backward, which are distinguished by a flag
+ * that uses kernel elevator terminology.
+ *
+ * Return: the vio to merge to, NULL if no merging is possible.
+ */
+static struct vio *get_mergeable_locked(struct int_map *map, struct vio *vio,
+ bool back_merge)
+{
+ struct bio *bio = vio->bio;
+ sector_t merge_sector = bio->bi_iter.bi_sector;
+ struct vio *vio_merge;
+
+ if (back_merge)
+ merge_sector -= VDO_SECTORS_PER_BLOCK;
+ else
+ merge_sector += VDO_SECTORS_PER_BLOCK;
+
+ vio_merge = vdo_int_map_get(map, merge_sector);
+
+ if (vio_merge == NULL)
+ return NULL;
+
+ if (vio->completion.priority != vio_merge->completion.priority)
+ return NULL;
+
+ if (bio_data_dir(bio) != bio_data_dir(vio_merge->bio))
+ return NULL;
+
+ if (bio_list_empty(&vio_merge->bios_merged))
+ return NULL;
+
+ if (back_merge) {
+ return (vio_merge->bios_merged.tail->bi_iter.bi_sector == merge_sector ?
+ vio_merge : NULL);
+ }
+
+ return (vio_merge->bios_merged.head->bi_iter.bi_sector == merge_sector ?
+ vio_merge : NULL);
+}
+
+static int map_merged_vio(struct int_map *bio_map, struct vio *vio)
+{
+ int result;
+ sector_t bio_sector;
+
+ bio_sector = vio->bios_merged.head->bi_iter.bi_sector;
+ result = vdo_int_map_put(bio_map, bio_sector, vio, true, NULL);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ bio_sector = vio->bios_merged.tail->bi_iter.bi_sector;
+ return vdo_int_map_put(bio_map, bio_sector, vio, true, NULL);
+}
+
+static int merge_to_prev_tail(struct int_map *bio_map, struct vio *vio,
+ struct vio *prev_vio)
+{
+ vdo_int_map_remove(bio_map, prev_vio->bios_merged.tail->bi_iter.bi_sector);
+ bio_list_merge(&prev_vio->bios_merged, &vio->bios_merged);
+ return map_merged_vio(bio_map, prev_vio);
+}
+
+static int merge_to_next_head(struct int_map *bio_map, struct vio *vio,
+ struct vio *next_vio)
+{
+ /*
+ * Handle "next merge" and "gap fill" cases the same way so as to reorder bios in a way
+ * that's compatible with using funnel queues in work queues. This avoids removing an
+ * existing completion.
+ */
+ vdo_int_map_remove(bio_map, next_vio->bios_merged.head->bi_iter.bi_sector);
+ bio_list_merge_head(&next_vio->bios_merged, &vio->bios_merged);
+ return map_merged_vio(bio_map, next_vio);
+}
+
+/**
+ * try_bio_map_merge() - Attempt to merge a vio's bio with other pending I/Os.
+ * @vio: The vio to merge.
+ *
+ * Currently this is only used for data_vios, but is broken out for future use with metadata vios.
+ *
+ * Return: whether or not the vio was merged.
+ */
+static bool try_bio_map_merge(struct vio *vio)
+{
+ int result;
+ bool merged = true;
+ struct bio *bio = vio->bio;
+ struct vio *prev_vio, *next_vio;
+ struct vdo *vdo = vio->completion.vdo;
+ struct bio_queue_data *bio_queue_data =
+ &vdo->io_submitter->bio_queue_data[vio->bio_zone];
+
+ bio->bi_next = NULL;
+ bio_list_init(&vio->bios_merged);
+ bio_list_add(&vio->bios_merged, bio);
+
+ mutex_lock(&bio_queue_data->lock);
+ prev_vio = get_mergeable_locked(bio_queue_data->map, vio, true);
+ next_vio = get_mergeable_locked(bio_queue_data->map, vio, false);
+ if (prev_vio == next_vio)
+ next_vio = NULL;
+
+ if ((prev_vio == NULL) && (next_vio == NULL)) {
+ /* no merge. just add to bio_queue */
+ merged = false;
+ result = vdo_int_map_put(bio_queue_data->map,
+ bio->bi_iter.bi_sector,
+ vio, true, NULL);
+ } else if (next_vio == NULL) {
+ /* Only prev. merge to prev's tail */
+ result = merge_to_prev_tail(bio_queue_data->map, vio, prev_vio);
+ } else {
+ /* Only next. merge to next's head */
+ result = merge_to_next_head(bio_queue_data->map, vio, next_vio);
+ }
+ mutex_unlock(&bio_queue_data->lock);
+
+ /* We don't care about failure of int_map_put in this case. */
+ VDO_ASSERT_LOG_ONLY(result == VDO_SUCCESS, "bio map insertion succeeds");
+ return merged;
+}
+
+/**
+ * vdo_submit_data_vio() - Submit I/O for a data_vio.
+ * @data_vio: the data_vio for which to issue I/O.
+ *
+ * If possible, this I/O will be merged other pending I/Os. Otherwise, the data_vio will be sent to
+ * the appropriate bio zone directly.
+ */
+void vdo_submit_data_vio(struct data_vio *data_vio)
+{
+ if (try_bio_map_merge(&data_vio->vio))
+ return;
+
+ launch_data_vio_bio_zone_callback(data_vio, submit_data_vio);
+}
+
+/**
+ * __submit_metadata_vio() - Submit I/O for a metadata vio.
+ * @vio: the vio for which to issue I/O
+ * @physical: the physical block number to read or write
+ * @callback: the bio endio function which will be called after the I/O completes
+ * @error_handler: the handler for submission or I/O errors (may be NULL)
+ * @operation: the type of I/O to perform
+ * @data: the buffer to read or write (may be NULL)
+ *
+ * The vio is enqueued on a vdo bio queue so that bio submission (which may block) does not block
+ * other vdo threads.
+ *
+ * That the error handler will run on the correct thread is only true so long as the thread calling
+ * this function, and the thread set in the endio callback are the same, as well as the fact that
+ * no error can occur on the bio queue. Currently this is true for all callers, but additional care
+ * will be needed if this ever changes.
+ */
+void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical,
+ bio_end_io_t callback, vdo_action_fn error_handler,
+ blk_opf_t operation, char *data)
+{
+ int result;
+ struct vdo_completion *completion = &vio->completion;
+ const struct admin_state_code *code = vdo_get_admin_state(completion->vdo);
+
+
+ VDO_ASSERT_LOG_ONLY(!code->quiescent, "I/O not allowed in state %s", code->name);
+ VDO_ASSERT_LOG_ONLY(vio->bio->bi_next == NULL, "metadata bio has no next bio");
+
+ vdo_reset_completion(completion);
+ completion->error_handler = error_handler;
+ result = vio_reset_bio(vio, data, callback, operation | REQ_META, physical);
+ if (result != VDO_SUCCESS) {
+ continue_vio(vio, result);
+ return;
+ }
+
+ vdo_set_completion_callback(completion, vdo_submit_vio,
+ get_vio_bio_zone_thread_id(vio));
+ vdo_launch_completion_with_priority(completion, get_metadata_priority(vio));
+}
+
+/**
+ * vdo_make_io_submitter() - Create an io_submitter structure.
+ * @thread_count: Number of bio-submission threads to set up.
+ * @rotation_interval: Interval to use when rotating between bio-submission threads when enqueuing
+ * completions.
+ * @max_requests_active: Number of bios for merge tracking.
+ * @vdo: The vdo which will use this submitter.
+ * @io_submitter: pointer to the new data structure.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+int vdo_make_io_submitter(unsigned int thread_count, unsigned int rotation_interval,
+ unsigned int max_requests_active, struct vdo *vdo,
+ struct io_submitter **io_submitter_ptr)
+{
+ unsigned int i;
+ struct io_submitter *io_submitter;
+ int result;
+
+ result = vdo_allocate_extended(struct io_submitter, thread_count,
+ struct bio_queue_data, "bio submission data",
+ &io_submitter);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ io_submitter->bio_queue_rotation_interval = rotation_interval;
+
+ /* Setup for each bio-submission work queue */
+ for (i = 0; i < thread_count; i++) {
+ struct bio_queue_data *bio_queue_data = &io_submitter->bio_queue_data[i];
+
+ mutex_init(&bio_queue_data->lock);
+ /*
+ * One I/O operation per request, but both first & last sector numbers.
+ *
+ * If requests are assigned to threads round-robin, they should be distributed
+ * quite evenly. But if they're assigned based on PBN, things can sometimes be very
+ * uneven. So for now, we'll assume that all requests *may* wind up on one thread,
+ * and thus all in the same map.
+ */
+ result = vdo_int_map_create(max_requests_active * 2,
+ &bio_queue_data->map);
+ if (result != VDO_SUCCESS) {
+ /*
+ * Clean up the partially initialized bio-queue entirely and indicate that
+ * initialization failed.
+ */
+ vdo_log_error("bio map initialization failed %d", result);
+ vdo_cleanup_io_submitter(io_submitter);
+ vdo_free_io_submitter(io_submitter);
+ return result;
+ }
+
+ bio_queue_data->queue_number = i;
+ result = vdo_make_thread(vdo, vdo->thread_config.bio_threads[i],
+ &bio_queue_type, 1, (void **) &bio_queue_data);
+ if (result != VDO_SUCCESS) {
+ /*
+ * Clean up the partially initialized bio-queue entirely and indicate that
+ * initialization failed.
+ */
+ vdo_int_map_free(vdo_forget(bio_queue_data->map));
+ vdo_log_error("bio queue initialization failed %d", result);
+ vdo_cleanup_io_submitter(io_submitter);
+ vdo_free_io_submitter(io_submitter);
+ return result;
+ }
+
+ bio_queue_data->queue = vdo->threads[vdo->thread_config.bio_threads[i]].queue;
+ io_submitter->num_bio_queues_used++;
+ }
+
+ *io_submitter_ptr = io_submitter;
+
+ return VDO_SUCCESS;
+}
+
+/**
+ * vdo_cleanup_io_submitter() - Tear down the io_submitter fields as needed for a physical layer.
+ * @io_submitter: The I/O submitter data to tear down (may be NULL).
+ */
+void vdo_cleanup_io_submitter(struct io_submitter *io_submitter)
+{
+ int i;
+
+ if (io_submitter == NULL)
+ return;
+
+ for (i = io_submitter->num_bio_queues_used - 1; i >= 0; i--)
+ vdo_finish_work_queue(io_submitter->bio_queue_data[i].queue);
+}
+
+/**
+ * vdo_free_io_submitter() - Free the io_submitter fields and structure as needed.
+ * @io_submitter: The I/O submitter data to destroy.
+ *
+ * This must be called after vdo_cleanup_io_submitter(). It is used to release resources late in
+ * the shutdown process to avoid or reduce the chance of race conditions.
+ */
+void vdo_free_io_submitter(struct io_submitter *io_submitter)
+{
+ int i;
+
+ if (io_submitter == NULL)
+ return;
+
+ for (i = io_submitter->num_bio_queues_used - 1; i >= 0; i--) {
+ io_submitter->num_bio_queues_used--;
+ /* vdo_destroy() will free the work queue, so just give up our reference to it. */
+ vdo_forget(io_submitter->bio_queue_data[i].queue);
+ vdo_int_map_free(vdo_forget(io_submitter->bio_queue_data[i].map));
+ }
+ vdo_free(io_submitter);
+}
diff --git a/drivers/md/dm-vdo/io-submitter.h b/drivers/md/dm-vdo/io-submitter.h
new file mode 100644
index 000000000000..80748699496f
--- /dev/null
+++ b/drivers/md/dm-vdo/io-submitter.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef VDO_IO_SUBMITTER_H
+#define VDO_IO_SUBMITTER_H
+
+#include <linux/bio.h>
+
+#include "types.h"
+
+struct io_submitter;
+
+int vdo_make_io_submitter(unsigned int thread_count, unsigned int rotation_interval,
+ unsigned int max_requests_active, struct vdo *vdo,
+ struct io_submitter **io_submitter);
+
+void vdo_cleanup_io_submitter(struct io_submitter *io_submitter);
+
+void vdo_free_io_submitter(struct io_submitter *io_submitter);
+
+void vdo_submit_vio(struct vdo_completion *completion);
+
+void vdo_submit_data_vio(struct data_vio *data_vio);
+
+void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical,
+ bio_end_io_t callback, vdo_action_fn error_handler,
+ blk_opf_t operation, char *data);
+
+static inline void vdo_submit_metadata_vio(struct vio *vio, physical_block_number_t physical,
+ bio_end_io_t callback, vdo_action_fn error_handler,
+ blk_opf_t operation)
+{
+ __submit_metadata_vio(vio, physical, callback, error_handler,
+ operation, vio->data);
+}
+
+static inline void vdo_submit_flush_vio(struct vio *vio, bio_end_io_t callback,
+ vdo_action_fn error_handler)
+{
+ /* FIXME: Can we just use REQ_OP_FLUSH? */
+ __submit_metadata_vio(vio, 0, callback, error_handler,
+ REQ_OP_WRITE | REQ_PREFLUSH, NULL);
+}
+
+#endif /* VDO_IO_SUBMITTER_H */
diff --git a/drivers/md/dm-vdo/logger.c b/drivers/md/dm-vdo/logger.c
new file mode 100644
index 000000000000..3f7dc2cb6b98
--- /dev/null
+++ b/drivers/md/dm-vdo/logger.c
@@ -0,0 +1,239 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "logger.h"
+
+#include <asm/current.h>
+#include <linux/delay.h>
+#include <linux/hardirq.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+
+#include "errors.h"
+#include "thread-device.h"
+#include "thread-utils.h"
+
+int vdo_log_level = VDO_LOG_DEFAULT;
+
+int vdo_get_log_level(void)
+{
+ int log_level_latch = READ_ONCE(vdo_log_level);
+
+ if (unlikely(log_level_latch > VDO_LOG_MAX)) {
+ log_level_latch = VDO_LOG_DEFAULT;
+ WRITE_ONCE(vdo_log_level, log_level_latch);
+ }
+ return log_level_latch;
+}
+
+static const char *get_current_interrupt_type(void)
+{
+ if (in_nmi())
+ return "NMI";
+
+ if (in_irq())
+ return "HI";
+
+ if (in_softirq())
+ return "SI";
+
+ return "INTR";
+}
+
+/**
+ * emit_log_message_to_kernel() - Emit a log message to the kernel at the specified priority.
+ *
+ * @priority: The priority at which to log the message
+ * @fmt: The format string of the message
+ */
+static void emit_log_message_to_kernel(int priority, const char *fmt, ...)
+{
+ va_list args;
+ struct va_format vaf;
+
+ if (priority > vdo_get_log_level())
+ return;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ switch (priority) {
+ case VDO_LOG_EMERG:
+ case VDO_LOG_ALERT:
+ case VDO_LOG_CRIT:
+ pr_crit("%pV", &vaf);
+ break;
+ case VDO_LOG_ERR:
+ pr_err("%pV", &vaf);
+ break;
+ case VDO_LOG_WARNING:
+ pr_warn("%pV", &vaf);
+ break;
+ case VDO_LOG_NOTICE:
+ case VDO_LOG_INFO:
+ pr_info("%pV", &vaf);
+ break;
+ case VDO_LOG_DEBUG:
+ pr_debug("%pV", &vaf);
+ break;
+ default:
+ printk(KERN_DEFAULT "%pV", &vaf);
+ break;
+ }
+
+ va_end(args);
+}
+
+/**
+ * emit_log_message() - Emit a log message to the kernel log in a format suited to the current
+ * thread context.
+ *
+ * Context info formats:
+ *
+ * interrupt: uds[NMI]: blah
+ * kvdo thread: kvdo12:foobarQ: blah
+ * thread w/device id: kvdo12:myprog: blah
+ * other thread: uds: myprog: blah
+ *
+ * Fields: module name, interrupt level, process name, device ID.
+ *
+ * @priority: the priority at which to log the message
+ * @module: The name of the module doing the logging
+ * @prefix: The prefix of the log message
+ * @vaf1: The first message format descriptor
+ * @vaf2: The second message format descriptor
+ */
+static void emit_log_message(int priority, const char *module, const char *prefix,
+ const struct va_format *vaf1, const struct va_format *vaf2)
+{
+ int device_instance;
+
+ /*
+ * In interrupt context, identify the interrupt type and module. Ignore the process/thread
+ * since it could be anything.
+ */
+ if (in_interrupt()) {
+ const char *type = get_current_interrupt_type();
+
+ emit_log_message_to_kernel(priority, "%s[%s]: %s%pV%pV\n", module, type,
+ prefix, vaf1, vaf2);
+ return;
+ }
+
+ /* Not at interrupt level; we have a process we can look at, and might have a device ID. */
+ device_instance = vdo_get_thread_device_id();
+ if (device_instance >= 0) {
+ emit_log_message_to_kernel(priority, "%s%u:%s: %s%pV%pV\n", module,
+ device_instance, current->comm, prefix, vaf1,
+ vaf2);
+ return;
+ }
+
+ /*
+ * If it's a kernel thread and the module name is a prefix of its name, assume it is ours
+ * and only identify the thread.
+ */
+ if (((current->flags & PF_KTHREAD) != 0) &&
+ (strncmp(module, current->comm, strlen(module)) == 0)) {
+ emit_log_message_to_kernel(priority, "%s: %s%pV%pV\n", current->comm,
+ prefix, vaf1, vaf2);
+ return;
+ }
+
+ /* Identify the module and the process. */
+ emit_log_message_to_kernel(priority, "%s: %s: %s%pV%pV\n", module, current->comm,
+ prefix, vaf1, vaf2);
+}
+
+/*
+ * vdo_log_embedded_message() - Log a message embedded within another message.
+ * @priority: the priority at which to log the message
+ * @module: the name of the module doing the logging
+ * @prefix: optional string prefix to message, may be NULL
+ * @fmt1: format of message first part (required)
+ * @args1: arguments for message first part (required)
+ * @fmt2: format of message second part
+ */
+void vdo_log_embedded_message(int priority, const char *module, const char *prefix,
+ const char *fmt1, va_list args1, const char *fmt2, ...)
+{
+ va_list args1_copy;
+ va_list args2;
+ struct va_format vaf1, vaf2;
+
+ va_start(args2, fmt2);
+
+ if (module == NULL)
+ module = VDO_LOGGING_MODULE_NAME;
+
+ if (prefix == NULL)
+ prefix = "";
+
+ /*
+ * It is implementation dependent whether va_list is defined as an array type that decays
+ * to a pointer when passed as an argument. Copy args1 and args2 with va_copy so that vaf1
+ * and vaf2 get proper va_list pointers irrespective of how va_list is defined.
+ */
+ va_copy(args1_copy, args1);
+ vaf1.fmt = fmt1;
+ vaf1.va = &args1_copy;
+
+ vaf2.fmt = fmt2;
+ vaf2.va = &args2;
+
+ emit_log_message(priority, module, prefix, &vaf1, &vaf2);
+
+ va_end(args1_copy);
+ va_end(args2);
+}
+
+int vdo_vlog_strerror(int priority, int errnum, const char *module, const char *format,
+ va_list args)
+{
+ char errbuf[VDO_MAX_ERROR_MESSAGE_SIZE];
+ const char *message = uds_string_error(errnum, errbuf, sizeof(errbuf));
+
+ vdo_log_embedded_message(priority, module, NULL, format, args, ": %s (%d)",
+ message, errnum);
+ return errnum;
+}
+
+int __vdo_log_strerror(int priority, int errnum, const char *module, const char *format, ...)
+{
+ va_list args;
+
+ va_start(args, format);
+ vdo_vlog_strerror(priority, errnum, module, format, args);
+ va_end(args);
+ return errnum;
+}
+
+void vdo_log_backtrace(int priority)
+{
+ if (priority > vdo_get_log_level())
+ return;
+
+ dump_stack();
+}
+
+void __vdo_log_message(int priority, const char *module, const char *format, ...)
+{
+ va_list args;
+
+ va_start(args, format);
+ vdo_log_embedded_message(priority, module, NULL, format, args, "%s", "");
+ va_end(args);
+}
+
+/*
+ * Sleep or delay a few milliseconds in an attempt to allow the log buffers to be flushed lest they
+ * be overrun.
+ */
+void vdo_pause_for_logger(void)
+{
+ fsleep(4000);
+}
diff --git a/drivers/md/dm-vdo/logger.h b/drivers/md/dm-vdo/logger.h
new file mode 100644
index 000000000000..ae6ad691c027
--- /dev/null
+++ b/drivers/md/dm-vdo/logger.h
@@ -0,0 +1,100 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef VDO_LOGGER_H
+#define VDO_LOGGER_H
+
+#include <linux/kern_levels.h>
+#include <linux/module.h>
+#include <linux/ratelimit.h>
+#include <linux/device-mapper.h>
+
+/* Custom logging utilities for UDS */
+
+enum {
+ VDO_LOG_EMERG = LOGLEVEL_EMERG,
+ VDO_LOG_ALERT = LOGLEVEL_ALERT,
+ VDO_LOG_CRIT = LOGLEVEL_CRIT,
+ VDO_LOG_ERR = LOGLEVEL_ERR,
+ VDO_LOG_WARNING = LOGLEVEL_WARNING,
+ VDO_LOG_NOTICE = LOGLEVEL_NOTICE,
+ VDO_LOG_INFO = LOGLEVEL_INFO,
+ VDO_LOG_DEBUG = LOGLEVEL_DEBUG,
+
+ VDO_LOG_MAX = VDO_LOG_DEBUG,
+ VDO_LOG_DEFAULT = VDO_LOG_INFO,
+};
+
+extern int vdo_log_level;
+
+#define DM_MSG_PREFIX "vdo"
+#define VDO_LOGGING_MODULE_NAME DM_NAME ": " DM_MSG_PREFIX
+
+/* Apply a rate limiter to a log method call. */
+#define vdo_log_ratelimit(log_fn, ...) \
+ do { \
+ static DEFINE_RATELIMIT_STATE(_rs, \
+ DEFAULT_RATELIMIT_INTERVAL, \
+ DEFAULT_RATELIMIT_BURST); \
+ if (__ratelimit(&_rs)) { \
+ log_fn(__VA_ARGS__); \
+ } \
+ } while (0)
+
+int vdo_get_log_level(void);
+
+void vdo_log_embedded_message(int priority, const char *module, const char *prefix,
+ const char *fmt1, va_list args1, const char *fmt2, ...)
+ __printf(4, 0) __printf(6, 7);
+
+void vdo_log_backtrace(int priority);
+
+/* All log functions will preserve the caller's value of errno. */
+
+#define vdo_log_strerror(priority, errnum, ...) \
+ __vdo_log_strerror(priority, errnum, VDO_LOGGING_MODULE_NAME, __VA_ARGS__)
+
+int __vdo_log_strerror(int priority, int errnum, const char *module,
+ const char *format, ...)
+ __printf(4, 5);
+
+int vdo_vlog_strerror(int priority, int errnum, const char *module, const char *format,
+ va_list args)
+ __printf(4, 0);
+
+/* Log an error prefixed with the string associated with the errnum. */
+#define vdo_log_error_strerror(errnum, ...) \
+ vdo_log_strerror(VDO_LOG_ERR, errnum, __VA_ARGS__)
+
+#define vdo_log_debug_strerror(errnum, ...) \
+ vdo_log_strerror(VDO_LOG_DEBUG, errnum, __VA_ARGS__)
+
+#define vdo_log_info_strerror(errnum, ...) \
+ vdo_log_strerror(VDO_LOG_INFO, errnum, __VA_ARGS__)
+
+#define vdo_log_warning_strerror(errnum, ...) \
+ vdo_log_strerror(VDO_LOG_WARNING, errnum, __VA_ARGS__)
+
+#define vdo_log_fatal_strerror(errnum, ...) \
+ vdo_log_strerror(VDO_LOG_CRIT, errnum, __VA_ARGS__)
+
+#define vdo_log_message(priority, ...) \
+ __vdo_log_message(priority, VDO_LOGGING_MODULE_NAME, __VA_ARGS__)
+
+void __vdo_log_message(int priority, const char *module, const char *format, ...)
+ __printf(3, 4);
+
+#define vdo_log_debug(...) vdo_log_message(VDO_LOG_DEBUG, __VA_ARGS__)
+
+#define vdo_log_info(...) vdo_log_message(VDO_LOG_INFO, __VA_ARGS__)
+
+#define vdo_log_warning(...) vdo_log_message(VDO_LOG_WARNING, __VA_ARGS__)
+
+#define vdo_log_error(...) vdo_log_message(VDO_LOG_ERR, __VA_ARGS__)
+
+#define vdo_log_fatal(...) vdo_log_message(VDO_LOG_CRIT, __VA_ARGS__)
+
+void vdo_pause_for_logger(void);
+#endif /* VDO_LOGGER_H */
diff --git a/drivers/md/dm-vdo/logical-zone.c b/drivers/md/dm-vdo/logical-zone.c
new file mode 100644
index 000000000000..026f031ffc9e
--- /dev/null
+++ b/drivers/md/dm-vdo/logical-zone.c
@@ -0,0 +1,373 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "logical-zone.h"
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "permassert.h"
+#include "string-utils.h"
+
+#include "action-manager.h"
+#include "admin-state.h"
+#include "block-map.h"
+#include "completion.h"
+#include "constants.h"
+#include "data-vio.h"
+#include "flush.h"
+#include "int-map.h"
+#include "physical-zone.h"
+#include "vdo.h"
+
+#define ALLOCATIONS_PER_ZONE 128
+
+/**
+ * as_logical_zone() - Convert a generic vdo_completion to a logical_zone.
+ * @completion: The completion to convert.
+ *
+ * Return: The completion as a logical_zone.
+ */
+static struct logical_zone *as_logical_zone(struct vdo_completion *completion)
+{
+ vdo_assert_completion_type(completion, VDO_GENERATION_FLUSHED_COMPLETION);
+ return container_of(completion, struct logical_zone, completion);
+}
+
+/* get_thread_id_for_zone() - Implements vdo_zone_thread_getter_fn. */
+static thread_id_t get_thread_id_for_zone(void *context, zone_count_t zone_number)
+{
+ struct logical_zones *zones = context;
+
+ return zones->zones[zone_number].thread_id;
+}
+
+/**
+ * initialize_zone() - Initialize a logical zone.
+ * @zones: The logical_zones to which this zone belongs.
+ * @zone_number: The logical_zone's index.
+ */
+static int initialize_zone(struct logical_zones *zones, zone_count_t zone_number)
+{
+ int result;
+ struct vdo *vdo = zones->vdo;
+ struct logical_zone *zone = &zones->zones[zone_number];
+ zone_count_t allocation_zone_number;
+
+ result = vdo_int_map_create(VDO_LOCK_MAP_CAPACITY, &zone->lbn_operations);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ if (zone_number < vdo->thread_config.logical_zone_count - 1)
+ zone->next = &zones->zones[zone_number + 1];
+
+ vdo_initialize_completion(&zone->completion, vdo,
+ VDO_GENERATION_FLUSHED_COMPLETION);
+ zone->zones = zones;
+ zone->zone_number = zone_number;
+ zone->thread_id = vdo->thread_config.logical_threads[zone_number];
+ zone->block_map_zone = &vdo->block_map->zones[zone_number];
+ INIT_LIST_HEAD(&zone->write_vios);
+ vdo_set_admin_state_code(&zone->state, VDO_ADMIN_STATE_NORMAL_OPERATION);
+
+ allocation_zone_number = zone->thread_id % vdo->thread_config.physical_zone_count;
+ zone->allocation_zone = &vdo->physical_zones->zones[allocation_zone_number];
+
+ return vdo_make_default_thread(vdo, zone->thread_id);
+}
+
+/**
+ * vdo_make_logical_zones() - Create a set of logical zones.
+ * @vdo: The vdo to which the zones will belong.
+ * @zones_ptr: A pointer to hold the new zones.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+int vdo_make_logical_zones(struct vdo *vdo, struct logical_zones **zones_ptr)
+{
+ struct logical_zones *zones;
+ int result;
+ zone_count_t zone;
+ zone_count_t zone_count = vdo->thread_config.logical_zone_count;
+
+ if (zone_count == 0)
+ return VDO_SUCCESS;
+
+ result = vdo_allocate_extended(struct logical_zones, zone_count,
+ struct logical_zone, __func__, &zones);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ zones->vdo = vdo;
+ zones->zone_count = zone_count;
+ for (zone = 0; zone < zone_count; zone++) {
+ result = initialize_zone(zones, zone);
+ if (result != VDO_SUCCESS) {
+ vdo_free_logical_zones(zones);
+ return result;
+ }
+ }
+
+ result = vdo_make_action_manager(zones->zone_count, get_thread_id_for_zone,
+ vdo->thread_config.admin_thread, zones, NULL,
+ vdo, &zones->manager);
+ if (result != VDO_SUCCESS) {
+ vdo_free_logical_zones(zones);
+ return result;
+ }
+
+ *zones_ptr = zones;
+ return VDO_SUCCESS;
+}
+
+/**
+ * vdo_free_logical_zones() - Free a set of logical zones.
+ * @zones: The set of zones to free.
+ */
+void vdo_free_logical_zones(struct logical_zones *zones)
+{
+ zone_count_t index;
+
+ if (zones == NULL)
+ return;
+
+ vdo_free(vdo_forget(zones->manager));
+
+ for (index = 0; index < zones->zone_count; index++)
+ vdo_int_map_free(vdo_forget(zones->zones[index].lbn_operations));
+
+ vdo_free(zones);
+}
+
+static inline void assert_on_zone_thread(struct logical_zone *zone, const char *what)
+{
+ VDO_ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == zone->thread_id),
+ "%s() called on correct thread", what);
+}
+
+/**
+ * check_for_drain_complete() - Check whether this zone has drained.
+ * @zone: The zone to check.
+ */
+static void check_for_drain_complete(struct logical_zone *zone)
+{
+ if (!vdo_is_state_draining(&zone->state) || zone->notifying ||
+ !list_empty(&zone->write_vios))
+ return;
+
+ vdo_finish_draining(&zone->state);
+}
+
+/**
+ * initiate_drain() - Initiate a drain.
+ *
+ * Implements vdo_admin_initiator_fn.
+ */
+static void initiate_drain(struct admin_state *state)
+{
+ check_for_drain_complete(container_of(state, struct logical_zone, state));
+}
+
+/**
+ * drain_logical_zone() - Drain a logical zone.
+ *
+ * Implements vdo_zone_action_fn.
+ */
+static void drain_logical_zone(void *context, zone_count_t zone_number,
+ struct vdo_completion *parent)
+{
+ struct logical_zones *zones = context;
+
+ vdo_start_draining(&zones->zones[zone_number].state,
+ vdo_get_current_manager_operation(zones->manager), parent,
+ initiate_drain);
+}
+
+void vdo_drain_logical_zones(struct logical_zones *zones,
+ const struct admin_state_code *operation,
+ struct vdo_completion *parent)
+{
+ vdo_schedule_operation(zones->manager, operation, NULL, drain_logical_zone, NULL,
+ parent);
+}
+
+/**
+ * resume_logical_zone() - Resume a logical zone.
+ *
+ * Implements vdo_zone_action_fn.
+ */
+static void resume_logical_zone(void *context, zone_count_t zone_number,
+ struct vdo_completion *parent)
+{
+ struct logical_zone *zone = &(((struct logical_zones *) context)->zones[zone_number]);
+
+ vdo_fail_completion(parent, vdo_resume_if_quiescent(&zone->state));
+}
+
+/**
+ * vdo_resume_logical_zones() - Resume a set of logical zones.
+ * @zones: The logical zones to resume.
+ * @parent: The object to notify when the zones have resumed.
+ */
+void vdo_resume_logical_zones(struct logical_zones *zones, struct vdo_completion *parent)
+{
+ vdo_schedule_operation(zones->manager, VDO_ADMIN_STATE_RESUMING, NULL,
+ resume_logical_zone, NULL, parent);
+}
+
+/**
+ * update_oldest_active_generation() - Update the oldest active generation.
+ * @zone: The zone.
+ *
+ * Return: true if the oldest active generation has changed.
+ */
+static bool update_oldest_active_generation(struct logical_zone *zone)
+{
+ struct data_vio *data_vio =
+ list_first_entry_or_null(&zone->write_vios, struct data_vio,
+ write_entry);
+ sequence_number_t oldest =
+ (data_vio == NULL) ? zone->flush_generation : data_vio->flush_generation;
+
+ if (oldest == zone->oldest_active_generation)
+ return false;
+
+ WRITE_ONCE(zone->oldest_active_generation, oldest);
+ return true;
+}
+
+/**
+ * vdo_increment_logical_zone_flush_generation() - Increment the flush generation in a logical
+ * zone.
+ * @zone: The logical zone.
+ * @expected_generation: The expected value of the flush generation before the increment.
+ */
+void vdo_increment_logical_zone_flush_generation(struct logical_zone *zone,
+ sequence_number_t expected_generation)
+{
+ assert_on_zone_thread(zone, __func__);
+ VDO_ASSERT_LOG_ONLY((zone->flush_generation == expected_generation),
+ "logical zone %u flush generation %llu should be %llu before increment",
+ zone->zone_number, (unsigned long long) zone->flush_generation,
+ (unsigned long long) expected_generation);
+
+ zone->flush_generation++;
+ zone->ios_in_flush_generation = 0;
+ update_oldest_active_generation(zone);
+}
+
+/**
+ * vdo_acquire_flush_generation_lock() - Acquire the shared lock on a flush generation by a write
+ * data_vio.
+ * @data_vio: The data_vio.
+ */
+void vdo_acquire_flush_generation_lock(struct data_vio *data_vio)
+{
+ struct logical_zone *zone = data_vio->logical.zone;
+
+ assert_on_zone_thread(zone, __func__);
+ VDO_ASSERT_LOG_ONLY(vdo_is_state_normal(&zone->state), "vdo state is normal");
+
+ data_vio->flush_generation = zone->flush_generation;
+ list_add_tail(&data_vio->write_entry, &zone->write_vios);
+ zone->ios_in_flush_generation++;
+}
+
+static void attempt_generation_complete_notification(struct vdo_completion *completion);
+
+/**
+ * notify_flusher() - Notify the flush that at least one generation no longer has active VIOs.
+ * @completion: The zone completion.
+ *
+ * This callback is registered in attempt_generation_complete_notification().
+ */
+static void notify_flusher(struct vdo_completion *completion)
+{
+ struct logical_zone *zone = as_logical_zone(completion);
+
+ vdo_complete_flushes(zone->zones->vdo->flusher);
+ vdo_launch_completion_callback(completion,
+ attempt_generation_complete_notification,
+ zone->thread_id);
+}
+
+/**
+ * attempt_generation_complete_notification() - Notify the flusher if some generation no
+ * longer has active VIOs.
+ * @completion: The zone completion.
+ */
+static void attempt_generation_complete_notification(struct vdo_completion *completion)
+{
+ struct logical_zone *zone = as_logical_zone(completion);
+
+ assert_on_zone_thread(zone, __func__);
+ if (zone->oldest_active_generation <= zone->notification_generation) {
+ zone->notifying = false;
+ check_for_drain_complete(zone);
+ return;
+ }
+
+ zone->notifying = true;
+ zone->notification_generation = zone->oldest_active_generation;
+ vdo_launch_completion_callback(&zone->completion, notify_flusher,
+ vdo_get_flusher_thread_id(zone->zones->vdo->flusher));
+}
+
+/**
+ * vdo_release_flush_generation_lock() - Release the shared lock on a flush generation held by a
+ * write data_vio.
+ * @data_vio: The data_vio whose lock is to be released.
+ *
+ * If there are pending flushes, and this data_vio completes the oldest generation active in this
+ * zone, an attempt will be made to finish any flushes which may now be complete.
+ */
+void vdo_release_flush_generation_lock(struct data_vio *data_vio)
+{
+ struct logical_zone *zone = data_vio->logical.zone;
+
+ assert_on_zone_thread(zone, __func__);
+
+ if (!data_vio_has_flush_generation_lock(data_vio))
+ return;
+
+ list_del_init(&data_vio->write_entry);
+ VDO_ASSERT_LOG_ONLY((zone->oldest_active_generation <= data_vio->flush_generation),
+ "data_vio releasing lock on generation %llu is not older than oldest active generation %llu",
+ (unsigned long long) data_vio->flush_generation,
+ (unsigned long long) zone->oldest_active_generation);
+
+ if (!update_oldest_active_generation(zone) || zone->notifying)
+ return;
+
+ attempt_generation_complete_notification(&zone->completion);
+}
+
+struct physical_zone *vdo_get_next_allocation_zone(struct logical_zone *zone)
+{
+ if (zone->allocation_count == ALLOCATIONS_PER_ZONE) {
+ zone->allocation_count = 0;
+ zone->allocation_zone = zone->allocation_zone->next;
+ }
+
+ zone->allocation_count++;
+ return zone->allocation_zone;
+}
+
+/**
+ * vdo_dump_logical_zone() - Dump information about a logical zone to the log for debugging.
+ * @zone: The zone to dump
+ *
+ * Context: the information is dumped in a thread-unsafe fashion.
+ *
+ */
+void vdo_dump_logical_zone(const struct logical_zone *zone)
+{
+ vdo_log_info("logical_zone %u", zone->zone_number);
+ vdo_log_info(" flush_generation=%llu oldest_active_generation=%llu notification_generation=%llu notifying=%s ios_in_flush_generation=%llu",
+ (unsigned long long) READ_ONCE(zone->flush_generation),
+ (unsigned long long) READ_ONCE(zone->oldest_active_generation),
+ (unsigned long long) READ_ONCE(zone->notification_generation),
+ vdo_bool_to_string(READ_ONCE(zone->notifying)),
+ (unsigned long long) READ_ONCE(zone->ios_in_flush_generation));
+}
diff --git a/drivers/md/dm-vdo/logical-zone.h b/drivers/md/dm-vdo/logical-zone.h
new file mode 100644
index 000000000000..1b666c84a193
--- /dev/null
+++ b/drivers/md/dm-vdo/logical-zone.h
@@ -0,0 +1,89 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef VDO_LOGICAL_ZONE_H
+#define VDO_LOGICAL_ZONE_H
+
+#include <linux/list.h>
+
+#include "admin-state.h"
+#include "int-map.h"
+#include "types.h"
+
+struct physical_zone;
+
+struct logical_zone {
+ /* The completion for flush notifications */
+ struct vdo_completion completion;
+ /* The owner of this zone */
+ struct logical_zones *zones;
+ /* Which logical zone this is */
+ zone_count_t zone_number;
+ /* The thread id for this zone */
+ thread_id_t thread_id;
+ /* In progress operations keyed by LBN */
+ struct int_map *lbn_operations;
+ /* The logical to physical map */
+ struct block_map_zone *block_map_zone;
+ /* The current flush generation */
+ sequence_number_t flush_generation;
+ /*
+ * The oldest active generation in this zone. This is mutated only on the logical zone
+ * thread but is queried from the flusher thread.
+ */
+ sequence_number_t oldest_active_generation;
+ /* The number of IOs in the current flush generation */
+ block_count_t ios_in_flush_generation;
+ /* The youngest generation of the current notification */
+ sequence_number_t notification_generation;
+ /* Whether a notification is in progress */
+ bool notifying;
+ /* The queue of active data write VIOs */
+ struct list_head write_vios;
+ /* The administrative state of the zone */
+ struct admin_state state;
+ /* The physical zone from which to allocate */
+ struct physical_zone *allocation_zone;
+ /* The number of allocations done from the current allocation_zone */
+ block_count_t allocation_count;
+ /* The next zone */
+ struct logical_zone *next;
+};
+
+struct logical_zones {
+ /* The vdo whose zones these are */
+ struct vdo *vdo;
+ /* The manager for administrative actions */
+ struct action_manager *manager;
+ /* The number of zones */
+ zone_count_t zone_count;
+ /* The logical zones themselves */
+ struct logical_zone zones[];
+};
+
+int __must_check vdo_make_logical_zones(struct vdo *vdo,
+ struct logical_zones **zones_ptr);
+
+void vdo_free_logical_zones(struct logical_zones *zones);
+
+void vdo_drain_logical_zones(struct logical_zones *zones,
+ const struct admin_state_code *operation,
+ struct vdo_completion *completion);
+
+void vdo_resume_logical_zones(struct logical_zones *zones,
+ struct vdo_completion *parent);
+
+void vdo_increment_logical_zone_flush_generation(struct logical_zone *zone,
+ sequence_number_t expected_generation);
+
+void vdo_acquire_flush_generation_lock(struct data_vio *data_vio);
+
+void vdo_release_flush_generation_lock(struct data_vio *data_vio);
+
+struct physical_zone * __must_check vdo_get_next_allocation_zone(struct logical_zone *zone);
+
+void vdo_dump_logical_zone(const struct logical_zone *zone);
+
+#endif /* VDO_LOGICAL_ZONE_H */
diff --git a/drivers/md/dm-vdo/memory-alloc.c b/drivers/md/dm-vdo/memory-alloc.c
new file mode 100644
index 000000000000..185f259c7245
--- /dev/null
+++ b/drivers/md/dm-vdo/memory-alloc.c
@@ -0,0 +1,438 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include <linux/delay.h>
+#include <linux/mm.h>
+#include <linux/sched/mm.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "permassert.h"
+
+/*
+ * UDS and VDO keep track of which threads are allowed to allocate memory freely, and which threads
+ * must be careful to not do a memory allocation that does an I/O request. The 'allocating_threads'
+ * thread_registry and its associated methods implement this tracking.
+ */
+static struct thread_registry allocating_threads;
+
+static inline bool allocations_allowed(void)
+{
+ return vdo_lookup_thread(&allocating_threads) != NULL;
+}
+
+/*
+ * Register the current thread as an allocating thread.
+ *
+ * An optional flag location can be supplied indicating whether, at any given point in time, the
+ * threads associated with that flag should be allocating storage. If the flag is false, a message
+ * will be logged.
+ *
+ * If no flag is supplied, the thread is always allowed to allocate storage without complaint.
+ *
+ * @new_thread: registered_thread structure to use for the current thread
+ * @flag_ptr: Location of the allocation-allowed flag
+ */
+void vdo_register_allocating_thread(struct registered_thread *new_thread,
+ const bool *flag_ptr)
+{
+ if (flag_ptr == NULL) {
+ static const bool allocation_always_allowed = true;
+
+ flag_ptr = &allocation_always_allowed;
+ }
+
+ vdo_register_thread(&allocating_threads, new_thread, flag_ptr);
+}
+
+/* Unregister the current thread as an allocating thread. */
+void vdo_unregister_allocating_thread(void)
+{
+ vdo_unregister_thread(&allocating_threads);
+}
+
+/*
+ * We track how much memory has been allocated and freed. When we unload the module, we log an
+ * error if we have not freed all the memory that we allocated. Nearly all memory allocation and
+ * freeing is done using this module.
+ *
+ * We do not use kernel functions like the kvasprintf() method, which allocate memory indirectly
+ * using kmalloc.
+ *
+ * These data structures and methods are used to track the amount of memory used.
+ */
+
+/*
+ * We allocate very few large objects, and allocation/deallocation isn't done in a
+ * performance-critical stage for us, so a linked list should be fine.
+ */
+struct vmalloc_block_info {
+ void *ptr;
+ size_t size;
+ struct vmalloc_block_info *next;
+};
+
+static struct {
+ spinlock_t lock;
+ size_t kmalloc_blocks;
+ size_t kmalloc_bytes;
+ size_t vmalloc_blocks;
+ size_t vmalloc_bytes;
+ size_t peak_bytes;
+ struct vmalloc_block_info *vmalloc_list;
+} memory_stats __cacheline_aligned;
+
+static void update_peak_usage(void)
+{
+ size_t total_bytes = memory_stats.kmalloc_bytes + memory_stats.vmalloc_bytes;
+
+ if (total_bytes > memory_stats.peak_bytes)
+ memory_stats.peak_bytes = total_bytes;
+}
+
+static void add_kmalloc_block(size_t size)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&memory_stats.lock, flags);
+ memory_stats.kmalloc_blocks++;
+ memory_stats.kmalloc_bytes += size;
+ update_peak_usage();
+ spin_unlock_irqrestore(&memory_stats.lock, flags);
+}
+
+static void remove_kmalloc_block(size_t size)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&memory_stats.lock, flags);
+ memory_stats.kmalloc_blocks--;
+ memory_stats.kmalloc_bytes -= size;
+ spin_unlock_irqrestore(&memory_stats.lock, flags);
+}
+
+static void add_vmalloc_block(struct vmalloc_block_info *block)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&memory_stats.lock, flags);
+ block->next = memory_stats.vmalloc_list;
+ memory_stats.vmalloc_list = block;
+ memory_stats.vmalloc_blocks++;
+ memory_stats.vmalloc_bytes += block->size;
+ update_peak_usage();
+ spin_unlock_irqrestore(&memory_stats.lock, flags);
+}
+
+static void remove_vmalloc_block(void *ptr)
+{
+ struct vmalloc_block_info *block;
+ struct vmalloc_block_info **block_ptr;
+ unsigned long flags;
+
+ spin_lock_irqsave(&memory_stats.lock, flags);
+ for (block_ptr = &memory_stats.vmalloc_list;
+ (block = *block_ptr) != NULL;
+ block_ptr = &block->next) {
+ if (block->ptr == ptr) {
+ *block_ptr = block->next;
+ memory_stats.vmalloc_blocks--;
+ memory_stats.vmalloc_bytes -= block->size;
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&memory_stats.lock, flags);
+ if (block != NULL)
+ vdo_free(block);
+ else
+ vdo_log_info("attempting to remove ptr %px not found in vmalloc list", ptr);
+}
+
+/*
+ * Determine whether allocating a memory block should use kmalloc or __vmalloc.
+ *
+ * vmalloc can allocate any integral number of pages.
+ *
+ * kmalloc can allocate any number of bytes up to a configured limit, which defaults to 8 megabytes
+ * on some systems. kmalloc is especially good when memory is being both allocated and freed, and
+ * it does this efficiently in a multi CPU environment.
+ *
+ * kmalloc usually rounds the size of the block up to the next power of two, so when the requested
+ * block is bigger than PAGE_SIZE / 2 bytes, kmalloc will never give you less space than the
+ * corresponding vmalloc allocation. Sometimes vmalloc will use less overhead than kmalloc.
+ *
+ * The advantages of kmalloc do not help out UDS or VDO, because we allocate all our memory up
+ * front and do not free and reallocate it. Sometimes we have problems using kmalloc, because the
+ * Linux memory page map can become so fragmented that kmalloc will not give us a 32KB chunk. We
+ * have used vmalloc as a backup to kmalloc in the past, and a follow-up vmalloc of 32KB will work.
+ * But there is no strong case to be made for using kmalloc over vmalloc for these size chunks.
+ *
+ * The kmalloc/vmalloc boundary is set at 4KB, and kmalloc gets the 4KB requests. There is no
+ * strong reason for favoring either kmalloc or vmalloc for 4KB requests, except that tracking
+ * vmalloc statistics uses a linked list implementation. Using a simple test, this choice of
+ * boundary results in 132 vmalloc calls. Using vmalloc for requests of exactly 4KB results in an
+ * additional 6374 vmalloc calls, which is much less efficient for tracking.
+ *
+ * @size: How many bytes to allocate
+ */
+static inline bool use_kmalloc(size_t size)
+{
+ return size <= PAGE_SIZE;
+}
+
+/*
+ * Allocate storage based on memory size and alignment, logging an error if the allocation fails.
+ * The memory will be zeroed.
+ *
+ * @size: The size of an object
+ * @align: The required alignment
+ * @what: What is being allocated (for error logging)
+ * @ptr: A pointer to hold the allocated memory
+ *
+ * Return: VDO_SUCCESS or an error code
+ */
+int vdo_allocate_memory(size_t size, size_t align, const char *what, void *ptr)
+{
+ /*
+ * The __GFP_RETRY_MAYFAIL flag means the VM implementation will retry memory reclaim
+ * procedures that have previously failed if there is some indication that progress has
+ * been made elsewhere. It can wait for other tasks to attempt high level approaches to
+ * freeing memory such as compaction (which removes fragmentation) and page-out. There is
+ * still a definite limit to the number of retries, but it is a larger limit than with
+ * __GFP_NORETRY. Allocations with this flag may fail, but only when there is genuinely
+ * little unused memory. While these allocations do not directly trigger the OOM killer,
+ * their failure indicates that the system is likely to need to use the OOM killer soon.
+ * The caller must handle failure, but can reasonably do so by failing a higher-level
+ * request, or completing it only in a much less efficient manner.
+ */
+ const gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL;
+ unsigned int noio_flags;
+ bool allocations_restricted = !allocations_allowed();
+ unsigned long start_time;
+ void *p = NULL;
+
+ if (unlikely(ptr == NULL))
+ return -EINVAL;
+
+ if (size == 0) {
+ *((void **) ptr) = NULL;
+ return VDO_SUCCESS;
+ }
+
+ if (allocations_restricted)
+ noio_flags = memalloc_noio_save();
+
+ start_time = jiffies;
+ if (use_kmalloc(size) && (align < PAGE_SIZE)) {
+ p = kmalloc(size, gfp_flags | __GFP_NOWARN);
+ if (p == NULL) {
+ /*
+ * It is possible for kmalloc to fail to allocate memory because there is
+ * no page available. A short sleep may allow the page reclaimer to
+ * free a page.
+ */
+ fsleep(1000);
+ p = kmalloc(size, gfp_flags);
+ }
+
+ if (p != NULL)
+ add_kmalloc_block(ksize(p));
+ } else {
+ struct vmalloc_block_info *block;
+
+ if (vdo_allocate(1, struct vmalloc_block_info, __func__, &block) == VDO_SUCCESS) {
+ /*
+ * It is possible for __vmalloc to fail to allocate memory because there
+ * are no pages available. A short sleep may allow the page reclaimer
+ * to free enough pages for a small allocation.
+ *
+ * For larger allocations, the page_alloc code is racing against the page
+ * reclaimer. If the page reclaimer can stay ahead of page_alloc, the
+ * __vmalloc will succeed. But if page_alloc overtakes the page reclaimer,
+ * the allocation fails. It is possible that more retries will succeed.
+ */
+ for (;;) {
+ p = __vmalloc(size, gfp_flags | __GFP_NOWARN);
+ if (p != NULL)
+ break;
+
+ if (jiffies_to_msecs(jiffies - start_time) > 1000) {
+ /* Try one more time, logging a failure for this call. */
+ p = __vmalloc(size, gfp_flags);
+ break;
+ }
+
+ fsleep(1000);
+ }
+
+ if (p == NULL) {
+ vdo_free(block);
+ } else {
+ block->ptr = p;
+ block->size = PAGE_ALIGN(size);
+ add_vmalloc_block(block);
+ }
+ }
+ }
+
+ if (allocations_restricted)
+ memalloc_noio_restore(noio_flags);
+
+ if (unlikely(p == NULL)) {
+ vdo_log_error("Could not allocate %zu bytes for %s in %u msecs",
+ size, what, jiffies_to_msecs(jiffies - start_time));
+ return -ENOMEM;
+ }
+
+ *((void **) ptr) = p;
+ return VDO_SUCCESS;
+}
+
+/*
+ * Allocate storage based on memory size, failing immediately if the required memory is not
+ * available. The memory will be zeroed.
+ *
+ * @size: The size of an object.
+ * @what: What is being allocated (for error logging)
+ *
+ * Return: pointer to the allocated memory, or NULL if the required space is not available.
+ */
+void *vdo_allocate_memory_nowait(size_t size, const char *what __maybe_unused)
+{
+ void *p = kmalloc(size, GFP_NOWAIT | __GFP_ZERO);
+
+ if (p != NULL)
+ add_kmalloc_block(ksize(p));
+
+ return p;
+}
+
+void vdo_free(void *ptr)
+{
+ if (ptr != NULL) {
+ if (is_vmalloc_addr(ptr)) {
+ remove_vmalloc_block(ptr);
+ vfree(ptr);
+ } else {
+ remove_kmalloc_block(ksize(ptr));
+ kfree(ptr);
+ }
+ }
+}
+
+/*
+ * Reallocate dynamically allocated memory. There are no alignment guarantees for the reallocated
+ * memory. If the new memory is larger than the old memory, the new space will be zeroed.
+ *
+ * @ptr: The memory to reallocate.
+ * @old_size: The old size of the memory
+ * @size: The new size to allocate
+ * @what: What is being allocated (for error logging)
+ * @new_ptr: A pointer to hold the reallocated pointer
+ *
+ * Return: VDO_SUCCESS or an error code
+ */
+int vdo_reallocate_memory(void *ptr, size_t old_size, size_t size, const char *what,
+ void *new_ptr)
+{
+ int result;
+
+ if (size == 0) {
+ vdo_free(ptr);
+ *(void **) new_ptr = NULL;
+ return VDO_SUCCESS;
+ }
+
+ result = vdo_allocate(size, char, what, new_ptr);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ if (ptr != NULL) {
+ if (old_size < size)
+ size = old_size;
+
+ memcpy(*((void **) new_ptr), ptr, size);
+ vdo_free(ptr);
+ }
+
+ return VDO_SUCCESS;
+}
+
+int vdo_duplicate_string(const char *string, const char *what, char **new_string)
+{
+ int result;
+ u8 *dup;
+
+ result = vdo_allocate(strlen(string) + 1, u8, what, &dup);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ memcpy(dup, string, strlen(string) + 1);
+ *new_string = dup;
+ return VDO_SUCCESS;
+}
+
+void vdo_memory_init(void)
+{
+ spin_lock_init(&memory_stats.lock);
+ vdo_initialize_thread_registry(&allocating_threads);
+}
+
+void vdo_memory_exit(void)
+{
+ VDO_ASSERT_LOG_ONLY(memory_stats.kmalloc_bytes == 0,
+ "kmalloc memory used (%zd bytes in %zd blocks) is returned to the kernel",
+ memory_stats.kmalloc_bytes, memory_stats.kmalloc_blocks);
+ VDO_ASSERT_LOG_ONLY(memory_stats.vmalloc_bytes == 0,
+ "vmalloc memory used (%zd bytes in %zd blocks) is returned to the kernel",
+ memory_stats.vmalloc_bytes, memory_stats.vmalloc_blocks);
+ vdo_log_debug("peak usage %zd bytes", memory_stats.peak_bytes);
+}
+
+void vdo_get_memory_stats(u64 *bytes_used, u64 *peak_bytes_used)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&memory_stats.lock, flags);
+ *bytes_used = memory_stats.kmalloc_bytes + memory_stats.vmalloc_bytes;
+ *peak_bytes_used = memory_stats.peak_bytes;
+ spin_unlock_irqrestore(&memory_stats.lock, flags);
+}
+
+/*
+ * Report stats on any allocated memory that we're tracking. Not all allocation types are
+ * guaranteed to be tracked in bytes (e.g., bios).
+ */
+void vdo_report_memory_usage(void)
+{
+ unsigned long flags;
+ u64 kmalloc_blocks;
+ u64 kmalloc_bytes;
+ u64 vmalloc_blocks;
+ u64 vmalloc_bytes;
+ u64 peak_usage;
+ u64 total_bytes;
+
+ spin_lock_irqsave(&memory_stats.lock, flags);
+ kmalloc_blocks = memory_stats.kmalloc_blocks;
+ kmalloc_bytes = memory_stats.kmalloc_bytes;
+ vmalloc_blocks = memory_stats.vmalloc_blocks;
+ vmalloc_bytes = memory_stats.vmalloc_bytes;
+ peak_usage = memory_stats.peak_bytes;
+ spin_unlock_irqrestore(&memory_stats.lock, flags);
+ total_bytes = kmalloc_bytes + vmalloc_bytes;
+ vdo_log_info("current module memory tracking (actual allocation sizes, not requested):");
+ vdo_log_info(" %llu bytes in %llu kmalloc blocks",
+ (unsigned long long) kmalloc_bytes,
+ (unsigned long long) kmalloc_blocks);
+ vdo_log_info(" %llu bytes in %llu vmalloc blocks",
+ (unsigned long long) vmalloc_bytes,
+ (unsigned long long) vmalloc_blocks);
+ vdo_log_info(" total %llu bytes, peak usage %llu bytes",
+ (unsigned long long) total_bytes, (unsigned long long) peak_usage);
+}
diff --git a/drivers/md/dm-vdo/memory-alloc.h b/drivers/md/dm-vdo/memory-alloc.h
new file mode 100644
index 000000000000..0093d9f940d9
--- /dev/null
+++ b/drivers/md/dm-vdo/memory-alloc.h
@@ -0,0 +1,162 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef VDO_MEMORY_ALLOC_H
+#define VDO_MEMORY_ALLOC_H
+
+#include <linux/cache.h>
+#include <linux/io.h> /* for PAGE_SIZE */
+
+#include "permassert.h"
+#include "thread-registry.h"
+
+/* Custom memory allocation function that tracks memory usage */
+int __must_check vdo_allocate_memory(size_t size, size_t align, const char *what, void *ptr);
+
+/*
+ * Allocate storage based on element counts, sizes, and alignment.
+ *
+ * This is a generalized form of our allocation use case: It allocates an array of objects,
+ * optionally preceded by one object of another type (i.e., a struct with trailing variable-length
+ * array), with the alignment indicated.
+ *
+ * Why is this inline? The sizes and alignment will always be constant, when invoked through the
+ * macros below, and often the count will be a compile-time constant 1 or the number of extra bytes
+ * will be a compile-time constant 0. So at least some of the arithmetic can usually be optimized
+ * away, and the run-time selection between allocation functions always can. In many cases, it'll
+ * boil down to just a function call with a constant size.
+ *
+ * @count: The number of objects to allocate
+ * @size: The size of an object
+ * @extra: The number of additional bytes to allocate
+ * @align: The required alignment
+ * @what: What is being allocated (for error logging)
+ * @ptr: A pointer to hold the allocated memory
+ *
+ * Return: VDO_SUCCESS or an error code
+ */
+static inline int __vdo_do_allocation(size_t count, size_t size, size_t extra,
+ size_t align, const char *what, void *ptr)
+{
+ size_t total_size = count * size + extra;
+
+ /* Overflow check: */
+ if ((size > 0) && (count > ((SIZE_MAX - extra) / size))) {
+ /*
+ * This is kind of a hack: We rely on the fact that SIZE_MAX would cover the entire
+ * address space (minus one byte) and thus the system can never allocate that much
+ * and the call will always fail. So we can report an overflow as "out of memory"
+ * by asking for "merely" SIZE_MAX bytes.
+ */
+ total_size = SIZE_MAX;
+ }
+
+ return vdo_allocate_memory(total_size, align, what, ptr);
+}
+
+/*
+ * Allocate one or more elements of the indicated type, logging an error if the allocation fails.
+ * The memory will be zeroed.
+ *
+ * @COUNT: The number of objects to allocate
+ * @TYPE: The type of objects to allocate. This type determines the alignment of the allocation.
+ * @WHAT: What is being allocated (for error logging)
+ * @PTR: A pointer to hold the allocated memory
+ *
+ * Return: VDO_SUCCESS or an error code
+ */
+#define vdo_allocate(COUNT, TYPE, WHAT, PTR) \
+ __vdo_do_allocation(COUNT, sizeof(TYPE), 0, __alignof__(TYPE), WHAT, PTR)
+
+/*
+ * Allocate one object of an indicated type, followed by one or more elements of a second type,
+ * logging an error if the allocation fails. The memory will be zeroed.
+ *
+ * @TYPE1: The type of the primary object to allocate. This type determines the alignment of the
+ * allocated memory.
+ * @COUNT: The number of objects to allocate
+ * @TYPE2: The type of array objects to allocate
+ * @WHAT: What is being allocated (for error logging)
+ * @PTR: A pointer to hold the allocated memory
+ *
+ * Return: VDO_SUCCESS or an error code
+ */
+#define vdo_allocate_extended(TYPE1, COUNT, TYPE2, WHAT, PTR) \
+ __extension__({ \
+ int _result; \
+ TYPE1 **_ptr = (PTR); \
+ BUILD_BUG_ON(__alignof__(TYPE1) < __alignof__(TYPE2)); \
+ _result = __vdo_do_allocation(COUNT, \
+ sizeof(TYPE2), \
+ sizeof(TYPE1), \
+ __alignof__(TYPE1), \
+ WHAT, \
+ _ptr); \
+ _result; \
+ })
+
+/*
+ * Allocate memory starting on a cache line boundary, logging an error if the allocation fails. The
+ * memory will be zeroed.
+ *
+ * @size: The number of bytes to allocate
+ * @what: What is being allocated (for error logging)
+ * @ptr: A pointer to hold the allocated memory
+ *
+ * Return: VDO_SUCCESS or an error code
+ */
+static inline int __must_check vdo_allocate_cache_aligned(size_t size, const char *what, void *ptr)
+{
+ return vdo_allocate_memory(size, L1_CACHE_BYTES, what, ptr);
+}
+
+/*
+ * Allocate one element of the indicated type immediately, failing if the required memory is not
+ * immediately available.
+ *
+ * @size: The number of bytes to allocate
+ * @what: What is being allocated (for error logging)
+ *
+ * Return: pointer to the memory, or NULL if the memory is not available.
+ */
+void *__must_check vdo_allocate_memory_nowait(size_t size, const char *what);
+
+int __must_check vdo_reallocate_memory(void *ptr, size_t old_size, size_t size,
+ const char *what, void *new_ptr);
+
+int __must_check vdo_duplicate_string(const char *string, const char *what,
+ char **new_string);
+
+/* Free memory allocated with vdo_allocate(). */
+void vdo_free(void *ptr);
+
+static inline void *__vdo_forget(void **ptr_ptr)
+{
+ void *ptr = *ptr_ptr;
+
+ *ptr_ptr = NULL;
+ return ptr;
+}
+
+/*
+ * Null out a pointer and return a copy to it. This macro should be used when passing a pointer to
+ * a function for which it is not safe to access the pointer once the function returns.
+ */
+#define vdo_forget(ptr) __vdo_forget((void **) &(ptr))
+
+void vdo_memory_init(void);
+
+void vdo_memory_exit(void);
+
+void vdo_register_allocating_thread(struct registered_thread *new_thread,
+ const bool *flag_ptr);
+
+void vdo_unregister_allocating_thread(void);
+
+void vdo_get_memory_stats(u64 *bytes_used, u64 *peak_bytes_used);
+
+void vdo_report_memory_usage(void);
+
+#endif /* VDO_MEMORY_ALLOC_H */
diff --git a/drivers/md/dm-vdo/message-stats.c b/drivers/md/dm-vdo/message-stats.c
new file mode 100644
index 000000000000..2802cf92922b
--- /dev/null
+++ b/drivers/md/dm-vdo/message-stats.c
@@ -0,0 +1,432 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "dedupe.h"
+#include "logger.h"
+#include "memory-alloc.h"
+#include "message-stats.h"
+#include "statistics.h"
+#include "thread-device.h"
+#include "vdo.h"
+
+static void write_u64(char *prefix, u64 value, char *suffix, char **buf,
+ unsigned int *maxlen)
+{
+ int count;
+
+ count = scnprintf(*buf, *maxlen, "%s%llu%s", prefix == NULL ? "" : prefix,
+ value, suffix == NULL ? "" : suffix);
+ *buf += count;
+ *maxlen -= count;
+}
+
+static void write_u32(char *prefix, u32 value, char *suffix, char **buf,
+ unsigned int *maxlen)
+{
+ int count;
+
+ count = scnprintf(*buf, *maxlen, "%s%u%s", prefix == NULL ? "" : prefix,
+ value, suffix == NULL ? "" : suffix);
+ *buf += count;
+ *maxlen -= count;
+}
+
+static void write_block_count_t(char *prefix, block_count_t value, char *suffix,
+ char **buf, unsigned int *maxlen)
+{
+ int count;
+
+ count = scnprintf(*buf, *maxlen, "%s%llu%s", prefix == NULL ? "" : prefix,
+ value, suffix == NULL ? "" : suffix);
+ *buf += count;
+ *maxlen -= count;
+}
+
+static void write_string(char *prefix, char *value, char *suffix, char **buf,
+ unsigned int *maxlen)
+{
+ int count;
+
+ count = scnprintf(*buf, *maxlen, "%s%s%s", prefix == NULL ? "" : prefix,
+ value, suffix == NULL ? "" : suffix);
+ *buf += count;
+ *maxlen -= count;
+}
+
+static void write_bool(char *prefix, bool value, char *suffix, char **buf,
+ unsigned int *maxlen)
+{
+ int count;
+
+ count = scnprintf(*buf, *maxlen, "%s%d%s", prefix == NULL ? "" : prefix,
+ value, suffix == NULL ? "" : suffix);
+ *buf += count;
+ *maxlen -= count;
+}
+
+static void write_u8(char *prefix, u8 value, char *suffix, char **buf,
+ unsigned int *maxlen)
+{
+ int count;
+
+ count = scnprintf(*buf, *maxlen, "%s%u%s", prefix == NULL ? "" : prefix,
+ value, suffix == NULL ? "" : suffix);
+ *buf += count;
+ *maxlen -= count;
+}
+
+static void write_block_allocator_statistics(char *prefix,
+ struct block_allocator_statistics *stats,
+ char *suffix, char **buf,
+ unsigned int *maxlen)
+{
+ write_string(prefix, "{ ", NULL, buf, maxlen);
+ /* The total number of slabs from which blocks may be allocated */
+ write_u64("slabCount : ", stats->slab_count, ", ", buf, maxlen);
+ /* The total number of slabs from which blocks have ever been allocated */
+ write_u64("slabsOpened : ", stats->slabs_opened, ", ", buf, maxlen);
+ /* The number of times since loading that a slab has been re-opened */
+ write_u64("slabsReopened : ", stats->slabs_reopened, ", ", buf, maxlen);
+ write_string(NULL, "}", suffix, buf, maxlen);
+}
+
+static void write_commit_statistics(char *prefix, struct commit_statistics *stats,
+ char *suffix, char **buf, unsigned int *maxlen)
+{
+ write_string(prefix, "{ ", NULL, buf, maxlen);
+ /* The total number of items on which processing has started */
+ write_u64("started : ", stats->started, ", ", buf, maxlen);
+ /* The total number of items for which a write operation has been issued */
+ write_u64("written : ", stats->written, ", ", buf, maxlen);
+ /* The total number of items for which a write operation has completed */
+ write_u64("committed : ", stats->committed, ", ", buf, maxlen);
+ write_string(NULL, "}", suffix, buf, maxlen);
+}
+
+static void write_recovery_journal_statistics(char *prefix,
+ struct recovery_journal_statistics *stats,
+ char *suffix, char **buf,
+ unsigned int *maxlen)
+{
+ write_string(prefix, "{ ", NULL, buf, maxlen);
+ /* Number of times the on-disk journal was full */
+ write_u64("diskFull : ", stats->disk_full, ", ", buf, maxlen);
+ /* Number of times the recovery journal requested slab journal commits. */
+ write_u64("slabJournalCommitsRequested : ",
+ stats->slab_journal_commits_requested, ", ", buf, maxlen);
+ /* Write/Commit totals for individual journal entries */
+ write_commit_statistics("entries : ", &stats->entries, ", ", buf, maxlen);
+ /* Write/Commit totals for journal blocks */
+ write_commit_statistics("blocks : ", &stats->blocks, ", ", buf, maxlen);
+ write_string(NULL, "}", suffix, buf, maxlen);
+}
+
+static void write_packer_statistics(char *prefix, struct packer_statistics *stats,
+ char *suffix, char **buf, unsigned int *maxlen)
+{
+ write_string(prefix, "{ ", NULL, buf, maxlen);
+ /* Number of compressed data items written since startup */
+ write_u64("compressedFragmentsWritten : ",
+ stats->compressed_fragments_written, ", ", buf, maxlen);
+ /* Number of blocks containing compressed items written since startup */
+ write_u64("compressedBlocksWritten : ",
+ stats->compressed_blocks_written, ", ", buf, maxlen);
+ /* Number of VIOs that are pending in the packer */
+ write_u64("compressedFragmentsInPacker : ",
+ stats->compressed_fragments_in_packer, ", ", buf, maxlen);
+ write_string(NULL, "}", suffix, buf, maxlen);
+}
+
+static void write_slab_journal_statistics(char *prefix,
+ struct slab_journal_statistics *stats,
+ char *suffix, char **buf, unsigned int *maxlen)
+{
+ write_string(prefix, "{ ", NULL, buf, maxlen);
+ /* Number of times the on-disk journal was full */
+ write_u64("diskFullCount : ", stats->disk_full_count, ", ", buf, maxlen);
+ /* Number of times an entry was added over the flush threshold */
+ write_u64("flushCount : ", stats->flush_count, ", ", buf, maxlen);
+ /* Number of times an entry was added over the block threshold */
+ write_u64("blockedCount : ", stats->blocked_count, ", ", buf, maxlen);
+ /* Number of times a tail block was written */
+ write_u64("blocksWritten : ", stats->blocks_written, ", ", buf, maxlen);
+ /* Number of times we had to wait for the tail to write */
+ write_u64("tailBusyCount : ", stats->tail_busy_count, ", ", buf, maxlen);
+ write_string(NULL, "}", suffix, buf, maxlen);
+}
+
+static void write_slab_summary_statistics(char *prefix,
+ struct slab_summary_statistics *stats,
+ char *suffix, char **buf, unsigned int *maxlen)
+{
+ write_string(prefix, "{ ", NULL, buf, maxlen);
+ /* Number of blocks written */
+ write_u64("blocksWritten : ", stats->blocks_written, ", ", buf, maxlen);
+ write_string(NULL, "}", suffix, buf, maxlen);
+}
+
+static void write_ref_counts_statistics(char *prefix, struct ref_counts_statistics *stats,
+ char *suffix, char **buf, unsigned int *maxlen)
+{
+ write_string(prefix, "{ ", NULL, buf, maxlen);
+ /* Number of reference blocks written */
+ write_u64("blocksWritten : ", stats->blocks_written, ", ", buf, maxlen);
+ write_string(NULL, "}", suffix, buf, maxlen);
+}
+
+static void write_block_map_statistics(char *prefix, struct block_map_statistics *stats,
+ char *suffix, char **buf, unsigned int *maxlen)
+{
+ write_string(prefix, "{ ", NULL, buf, maxlen);
+ /* number of dirty (resident) pages */
+ write_u32("dirtyPages : ", stats->dirty_pages, ", ", buf, maxlen);
+ /* number of clean (resident) pages */
+ write_u32("cleanPages : ", stats->clean_pages, ", ", buf, maxlen);
+ /* number of free pages */
+ write_u32("freePages : ", stats->free_pages, ", ", buf, maxlen);
+ /* number of pages in failed state */
+ write_u32("failedPages : ", stats->failed_pages, ", ", buf, maxlen);
+ /* number of pages incoming */
+ write_u32("incomingPages : ", stats->incoming_pages, ", ", buf, maxlen);
+ /* number of pages outgoing */
+ write_u32("outgoingPages : ", stats->outgoing_pages, ", ", buf, maxlen);
+ /* how many times free page not avail */
+ write_u32("cachePressure : ", stats->cache_pressure, ", ", buf, maxlen);
+ /* number of get_vdo_page() calls for read */
+ write_u64("readCount : ", stats->read_count, ", ", buf, maxlen);
+ /* number of get_vdo_page() calls for write */
+ write_u64("writeCount : ", stats->write_count, ", ", buf, maxlen);
+ /* number of times pages failed to read */
+ write_u64("failedReads : ", stats->failed_reads, ", ", buf, maxlen);
+ /* number of times pages failed to write */
+ write_u64("failedWrites : ", stats->failed_writes, ", ", buf, maxlen);
+ /* number of gets that are reclaimed */
+ write_u64("reclaimed : ", stats->reclaimed, ", ", buf, maxlen);
+ /* number of gets for outgoing pages */
+ write_u64("readOutgoing : ", stats->read_outgoing, ", ", buf, maxlen);
+ /* number of gets that were already there */
+ write_u64("foundInCache : ", stats->found_in_cache, ", ", buf, maxlen);
+ /* number of gets requiring discard */
+ write_u64("discardRequired : ", stats->discard_required, ", ", buf, maxlen);
+ /* number of gets enqueued for their page */
+ write_u64("waitForPage : ", stats->wait_for_page, ", ", buf, maxlen);
+ /* number of gets that have to fetch */
+ write_u64("fetchRequired : ", stats->fetch_required, ", ", buf, maxlen);
+ /* number of page fetches */
+ write_u64("pagesLoaded : ", stats->pages_loaded, ", ", buf, maxlen);
+ /* number of page saves */
+ write_u64("pagesSaved : ", stats->pages_saved, ", ", buf, maxlen);
+ /* the number of flushes issued */
+ write_u64("flushCount : ", stats->flush_count, ", ", buf, maxlen);
+ write_string(NULL, "}", suffix, buf, maxlen);
+}
+
+static void write_hash_lock_statistics(char *prefix, struct hash_lock_statistics *stats,
+ char *suffix, char **buf, unsigned int *maxlen)
+{
+ write_string(prefix, "{ ", NULL, buf, maxlen);
+ /* Number of times the UDS advice proved correct */
+ write_u64("dedupeAdviceValid : ", stats->dedupe_advice_valid, ", ", buf, maxlen);
+ /* Number of times the UDS advice proved incorrect */
+ write_u64("dedupeAdviceStale : ", stats->dedupe_advice_stale, ", ", buf, maxlen);
+ /* Number of writes with the same data as another in-flight write */
+ write_u64("concurrentDataMatches : ", stats->concurrent_data_matches,
+ ", ", buf, maxlen);
+ /* Number of writes whose hash collided with an in-flight write */
+ write_u64("concurrentHashCollisions : ",
+ stats->concurrent_hash_collisions, ", ", buf, maxlen);
+ /* Current number of dedupe queries that are in flight */
+ write_u32("currDedupeQueries : ", stats->curr_dedupe_queries, ", ", buf, maxlen);
+ write_string(NULL, "}", suffix, buf, maxlen);
+}
+
+static void write_error_statistics(char *prefix, struct error_statistics *stats,
+ char *suffix, char **buf, unsigned int *maxlen)
+{
+ write_string(prefix, "{ ", NULL, buf, maxlen);
+ /* number of times VDO got an invalid dedupe advice PBN from UDS */
+ write_u64("invalidAdvicePBNCount : ", stats->invalid_advice_pbn_count,
+ ", ", buf, maxlen);
+ /* number of times a VIO completed with a VDO_NO_SPACE error */
+ write_u64("noSpaceErrorCount : ", stats->no_space_error_count, ", ",
+ buf, maxlen);
+ /* number of times a VIO completed with a VDO_READ_ONLY error */
+ write_u64("readOnlyErrorCount : ", stats->read_only_error_count, ", ",
+ buf, maxlen);
+ write_string(NULL, "}", suffix, buf, maxlen);
+}
+
+static void write_bio_stats(char *prefix, struct bio_stats *stats, char *suffix,
+ char **buf, unsigned int *maxlen)
+{
+ write_string(prefix, "{ ", NULL, buf, maxlen);
+ /* Number of REQ_OP_READ bios */
+ write_u64("read : ", stats->read, ", ", buf, maxlen);
+ /* Number of REQ_OP_WRITE bios with data */
+ write_u64("write : ", stats->write, ", ", buf, maxlen);
+ /* Number of bios tagged with REQ_PREFLUSH and containing no data */
+ write_u64("emptyFlush : ", stats->empty_flush, ", ", buf, maxlen);
+ /* Number of REQ_OP_DISCARD bios */
+ write_u64("discard : ", stats->discard, ", ", buf, maxlen);
+ /* Number of bios tagged with REQ_PREFLUSH */
+ write_u64("flush : ", stats->flush, ", ", buf, maxlen);
+ /* Number of bios tagged with REQ_FUA */
+ write_u64("fua : ", stats->fua, ", ", buf, maxlen);
+ write_string(NULL, "}", suffix, buf, maxlen);
+}
+
+static void write_memory_usage(char *prefix, struct memory_usage *stats, char *suffix,
+ char **buf, unsigned int *maxlen)
+{
+ write_string(prefix, "{ ", NULL, buf, maxlen);
+ /* Tracked bytes currently allocated. */
+ write_u64("bytesUsed : ", stats->bytes_used, ", ", buf, maxlen);
+ /* Maximum tracked bytes allocated. */
+ write_u64("peakBytesUsed : ", stats->peak_bytes_used, ", ", buf, maxlen);
+ write_string(NULL, "}", suffix, buf, maxlen);
+}
+
+static void write_index_statistics(char *prefix, struct index_statistics *stats,
+ char *suffix, char **buf, unsigned int *maxlen)
+{
+ write_string(prefix, "{ ", NULL, buf, maxlen);
+ /* Number of records stored in the index */
+ write_u64("entriesIndexed : ", stats->entries_indexed, ", ", buf, maxlen);
+ /* Number of post calls that found an existing entry */
+ write_u64("postsFound : ", stats->posts_found, ", ", buf, maxlen);
+ /* Number of post calls that added a new entry */
+ write_u64("postsNotFound : ", stats->posts_not_found, ", ", buf, maxlen);
+ /* Number of query calls that found an existing entry */
+ write_u64("queriesFound : ", stats->queries_found, ", ", buf, maxlen);
+ /* Number of query calls that added a new entry */
+ write_u64("queriesNotFound : ", stats->queries_not_found, ", ", buf, maxlen);
+ /* Number of update calls that found an existing entry */
+ write_u64("updatesFound : ", stats->updates_found, ", ", buf, maxlen);
+ /* Number of update calls that added a new entry */
+ write_u64("updatesNotFound : ", stats->updates_not_found, ", ", buf, maxlen);
+ /* Number of entries discarded */
+ write_u64("entriesDiscarded : ", stats->entries_discarded, ", ", buf, maxlen);
+ write_string(NULL, "}", suffix, buf, maxlen);
+}
+
+static void write_vdo_statistics(char *prefix, struct vdo_statistics *stats, char *suffix,
+ char **buf, unsigned int *maxlen)
+{
+ write_string(prefix, "{ ", NULL, buf, maxlen);
+ write_u32("version : ", stats->version, ", ", buf, maxlen);
+ /* Number of blocks used for data */
+ write_u64("dataBlocksUsed : ", stats->data_blocks_used, ", ", buf, maxlen);
+ /* Number of blocks used for VDO metadata */
+ write_u64("overheadBlocksUsed : ", stats->overhead_blocks_used, ", ",
+ buf, maxlen);
+ /* Number of logical blocks that are currently mapped to physical blocks */
+ write_u64("logicalBlocksUsed : ", stats->logical_blocks_used, ", ", buf, maxlen);
+ /* number of physical blocks */
+ write_block_count_t("physicalBlocks : ", stats->physical_blocks, ", ",
+ buf, maxlen);
+ /* number of logical blocks */
+ write_block_count_t("logicalBlocks : ", stats->logical_blocks, ", ",
+ buf, maxlen);
+ /* Size of the block map page cache, in bytes */
+ write_u64("blockMapCacheSize : ", stats->block_map_cache_size, ", ",
+ buf, maxlen);
+ /* The physical block size */
+ write_u64("blockSize : ", stats->block_size, ", ", buf, maxlen);
+ /* Number of times the VDO has successfully recovered */
+ write_u64("completeRecoveries : ", stats->complete_recoveries, ", ",
+ buf, maxlen);
+ /* Number of times the VDO has recovered from read-only mode */
+ write_u64("readOnlyRecoveries : ", stats->read_only_recoveries, ", ",
+ buf, maxlen);
+ /* String describing the operating mode of the VDO */
+ write_string("mode : ", stats->mode, ", ", buf, maxlen);
+ /* Whether the VDO is in recovery mode */
+ write_bool("inRecoveryMode : ", stats->in_recovery_mode, ", ", buf, maxlen);
+ /* What percentage of recovery mode work has been completed */
+ write_u8("recoveryPercentage : ", stats->recovery_percentage, ", ", buf, maxlen);
+ /* The statistics for the compressed block packer */
+ write_packer_statistics("packer : ", &stats->packer, ", ", buf, maxlen);
+ /* Counters for events in the block allocator */
+ write_block_allocator_statistics("allocator : ", &stats->allocator,
+ ", ", buf, maxlen);
+ /* Counters for events in the recovery journal */
+ write_recovery_journal_statistics("journal : ", &stats->journal, ", ",
+ buf, maxlen);
+ /* The statistics for the slab journals */
+ write_slab_journal_statistics("slabJournal : ", &stats->slab_journal,
+ ", ", buf, maxlen);
+ /* The statistics for the slab summary */
+ write_slab_summary_statistics("slabSummary : ", &stats->slab_summary,
+ ", ", buf, maxlen);
+ /* The statistics for the reference counts */
+ write_ref_counts_statistics("refCounts : ", &stats->ref_counts, ", ",
+ buf, maxlen);
+ /* The statistics for the block map */
+ write_block_map_statistics("blockMap : ", &stats->block_map, ", ", buf, maxlen);
+ /* The dedupe statistics from hash locks */
+ write_hash_lock_statistics("hashLock : ", &stats->hash_lock, ", ", buf, maxlen);
+ /* Counts of error conditions */
+ write_error_statistics("errors : ", &stats->errors, ", ", buf, maxlen);
+ /* The VDO instance */
+ write_u32("instance : ", stats->instance, ", ", buf, maxlen);
+ /* Current number of active VIOs */
+ write_u32("currentVIOsInProgress : ", stats->current_vios_in_progress,
+ ", ", buf, maxlen);
+ /* Maximum number of active VIOs */
+ write_u32("maxVIOs : ", stats->max_vios, ", ", buf, maxlen);
+ /* Number of times the UDS index was too slow in responding */
+ write_u64("dedupeAdviceTimeouts : ", stats->dedupe_advice_timeouts,
+ ", ", buf, maxlen);
+ /* Number of flush requests submitted to the storage device */
+ write_u64("flushOut : ", stats->flush_out, ", ", buf, maxlen);
+ /* Logical block size */
+ write_u64("logicalBlockSize : ", stats->logical_block_size, ", ", buf, maxlen);
+ /* Bios submitted into VDO from above */
+ write_bio_stats("biosIn : ", &stats->bios_in, ", ", buf, maxlen);
+ write_bio_stats("biosInPartial : ", &stats->bios_in_partial, ", ", buf, maxlen);
+ /* Bios submitted onward for user data */
+ write_bio_stats("biosOut : ", &stats->bios_out, ", ", buf, maxlen);
+ /* Bios submitted onward for metadata */
+ write_bio_stats("biosMeta : ", &stats->bios_meta, ", ", buf, maxlen);
+ write_bio_stats("biosJournal : ", &stats->bios_journal, ", ", buf, maxlen);
+ write_bio_stats("biosPageCache : ", &stats->bios_page_cache, ", ", buf, maxlen);
+ write_bio_stats("biosOutCompleted : ", &stats->bios_out_completed, ", ",
+ buf, maxlen);
+ write_bio_stats("biosMetaCompleted : ", &stats->bios_meta_completed,
+ ", ", buf, maxlen);
+ write_bio_stats("biosJournalCompleted : ",
+ &stats->bios_journal_completed, ", ", buf, maxlen);
+ write_bio_stats("biosPageCacheCompleted : ",
+ &stats->bios_page_cache_completed, ", ", buf, maxlen);
+ write_bio_stats("biosAcknowledged : ", &stats->bios_acknowledged, ", ",
+ buf, maxlen);
+ write_bio_stats("biosAcknowledgedPartial : ",
+ &stats->bios_acknowledged_partial, ", ", buf, maxlen);
+ /* Current number of bios in progress */
+ write_bio_stats("biosInProgress : ", &stats->bios_in_progress, ", ",
+ buf, maxlen);
+ /* Memory usage stats. */
+ write_memory_usage("memoryUsage : ", &stats->memory_usage, ", ", buf, maxlen);
+ /* The statistics for the UDS index */
+ write_index_statistics("index : ", &stats->index, ", ", buf, maxlen);
+ write_string(NULL, "}", suffix, buf, maxlen);
+}
+
+int vdo_write_stats(struct vdo *vdo, char *buf, unsigned int maxlen)
+{
+ struct vdo_statistics *stats;
+ int result;
+
+ result = vdo_allocate(1, struct vdo_statistics, __func__, &stats);
+ if (result != VDO_SUCCESS) {
+ vdo_log_error("Cannot allocate memory to write VDO statistics");
+ return result;
+ }
+
+ vdo_fetch_statistics(vdo, stats);
+ write_vdo_statistics(NULL, stats, NULL, &buf, &maxlen);
+ vdo_free(stats);
+ return VDO_SUCCESS;
+}
diff --git a/drivers/md/dm-vdo/message-stats.h b/drivers/md/dm-vdo/message-stats.h
new file mode 100644
index 000000000000..f7fceca9acab
--- /dev/null
+++ b/drivers/md/dm-vdo/message-stats.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef VDO_MESSAGE_STATS_H
+#define VDO_MESSAGE_STATS_H
+
+#include "types.h"
+
+int vdo_write_stats(struct vdo *vdo, char *buf, unsigned int maxlen);
+
+#endif /* VDO_MESSAGE_STATS_H */
diff --git a/drivers/md/dm-vdo/murmurhash3.c b/drivers/md/dm-vdo/murmurhash3.c
new file mode 100644
index 000000000000..00c9b9c05001
--- /dev/null
+++ b/drivers/md/dm-vdo/murmurhash3.c
@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: LGPL-2.1+
+/*
+ * MurmurHash3 was written by Austin Appleby, and is placed in the public
+ * domain. The author hereby disclaims copyright to this source code.
+ *
+ * Adapted by John Wiele (jwiele@redhat.com).
+ */
+
+#include "murmurhash3.h"
+
+static inline u64 rotl64(u64 x, s8 r)
+{
+ return (x << r) | (x >> (64 - r));
+}
+
+#define ROTL64(x, y) rotl64(x, y)
+static __always_inline u64 getblock64(const u64 *p, int i)
+{
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ return p[i];
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+ return __builtin_bswap64(p[i]);
+#else
+#error "can't figure out byte order"
+#endif
+}
+
+static __always_inline void putblock64(u64 *p, int i, u64 value)
+{
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ p[i] = value;
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+ p[i] = __builtin_bswap64(value);
+#else
+#error "can't figure out byte order"
+#endif
+}
+
+/* Finalization mix - force all bits of a hash block to avalanche */
+
+static __always_inline u64 fmix64(u64 k)
+{
+ k ^= k >> 33;
+ k *= 0xff51afd7ed558ccdLLU;
+ k ^= k >> 33;
+ k *= 0xc4ceb9fe1a85ec53LLU;
+ k ^= k >> 33;
+
+ return k;
+}
+
+void murmurhash3_128(const void *key, const int len, const u32 seed, void *out)
+{
+ const u8 *data = key;
+ const int nblocks = len / 16;
+
+ u64 h1 = seed;
+ u64 h2 = seed;
+
+ const u64 c1 = 0x87c37b91114253d5LLU;
+ const u64 c2 = 0x4cf5ad432745937fLLU;
+
+ /* body */
+
+ const u64 *blocks = (const u64 *)(data);
+
+ int i;
+
+ for (i = 0; i < nblocks; i++) {
+ u64 k1 = getblock64(blocks, i * 2 + 0);
+ u64 k2 = getblock64(blocks, i * 2 + 1);
+
+ k1 *= c1;
+ k1 = ROTL64(k1, 31);
+ k1 *= c2;
+ h1 ^= k1;
+
+ h1 = ROTL64(h1, 27);
+ h1 += h2;
+ h1 = h1 * 5 + 0x52dce729;
+
+ k2 *= c2;
+ k2 = ROTL64(k2, 33);
+ k2 *= c1;
+ h2 ^= k2;
+
+ h2 = ROTL64(h2, 31);
+ h2 += h1;
+ h2 = h2 * 5 + 0x38495ab5;
+ }
+
+ /* tail */
+
+ {
+ const u8 *tail = (const u8 *)(data + nblocks * 16);
+
+ u64 k1 = 0;
+ u64 k2 = 0;
+
+ switch (len & 15) {
+ case 15:
+ k2 ^= ((u64)tail[14]) << 48;
+ fallthrough;
+ case 14:
+ k2 ^= ((u64)tail[13]) << 40;
+ fallthrough;
+ case 13:
+ k2 ^= ((u64)tail[12]) << 32;
+ fallthrough;
+ case 12:
+ k2 ^= ((u64)tail[11]) << 24;
+ fallthrough;
+ case 11:
+ k2 ^= ((u64)tail[10]) << 16;
+ fallthrough;
+ case 10:
+ k2 ^= ((u64)tail[9]) << 8;
+ fallthrough;
+ case 9:
+ k2 ^= ((u64)tail[8]) << 0;
+ k2 *= c2;
+ k2 = ROTL64(k2, 33);
+ k2 *= c1;
+ h2 ^= k2;
+ fallthrough;
+
+ case 8:
+ k1 ^= ((u64)tail[7]) << 56;
+ fallthrough;
+ case 7:
+ k1 ^= ((u64)tail[6]) << 48;
+ fallthrough;
+ case 6:
+ k1 ^= ((u64)tail[5]) << 40;
+ fallthrough;
+ case 5:
+ k1 ^= ((u64)tail[4]) << 32;
+ fallthrough;
+ case 4:
+ k1 ^= ((u64)tail[3]) << 24;
+ fallthrough;
+ case 3:
+ k1 ^= ((u64)tail[2]) << 16;
+ fallthrough;
+ case 2:
+ k1 ^= ((u64)tail[1]) << 8;
+ fallthrough;
+ case 1:
+ k1 ^= ((u64)tail[0]) << 0;
+ k1 *= c1;
+ k1 = ROTL64(k1, 31);
+ k1 *= c2;
+ h1 ^= k1;
+ break;
+ default:
+ break;
+ };
+ }
+ /* finalization */
+
+ h1 ^= len;
+ h2 ^= len;
+
+ h1 += h2;
+ h2 += h1;
+
+ h1 = fmix64(h1);
+ h2 = fmix64(h2);
+
+ h1 += h2;
+ h2 += h1;
+
+ putblock64((u64 *)out, 0, h1);
+ putblock64((u64 *)out, 1, h2);
+}
diff --git a/drivers/md/dm-vdo/murmurhash3.h b/drivers/md/dm-vdo/murmurhash3.h
new file mode 100644
index 000000000000..d84711ddb659
--- /dev/null
+++ b/drivers/md/dm-vdo/murmurhash3.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+/*
+ * MurmurHash3 was written by Austin Appleby, and is placed in the public
+ * domain. The author hereby disclaims copyright to this source code.
+ */
+
+#ifndef _MURMURHASH3_H_
+#define _MURMURHASH3_H_
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+void murmurhash3_128(const void *key, int len, u32 seed, void *out);
+
+#endif /* _MURMURHASH3_H_ */
diff --git a/drivers/md/dm-vdo/numeric.h b/drivers/md/dm-vdo/numeric.h
new file mode 100644
index 000000000000..dc8c400b21d2
--- /dev/null
+++ b/drivers/md/dm-vdo/numeric.h
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef UDS_NUMERIC_H
+#define UDS_NUMERIC_H
+
+#include <asm/unaligned.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+/*
+ * These utilities encode or decode a number from an offset in a larger data buffer and then
+ * advance the offset pointer to the next field in the buffer.
+ */
+
+static inline void decode_s64_le(const u8 *buffer, size_t *offset, s64 *decoded)
+{
+ *decoded = get_unaligned_le64(buffer + *offset);
+ *offset += sizeof(s64);
+}
+
+static inline void encode_s64_le(u8 *data, size_t *offset, s64 to_encode)
+{
+ put_unaligned_le64(to_encode, data + *offset);
+ *offset += sizeof(s64);
+}
+
+static inline void decode_u64_le(const u8 *buffer, size_t *offset, u64 *decoded)
+{
+ *decoded = get_unaligned_le64(buffer + *offset);
+ *offset += sizeof(u64);
+}
+
+static inline void encode_u64_le(u8 *data, size_t *offset, u64 to_encode)
+{
+ put_unaligned_le64(to_encode, data + *offset);
+ *offset += sizeof(u64);
+}
+
+static inline void decode_s32_le(const u8 *buffer, size_t *offset, s32 *decoded)
+{
+ *decoded = get_unaligned_le32(buffer + *offset);
+ *offset += sizeof(s32);
+}
+
+static inline void encode_s32_le(u8 *data, size_t *offset, s32 to_encode)
+{
+ put_unaligned_le32(to_encode, data + *offset);
+ *offset += sizeof(s32);
+}
+
+static inline void decode_u32_le(const u8 *buffer, size_t *offset, u32 *decoded)
+{
+ *decoded = get_unaligned_le32(buffer + *offset);
+ *offset += sizeof(u32);
+}
+
+static inline void encode_u32_le(u8 *data, size_t *offset, u32 to_encode)
+{
+ put_unaligned_le32(to_encode, data + *offset);
+ *offset += sizeof(u32);
+}
+
+static inline void decode_u16_le(const u8 *buffer, size_t *offset, u16 *decoded)
+{
+ *decoded = get_unaligned_le16(buffer + *offset);
+ *offset += sizeof(u16);
+}
+
+static inline void encode_u16_le(u8 *data, size_t *offset, u16 to_encode)
+{
+ put_unaligned_le16(to_encode, data + *offset);
+ *offset += sizeof(u16);
+}
+
+#endif /* UDS_NUMERIC_H */
diff --git a/drivers/md/dm-vdo/packer.c b/drivers/md/dm-vdo/packer.c
new file mode 100644
index 000000000000..16cf29b4c90a
--- /dev/null
+++ b/drivers/md/dm-vdo/packer.c
@@ -0,0 +1,780 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "packer.h"
+
+#include <linux/atomic.h>
+#include <linux/blkdev.h>
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "permassert.h"
+#include "string-utils.h"
+
+#include "admin-state.h"
+#include "completion.h"
+#include "constants.h"
+#include "data-vio.h"
+#include "dedupe.h"
+#include "encodings.h"
+#include "io-submitter.h"
+#include "physical-zone.h"
+#include "status-codes.h"
+#include "vdo.h"
+#include "vio.h"
+
+static const struct version_number COMPRESSED_BLOCK_1_0 = {
+ .major_version = 1,
+ .minor_version = 0,
+};
+
+#define COMPRESSED_BLOCK_1_0_SIZE (4 + 4 + (2 * VDO_MAX_COMPRESSION_SLOTS))
+
+/**
+ * vdo_get_compressed_block_fragment() - Get a reference to a compressed fragment from a compressed
+ * block.
+ * @mapping_state [in] The mapping state for the look up.
+ * @compressed_block [in] The compressed block that was read from disk.
+ * @fragment_offset [out] The offset of the fragment within a compressed block.
+ * @fragment_size [out] The size of the fragment.
+ *
+ * Return: If a valid compressed fragment is found, VDO_SUCCESS; otherwise, VDO_INVALID_FRAGMENT if
+ * the fragment is invalid.
+ */
+int vdo_get_compressed_block_fragment(enum block_mapping_state mapping_state,
+ struct compressed_block *block,
+ u16 *fragment_offset, u16 *fragment_size)
+{
+ u16 compressed_size;
+ u16 offset = 0;
+ unsigned int i;
+ u8 slot;
+ struct version_number version;
+
+ if (!vdo_is_state_compressed(mapping_state))
+ return VDO_INVALID_FRAGMENT;
+
+ version = vdo_unpack_version_number(block->header.version);
+ if (!vdo_are_same_version(version, COMPRESSED_BLOCK_1_0))
+ return VDO_INVALID_FRAGMENT;
+
+ slot = mapping_state - VDO_MAPPING_STATE_COMPRESSED_BASE;
+ if (slot >= VDO_MAX_COMPRESSION_SLOTS)
+ return VDO_INVALID_FRAGMENT;
+
+ compressed_size = __le16_to_cpu(block->header.sizes[slot]);
+ for (i = 0; i < slot; i++) {
+ offset += __le16_to_cpu(block->header.sizes[i]);
+ if (offset >= VDO_COMPRESSED_BLOCK_DATA_SIZE)
+ return VDO_INVALID_FRAGMENT;
+ }
+
+ if ((offset + compressed_size) > VDO_COMPRESSED_BLOCK_DATA_SIZE)
+ return VDO_INVALID_FRAGMENT;
+
+ *fragment_offset = offset;
+ *fragment_size = compressed_size;
+ return VDO_SUCCESS;
+}
+
+/**
+ * assert_on_packer_thread() - Check that we are on the packer thread.
+ * @packer: The packer.
+ * @caller: The function which is asserting.
+ */
+static inline void assert_on_packer_thread(struct packer *packer, const char *caller)
+{
+ VDO_ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == packer->thread_id),
+ "%s() called from packer thread", caller);
+}
+
+/**
+ * insert_in_sorted_list() - Insert a bin to the list.
+ * @packer: The packer.
+ * @bin: The bin to move to its sorted position.
+ *
+ * The list is in ascending order of free space. Since all bins are already in the list, this
+ * actually moves the bin to the correct position in the list.
+ */
+static void insert_in_sorted_list(struct packer *packer, struct packer_bin *bin)
+{
+ struct packer_bin *active_bin;
+
+ list_for_each_entry(active_bin, &packer->bins, list)
+ if (active_bin->free_space > bin->free_space) {
+ list_move_tail(&bin->list, &active_bin->list);
+ return;
+ }
+
+ list_move_tail(&bin->list, &packer->bins);
+}
+
+/**
+ * make_bin() - Allocate a bin and put it into the packer's list.
+ * @packer: The packer.
+ */
+static int __must_check make_bin(struct packer *packer)
+{
+ struct packer_bin *bin;
+ int result;
+
+ result = vdo_allocate_extended(struct packer_bin, VDO_MAX_COMPRESSION_SLOTS,
+ struct vio *, __func__, &bin);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ bin->free_space = VDO_COMPRESSED_BLOCK_DATA_SIZE;
+ INIT_LIST_HEAD(&bin->list);
+ list_add_tail(&bin->list, &packer->bins);
+ return VDO_SUCCESS;
+}
+
+/**
+ * vdo_make_packer() - Make a new block packer.
+ *
+ * @vdo: The vdo to which this packer belongs.
+ * @bin_count: The number of partial bins to keep in memory.
+ * @packer_ptr: A pointer to hold the new packer.
+ *
+ * Return: VDO_SUCCESS or an error
+ */
+int vdo_make_packer(struct vdo *vdo, block_count_t bin_count, struct packer **packer_ptr)
+{
+ struct packer *packer;
+ block_count_t i;
+ int result;
+
+ result = vdo_allocate(1, struct packer, __func__, &packer);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ packer->thread_id = vdo->thread_config.packer_thread;
+ packer->size = bin_count;
+ INIT_LIST_HEAD(&packer->bins);
+ vdo_set_admin_state_code(&packer->state, VDO_ADMIN_STATE_NORMAL_OPERATION);
+
+ for (i = 0; i < bin_count; i++) {
+ result = make_bin(packer);
+ if (result != VDO_SUCCESS) {
+ vdo_free_packer(packer);
+ return result;
+ }
+ }
+
+ /*
+ * The canceled bin can hold up to half the number of user vios. Every canceled vio in the
+ * bin must have a canceler for which it is waiting, and any canceler will only have
+ * canceled one lock holder at a time.
+ */
+ result = vdo_allocate_extended(struct packer_bin, MAXIMUM_VDO_USER_VIOS / 2,
+ struct vio *, __func__, &packer->canceled_bin);
+ if (result != VDO_SUCCESS) {
+ vdo_free_packer(packer);
+ return result;
+ }
+
+ result = vdo_make_default_thread(vdo, packer->thread_id);
+ if (result != VDO_SUCCESS) {
+ vdo_free_packer(packer);
+ return result;
+ }
+
+ *packer_ptr = packer;
+ return VDO_SUCCESS;
+}
+
+/**
+ * vdo_free_packer() - Free a block packer.
+ * @packer: The packer to free.
+ */
+void vdo_free_packer(struct packer *packer)
+{
+ struct packer_bin *bin, *tmp;
+
+ if (packer == NULL)
+ return;
+
+ list_for_each_entry_safe(bin, tmp, &packer->bins, list) {
+ list_del_init(&bin->list);
+ vdo_free(bin);
+ }
+
+ vdo_free(vdo_forget(packer->canceled_bin));
+ vdo_free(packer);
+}
+
+/**
+ * get_packer_from_data_vio() - Get the packer from a data_vio.
+ * @data_vio: The data_vio.
+ *
+ * Return: The packer from the VDO to which the data_vio belongs.
+ */
+static inline struct packer *get_packer_from_data_vio(struct data_vio *data_vio)
+{
+ return vdo_from_data_vio(data_vio)->packer;
+}
+
+/**
+ * vdo_get_packer_statistics() - Get the current statistics from the packer.
+ * @packer: The packer to query.
+ *
+ * Return: a copy of the current statistics for the packer.
+ */
+struct packer_statistics vdo_get_packer_statistics(const struct packer *packer)
+{
+ const struct packer_statistics *stats = &packer->statistics;
+
+ return (struct packer_statistics) {
+ .compressed_fragments_written = READ_ONCE(stats->compressed_fragments_written),
+ .compressed_blocks_written = READ_ONCE(stats->compressed_blocks_written),
+ .compressed_fragments_in_packer = READ_ONCE(stats->compressed_fragments_in_packer),
+ };
+}
+
+/**
+ * abort_packing() - Abort packing a data_vio.
+ * @data_vio: The data_vio to abort.
+ */
+static void abort_packing(struct data_vio *data_vio)
+{
+ struct packer *packer = get_packer_from_data_vio(data_vio);
+
+ WRITE_ONCE(packer->statistics.compressed_fragments_in_packer,
+ packer->statistics.compressed_fragments_in_packer - 1);
+
+ write_data_vio(data_vio);
+}
+
+/**
+ * release_compressed_write_waiter() - Update a data_vio for which a successful compressed write
+ * has completed and send it on its way.
+
+ * @data_vio: The data_vio to release.
+ * @allocation: The allocation to which the compressed block was written.
+ */
+static void release_compressed_write_waiter(struct data_vio *data_vio,
+ struct allocation *allocation)
+{
+ data_vio->new_mapped = (struct zoned_pbn) {
+ .pbn = allocation->pbn,
+ .zone = allocation->zone,
+ .state = data_vio->compression.slot + VDO_MAPPING_STATE_COMPRESSED_BASE,
+ };
+
+ vdo_share_compressed_write_lock(data_vio, allocation->lock);
+ update_metadata_for_data_vio_write(data_vio, allocation->lock);
+}
+
+/**
+ * finish_compressed_write() - Finish a compressed block write.
+ * @completion: The compressed write completion.
+ *
+ * This callback is registered in continue_after_allocation().
+ */
+static void finish_compressed_write(struct vdo_completion *completion)
+{
+ struct data_vio *agent = as_data_vio(completion);
+ struct data_vio *client, *next;
+
+ assert_data_vio_in_allocated_zone(agent);
+
+ /*
+ * Process all the non-agent waiters first to ensure that the pbn lock can not be released
+ * until all of them have had a chance to journal their increfs.
+ */
+ for (client = agent->compression.next_in_batch; client != NULL; client = next) {
+ next = client->compression.next_in_batch;
+ release_compressed_write_waiter(client, &agent->allocation);
+ }
+
+ completion->error_handler = handle_data_vio_error;
+ release_compressed_write_waiter(agent, &agent->allocation);
+}
+
+static void handle_compressed_write_error(struct vdo_completion *completion)
+{
+ struct data_vio *agent = as_data_vio(completion);
+ struct allocation *allocation = &agent->allocation;
+ struct data_vio *client, *next;
+
+ if (vdo_requeue_completion_if_needed(completion, allocation->zone->thread_id))
+ return;
+
+ update_vio_error_stats(as_vio(completion),
+ "Completing compressed write vio for physical block %llu with error",
+ (unsigned long long) allocation->pbn);
+
+ for (client = agent->compression.next_in_batch; client != NULL; client = next) {
+ next = client->compression.next_in_batch;
+ write_data_vio(client);
+ }
+
+ /* Now that we've released the batch from the packer, forget the error and continue on. */
+ vdo_reset_completion(completion);
+ completion->error_handler = handle_data_vio_error;
+ write_data_vio(agent);
+}
+
+/**
+ * add_to_bin() - Put a data_vio in a specific packer_bin in which it will definitely fit.
+ * @bin: The bin in which to put the data_vio.
+ * @data_vio: The data_vio to add.
+ */
+static void add_to_bin(struct packer_bin *bin, struct data_vio *data_vio)
+{
+ data_vio->compression.bin = bin;
+ data_vio->compression.slot = bin->slots_used;
+ bin->incoming[bin->slots_used++] = data_vio;
+}
+
+/**
+ * remove_from_bin() - Get the next data_vio whose compression has not been canceled from a bin.
+ * @packer: The packer.
+ * @bin: The bin from which to get a data_vio.
+ *
+ * Any canceled data_vios will be moved to the canceled bin.
+ * Return: An uncanceled data_vio from the bin or NULL if there are none.
+ */
+static struct data_vio *remove_from_bin(struct packer *packer, struct packer_bin *bin)
+{
+ while (bin->slots_used > 0) {
+ struct data_vio *data_vio = bin->incoming[--bin->slots_used];
+
+ if (!advance_data_vio_compression_stage(data_vio).may_not_compress) {
+ data_vio->compression.bin = NULL;
+ return data_vio;
+ }
+
+ add_to_bin(packer->canceled_bin, data_vio);
+ }
+
+ /* The bin is now empty. */
+ bin->free_space = VDO_COMPRESSED_BLOCK_DATA_SIZE;
+ return NULL;
+}
+
+/**
+ * initialize_compressed_block() - Initialize a compressed block.
+ * @block: The compressed block to initialize.
+ * @size: The size of the agent's fragment.
+ *
+ * This method initializes the compressed block in the compressed write agent. Because the
+ * compressor already put the agent's compressed fragment at the start of the compressed block's
+ * data field, it needn't be copied. So all we need do is initialize the header and set the size of
+ * the agent's fragment.
+ */
+static void initialize_compressed_block(struct compressed_block *block, u16 size)
+{
+ /*
+ * Make sure the block layout isn't accidentally changed by changing the length of the
+ * block header.
+ */
+ BUILD_BUG_ON(sizeof(struct compressed_block_header) != COMPRESSED_BLOCK_1_0_SIZE);
+
+ block->header.version = vdo_pack_version_number(COMPRESSED_BLOCK_1_0);
+ block->header.sizes[0] = __cpu_to_le16(size);
+}
+
+/**
+ * pack_fragment() - Pack a data_vio's fragment into the compressed block in which it is already
+ * known to fit.
+ * @compression: The agent's compression_state to pack in to.
+ * @data_vio: The data_vio to pack.
+ * @offset: The offset into the compressed block at which to pack the fragment.
+ * @compressed_block: The compressed block which will be written out when batch is fully packed.
+ *
+ * Return: The new amount of space used.
+ */
+static block_size_t __must_check pack_fragment(struct compression_state *compression,
+ struct data_vio *data_vio,
+ block_size_t offset, slot_number_t slot,
+ struct compressed_block *block)
+{
+ struct compression_state *to_pack = &data_vio->compression;
+ char *fragment = to_pack->block->data;
+
+ to_pack->next_in_batch = compression->next_in_batch;
+ compression->next_in_batch = data_vio;
+ to_pack->slot = slot;
+ block->header.sizes[slot] = __cpu_to_le16(to_pack->size);
+ memcpy(&block->data[offset], fragment, to_pack->size);
+ return (offset + to_pack->size);
+}
+
+/**
+ * compressed_write_end_io() - The bio_end_io for a compressed block write.
+ * @bio: The bio for the compressed write.
+ */
+static void compressed_write_end_io(struct bio *bio)
+{
+ struct data_vio *data_vio = vio_as_data_vio(bio->bi_private);
+
+ vdo_count_completed_bios(bio);
+ set_data_vio_allocated_zone_callback(data_vio, finish_compressed_write);
+ continue_data_vio_with_error(data_vio, blk_status_to_errno(bio->bi_status));
+}
+
+/**
+ * write_bin() - Write out a bin.
+ * @packer: The packer.
+ * @bin: The bin to write.
+ */
+static void write_bin(struct packer *packer, struct packer_bin *bin)
+{
+ int result;
+ block_size_t offset;
+ slot_number_t slot = 1;
+ struct compression_state *compression;
+ struct compressed_block *block;
+ struct data_vio *agent = remove_from_bin(packer, bin);
+ struct data_vio *client;
+ struct packer_statistics *stats;
+
+ if (agent == NULL)
+ return;
+
+ compression = &agent->compression;
+ compression->slot = 0;
+ block = compression->block;
+ initialize_compressed_block(block, compression->size);
+ offset = compression->size;
+
+ while ((client = remove_from_bin(packer, bin)) != NULL)
+ offset = pack_fragment(compression, client, offset, slot++, block);
+
+ /*
+ * If the batch contains only a single vio, then we save nothing by saving the compressed
+ * form. Continue processing the single vio in the batch.
+ */
+ if (slot == 1) {
+ abort_packing(agent);
+ return;
+ }
+
+ if (slot < VDO_MAX_COMPRESSION_SLOTS) {
+ /* Clear out the sizes of the unused slots */
+ memset(&block->header.sizes[slot], 0,
+ (VDO_MAX_COMPRESSION_SLOTS - slot) * sizeof(__le16));
+ }
+
+ agent->vio.completion.error_handler = handle_compressed_write_error;
+ if (vdo_is_read_only(vdo_from_data_vio(agent))) {
+ continue_data_vio_with_error(agent, VDO_READ_ONLY);
+ return;
+ }
+
+ result = vio_reset_bio(&agent->vio, (char *) block, compressed_write_end_io,
+ REQ_OP_WRITE, agent->allocation.pbn);
+ if (result != VDO_SUCCESS) {
+ continue_data_vio_with_error(agent, result);
+ return;
+ }
+
+ /*
+ * Once the compressed write is submitted, the fragments are no longer in the packer, so
+ * update stats now.
+ */
+ stats = &packer->statistics;
+ WRITE_ONCE(stats->compressed_fragments_in_packer,
+ (stats->compressed_fragments_in_packer - slot));
+ WRITE_ONCE(stats->compressed_fragments_written,
+ (stats->compressed_fragments_written + slot));
+ WRITE_ONCE(stats->compressed_blocks_written,
+ stats->compressed_blocks_written + 1);
+
+ vdo_submit_data_vio(agent);
+}
+
+/**
+ * add_data_vio_to_packer_bin() - Add a data_vio to a bin's incoming queue
+ * @packer: The packer.
+ * @bin: The bin to which to add the data_vio.
+ * @data_vio: The data_vio to add to the bin's queue.
+ *
+ * Adds a data_vio to a bin's incoming queue, handles logical space change, and calls physical
+ * space processor.
+ */
+static void add_data_vio_to_packer_bin(struct packer *packer, struct packer_bin *bin,
+ struct data_vio *data_vio)
+{
+ /* If the selected bin doesn't have room, start a new batch to make room. */
+ if (bin->free_space < data_vio->compression.size)
+ write_bin(packer, bin);
+
+ add_to_bin(bin, data_vio);
+ bin->free_space -= data_vio->compression.size;
+
+ /* If we happen to exactly fill the bin, start a new batch. */
+ if ((bin->slots_used == VDO_MAX_COMPRESSION_SLOTS) ||
+ (bin->free_space == 0))
+ write_bin(packer, bin);
+
+ /* Now that we've finished changing the free space, restore the sort order. */
+ insert_in_sorted_list(packer, bin);
+}
+
+/**
+ * select_bin() - Select the bin that should be used to pack the compressed data in a data_vio with
+ * other data_vios.
+ * @packer: The packer.
+ * @data_vio: The data_vio.
+ */
+static struct packer_bin * __must_check select_bin(struct packer *packer,
+ struct data_vio *data_vio)
+{
+ /*
+ * First best fit: select the bin with the least free space that has enough room for the
+ * compressed data in the data_vio.
+ */
+ struct packer_bin *bin, *fullest_bin;
+
+ list_for_each_entry(bin, &packer->bins, list) {
+ if (bin->free_space >= data_vio->compression.size)
+ return bin;
+ }
+
+ /*
+ * None of the bins have enough space for the data_vio. We're not allowed to create new
+ * bins, so we have to overflow one of the existing bins. It's pretty intuitive to select
+ * the fullest bin, since that "wastes" the least amount of free space in the compressed
+ * block. But if the space currently used in the fullest bin is smaller than the compressed
+ * size of the incoming block, it seems wrong to force that bin to write when giving up on
+ * compressing the incoming data_vio would likewise "waste" the least amount of free space.
+ */
+ fullest_bin = list_first_entry(&packer->bins, struct packer_bin, list);
+ if (data_vio->compression.size >=
+ (VDO_COMPRESSED_BLOCK_DATA_SIZE - fullest_bin->free_space))
+ return NULL;
+
+ /*
+ * The fullest bin doesn't have room, but writing it out and starting a new batch with the
+ * incoming data_vio will increase the packer's free space.
+ */
+ return fullest_bin;
+}
+
+/**
+ * vdo_attempt_packing() - Attempt to rewrite the data in this data_vio as part of a compressed
+ * block.
+ * @data_vio: The data_vio to pack.
+ */
+void vdo_attempt_packing(struct data_vio *data_vio)
+{
+ int result;
+ struct packer_bin *bin;
+ struct data_vio_compression_status status = get_data_vio_compression_status(data_vio);
+ struct packer *packer = get_packer_from_data_vio(data_vio);
+
+ assert_on_packer_thread(packer, __func__);
+
+ result = VDO_ASSERT((status.stage == DATA_VIO_COMPRESSING),
+ "attempt to pack data_vio not ready for packing, stage: %u",
+ status.stage);
+ if (result != VDO_SUCCESS)
+ return;
+
+ /*
+ * Increment whether or not this data_vio will be packed or not since abort_packing()
+ * always decrements the counter.
+ */
+ WRITE_ONCE(packer->statistics.compressed_fragments_in_packer,
+ packer->statistics.compressed_fragments_in_packer + 1);
+
+ /*
+ * If packing of this data_vio is disallowed for administrative reasons, give up before
+ * making any state changes.
+ */
+ if (!vdo_is_state_normal(&packer->state) ||
+ (data_vio->flush_generation < packer->flush_generation)) {
+ abort_packing(data_vio);
+ return;
+ }
+
+ /*
+ * The advance_data_vio_compression_stage() check here verifies that the data_vio is
+ * allowed to be compressed (if it has already been canceled, we'll fall out here). Once
+ * the data_vio is in the DATA_VIO_PACKING state, it must be guaranteed to be put in a bin
+ * before any more requests can be processed by the packer thread. Otherwise, a canceling
+ * data_vio could attempt to remove the canceled data_vio from the packer and fail to
+ * rendezvous with it. Thus, we must call select_bin() first to ensure that we will
+ * actually add the data_vio to a bin before advancing to the DATA_VIO_PACKING stage.
+ */
+ bin = select_bin(packer, data_vio);
+ if ((bin == NULL) ||
+ (advance_data_vio_compression_stage(data_vio).stage != DATA_VIO_PACKING)) {
+ abort_packing(data_vio);
+ return;
+ }
+
+ add_data_vio_to_packer_bin(packer, bin, data_vio);
+}
+
+/**
+ * check_for_drain_complete() - Check whether the packer has drained.
+ * @packer: The packer.
+ */
+static void check_for_drain_complete(struct packer *packer)
+{
+ if (vdo_is_state_draining(&packer->state) && (packer->canceled_bin->slots_used == 0))
+ vdo_finish_draining(&packer->state);
+}
+
+/**
+ * write_all_non_empty_bins() - Write out all non-empty bins on behalf of a flush or suspend.
+ * @packer: The packer being flushed.
+ */
+static void write_all_non_empty_bins(struct packer *packer)
+{
+ struct packer_bin *bin;
+
+ list_for_each_entry(bin, &packer->bins, list)
+ write_bin(packer, bin);
+ /*
+ * We don't need to re-sort the bin here since this loop will make every bin have
+ * the same amount of free space, so every ordering is sorted.
+ */
+
+ check_for_drain_complete(packer);
+}
+
+/**
+ * vdo_flush_packer() - Request that the packer flush asynchronously.
+ * @packer: The packer to flush.
+ *
+ * All bins with at least two compressed data blocks will be written out, and any solitary pending
+ * VIOs will be released from the packer. While flushing is in progress, any VIOs submitted to
+ * vdo_attempt_packing() will be continued immediately without attempting to pack them.
+ */
+void vdo_flush_packer(struct packer *packer)
+{
+ assert_on_packer_thread(packer, __func__);
+ if (vdo_is_state_normal(&packer->state))
+ write_all_non_empty_bins(packer);
+}
+
+/**
+ * vdo_remove_lock_holder_from_packer() - Remove a lock holder from the packer.
+ * @completion: The data_vio which needs a lock held by a data_vio in the packer. The data_vio's
+ * compression.lock_holder field will point to the data_vio to remove.
+ */
+void vdo_remove_lock_holder_from_packer(struct vdo_completion *completion)
+{
+ struct data_vio *data_vio = as_data_vio(completion);
+ struct packer *packer = get_packer_from_data_vio(data_vio);
+ struct data_vio *lock_holder;
+ struct packer_bin *bin;
+ slot_number_t slot;
+
+ assert_data_vio_in_packer_zone(data_vio);
+
+ lock_holder = vdo_forget(data_vio->compression.lock_holder);
+ bin = lock_holder->compression.bin;
+ VDO_ASSERT_LOG_ONLY((bin != NULL), "data_vio in packer has a bin");
+
+ slot = lock_holder->compression.slot;
+ bin->slots_used--;
+ if (slot < bin->slots_used) {
+ bin->incoming[slot] = bin->incoming[bin->slots_used];
+ bin->incoming[slot]->compression.slot = slot;
+ }
+
+ lock_holder->compression.bin = NULL;
+ lock_holder->compression.slot = 0;
+
+ if (bin != packer->canceled_bin) {
+ bin->free_space += lock_holder->compression.size;
+ insert_in_sorted_list(packer, bin);
+ }
+
+ abort_packing(lock_holder);
+ check_for_drain_complete(packer);
+}
+
+/**
+ * vdo_increment_packer_flush_generation() - Increment the flush generation in the packer.
+ * @packer: The packer.
+ *
+ * This will also cause the packer to flush so that any VIOs from previous generations will exit
+ * the packer.
+ */
+void vdo_increment_packer_flush_generation(struct packer *packer)
+{
+ assert_on_packer_thread(packer, __func__);
+ packer->flush_generation++;
+ vdo_flush_packer(packer);
+}
+
+/**
+ * initiate_drain() - Initiate a drain.
+ *
+ * Implements vdo_admin_initiator_fn.
+ */
+static void initiate_drain(struct admin_state *state)
+{
+ struct packer *packer = container_of(state, struct packer, state);
+
+ write_all_non_empty_bins(packer);
+}
+
+/**
+ * vdo_drain_packer() - Drain the packer by preventing any more VIOs from entering the packer and
+ * then flushing.
+ * @packer: The packer to drain.
+ * @completion: The completion to finish when the packer has drained.
+ */
+void vdo_drain_packer(struct packer *packer, struct vdo_completion *completion)
+{
+ assert_on_packer_thread(packer, __func__);
+ vdo_start_draining(&packer->state, VDO_ADMIN_STATE_SUSPENDING, completion,
+ initiate_drain);
+}
+
+/**
+ * vdo_resume_packer() - Resume a packer which has been suspended.
+ * @packer: The packer to resume.
+ * @parent: The completion to finish when the packer has resumed.
+ */
+void vdo_resume_packer(struct packer *packer, struct vdo_completion *parent)
+{
+ assert_on_packer_thread(packer, __func__);
+ vdo_continue_completion(parent, vdo_resume_if_quiescent(&packer->state));
+}
+
+static void dump_packer_bin(const struct packer_bin *bin, bool canceled)
+{
+ if (bin->slots_used == 0)
+ /* Don't dump empty bins. */
+ return;
+
+ vdo_log_info(" %sBin slots_used=%u free_space=%zu",
+ (canceled ? "Canceled" : ""), bin->slots_used, bin->free_space);
+
+ /*
+ * FIXME: dump vios in bin->incoming? The vios should have been dumped from the vio pool.
+ * Maybe just dump their addresses so it's clear they're here?
+ */
+}
+
+/**
+ * vdo_dump_packer() - Dump the packer.
+ * @packer: The packer.
+ *
+ * Context: dumps in a thread-unsafe fashion.
+ */
+void vdo_dump_packer(const struct packer *packer)
+{
+ struct packer_bin *bin;
+
+ vdo_log_info("packer");
+ vdo_log_info(" flushGeneration=%llu state %s packer_bin_count=%llu",
+ (unsigned long long) packer->flush_generation,
+ vdo_get_admin_state_code(&packer->state)->name,
+ (unsigned long long) packer->size);
+
+ list_for_each_entry(bin, &packer->bins, list)
+ dump_packer_bin(bin, false);
+
+ dump_packer_bin(packer->canceled_bin, true);
+}
diff --git a/drivers/md/dm-vdo/packer.h b/drivers/md/dm-vdo/packer.h
new file mode 100644
index 000000000000..0f3be44710b5
--- /dev/null
+++ b/drivers/md/dm-vdo/packer.h
@@ -0,0 +1,122 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef VDO_PACKER_H
+#define VDO_PACKER_H
+
+#include <linux/list.h>
+
+#include "admin-state.h"
+#include "constants.h"
+#include "encodings.h"
+#include "statistics.h"
+#include "types.h"
+#include "wait-queue.h"
+
+enum {
+ DEFAULT_PACKER_BINS = 16,
+};
+
+/* The header of a compressed block. */
+struct compressed_block_header {
+ /* Unsigned 32-bit major and minor versions, little-endian */
+ struct packed_version_number version;
+
+ /* List of unsigned 16-bit compressed block sizes, little-endian */
+ __le16 sizes[VDO_MAX_COMPRESSION_SLOTS];
+} __packed;
+
+enum {
+ VDO_COMPRESSED_BLOCK_DATA_SIZE = VDO_BLOCK_SIZE - sizeof(struct compressed_block_header),
+
+ /*
+ * A compressed block is only written if we can pack at least two fragments into it, so a
+ * fragment which fills the entire data portion of a compressed block is too big.
+ */
+ VDO_MAX_COMPRESSED_FRAGMENT_SIZE = VDO_COMPRESSED_BLOCK_DATA_SIZE - 1,
+};
+
+/* * The compressed block overlay. */
+struct compressed_block {
+ struct compressed_block_header header;
+ char data[VDO_COMPRESSED_BLOCK_DATA_SIZE];
+} __packed;
+
+/*
+ * Each packer_bin holds an incomplete batch of data_vios that only partially fill a compressed
+ * block. The bins are kept in a ring sorted by the amount of unused space so the first bin with
+ * enough space to hold a newly-compressed data_vio can easily be found. When the bin fills up or
+ * is flushed, the first uncanceled data_vio in the bin is selected to be the agent for that bin.
+ * Upon entering the packer, each data_vio already has its compressed data in the first slot of the
+ * data_vio's compressed_block (overlaid on the data_vio's scratch_block). So the agent's fragment
+ * is already in place. The fragments for the other uncanceled data_vios in the bin are packed into
+ * the agent's compressed block. The agent then writes out the compressed block. If the write is
+ * successful, the agent shares its pbn lock which each of the other data_vios in its compressed
+ * block and sends each on its way. Finally the agent itself continues on the write path as before.
+ *
+ * There is one special bin which is used to hold data_vios which have been canceled and removed
+ * from their bin by the packer. These data_vios need to wait for the canceller to rendezvous with
+ * them and so they sit in this special bin.
+ */
+struct packer_bin {
+ /* List links for packer.packer_bins */
+ struct list_head list;
+ /* The number of items in the bin */
+ slot_number_t slots_used;
+ /* The number of compressed block bytes remaining in the current batch */
+ size_t free_space;
+ /* The current partial batch of data_vios, waiting for more */
+ struct data_vio *incoming[];
+};
+
+struct packer {
+ /* The ID of the packer's callback thread */
+ thread_id_t thread_id;
+ /* The number of bins */
+ block_count_t size;
+ /* A list of all packer_bins, kept sorted by free_space */
+ struct list_head bins;
+ /*
+ * A bin to hold data_vios which were canceled out of the packer and are waiting to
+ * rendezvous with the canceling data_vio.
+ */
+ struct packer_bin *canceled_bin;
+
+ /* The current flush generation */
+ sequence_number_t flush_generation;
+
+ /* The administrative state of the packer */
+ struct admin_state state;
+
+ /* Statistics are only updated on the packer thread, but are accessed from other threads */
+ struct packer_statistics statistics;
+};
+
+int vdo_get_compressed_block_fragment(enum block_mapping_state mapping_state,
+ struct compressed_block *block,
+ u16 *fragment_offset, u16 *fragment_size);
+
+int __must_check vdo_make_packer(struct vdo *vdo, block_count_t bin_count,
+ struct packer **packer_ptr);
+
+void vdo_free_packer(struct packer *packer);
+
+struct packer_statistics __must_check vdo_get_packer_statistics(const struct packer *packer);
+
+void vdo_attempt_packing(struct data_vio *data_vio);
+
+void vdo_flush_packer(struct packer *packer);
+
+void vdo_remove_lock_holder_from_packer(struct vdo_completion *completion);
+
+void vdo_increment_packer_flush_generation(struct packer *packer);
+
+void vdo_drain_packer(struct packer *packer, struct vdo_completion *completion);
+
+void vdo_resume_packer(struct packer *packer, struct vdo_completion *parent);
+
+void vdo_dump_packer(const struct packer *packer);
+
+#endif /* VDO_PACKER_H */
diff --git a/drivers/md/dm-vdo/permassert.c b/drivers/md/dm-vdo/permassert.c
new file mode 100644
index 000000000000..bf9eccea1cb3
--- /dev/null
+++ b/drivers/md/dm-vdo/permassert.c
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "permassert.h"
+
+#include "errors.h"
+#include "logger.h"
+
+int vdo_assertion_failed(const char *expression_string, const char *file_name,
+ int line_number, const char *format, ...)
+{
+ va_list args;
+
+ va_start(args, format);
+
+ vdo_log_embedded_message(VDO_LOG_ERR, VDO_LOGGING_MODULE_NAME, "assertion \"",
+ format, args, "\" (%s) failed at %s:%d",
+ expression_string, file_name, line_number);
+ vdo_log_backtrace(VDO_LOG_ERR);
+
+ va_end(args);
+
+ return UDS_ASSERTION_FAILED;
+}
diff --git a/drivers/md/dm-vdo/permassert.h b/drivers/md/dm-vdo/permassert.h
new file mode 100644
index 000000000000..c34f2ba650e1
--- /dev/null
+++ b/drivers/md/dm-vdo/permassert.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef PERMASSERT_H
+#define PERMASSERT_H
+
+#include <linux/compiler.h>
+
+#include "errors.h"
+
+/* Utilities for asserting that certain conditions are met */
+
+#define STRINGIFY(X) #X
+
+/*
+ * A hack to apply the "warn if unused" attribute to an integral expression.
+ *
+ * Since GCC doesn't propagate the warn_unused_result attribute to conditional expressions
+ * incorporating calls to functions with that attribute, this function can be used to wrap such an
+ * expression. With optimization enabled, this function contributes no additional instructions, but
+ * the warn_unused_result attribute still applies to the code calling it.
+ */
+static inline int __must_check vdo_must_use(int value)
+{
+ return value;
+}
+
+/* Assert that an expression is true and return an error if it is not. */
+#define VDO_ASSERT(expr, ...) vdo_must_use(__VDO_ASSERT(expr, __VA_ARGS__))
+
+/* Log a message if the expression is not true. */
+#define VDO_ASSERT_LOG_ONLY(expr, ...) __VDO_ASSERT(expr, __VA_ARGS__)
+
+#define __VDO_ASSERT(expr, ...) \
+ (likely(expr) ? VDO_SUCCESS \
+ : vdo_assertion_failed(STRINGIFY(expr), __FILE__, __LINE__, __VA_ARGS__))
+
+/* Log an assertion failure message. */
+int vdo_assertion_failed(const char *expression_string, const char *file_name,
+ int line_number, const char *format, ...)
+ __printf(4, 5);
+
+#endif /* PERMASSERT_H */
diff --git a/drivers/md/dm-vdo/physical-zone.c b/drivers/md/dm-vdo/physical-zone.c
new file mode 100644
index 000000000000..2fee3a7c1191
--- /dev/null
+++ b/drivers/md/dm-vdo/physical-zone.c
@@ -0,0 +1,644 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "physical-zone.h"
+
+#include <linux/list.h>
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "permassert.h"
+
+#include "block-map.h"
+#include "completion.h"
+#include "constants.h"
+#include "data-vio.h"
+#include "dedupe.h"
+#include "encodings.h"
+#include "flush.h"
+#include "int-map.h"
+#include "slab-depot.h"
+#include "status-codes.h"
+#include "vdo.h"
+
+/* Each user data_vio needs a PBN read lock and write lock. */
+#define LOCK_POOL_CAPACITY (2 * MAXIMUM_VDO_USER_VIOS)
+
+struct pbn_lock_implementation {
+ enum pbn_lock_type type;
+ const char *name;
+ const char *release_reason;
+};
+
+/* This array must have an entry for every pbn_lock_type value. */
+static const struct pbn_lock_implementation LOCK_IMPLEMENTATIONS[] = {
+ [VIO_READ_LOCK] = {
+ .type = VIO_READ_LOCK,
+ .name = "read",
+ .release_reason = "candidate duplicate",
+ },
+ [VIO_WRITE_LOCK] = {
+ .type = VIO_WRITE_LOCK,
+ .name = "write",
+ .release_reason = "newly allocated",
+ },
+ [VIO_BLOCK_MAP_WRITE_LOCK] = {
+ .type = VIO_BLOCK_MAP_WRITE_LOCK,
+ .name = "block map write",
+ .release_reason = "block map write",
+ },
+};
+
+static inline bool has_lock_type(const struct pbn_lock *lock, enum pbn_lock_type type)
+{
+ return (lock->implementation == &LOCK_IMPLEMENTATIONS[type]);
+}
+
+/**
+ * vdo_is_pbn_read_lock() - Check whether a pbn_lock is a read lock.
+ * @lock: The lock to check.
+ *
+ * Return: true if the lock is a read lock.
+ */
+bool vdo_is_pbn_read_lock(const struct pbn_lock *lock)
+{
+ return has_lock_type(lock, VIO_READ_LOCK);
+}
+
+static inline void set_pbn_lock_type(struct pbn_lock *lock, enum pbn_lock_type type)
+{
+ lock->implementation = &LOCK_IMPLEMENTATIONS[type];
+}
+
+/**
+ * vdo_downgrade_pbn_write_lock() - Downgrade a PBN write lock to a PBN read lock.
+ * @lock: The PBN write lock to downgrade.
+ *
+ * The lock holder count is cleared and the caller is responsible for setting the new count.
+ */
+void vdo_downgrade_pbn_write_lock(struct pbn_lock *lock, bool compressed_write)
+{
+ VDO_ASSERT_LOG_ONLY(!vdo_is_pbn_read_lock(lock),
+ "PBN lock must not already have been downgraded");
+ VDO_ASSERT_LOG_ONLY(!has_lock_type(lock, VIO_BLOCK_MAP_WRITE_LOCK),
+ "must not downgrade block map write locks");
+ VDO_ASSERT_LOG_ONLY(lock->holder_count == 1,
+ "PBN write lock should have one holder but has %u",
+ lock->holder_count);
+ /*
+ * data_vio write locks are downgraded in place--the writer retains the hold on the lock.
+ * If this was a compressed write, the holder has not yet journaled its own inc ref,
+ * otherwise, it has.
+ */
+ lock->increment_limit =
+ (compressed_write ? MAXIMUM_REFERENCE_COUNT : MAXIMUM_REFERENCE_COUNT - 1);
+ set_pbn_lock_type(lock, VIO_READ_LOCK);
+}
+
+/**
+ * vdo_claim_pbn_lock_increment() - Try to claim one of the available reference count increments on
+ * a read lock.
+ * @lock: The PBN read lock from which to claim an increment.
+ *
+ * Claims may be attempted from any thread. A claim is only valid until the PBN lock is released.
+ *
+ * Return: true if the claim succeeded, guaranteeing one increment can be made without overflowing
+ * the PBN's reference count.
+ */
+bool vdo_claim_pbn_lock_increment(struct pbn_lock *lock)
+{
+ /*
+ * Claim the next free reference atomically since hash locks from multiple hash zone
+ * threads might be concurrently deduplicating against a single PBN lock on compressed
+ * block. As long as hitting the increment limit will lead to the PBN lock being released
+ * in a sane time-frame, we won't overflow a 32-bit claim counter, allowing a simple add
+ * instead of a compare-and-swap.
+ */
+ u32 claim_number = (u32) atomic_add_return(1, &lock->increments_claimed);
+
+ return (claim_number <= lock->increment_limit);
+}
+
+/**
+ * vdo_assign_pbn_lock_provisional_reference() - Inform a PBN lock that it is responsible for a
+ * provisional reference.
+ * @lock: The PBN lock.
+ */
+void vdo_assign_pbn_lock_provisional_reference(struct pbn_lock *lock)
+{
+ VDO_ASSERT_LOG_ONLY(!lock->has_provisional_reference,
+ "lock does not have a provisional reference");
+ lock->has_provisional_reference = true;
+}
+
+/**
+ * vdo_unassign_pbn_lock_provisional_reference() - Inform a PBN lock that it is no longer
+ * responsible for a provisional reference.
+ * @lock: The PBN lock.
+ */
+void vdo_unassign_pbn_lock_provisional_reference(struct pbn_lock *lock)
+{
+ lock->has_provisional_reference = false;
+}
+
+/**
+ * release_pbn_lock_provisional_reference() - If the lock is responsible for a provisional
+ * reference, release that reference.
+ * @lock: The lock.
+ * @locked_pbn: The PBN covered by the lock.
+ * @allocator: The block allocator from which to release the reference.
+ *
+ * This method is called when the lock is released.
+ */
+static void release_pbn_lock_provisional_reference(struct pbn_lock *lock,
+ physical_block_number_t locked_pbn,
+ struct block_allocator *allocator)
+{
+ int result;
+
+ if (!vdo_pbn_lock_has_provisional_reference(lock))
+ return;
+
+ result = vdo_release_block_reference(allocator, locked_pbn);
+ if (result != VDO_SUCCESS) {
+ vdo_log_error_strerror(result,
+ "Failed to release reference to %s physical block %llu",
+ lock->implementation->release_reason,
+ (unsigned long long) locked_pbn);
+ }
+
+ vdo_unassign_pbn_lock_provisional_reference(lock);
+}
+
+/**
+ * union idle_pbn_lock - PBN lock list entries.
+ *
+ * Unused (idle) PBN locks are kept in a list. Just like in a malloc implementation, the lock
+ * structure is unused memory, so we can save a bit of space (and not pollute the lock structure
+ * proper) by using a union to overlay the lock structure with the free list.
+ */
+typedef union {
+ /** @entry: Only used while locks are in the pool. */
+ struct list_head entry;
+ /** @lock: Only used while locks are not in the pool. */
+ struct pbn_lock lock;
+} idle_pbn_lock;
+
+/**
+ * struct pbn_lock_pool - list of PBN locks.
+ *
+ * The lock pool is little more than the memory allocated for the locks.
+ */
+struct pbn_lock_pool {
+ /** @capacity: The number of locks allocated for the pool. */
+ size_t capacity;
+ /** @borrowed: The number of locks currently borrowed from the pool. */
+ size_t borrowed;
+ /** @idle_list: A list containing all idle PBN lock instances. */
+ struct list_head idle_list;
+ /** @locks: The memory for all the locks allocated by this pool. */
+ idle_pbn_lock locks[];
+};
+
+/**
+ * return_pbn_lock_to_pool() - Return a pbn lock to its pool.
+ * @pool: The pool from which the lock was borrowed.
+ * @lock: The last reference to the lock being returned.
+ *
+ * It must be the last live reference, as if the memory were being freed (the lock memory will
+ * re-initialized or zeroed).
+ */
+static void return_pbn_lock_to_pool(struct pbn_lock_pool *pool, struct pbn_lock *lock)
+{
+ idle_pbn_lock *idle;
+
+ /* A bit expensive, but will promptly catch some use-after-free errors. */
+ memset(lock, 0, sizeof(*lock));
+
+ idle = container_of(lock, idle_pbn_lock, lock);
+ INIT_LIST_HEAD(&idle->entry);
+ list_add_tail(&idle->entry, &pool->idle_list);
+
+ VDO_ASSERT_LOG_ONLY(pool->borrowed > 0, "shouldn't return more than borrowed");
+ pool->borrowed -= 1;
+}
+
+/**
+ * make_pbn_lock_pool() - Create a new PBN lock pool and all the lock instances it can loan out.
+ *
+ * @capacity: The number of PBN locks to allocate for the pool.
+ * @pool_ptr: A pointer to receive the new pool.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+static int make_pbn_lock_pool(size_t capacity, struct pbn_lock_pool **pool_ptr)
+{
+ size_t i;
+ struct pbn_lock_pool *pool;
+ int result;
+
+ result = vdo_allocate_extended(struct pbn_lock_pool, capacity, idle_pbn_lock,
+ __func__, &pool);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ pool->capacity = capacity;
+ pool->borrowed = capacity;
+ INIT_LIST_HEAD(&pool->idle_list);
+
+ for (i = 0; i < capacity; i++)
+ return_pbn_lock_to_pool(pool, &pool->locks[i].lock);
+
+ *pool_ptr = pool;
+ return VDO_SUCCESS;
+}
+
+/**
+ * free_pbn_lock_pool() - Free a PBN lock pool.
+ * @pool: The lock pool to free.
+ *
+ * This also frees all the PBN locks it allocated, so the caller must ensure that all locks have
+ * been returned to the pool.
+ */
+static void free_pbn_lock_pool(struct pbn_lock_pool *pool)
+{
+ if (pool == NULL)
+ return;
+
+ VDO_ASSERT_LOG_ONLY(pool->borrowed == 0,
+ "All PBN locks must be returned to the pool before it is freed, but %zu locks are still on loan",
+ pool->borrowed);
+ vdo_free(pool);
+}
+
+/**
+ * borrow_pbn_lock_from_pool() - Borrow a PBN lock from the pool and initialize it with the
+ * provided type.
+ * @pool: The pool from which to borrow.
+ * @type: The type with which to initialize the lock.
+ * @lock_ptr: A pointer to receive the borrowed lock.
+ *
+ * Pools do not grow on demand or allocate memory, so this will fail if the pool is empty. Borrowed
+ * locks are still associated with this pool and must be returned to only this pool.
+ *
+ * Return: VDO_SUCCESS, or VDO_LOCK_ERROR if the pool is empty.
+ */
+static int __must_check borrow_pbn_lock_from_pool(struct pbn_lock_pool *pool,
+ enum pbn_lock_type type,
+ struct pbn_lock **lock_ptr)
+{
+ int result;
+ struct list_head *idle_entry;
+ idle_pbn_lock *idle;
+
+ if (pool->borrowed >= pool->capacity)
+ return vdo_log_error_strerror(VDO_LOCK_ERROR,
+ "no free PBN locks left to borrow");
+ pool->borrowed += 1;
+
+ result = VDO_ASSERT(!list_empty(&pool->idle_list),
+ "idle list should not be empty if pool not at capacity");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ idle_entry = pool->idle_list.prev;
+ list_del(idle_entry);
+ memset(idle_entry, 0, sizeof(*idle_entry));
+
+ idle = list_entry(idle_entry, idle_pbn_lock, entry);
+ idle->lock.holder_count = 0;
+ set_pbn_lock_type(&idle->lock, type);
+
+ *lock_ptr = &idle->lock;
+ return VDO_SUCCESS;
+}
+
+/**
+ * initialize_zone() - Initialize a physical zone.
+ * @vdo: The vdo to which the zone will belong.
+ * @zones: The physical_zones to which the zone being initialized belongs
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+static int initialize_zone(struct vdo *vdo, struct physical_zones *zones)
+{
+ int result;
+ zone_count_t zone_number = zones->zone_count;
+ struct physical_zone *zone = &zones->zones[zone_number];
+
+ result = vdo_int_map_create(VDO_LOCK_MAP_CAPACITY, &zone->pbn_operations);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = make_pbn_lock_pool(LOCK_POOL_CAPACITY, &zone->lock_pool);
+ if (result != VDO_SUCCESS) {
+ vdo_int_map_free(zone->pbn_operations);
+ return result;
+ }
+
+ zone->zone_number = zone_number;
+ zone->thread_id = vdo->thread_config.physical_threads[zone_number];
+ zone->allocator = &vdo->depot->allocators[zone_number];
+ zone->next = &zones->zones[(zone_number + 1) % vdo->thread_config.physical_zone_count];
+ result = vdo_make_default_thread(vdo, zone->thread_id);
+ if (result != VDO_SUCCESS) {
+ free_pbn_lock_pool(vdo_forget(zone->lock_pool));
+ vdo_int_map_free(zone->pbn_operations);
+ return result;
+ }
+ return result;
+}
+
+/**
+ * vdo_make_physical_zones() - Make the physical zones for a vdo.
+ * @vdo: The vdo being constructed
+ * @zones_ptr: A pointer to hold the zones
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+int vdo_make_physical_zones(struct vdo *vdo, struct physical_zones **zones_ptr)
+{
+ struct physical_zones *zones;
+ int result;
+ zone_count_t zone_count = vdo->thread_config.physical_zone_count;
+
+ if (zone_count == 0)
+ return VDO_SUCCESS;
+
+ result = vdo_allocate_extended(struct physical_zones, zone_count,
+ struct physical_zone, __func__, &zones);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ for (zones->zone_count = 0; zones->zone_count < zone_count; zones->zone_count++) {
+ result = initialize_zone(vdo, zones);
+ if (result != VDO_SUCCESS) {
+ vdo_free_physical_zones(zones);
+ return result;
+ }
+ }
+
+ *zones_ptr = zones;
+ return VDO_SUCCESS;
+}
+
+/**
+ * vdo_free_physical_zones() - Destroy the physical zones.
+ * @zones: The zones to free.
+ */
+void vdo_free_physical_zones(struct physical_zones *zones)
+{
+ zone_count_t index;
+
+ if (zones == NULL)
+ return;
+
+ for (index = 0; index < zones->zone_count; index++) {
+ struct physical_zone *zone = &zones->zones[index];
+
+ free_pbn_lock_pool(vdo_forget(zone->lock_pool));
+ vdo_int_map_free(vdo_forget(zone->pbn_operations));
+ }
+
+ vdo_free(zones);
+}
+
+/**
+ * vdo_get_physical_zone_pbn_lock() - Get the lock on a PBN if one exists.
+ * @zone: The physical zone responsible for the PBN.
+ * @pbn: The physical block number whose lock is desired.
+ *
+ * Return: The lock or NULL if the PBN is not locked.
+ */
+struct pbn_lock *vdo_get_physical_zone_pbn_lock(struct physical_zone *zone,
+ physical_block_number_t pbn)
+{
+ return ((zone == NULL) ? NULL : vdo_int_map_get(zone->pbn_operations, pbn));
+}
+
+/**
+ * vdo_attempt_physical_zone_pbn_lock() - Attempt to lock a physical block in the zone responsible
+ * for it.
+ * @zone: The physical zone responsible for the PBN.
+ * @pbn: The physical block number to lock.
+ * @type: The type with which to initialize a new lock.
+ * @lock_ptr: A pointer to receive the lock, existing or new.
+ *
+ * If the PBN is already locked, the existing lock will be returned. Otherwise, a new lock instance
+ * will be borrowed from the pool, initialized, and returned. The lock owner will be NULL for a new
+ * lock acquired by the caller, who is responsible for setting that field promptly. The lock owner
+ * will be non-NULL when there is already an existing lock on the PBN.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+int vdo_attempt_physical_zone_pbn_lock(struct physical_zone *zone,
+ physical_block_number_t pbn,
+ enum pbn_lock_type type,
+ struct pbn_lock **lock_ptr)
+{
+ /*
+ * Borrow and prepare a lock from the pool so we don't have to do two int_map accesses in
+ * the common case of no lock contention.
+ */
+ struct pbn_lock *lock, *new_lock = NULL;
+ int result;
+
+ result = borrow_pbn_lock_from_pool(zone->lock_pool, type, &new_lock);
+ if (result != VDO_SUCCESS) {
+ VDO_ASSERT_LOG_ONLY(false, "must always be able to borrow a PBN lock");
+ return result;
+ }
+
+ result = vdo_int_map_put(zone->pbn_operations, pbn, new_lock, false,
+ (void **) &lock);
+ if (result != VDO_SUCCESS) {
+ return_pbn_lock_to_pool(zone->lock_pool, new_lock);
+ return result;
+ }
+
+ if (lock != NULL) {
+ /* The lock is already held, so we don't need the borrowed one. */
+ return_pbn_lock_to_pool(zone->lock_pool, vdo_forget(new_lock));
+ result = VDO_ASSERT(lock->holder_count > 0, "physical block %llu lock held",
+ (unsigned long long) pbn);
+ if (result != VDO_SUCCESS)
+ return result;
+ *lock_ptr = lock;
+ } else {
+ *lock_ptr = new_lock;
+ }
+ return VDO_SUCCESS;
+}
+
+/**
+ * allocate_and_lock_block() - Attempt to allocate a block from this zone.
+ * @allocation: The struct allocation of the data_vio attempting to allocate.
+ *
+ * If a block is allocated, the recipient will also hold a lock on it.
+ *
+ * Return: VDO_SUCCESS if a block was allocated, or an error code.
+ */
+static int allocate_and_lock_block(struct allocation *allocation)
+{
+ int result;
+ struct pbn_lock *lock;
+
+ VDO_ASSERT_LOG_ONLY(allocation->lock == NULL,
+ "must not allocate a block while already holding a lock on one");
+
+ result = vdo_allocate_block(allocation->zone->allocator, &allocation->pbn);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = vdo_attempt_physical_zone_pbn_lock(allocation->zone, allocation->pbn,
+ allocation->write_lock_type, &lock);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ if (lock->holder_count > 0) {
+ /* This block is already locked, which should be impossible. */
+ return vdo_log_error_strerror(VDO_LOCK_ERROR,
+ "Newly allocated block %llu was spuriously locked (holder_count=%u)",
+ (unsigned long long) allocation->pbn,
+ lock->holder_count);
+ }
+
+ /* We've successfully acquired a new lock, so mark it as ours. */
+ lock->holder_count += 1;
+ allocation->lock = lock;
+ vdo_assign_pbn_lock_provisional_reference(lock);
+ return VDO_SUCCESS;
+}
+
+/**
+ * retry_allocation() - Retry allocating a block now that we're done waiting for scrubbing.
+ * @waiter: The allocating_vio that was waiting to allocate.
+ * @context: The context (unused).
+ */
+static void retry_allocation(struct vdo_waiter *waiter, void *context __always_unused)
+{
+ struct data_vio *data_vio = vdo_waiter_as_data_vio(waiter);
+
+ /* Now that some slab has scrubbed, restart the allocation process. */
+ data_vio->allocation.wait_for_clean_slab = false;
+ data_vio->allocation.first_allocation_zone = data_vio->allocation.zone->zone_number;
+ continue_data_vio(data_vio);
+}
+
+/**
+ * continue_allocating() - Continue searching for an allocation by enqueuing to wait for scrubbing
+ * or switching to the next zone.
+ * @data_vio: The data_vio attempting to get an allocation.
+ *
+ * This method should only be called from the error handler set in data_vio_allocate_data_block.
+ *
+ * Return: true if the allocation process has continued in another zone.
+ */
+static bool continue_allocating(struct data_vio *data_vio)
+{
+ struct allocation *allocation = &data_vio->allocation;
+ struct physical_zone *zone = allocation->zone;
+ struct vdo_completion *completion = &data_vio->vio.completion;
+ int result = VDO_SUCCESS;
+ bool was_waiting = allocation->wait_for_clean_slab;
+ bool tried_all = (allocation->first_allocation_zone == zone->next->zone_number);
+
+ vdo_reset_completion(completion);
+
+ if (tried_all && !was_waiting) {
+ /*
+ * We've already looked in all the zones, and found nothing. So go through the
+ * zones again, and wait for each to scrub before trying to allocate.
+ */
+ allocation->wait_for_clean_slab = true;
+ allocation->first_allocation_zone = zone->zone_number;
+ }
+
+ if (allocation->wait_for_clean_slab) {
+ data_vio->waiter.callback = retry_allocation;
+ result = vdo_enqueue_clean_slab_waiter(zone->allocator,
+ &data_vio->waiter);
+ if (result == VDO_SUCCESS) {
+ /* We've enqueued to wait for a slab to be scrubbed. */
+ return true;
+ }
+
+ if ((result != VDO_NO_SPACE) || (was_waiting && tried_all)) {
+ vdo_set_completion_result(completion, result);
+ return false;
+ }
+ }
+
+ allocation->zone = zone->next;
+ completion->callback_thread_id = allocation->zone->thread_id;
+ vdo_launch_completion(completion);
+ return true;
+}
+
+/**
+ * vdo_allocate_block_in_zone() - Attempt to allocate a block in the current physical zone, and if
+ * that fails try the next if possible.
+ * @data_vio: The data_vio needing an allocation.
+ *
+ * Return: true if a block was allocated, if not the data_vio will have been dispatched so the
+ * caller must not touch it.
+ */
+bool vdo_allocate_block_in_zone(struct data_vio *data_vio)
+{
+ int result = allocate_and_lock_block(&data_vio->allocation);
+
+ if (result == VDO_SUCCESS)
+ return true;
+
+ if ((result != VDO_NO_SPACE) || !continue_allocating(data_vio))
+ continue_data_vio_with_error(data_vio, result);
+
+ return false;
+}
+
+/**
+ * vdo_release_physical_zone_pbn_lock() - Release a physical block lock if it is held and return it
+ * to the lock pool.
+ * @zone: The physical zone in which the lock was obtained.
+ * @locked_pbn: The physical block number to unlock.
+ * @lock: The lock being released.
+ *
+ * It must be the last live reference, as if the memory were being freed (the
+ * lock memory will re-initialized or zeroed).
+ */
+void vdo_release_physical_zone_pbn_lock(struct physical_zone *zone,
+ physical_block_number_t locked_pbn,
+ struct pbn_lock *lock)
+{
+ struct pbn_lock *holder;
+
+ if (lock == NULL)
+ return;
+
+ VDO_ASSERT_LOG_ONLY(lock->holder_count > 0,
+ "should not be releasing a lock that is not held");
+
+ lock->holder_count -= 1;
+ if (lock->holder_count > 0) {
+ /* The lock was shared and is still referenced, so don't release it yet. */
+ return;
+ }
+
+ holder = vdo_int_map_remove(zone->pbn_operations, locked_pbn);
+ VDO_ASSERT_LOG_ONLY((lock == holder), "physical block lock mismatch for block %llu",
+ (unsigned long long) locked_pbn);
+
+ release_pbn_lock_provisional_reference(lock, locked_pbn, zone->allocator);
+ return_pbn_lock_to_pool(zone->lock_pool, lock);
+}
+
+/**
+ * vdo_dump_physical_zone() - Dump information about a physical zone to the log for debugging.
+ * @zone: The zone to dump.
+ */
+void vdo_dump_physical_zone(const struct physical_zone *zone)
+{
+ vdo_dump_block_allocator(zone->allocator);
+}
diff --git a/drivers/md/dm-vdo/physical-zone.h b/drivers/md/dm-vdo/physical-zone.h
new file mode 100644
index 000000000000..47d874fd5a0b
--- /dev/null
+++ b/drivers/md/dm-vdo/physical-zone.h
@@ -0,0 +1,115 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef VDO_PHYSICAL_ZONE_H
+#define VDO_PHYSICAL_ZONE_H
+
+#include <linux/atomic.h>
+
+#include "types.h"
+
+/*
+ * The type of a PBN lock.
+ */
+enum pbn_lock_type {
+ VIO_READ_LOCK,
+ VIO_WRITE_LOCK,
+ VIO_BLOCK_MAP_WRITE_LOCK,
+};
+
+struct pbn_lock_implementation;
+
+/*
+ * A PBN lock.
+ */
+struct pbn_lock {
+ /* The implementation of the lock */
+ const struct pbn_lock_implementation *implementation;
+
+ /* The number of VIOs holding or sharing this lock */
+ data_vio_count_t holder_count;
+ /*
+ * The number of compressed block writers holding a share of this lock while they are
+ * acquiring a reference to the PBN.
+ */
+ u8 fragment_locks;
+
+ /* Whether the locked PBN has been provisionally referenced on behalf of the lock holder. */
+ bool has_provisional_reference;
+
+ /*
+ * For read locks, the number of references that were known to be available on the locked
+ * block at the time the lock was acquired.
+ */
+ u8 increment_limit;
+
+ /*
+ * For read locks, the number of data_vios that have tried to claim one of the available
+ * increments during the lifetime of the lock. Each claim will first increment this
+ * counter, so it can exceed the increment limit.
+ */
+ atomic_t increments_claimed;
+};
+
+struct physical_zone {
+ /* Which physical zone this is */
+ zone_count_t zone_number;
+ /* The thread ID for this zone */
+ thread_id_t thread_id;
+ /* In progress operations keyed by PBN */
+ struct int_map *pbn_operations;
+ /* Pool of unused pbn_lock instances */
+ struct pbn_lock_pool *lock_pool;
+ /* The block allocator for this zone */
+ struct block_allocator *allocator;
+ /* The next zone from which to attempt an allocation */
+ struct physical_zone *next;
+};
+
+struct physical_zones {
+ /* The number of zones */
+ zone_count_t zone_count;
+ /* The physical zones themselves */
+ struct physical_zone zones[];
+};
+
+bool __must_check vdo_is_pbn_read_lock(const struct pbn_lock *lock);
+void vdo_downgrade_pbn_write_lock(struct pbn_lock *lock, bool compressed_write);
+bool __must_check vdo_claim_pbn_lock_increment(struct pbn_lock *lock);
+
+/**
+ * vdo_pbn_lock_has_provisional_reference() - Check whether a PBN lock has a provisional reference.
+ * @lock: The PBN lock.
+ */
+static inline bool vdo_pbn_lock_has_provisional_reference(struct pbn_lock *lock)
+{
+ return ((lock != NULL) && lock->has_provisional_reference);
+}
+
+void vdo_assign_pbn_lock_provisional_reference(struct pbn_lock *lock);
+void vdo_unassign_pbn_lock_provisional_reference(struct pbn_lock *lock);
+
+int __must_check vdo_make_physical_zones(struct vdo *vdo,
+ struct physical_zones **zones_ptr);
+
+void vdo_free_physical_zones(struct physical_zones *zones);
+
+struct pbn_lock * __must_check vdo_get_physical_zone_pbn_lock(struct physical_zone *zone,
+ physical_block_number_t pbn);
+
+int __must_check vdo_attempt_physical_zone_pbn_lock(struct physical_zone *zone,
+ physical_block_number_t pbn,
+ enum pbn_lock_type type,
+ struct pbn_lock **lock_ptr);
+
+bool __must_check vdo_allocate_block_in_zone(struct data_vio *data_vio);
+
+void vdo_release_physical_zone_pbn_lock(struct physical_zone *zone,
+ physical_block_number_t locked_pbn,
+ struct pbn_lock *lock);
+
+void vdo_dump_physical_zone(const struct physical_zone *zone);
+
+#endif /* VDO_PHYSICAL_ZONE_H */
diff --git a/drivers/md/dm-vdo/priority-table.c b/drivers/md/dm-vdo/priority-table.c
new file mode 100644
index 000000000000..42d3d8d0e4b5
--- /dev/null
+++ b/drivers/md/dm-vdo/priority-table.c
@@ -0,0 +1,224 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "priority-table.h"
+
+#include <linux/log2.h>
+
+#include "errors.h"
+#include "memory-alloc.h"
+#include "permassert.h"
+
+#include "status-codes.h"
+
+/* We use a single 64-bit search vector, so the maximum priority is 63 */
+#define MAX_PRIORITY 63
+
+/*
+ * All the entries with the same priority are queued in a circular list in a bucket for that
+ * priority. The table is essentially an array of buckets.
+ */
+struct bucket {
+ /*
+ * The head of a queue of table entries, all having the same priority
+ */
+ struct list_head queue;
+ /* The priority of all the entries in this bucket */
+ unsigned int priority;
+};
+
+/*
+ * A priority table is an array of buckets, indexed by priority. New entries are added to the end
+ * of the queue in the appropriate bucket. The dequeue operation finds the highest-priority
+ * non-empty bucket by searching a bit vector represented as a single 8-byte word, which is very
+ * fast with compiler and CPU support.
+ */
+struct priority_table {
+ /* The maximum priority of entries that may be stored in this table */
+ unsigned int max_priority;
+ /* A bit vector flagging all buckets that are currently non-empty */
+ u64 search_vector;
+ /* The array of all buckets, indexed by priority */
+ struct bucket buckets[];
+};
+
+/**
+ * vdo_make_priority_table() - Allocate and initialize a new priority_table.
+ * @max_priority: The maximum priority value for table entries.
+ * @table_ptr: A pointer to hold the new table.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+int vdo_make_priority_table(unsigned int max_priority, struct priority_table **table_ptr)
+{
+ struct priority_table *table;
+ int result;
+ unsigned int priority;
+
+ if (max_priority > MAX_PRIORITY)
+ return UDS_INVALID_ARGUMENT;
+
+ result = vdo_allocate_extended(struct priority_table, max_priority + 1,
+ struct bucket, __func__, &table);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ for (priority = 0; priority <= max_priority; priority++) {
+ struct bucket *bucket = &table->buckets[priority];
+
+ bucket->priority = priority;
+ INIT_LIST_HEAD(&bucket->queue);
+ }
+
+ table->max_priority = max_priority;
+ table->search_vector = 0;
+
+ *table_ptr = table;
+ return VDO_SUCCESS;
+}
+
+/**
+ * vdo_free_priority_table() - Free a priority_table.
+ * @table: The table to free.
+ *
+ * The table does not own the entries stored in it and they are not freed by this call.
+ */
+void vdo_free_priority_table(struct priority_table *table)
+{
+ if (table == NULL)
+ return;
+
+ /*
+ * Unlink the buckets from any entries still in the table so the entries won't be left with
+ * dangling pointers to freed memory.
+ */
+ vdo_reset_priority_table(table);
+
+ vdo_free(table);
+}
+
+/**
+ * vdo_reset_priority_table() - Reset a priority table, leaving it in the same empty state as when
+ * newly constructed.
+ * @table: The table to reset.
+ *
+ * The table does not own the entries stored in it and they are not freed (or even unlinked from
+ * each other) by this call.
+ */
+void vdo_reset_priority_table(struct priority_table *table)
+{
+ unsigned int priority;
+
+ table->search_vector = 0;
+ for (priority = 0; priority <= table->max_priority; priority++)
+ list_del_init(&table->buckets[priority].queue);
+}
+
+/**
+ * vdo_priority_table_enqueue() - Add a new entry to the priority table, appending it to the queue
+ * for entries with the specified priority.
+ * @table: The table in which to store the entry.
+ * @priority: The priority of the entry.
+ * @entry: The list_head embedded in the entry to store in the table (the caller must have
+ * initialized it).
+ */
+void vdo_priority_table_enqueue(struct priority_table *table, unsigned int priority,
+ struct list_head *entry)
+{
+ VDO_ASSERT_LOG_ONLY((priority <= table->max_priority),
+ "entry priority must be valid for the table");
+
+ /* Append the entry to the queue in the specified bucket. */
+ list_move_tail(entry, &table->buckets[priority].queue);
+
+ /* Flag the bucket in the search vector since it must be non-empty. */
+ table->search_vector |= (1ULL << priority);
+}
+
+static inline void mark_bucket_empty(struct priority_table *table, struct bucket *bucket)
+{
+ table->search_vector &= ~(1ULL << bucket->priority);
+}
+
+/**
+ * vdo_priority_table_dequeue() - Find the highest-priority entry in the table, remove it from the
+ * table, and return it.
+ * @table: The priority table from which to remove an entry.
+ *
+ * If there are multiple entries with the same priority, the one that has been in the table with
+ * that priority the longest will be returned.
+ *
+ * Return: The dequeued entry, or NULL if the table is currently empty.
+ */
+struct list_head *vdo_priority_table_dequeue(struct priority_table *table)
+{
+ struct bucket *bucket;
+ struct list_head *entry;
+ int top_priority;
+
+ if (table->search_vector == 0) {
+ /* All buckets are empty. */
+ return NULL;
+ }
+
+ /*
+ * Find the highest priority non-empty bucket by finding the highest-order non-zero bit in
+ * the search vector.
+ */
+ top_priority = ilog2(table->search_vector);
+
+ /* Dequeue the first entry in the bucket. */
+ bucket = &table->buckets[top_priority];
+ entry = bucket->queue.next;
+ list_del_init(entry);
+
+ /* Clear the bit in the search vector if the bucket has been emptied. */
+ if (list_empty(&bucket->queue))
+ mark_bucket_empty(table, bucket);
+
+ return entry;
+}
+
+/**
+ * vdo_priority_table_remove() - Remove a specified entry from its priority table.
+ * @table: The table from which to remove the entry.
+ * @entry: The entry to remove from the table.
+ */
+void vdo_priority_table_remove(struct priority_table *table, struct list_head *entry)
+{
+ struct list_head *next_entry;
+
+ /*
+ * We can't guard against calls where the entry is on a list for a different table, but
+ * it's easy to deal with an entry not in any table or list.
+ */
+ if (list_empty(entry))
+ return;
+
+ /*
+ * Remove the entry from the bucket list, remembering a pointer to another entry in the
+ * ring.
+ */
+ next_entry = entry->next;
+ list_del_init(entry);
+
+ /*
+ * If the rest of the list is now empty, the next node must be the list head in the bucket
+ * and we can use it to update the search vector.
+ */
+ if (list_empty(next_entry))
+ mark_bucket_empty(table, list_entry(next_entry, struct bucket, queue));
+}
+
+/**
+ * vdo_is_priority_table_empty() - Return whether the priority table is empty.
+ * @table: The table to check.
+ *
+ * Return: true if the table is empty.
+ */
+bool vdo_is_priority_table_empty(struct priority_table *table)
+{
+ return (table->search_vector == 0);
+}
diff --git a/drivers/md/dm-vdo/priority-table.h b/drivers/md/dm-vdo/priority-table.h
new file mode 100644
index 000000000000..8b060462e3e4
--- /dev/null
+++ b/drivers/md/dm-vdo/priority-table.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef VDO_PRIORITY_TABLE_H
+#define VDO_PRIORITY_TABLE_H
+
+#include <linux/list.h>
+
+/*
+ * A priority_table is a simple implementation of a priority queue for entries with priorities that
+ * are small non-negative integer values. It implements the obvious priority queue operations of
+ * enqueuing an entry and dequeuing an entry with the maximum priority. It also supports removing
+ * an arbitrary entry. The priority of an entry already in the table can be changed by removing it
+ * and re-enqueuing it with a different priority. All operations have O(1) complexity.
+ *
+ * The links for the table entries must be embedded in the entries themselves. Lists are used to
+ * link entries in the table and no wrapper type is declared, so an existing list entry in an
+ * object can also be used to queue it in a priority_table, assuming the field is not used for
+ * anything else while so queued.
+ *
+ * The table is implemented as an array of queues (circular lists) indexed by priority, along with
+ * a hint for which queues are non-empty. Steven Skiena calls a very similar structure a "bounded
+ * height priority queue", but given the resemblance to a hash table, "priority table" seems both
+ * shorter and more apt, if somewhat novel.
+ */
+
+struct priority_table;
+
+int __must_check vdo_make_priority_table(unsigned int max_priority,
+ struct priority_table **table_ptr);
+
+void vdo_free_priority_table(struct priority_table *table);
+
+void vdo_priority_table_enqueue(struct priority_table *table, unsigned int priority,
+ struct list_head *entry);
+
+void vdo_reset_priority_table(struct priority_table *table);
+
+struct list_head * __must_check vdo_priority_table_dequeue(struct priority_table *table);
+
+void vdo_priority_table_remove(struct priority_table *table, struct list_head *entry);
+
+bool __must_check vdo_is_priority_table_empty(struct priority_table *table);
+
+#endif /* VDO_PRIORITY_TABLE_H */
diff --git a/drivers/md/dm-vdo/recovery-journal.c b/drivers/md/dm-vdo/recovery-journal.c
new file mode 100644
index 000000000000..ee6321a3e523
--- /dev/null
+++ b/drivers/md/dm-vdo/recovery-journal.c
@@ -0,0 +1,1762 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "recovery-journal.h"
+
+#include <linux/atomic.h>
+#include <linux/bio.h>
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "permassert.h"
+
+#include "block-map.h"
+#include "completion.h"
+#include "constants.h"
+#include "data-vio.h"
+#include "encodings.h"
+#include "io-submitter.h"
+#include "slab-depot.h"
+#include "types.h"
+#include "vdo.h"
+#include "vio.h"
+#include "wait-queue.h"
+
+static const u64 RECOVERY_COUNT_MASK = 0xff;
+
+/*
+ * The number of reserved blocks must be large enough to prevent a new recovery journal
+ * block write from overwriting a block which appears to still be a valid head block of the
+ * journal. Currently, that means reserving enough space for all 2048 data_vios.
+ */
+#define RECOVERY_JOURNAL_RESERVED_BLOCKS \
+ ((MAXIMUM_VDO_USER_VIOS / RECOVERY_JOURNAL_ENTRIES_PER_BLOCK) + 2)
+
+/**
+ * DOC: Lock Counters.
+ *
+ * A lock_counter is intended to keep all of the locks for the blocks in the recovery journal. The
+ * per-zone counters are all kept in a single array which is arranged by zone (i.e. zone 0's lock 0
+ * is at index 0, zone 0's lock 1 is at index 1, and zone 1's lock 0 is at index 'locks'. This
+ * arrangement is intended to minimize cache-line contention for counters from different zones.
+ *
+ * The locks are implemented as a single object instead of as a lock counter per lock both to
+ * afford this opportunity to reduce cache line contention and also to eliminate the need to have a
+ * completion per lock.
+ *
+ * Lock sets are laid out with the set for recovery journal first, followed by the logical zones,
+ * and then the physical zones.
+ */
+
+enum lock_counter_state {
+ LOCK_COUNTER_STATE_NOT_NOTIFYING,
+ LOCK_COUNTER_STATE_NOTIFYING,
+ LOCK_COUNTER_STATE_SUSPENDED,
+};
+
+/**
+ * get_zone_count_ptr() - Get a pointer to the zone count for a given lock on a given zone.
+ * @journal: The recovery journal.
+ * @lock_number: The lock to get.
+ * @zone_type: The zone type whose count is desired.
+ *
+ * Return: A pointer to the zone count for the given lock and zone.
+ */
+static inline atomic_t *get_zone_count_ptr(struct recovery_journal *journal,
+ block_count_t lock_number,
+ enum vdo_zone_type zone_type)
+{
+ return ((zone_type == VDO_ZONE_TYPE_LOGICAL)
+ ? &journal->lock_counter.logical_zone_counts[lock_number]
+ : &journal->lock_counter.physical_zone_counts[lock_number]);
+}
+
+/**
+ * get_counter() - Get the zone counter for a given lock on a given zone.
+ * @journal: The recovery journal.
+ * @lock_number: The lock to get.
+ * @zone_type: The zone type whose count is desired.
+ * @zone_id: The zone index whose count is desired.
+ *
+ * Return: The counter for the given lock and zone.
+ */
+static inline u16 *get_counter(struct recovery_journal *journal,
+ block_count_t lock_number, enum vdo_zone_type zone_type,
+ zone_count_t zone_id)
+{
+ struct lock_counter *counter = &journal->lock_counter;
+ block_count_t zone_counter = (counter->locks * zone_id) + lock_number;
+
+ if (zone_type == VDO_ZONE_TYPE_JOURNAL)
+ return &counter->journal_counters[zone_counter];
+
+ if (zone_type == VDO_ZONE_TYPE_LOGICAL)
+ return &counter->logical_counters[zone_counter];
+
+ return &counter->physical_counters[zone_counter];
+}
+
+static atomic_t *get_decrement_counter(struct recovery_journal *journal,
+ block_count_t lock_number)
+{
+ return &journal->lock_counter.journal_decrement_counts[lock_number];
+}
+
+/**
+ * is_journal_zone_locked() - Check whether the journal zone is locked for a given lock.
+ * @journal: The recovery journal.
+ * @lock_number: The lock to check.
+ *
+ * Return: true if the journal zone is locked.
+ */
+static bool is_journal_zone_locked(struct recovery_journal *journal,
+ block_count_t lock_number)
+{
+ u16 journal_value = *get_counter(journal, lock_number, VDO_ZONE_TYPE_JOURNAL, 0);
+ u32 decrements = atomic_read(get_decrement_counter(journal, lock_number));
+
+ /* Pairs with barrier in vdo_release_journal_entry_lock() */
+ smp_rmb();
+ VDO_ASSERT_LOG_ONLY((decrements <= journal_value),
+ "journal zone lock counter must not underflow");
+ return (journal_value != decrements);
+}
+
+/**
+ * vdo_release_recovery_journal_block_reference() - Release a reference to a recovery journal
+ * block.
+ * @journal: The recovery journal.
+ * @sequence_number: The journal sequence number of the referenced block.
+ * @zone_type: The type of the zone making the adjustment.
+ * @zone_id: The ID of the zone making the adjustment.
+ *
+ * If this is the last reference for a given zone type, an attempt will be made to reap the
+ * journal.
+ */
+void vdo_release_recovery_journal_block_reference(struct recovery_journal *journal,
+ sequence_number_t sequence_number,
+ enum vdo_zone_type zone_type,
+ zone_count_t zone_id)
+{
+ u16 *current_value;
+ block_count_t lock_number;
+ int prior_state;
+
+ if (sequence_number == 0)
+ return;
+
+ lock_number = vdo_get_recovery_journal_block_number(journal, sequence_number);
+ current_value = get_counter(journal, lock_number, zone_type, zone_id);
+
+ VDO_ASSERT_LOG_ONLY((*current_value >= 1),
+ "decrement of lock counter must not underflow");
+ *current_value -= 1;
+
+ if (zone_type == VDO_ZONE_TYPE_JOURNAL) {
+ if (is_journal_zone_locked(journal, lock_number))
+ return;
+ } else {
+ atomic_t *zone_count;
+
+ if (*current_value != 0)
+ return;
+
+ zone_count = get_zone_count_ptr(journal, lock_number, zone_type);
+
+ if (atomic_add_return(-1, zone_count) > 0)
+ return;
+ }
+
+ /*
+ * Extra barriers because this was original developed using a CAS operation that implicitly
+ * had them.
+ */
+ smp_mb__before_atomic();
+ prior_state = atomic_cmpxchg(&journal->lock_counter.state,
+ LOCK_COUNTER_STATE_NOT_NOTIFYING,
+ LOCK_COUNTER_STATE_NOTIFYING);
+ /* same as before_atomic */
+ smp_mb__after_atomic();
+
+ if (prior_state != LOCK_COUNTER_STATE_NOT_NOTIFYING)
+ return;
+
+ vdo_launch_completion(&journal->lock_counter.completion);
+}
+
+static inline struct recovery_journal_block * __must_check get_journal_block(struct list_head *list)
+{
+ return list_first_entry_or_null(list, struct recovery_journal_block, list_node);
+}
+
+/**
+ * pop_free_list() - Get a block from the end of the free list.
+ * @journal: The journal.
+ *
+ * Return: The block or NULL if the list is empty.
+ */
+static struct recovery_journal_block * __must_check pop_free_list(struct recovery_journal *journal)
+{
+ struct recovery_journal_block *block;
+
+ if (list_empty(&journal->free_tail_blocks))
+ return NULL;
+
+ block = list_last_entry(&journal->free_tail_blocks,
+ struct recovery_journal_block, list_node);
+ list_del_init(&block->list_node);
+ return block;
+}
+
+/**
+ * is_block_dirty() - Check whether a recovery block is dirty.
+ * @block: The block to check.
+ *
+ * Indicates it has any uncommitted entries, which includes both entries not written and entries
+ * written but not yet acknowledged.
+ *
+ * Return: true if the block has any uncommitted entries.
+ */
+static inline bool __must_check is_block_dirty(const struct recovery_journal_block *block)
+{
+ return (block->uncommitted_entry_count > 0);
+}
+
+/**
+ * is_block_empty() - Check whether a journal block is empty.
+ * @block: The block to check.
+ *
+ * Return: true if the block has no entries.
+ */
+static inline bool __must_check is_block_empty(const struct recovery_journal_block *block)
+{
+ return (block->entry_count == 0);
+}
+
+/**
+ * is_block_full() - Check whether a journal block is full.
+ * @block: The block to check.
+ *
+ * Return: true if the block is full.
+ */
+static inline bool __must_check is_block_full(const struct recovery_journal_block *block)
+{
+ return ((block == NULL) || (block->journal->entries_per_block == block->entry_count));
+}
+
+/**
+ * assert_on_journal_thread() - Assert that we are running on the journal thread.
+ * @journal: The journal.
+ * @function_name: The function doing the check (for logging).
+ */
+static void assert_on_journal_thread(struct recovery_journal *journal,
+ const char *function_name)
+{
+ VDO_ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == journal->thread_id),
+ "%s() called on journal thread", function_name);
+}
+
+/**
+ * continue_waiter() - Release a data_vio from the journal.
+ *
+ * Invoked whenever a data_vio is to be released from the journal, either because its entry was
+ * committed to disk, or because there was an error. Implements waiter_callback_fn.
+ */
+static void continue_waiter(struct vdo_waiter *waiter, void *context)
+{
+ continue_data_vio_with_error(vdo_waiter_as_data_vio(waiter), *((int *) context));
+}
+
+/**
+ * has_block_waiters() - Check whether the journal has any waiters on any blocks.
+ * @journal: The journal in question.
+ *
+ * Return: true if any block has a waiter.
+ */
+static inline bool has_block_waiters(struct recovery_journal *journal)
+{
+ struct recovery_journal_block *block = get_journal_block(&journal->active_tail_blocks);
+
+ /*
+ * Either the first active tail block (if it exists) has waiters, or no active tail block
+ * has waiters.
+ */
+ return ((block != NULL) &&
+ (vdo_waitq_has_waiters(&block->entry_waiters) ||
+ vdo_waitq_has_waiters(&block->commit_waiters)));
+}
+
+static void recycle_journal_blocks(struct recovery_journal *journal);
+static void recycle_journal_block(struct recovery_journal_block *block);
+static void notify_commit_waiters(struct recovery_journal *journal);
+
+/**
+ * suspend_lock_counter() - Prevent the lock counter from notifying.
+ * @counter: The counter.
+ *
+ * Return: true if the lock counter was not notifying and hence the suspend was efficacious.
+ */
+static bool suspend_lock_counter(struct lock_counter *counter)
+{
+ int prior_state;
+
+ /*
+ * Extra barriers because this was originally developed using a CAS operation that
+ * implicitly had them.
+ */
+ smp_mb__before_atomic();
+ prior_state = atomic_cmpxchg(&counter->state, LOCK_COUNTER_STATE_NOT_NOTIFYING,
+ LOCK_COUNTER_STATE_SUSPENDED);
+ /* same as before_atomic */
+ smp_mb__after_atomic();
+
+ return ((prior_state == LOCK_COUNTER_STATE_SUSPENDED) ||
+ (prior_state == LOCK_COUNTER_STATE_NOT_NOTIFYING));
+}
+
+static inline bool is_read_only(struct recovery_journal *journal)
+{
+ return vdo_is_read_only(journal->flush_vio->completion.vdo);
+}
+
+/**
+ * check_for_drain_complete() - Check whether the journal has drained.
+ * @journal: The journal which may have just drained.
+ */
+static void check_for_drain_complete(struct recovery_journal *journal)
+{
+ int result = VDO_SUCCESS;
+
+ if (is_read_only(journal)) {
+ result = VDO_READ_ONLY;
+ /*
+ * Clean up any full active blocks which were not written due to read-only mode.
+ *
+ * FIXME: This would probably be better as a short-circuit in write_block().
+ */
+ notify_commit_waiters(journal);
+ recycle_journal_blocks(journal);
+
+ /* Release any data_vios waiting to be assigned entries. */
+ vdo_waitq_notify_all_waiters(&journal->entry_waiters,
+ continue_waiter, &result);
+ }
+
+ if (!vdo_is_state_draining(&journal->state) ||
+ journal->reaping ||
+ has_block_waiters(journal) ||
+ vdo_waitq_has_waiters(&journal->entry_waiters) ||
+ !suspend_lock_counter(&journal->lock_counter))
+ return;
+
+ if (vdo_is_state_saving(&journal->state)) {
+ if (journal->active_block != NULL) {
+ VDO_ASSERT_LOG_ONLY(((result == VDO_READ_ONLY) ||
+ !is_block_dirty(journal->active_block)),
+ "journal being saved has clean active block");
+ recycle_journal_block(journal->active_block);
+ }
+
+ VDO_ASSERT_LOG_ONLY(list_empty(&journal->active_tail_blocks),
+ "all blocks in a journal being saved must be inactive");
+ }
+
+ vdo_finish_draining_with_result(&journal->state, result);
+}
+
+/**
+ * notify_recovery_journal_of_read_only_mode() - Notify a recovery journal that the VDO has gone
+ * read-only.
+ * @listener: The journal.
+ * @parent: The completion to notify in order to acknowledge the notification.
+ *
+ * Implements vdo_read_only_notification_fn.
+ */
+static void notify_recovery_journal_of_read_only_mode(void *listener,
+ struct vdo_completion *parent)
+{
+ check_for_drain_complete(listener);
+ vdo_finish_completion(parent);
+}
+
+/**
+ * enter_journal_read_only_mode() - Put the journal in read-only mode.
+ * @journal: The journal which has failed.
+ * @error_code: The error result triggering this call.
+ *
+ * All attempts to add entries after this function is called will fail. All VIOs waiting for
+ * commits will be awakened with an error.
+ */
+static void enter_journal_read_only_mode(struct recovery_journal *journal,
+ int error_code)
+{
+ vdo_enter_read_only_mode(journal->flush_vio->completion.vdo, error_code);
+ check_for_drain_complete(journal);
+}
+
+/**
+ * vdo_get_recovery_journal_current_sequence_number() - Obtain the recovery journal's current
+ * sequence number.
+ * @journal: The journal in question.
+ *
+ * Exposed only so the block map can be initialized therefrom.
+ *
+ * Return: The sequence number of the tail block.
+ */
+sequence_number_t vdo_get_recovery_journal_current_sequence_number(struct recovery_journal *journal)
+{
+ return journal->tail;
+}
+
+/**
+ * get_recovery_journal_head() - Get the head of the recovery journal.
+ * @journal: The journal.
+ *
+ * The head is the lowest sequence number of the block map head and the slab journal head.
+ *
+ * Return: the head of the journal.
+ */
+static inline sequence_number_t get_recovery_journal_head(const struct recovery_journal *journal)
+{
+ return min(journal->block_map_head, journal->slab_journal_head);
+}
+
+/**
+ * compute_recovery_count_byte() - Compute the recovery count byte for a given recovery count.
+ * @recovery_count: The recovery count.
+ *
+ * Return: The byte corresponding to the recovery count.
+ */
+static inline u8 __must_check compute_recovery_count_byte(u64 recovery_count)
+{
+ return (u8)(recovery_count & RECOVERY_COUNT_MASK);
+}
+
+/**
+ * check_slab_journal_commit_threshold() - Check whether the journal is over the threshold, and if
+ * so, force the oldest slab journal tail block to commit.
+ * @journal: The journal.
+ */
+static void check_slab_journal_commit_threshold(struct recovery_journal *journal)
+{
+ block_count_t current_length = journal->tail - journal->slab_journal_head;
+
+ if (current_length > journal->slab_journal_commit_threshold) {
+ journal->events.slab_journal_commits_requested++;
+ vdo_commit_oldest_slab_journal_tail_blocks(journal->depot,
+ journal->slab_journal_head);
+ }
+}
+
+static void reap_recovery_journal(struct recovery_journal *journal);
+static void assign_entries(struct recovery_journal *journal);
+
+/**
+ * finish_reaping() - Finish reaping the journal.
+ * @journal: The journal being reaped.
+ */
+static void finish_reaping(struct recovery_journal *journal)
+{
+ block_count_t blocks_reaped;
+ sequence_number_t old_head = get_recovery_journal_head(journal);
+
+ journal->block_map_head = journal->block_map_reap_head;
+ journal->slab_journal_head = journal->slab_journal_reap_head;
+ blocks_reaped = get_recovery_journal_head(journal) - old_head;
+ journal->available_space += blocks_reaped * journal->entries_per_block;
+ journal->reaping = false;
+ check_slab_journal_commit_threshold(journal);
+ assign_entries(journal);
+ check_for_drain_complete(journal);
+}
+
+/**
+ * complete_reaping() - Finish reaping the journal after flushing the lower layer.
+ * @completion: The journal's flush VIO.
+ *
+ * This is the callback registered in reap_recovery_journal().
+ */
+static void complete_reaping(struct vdo_completion *completion)
+{
+ struct recovery_journal *journal = completion->parent;
+
+ finish_reaping(journal);
+
+ /* Try reaping again in case more locks were released while flush was out. */
+ reap_recovery_journal(journal);
+}
+
+/**
+ * handle_flush_error() - Handle an error when flushing the lower layer due to reaping.
+ * @completion: The journal's flush VIO.
+ */
+static void handle_flush_error(struct vdo_completion *completion)
+{
+ struct recovery_journal *journal = completion->parent;
+
+ vio_record_metadata_io_error(as_vio(completion));
+ journal->reaping = false;
+ enter_journal_read_only_mode(journal, completion->result);
+}
+
+static void flush_endio(struct bio *bio)
+{
+ struct vio *vio = bio->bi_private;
+ struct recovery_journal *journal = vio->completion.parent;
+
+ continue_vio_after_io(vio, complete_reaping, journal->thread_id);
+}
+
+/**
+ * initialize_journal_state() - Set all journal fields appropriately to start journaling from the
+ * current active block.
+ * @journal: The journal to be reset based on its active block.
+ */
+static void initialize_journal_state(struct recovery_journal *journal)
+{
+ journal->append_point.sequence_number = journal->tail;
+ journal->last_write_acknowledged = journal->tail;
+ journal->block_map_head = journal->tail;
+ journal->slab_journal_head = journal->tail;
+ journal->block_map_reap_head = journal->tail;
+ journal->slab_journal_reap_head = journal->tail;
+ journal->block_map_head_block_number =
+ vdo_get_recovery_journal_block_number(journal, journal->block_map_head);
+ journal->slab_journal_head_block_number =
+ vdo_get_recovery_journal_block_number(journal,
+ journal->slab_journal_head);
+ journal->available_space =
+ (journal->entries_per_block * vdo_get_recovery_journal_length(journal->size));
+}
+
+/**
+ * vdo_get_recovery_journal_length() - Get the number of usable recovery journal blocks.
+ * @journal_size: The size of the recovery journal in blocks.
+ *
+ * Return: the number of recovery journal blocks usable for entries.
+ */
+block_count_t vdo_get_recovery_journal_length(block_count_t journal_size)
+{
+ block_count_t reserved_blocks = journal_size / 4;
+
+ if (reserved_blocks > RECOVERY_JOURNAL_RESERVED_BLOCKS)
+ reserved_blocks = RECOVERY_JOURNAL_RESERVED_BLOCKS;
+ return (journal_size - reserved_blocks);
+}
+
+/**
+ * reap_recovery_journal_callback() - Attempt to reap the journal.
+ * @completion: The lock counter completion.
+ *
+ * Attempts to reap the journal now that all the locks on some journal block have been released.
+ * This is the callback registered with the lock counter.
+ */
+static void reap_recovery_journal_callback(struct vdo_completion *completion)
+{
+ struct recovery_journal *journal = (struct recovery_journal *) completion->parent;
+ /*
+ * The acknowledgment must be done before reaping so that there is no race between
+ * acknowledging the notification and unlocks wishing to notify.
+ */
+ smp_wmb();
+ atomic_set(&journal->lock_counter.state, LOCK_COUNTER_STATE_NOT_NOTIFYING);
+
+ if (vdo_is_state_quiescing(&journal->state)) {
+ /*
+ * Don't start reaping when the journal is trying to quiesce. Do check if this
+ * notification is the last thing the is waiting on.
+ */
+ check_for_drain_complete(journal);
+ return;
+ }
+
+ reap_recovery_journal(journal);
+ check_slab_journal_commit_threshold(journal);
+}
+
+/**
+ * initialize_lock_counter() - Initialize a lock counter.
+ *
+ * @journal: The recovery journal.
+ * @vdo: The vdo.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int __must_check initialize_lock_counter(struct recovery_journal *journal,
+ struct vdo *vdo)
+{
+ int result;
+ struct thread_config *config = &vdo->thread_config;
+ struct lock_counter *counter = &journal->lock_counter;
+
+ result = vdo_allocate(journal->size, u16, __func__, &counter->journal_counters);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = vdo_allocate(journal->size, atomic_t, __func__,
+ &counter->journal_decrement_counts);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = vdo_allocate(journal->size * config->logical_zone_count, u16, __func__,
+ &counter->logical_counters);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = vdo_allocate(journal->size, atomic_t, __func__,
+ &counter->logical_zone_counts);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = vdo_allocate(journal->size * config->physical_zone_count, u16, __func__,
+ &counter->physical_counters);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = vdo_allocate(journal->size, atomic_t, __func__,
+ &counter->physical_zone_counts);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ vdo_initialize_completion(&counter->completion, vdo,
+ VDO_LOCK_COUNTER_COMPLETION);
+ vdo_prepare_completion(&counter->completion, reap_recovery_journal_callback,
+ reap_recovery_journal_callback, config->journal_thread,
+ journal);
+ counter->logical_zones = config->logical_zone_count;
+ counter->physical_zones = config->physical_zone_count;
+ counter->locks = journal->size;
+ return VDO_SUCCESS;
+}
+
+/**
+ * set_journal_tail() - Set the journal's tail sequence number.
+ * @journal: The journal whose tail is to be set.
+ * @tail: The new tail value.
+ */
+static void set_journal_tail(struct recovery_journal *journal, sequence_number_t tail)
+{
+ /* VDO does not support sequence numbers above 1 << 48 in the slab journal. */
+ if (tail >= (1ULL << 48))
+ enter_journal_read_only_mode(journal, VDO_JOURNAL_OVERFLOW);
+
+ journal->tail = tail;
+}
+
+/**
+ * initialize_recovery_block() - Initialize a journal block.
+ * @vdo: The vdo from which to construct vios.
+ * @journal: The journal to which the block will belong.
+ * @block: The block to initialize.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int initialize_recovery_block(struct vdo *vdo, struct recovery_journal *journal,
+ struct recovery_journal_block *block)
+{
+ char *data;
+ int result;
+
+ /*
+ * Ensure that a block is large enough to store RECOVERY_JOURNAL_ENTRIES_PER_BLOCK entries.
+ */
+ BUILD_BUG_ON(RECOVERY_JOURNAL_ENTRIES_PER_BLOCK >
+ ((VDO_BLOCK_SIZE - sizeof(struct packed_journal_header)) /
+ sizeof(struct packed_recovery_journal_entry)));
+
+ /*
+ * Allocate a full block for the journal block even though not all of the space is used
+ * since the VIO needs to write a full disk block.
+ */
+ result = vdo_allocate(VDO_BLOCK_SIZE, char, __func__, &data);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = allocate_vio_components(vdo, VIO_TYPE_RECOVERY_JOURNAL,
+ VIO_PRIORITY_HIGH, block, 1, data, &block->vio);
+ if (result != VDO_SUCCESS) {
+ vdo_free(data);
+ return result;
+ }
+
+ list_add_tail(&block->list_node, &journal->free_tail_blocks);
+ block->journal = journal;
+ return VDO_SUCCESS;
+}
+
+/**
+ * vdo_decode_recovery_journal() - Make a recovery journal and initialize it with the state that
+ * was decoded from the super block.
+ *
+ * @state: The decoded state of the journal.
+ * @nonce: The nonce of the VDO.
+ * @vdo: The VDO.
+ * @partition: The partition for the journal.
+ * @recovery_count: The VDO's number of completed recoveries.
+ * @journal_size: The number of blocks in the journal on disk.
+ * @journal_ptr: The pointer to hold the new recovery journal.
+ *
+ * Return: A success or error code.
+ */
+int vdo_decode_recovery_journal(struct recovery_journal_state_7_0 state, nonce_t nonce,
+ struct vdo *vdo, struct partition *partition,
+ u64 recovery_count, block_count_t journal_size,
+ struct recovery_journal **journal_ptr)
+{
+ block_count_t i;
+ struct recovery_journal *journal;
+ int result;
+
+ result = vdo_allocate_extended(struct recovery_journal,
+ RECOVERY_JOURNAL_RESERVED_BLOCKS,
+ struct recovery_journal_block, __func__,
+ &journal);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ INIT_LIST_HEAD(&journal->free_tail_blocks);
+ INIT_LIST_HEAD(&journal->active_tail_blocks);
+ vdo_waitq_init(&journal->pending_writes);
+
+ journal->thread_id = vdo->thread_config.journal_thread;
+ journal->origin = partition->offset;
+ journal->nonce = nonce;
+ journal->recovery_count = compute_recovery_count_byte(recovery_count);
+ journal->size = journal_size;
+ journal->slab_journal_commit_threshold = (journal_size * 2) / 3;
+ journal->logical_blocks_used = state.logical_blocks_used;
+ journal->block_map_data_blocks = state.block_map_data_blocks;
+ journal->entries_per_block = RECOVERY_JOURNAL_ENTRIES_PER_BLOCK;
+ set_journal_tail(journal, state.journal_start);
+ initialize_journal_state(journal);
+ /* TODO: this will have to change if we make initial resume of a VDO a real resume */
+ vdo_set_admin_state_code(&journal->state, VDO_ADMIN_STATE_SUSPENDED);
+
+ for (i = 0; i < RECOVERY_JOURNAL_RESERVED_BLOCKS; i++) {
+ struct recovery_journal_block *block = &journal->blocks[i];
+
+ result = initialize_recovery_block(vdo, journal, block);
+ if (result != VDO_SUCCESS) {
+ vdo_free_recovery_journal(journal);
+ return result;
+ }
+ }
+
+ result = initialize_lock_counter(journal, vdo);
+ if (result != VDO_SUCCESS) {
+ vdo_free_recovery_journal(journal);
+ return result;
+ }
+
+ result = create_metadata_vio(vdo, VIO_TYPE_RECOVERY_JOURNAL, VIO_PRIORITY_HIGH,
+ journal, NULL, &journal->flush_vio);
+ if (result != VDO_SUCCESS) {
+ vdo_free_recovery_journal(journal);
+ return result;
+ }
+
+ result = vdo_register_read_only_listener(vdo, journal,
+ notify_recovery_journal_of_read_only_mode,
+ journal->thread_id);
+ if (result != VDO_SUCCESS) {
+ vdo_free_recovery_journal(journal);
+ return result;
+ }
+
+ result = vdo_make_default_thread(vdo, journal->thread_id);
+ if (result != VDO_SUCCESS) {
+ vdo_free_recovery_journal(journal);
+ return result;
+ }
+
+ journal->flush_vio->completion.callback_thread_id = journal->thread_id;
+ *journal_ptr = journal;
+ return VDO_SUCCESS;
+}
+
+/**
+ * vdo_free_recovery_journal() - Free a recovery journal.
+ * @journal: The recovery journal to free.
+ */
+void vdo_free_recovery_journal(struct recovery_journal *journal)
+{
+ block_count_t i;
+
+ if (journal == NULL)
+ return;
+
+ vdo_free(vdo_forget(journal->lock_counter.logical_zone_counts));
+ vdo_free(vdo_forget(journal->lock_counter.physical_zone_counts));
+ vdo_free(vdo_forget(journal->lock_counter.journal_counters));
+ vdo_free(vdo_forget(journal->lock_counter.journal_decrement_counts));
+ vdo_free(vdo_forget(journal->lock_counter.logical_counters));
+ vdo_free(vdo_forget(journal->lock_counter.physical_counters));
+ free_vio(vdo_forget(journal->flush_vio));
+
+ /*
+ * FIXME: eventually, the journal should be constructed in a quiescent state which
+ * requires opening before use.
+ */
+ if (!vdo_is_state_quiescent(&journal->state)) {
+ VDO_ASSERT_LOG_ONLY(list_empty(&journal->active_tail_blocks),
+ "journal being freed has no active tail blocks");
+ } else if (!vdo_is_state_saved(&journal->state) &&
+ !list_empty(&journal->active_tail_blocks)) {
+ vdo_log_warning("journal being freed has uncommitted entries");
+ }
+
+ for (i = 0; i < RECOVERY_JOURNAL_RESERVED_BLOCKS; i++) {
+ struct recovery_journal_block *block = &journal->blocks[i];
+
+ vdo_free(vdo_forget(block->vio.data));
+ free_vio_components(&block->vio);
+ }
+
+ vdo_free(journal);
+}
+
+/**
+ * vdo_initialize_recovery_journal_post_repair() - Initialize the journal after a repair.
+ * @journal: The journal in question.
+ * @recovery_count: The number of completed recoveries.
+ * @tail: The new tail block sequence number.
+ * @logical_blocks_used: The new number of logical blocks used.
+ * @block_map_data_blocks: The new number of block map data blocks.
+ */
+void vdo_initialize_recovery_journal_post_repair(struct recovery_journal *journal,
+ u64 recovery_count,
+ sequence_number_t tail,
+ block_count_t logical_blocks_used,
+ block_count_t block_map_data_blocks)
+{
+ set_journal_tail(journal, tail + 1);
+ journal->recovery_count = compute_recovery_count_byte(recovery_count);
+ initialize_journal_state(journal);
+ journal->logical_blocks_used = logical_blocks_used;
+ journal->block_map_data_blocks = block_map_data_blocks;
+}
+
+/**
+ * vdo_get_journal_block_map_data_blocks_used() - Get the number of block map pages, allocated from
+ * data blocks, currently in use.
+ * @journal: The journal in question.
+ *
+ * Return: The number of block map pages allocated from slabs.
+ */
+block_count_t vdo_get_journal_block_map_data_blocks_used(struct recovery_journal *journal)
+{
+ return journal->block_map_data_blocks;
+}
+
+/**
+ * vdo_get_recovery_journal_thread_id() - Get the ID of a recovery journal's thread.
+ * @journal: The journal to query.
+ *
+ * Return: The ID of the journal's thread.
+ */
+thread_id_t vdo_get_recovery_journal_thread_id(struct recovery_journal *journal)
+{
+ return journal->thread_id;
+}
+
+/**
+ * vdo_open_recovery_journal() - Prepare the journal for new entries.
+ * @journal: The journal in question.
+ * @depot: The slab depot for this VDO.
+ * @block_map: The block map for this VDO.
+ */
+void vdo_open_recovery_journal(struct recovery_journal *journal,
+ struct slab_depot *depot, struct block_map *block_map)
+{
+ journal->depot = depot;
+ journal->block_map = block_map;
+ WRITE_ONCE(journal->state.current_state, VDO_ADMIN_STATE_NORMAL_OPERATION);
+}
+
+/**
+ * vdo_record_recovery_journal() - Record the state of a recovery journal for encoding in the super
+ * block.
+ * @journal: the recovery journal.
+ *
+ * Return: the state of the journal.
+ */
+struct recovery_journal_state_7_0
+vdo_record_recovery_journal(const struct recovery_journal *journal)
+{
+ struct recovery_journal_state_7_0 state = {
+ .logical_blocks_used = journal->logical_blocks_used,
+ .block_map_data_blocks = journal->block_map_data_blocks,
+ };
+
+ if (vdo_is_state_saved(&journal->state)) {
+ /*
+ * If the journal is saved, we should start one past the active block (since the
+ * active block is not guaranteed to be empty).
+ */
+ state.journal_start = journal->tail;
+ } else {
+ /*
+ * When we're merely suspended or have gone read-only, we must record the first
+ * block that might have entries that need to be applied.
+ */
+ state.journal_start = get_recovery_journal_head(journal);
+ }
+
+ return state;
+}
+
+/**
+ * get_block_header() - Get a pointer to the packed journal block header in the block buffer.
+ * @block: The recovery block.
+ *
+ * Return: The block's header.
+ */
+static inline struct packed_journal_header *
+get_block_header(const struct recovery_journal_block *block)
+{
+ return (struct packed_journal_header *) block->vio.data;
+}
+
+/**
+ * set_active_sector() - Set the current sector of the current block and initialize it.
+ * @block: The block to update.
+ * @sector: A pointer to the first byte of the new sector.
+ */
+static void set_active_sector(struct recovery_journal_block *block, void *sector)
+{
+ block->sector = sector;
+ block->sector->check_byte = get_block_header(block)->check_byte;
+ block->sector->recovery_count = block->journal->recovery_count;
+ block->sector->entry_count = 0;
+}
+
+/**
+ * advance_tail() - Advance the tail of the journal.
+ * @journal: The journal whose tail should be advanced.
+ *
+ * Return: true if the tail was advanced.
+ */
+static bool advance_tail(struct recovery_journal *journal)
+{
+ struct recovery_block_header unpacked;
+ struct packed_journal_header *header;
+ struct recovery_journal_block *block;
+
+ block = journal->active_block = pop_free_list(journal);
+ if (block == NULL)
+ return false;
+
+ list_move_tail(&block->list_node, &journal->active_tail_blocks);
+
+ unpacked = (struct recovery_block_header) {
+ .metadata_type = VDO_METADATA_RECOVERY_JOURNAL_2,
+ .block_map_data_blocks = journal->block_map_data_blocks,
+ .logical_blocks_used = journal->logical_blocks_used,
+ .nonce = journal->nonce,
+ .recovery_count = journal->recovery_count,
+ .sequence_number = journal->tail,
+ .check_byte = vdo_compute_recovery_journal_check_byte(journal,
+ journal->tail),
+ };
+
+ header = get_block_header(block);
+ memset(block->vio.data, 0x0, VDO_BLOCK_SIZE);
+ block->sequence_number = journal->tail;
+ block->entry_count = 0;
+ block->uncommitted_entry_count = 0;
+ block->block_number = vdo_get_recovery_journal_block_number(journal,
+ journal->tail);
+
+ vdo_pack_recovery_block_header(&unpacked, header);
+ set_active_sector(block, vdo_get_journal_block_sector(header, 1));
+ set_journal_tail(journal, journal->tail + 1);
+ vdo_advance_block_map_era(journal->block_map, journal->tail);
+ return true;
+}
+
+/**
+ * initialize_lock_count() - Initialize the value of the journal zone's counter for a given lock.
+ * @journal: The recovery journal.
+ *
+ * Context: This must be called from the journal zone.
+ */
+static void initialize_lock_count(struct recovery_journal *journal)
+{
+ u16 *journal_value;
+ block_count_t lock_number = journal->active_block->block_number;
+ atomic_t *decrement_counter = get_decrement_counter(journal, lock_number);
+
+ journal_value = get_counter(journal, lock_number, VDO_ZONE_TYPE_JOURNAL, 0);
+ VDO_ASSERT_LOG_ONLY((*journal_value == atomic_read(decrement_counter)),
+ "count to be initialized not in use");
+ *journal_value = journal->entries_per_block + 1;
+ atomic_set(decrement_counter, 0);
+}
+
+/**
+ * prepare_to_assign_entry() - Prepare the currently active block to receive an entry and check
+ * whether an entry of the given type may be assigned at this time.
+ * @journal: The journal receiving an entry.
+ *
+ * Return: true if there is space in the journal to store an entry of the specified type.
+ */
+static bool prepare_to_assign_entry(struct recovery_journal *journal)
+{
+ if (journal->available_space == 0)
+ return false;
+
+ if (is_block_full(journal->active_block) && !advance_tail(journal))
+ return false;
+
+ if (!is_block_empty(journal->active_block))
+ return true;
+
+ if ((journal->tail - get_recovery_journal_head(journal)) > journal->size) {
+ /* Cannot use this block since the journal is full. */
+ journal->events.disk_full++;
+ return false;
+ }
+
+ /*
+ * Don't allow the new block to be reaped until all of its entries have been committed to
+ * the block map and until the journal block has been fully committed as well. Because the
+ * block map update is done only after any slab journal entries have been made, the
+ * per-entry lock for the block map entry serves to protect those as well.
+ */
+ initialize_lock_count(journal);
+ return true;
+}
+
+static void write_blocks(struct recovery_journal *journal);
+
+/**
+ * schedule_block_write() - Queue a block for writing.
+ * @journal: The journal in question.
+ * @block: The block which is now ready to write.
+ *
+ * The block is expected to be full. If the block is currently writing, this is a noop as the block
+ * will be queued for writing when the write finishes. The block must not currently be queued for
+ * writing.
+ */
+static void schedule_block_write(struct recovery_journal *journal,
+ struct recovery_journal_block *block)
+{
+ if (!block->committing)
+ vdo_waitq_enqueue_waiter(&journal->pending_writes, &block->write_waiter);
+ /*
+ * At the end of adding entries, or discovering this partial block is now full and ready to
+ * rewrite, we will call write_blocks() and write a whole batch.
+ */
+}
+
+/**
+ * release_journal_block_reference() - Release a reference to a journal block.
+ * @block: The journal block from which to release a reference.
+ */
+static void release_journal_block_reference(struct recovery_journal_block *block)
+{
+ vdo_release_recovery_journal_block_reference(block->journal,
+ block->sequence_number,
+ VDO_ZONE_TYPE_JOURNAL, 0);
+}
+
+static void update_usages(struct recovery_journal *journal, struct data_vio *data_vio)
+{
+ if (data_vio->increment_updater.operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING) {
+ journal->block_map_data_blocks++;
+ return;
+ }
+
+ if (data_vio->new_mapped.state != VDO_MAPPING_STATE_UNMAPPED)
+ journal->logical_blocks_used++;
+
+ if (data_vio->mapped.state != VDO_MAPPING_STATE_UNMAPPED)
+ journal->logical_blocks_used--;
+}
+
+/**
+ * assign_entry() - Assign an entry waiter to the active block.
+ *
+ * Implements waiter_callback_fn.
+ */
+static void assign_entry(struct vdo_waiter *waiter, void *context)
+{
+ struct data_vio *data_vio = vdo_waiter_as_data_vio(waiter);
+ struct recovery_journal_block *block = context;
+ struct recovery_journal *journal = block->journal;
+
+ /* Record the point at which we will make the journal entry. */
+ data_vio->recovery_journal_point = (struct journal_point) {
+ .sequence_number = block->sequence_number,
+ .entry_count = block->entry_count,
+ };
+
+ update_usages(journal, data_vio);
+ journal->available_space--;
+
+ if (!vdo_waitq_has_waiters(&block->entry_waiters))
+ journal->events.blocks.started++;
+
+ vdo_waitq_enqueue_waiter(&block->entry_waiters, &data_vio->waiter);
+ block->entry_count++;
+ block->uncommitted_entry_count++;
+ journal->events.entries.started++;
+
+ if (is_block_full(block)) {
+ /*
+ * The block is full, so we can write it anytime henceforth. If it is already
+ * committing, we'll queue it for writing when it comes back.
+ */
+ schedule_block_write(journal, block);
+ }
+
+ /* Force out slab journal tail blocks when threshold is reached. */
+ check_slab_journal_commit_threshold(journal);
+}
+
+static void assign_entries(struct recovery_journal *journal)
+{
+ if (journal->adding_entries) {
+ /* Protect against re-entrancy. */
+ return;
+ }
+
+ journal->adding_entries = true;
+ while (vdo_waitq_has_waiters(&journal->entry_waiters) &&
+ prepare_to_assign_entry(journal)) {
+ vdo_waitq_notify_next_waiter(&journal->entry_waiters,
+ assign_entry, journal->active_block);
+ }
+
+ /* Now that we've finished with entries, see if we have a batch of blocks to write. */
+ write_blocks(journal);
+ journal->adding_entries = false;
+}
+
+/**
+ * recycle_journal_block() - Prepare an in-memory journal block to be reused now that it has been
+ * fully committed.
+ * @block: The block to be recycled.
+ */
+static void recycle_journal_block(struct recovery_journal_block *block)
+{
+ struct recovery_journal *journal = block->journal;
+ block_count_t i;
+
+ list_move_tail(&block->list_node, &journal->free_tail_blocks);
+
+ /* Release any unused entry locks. */
+ for (i = block->entry_count; i < journal->entries_per_block; i++)
+ release_journal_block_reference(block);
+
+ /*
+ * Release our own lock against reaping now that the block is completely committed, or
+ * we're giving up because we're in read-only mode.
+ */
+ if (block->entry_count > 0)
+ release_journal_block_reference(block);
+
+ if (block == journal->active_block)
+ journal->active_block = NULL;
+}
+
+/**
+ * continue_committed_waiter() - invoked whenever a VIO is to be released from the journal because
+ * its entry was committed to disk.
+ *
+ * Implements waiter_callback_fn.
+ */
+static void continue_committed_waiter(struct vdo_waiter *waiter, void *context)
+{
+ struct data_vio *data_vio = vdo_waiter_as_data_vio(waiter);
+ struct recovery_journal *journal = context;
+ int result = (is_read_only(journal) ? VDO_READ_ONLY : VDO_SUCCESS);
+ bool has_decrement;
+
+ VDO_ASSERT_LOG_ONLY(vdo_before_journal_point(&journal->commit_point,
+ &data_vio->recovery_journal_point),
+ "DataVIOs released from recovery journal in order. Recovery journal point is (%llu, %u), but commit waiter point is (%llu, %u)",
+ (unsigned long long) journal->commit_point.sequence_number,
+ journal->commit_point.entry_count,
+ (unsigned long long) data_vio->recovery_journal_point.sequence_number,
+ data_vio->recovery_journal_point.entry_count);
+
+ journal->commit_point = data_vio->recovery_journal_point;
+ data_vio->last_async_operation = VIO_ASYNC_OP_UPDATE_REFERENCE_COUNTS;
+ if (result != VDO_SUCCESS) {
+ continue_data_vio_with_error(data_vio, result);
+ return;
+ }
+
+ /*
+ * The increment must be launched first since it must come before the
+ * decrement if they are in the same slab.
+ */
+ has_decrement = (data_vio->decrement_updater.zpbn.pbn != VDO_ZERO_BLOCK);
+ if ((data_vio->increment_updater.zpbn.pbn != VDO_ZERO_BLOCK) || !has_decrement)
+ continue_data_vio(data_vio);
+
+ if (has_decrement)
+ vdo_launch_completion(&data_vio->decrement_completion);
+}
+
+/**
+ * notify_commit_waiters() - Notify any VIOs whose entries have now committed.
+ * @journal: The recovery journal to update.
+ */
+static void notify_commit_waiters(struct recovery_journal *journal)
+{
+ struct recovery_journal_block *block;
+
+ list_for_each_entry(block, &journal->active_tail_blocks, list_node) {
+ if (block->committing)
+ return;
+
+ vdo_waitq_notify_all_waiters(&block->commit_waiters,
+ continue_committed_waiter, journal);
+ if (is_read_only(journal)) {
+ vdo_waitq_notify_all_waiters(&block->entry_waiters,
+ continue_committed_waiter,
+ journal);
+ } else if (is_block_dirty(block) || !is_block_full(block)) {
+ /* Stop at partially-committed or partially-filled blocks. */
+ return;
+ }
+ }
+}
+
+/**
+ * recycle_journal_blocks() - Recycle any journal blocks which have been fully committed.
+ * @journal: The recovery journal to update.
+ */
+static void recycle_journal_blocks(struct recovery_journal *journal)
+{
+ struct recovery_journal_block *block, *tmp;
+
+ list_for_each_entry_safe(block, tmp, &journal->active_tail_blocks, list_node) {
+ if (block->committing) {
+ /* Don't recycle committing blocks. */
+ return;
+ }
+
+ if (!is_read_only(journal) &&
+ (is_block_dirty(block) || !is_block_full(block))) {
+ /*
+ * Don't recycle partially written or partially full blocks, except in
+ * read-only mode.
+ */
+ return;
+ }
+
+ recycle_journal_block(block);
+ }
+}
+
+/**
+ * complete_write() - Handle post-commit processing.
+ * @completion: The completion of the VIO writing this block.
+ *
+ * This is the callback registered by write_block(). If more entries accumulated in the block being
+ * committed while the commit was in progress, another commit will be initiated.
+ */
+static void complete_write(struct vdo_completion *completion)
+{
+ struct recovery_journal_block *block = completion->parent;
+ struct recovery_journal *journal = block->journal;
+ struct recovery_journal_block *last_active_block;
+
+ assert_on_journal_thread(journal, __func__);
+
+ journal->pending_write_count -= 1;
+ journal->events.blocks.committed += 1;
+ journal->events.entries.committed += block->entries_in_commit;
+ block->uncommitted_entry_count -= block->entries_in_commit;
+ block->entries_in_commit = 0;
+ block->committing = false;
+
+ /* If this block is the latest block to be acknowledged, record that fact. */
+ if (block->sequence_number > journal->last_write_acknowledged)
+ journal->last_write_acknowledged = block->sequence_number;
+
+ last_active_block = get_journal_block(&journal->active_tail_blocks);
+ VDO_ASSERT_LOG_ONLY((block->sequence_number >= last_active_block->sequence_number),
+ "completed journal write is still active");
+
+ notify_commit_waiters(journal);
+
+ /*
+ * Is this block now full? Reaping, and adding entries, might have already sent it off for
+ * rewriting; else, queue it for rewrite.
+ */
+ if (is_block_dirty(block) && is_block_full(block))
+ schedule_block_write(journal, block);
+
+ recycle_journal_blocks(journal);
+ write_blocks(journal);
+
+ check_for_drain_complete(journal);
+}
+
+static void handle_write_error(struct vdo_completion *completion)
+{
+ struct recovery_journal_block *block = completion->parent;
+ struct recovery_journal *journal = block->journal;
+
+ vio_record_metadata_io_error(as_vio(completion));
+ vdo_log_error_strerror(completion->result,
+ "cannot write recovery journal block %llu",
+ (unsigned long long) block->sequence_number);
+ enter_journal_read_only_mode(journal, completion->result);
+ complete_write(completion);
+}
+
+static void complete_write_endio(struct bio *bio)
+{
+ struct vio *vio = bio->bi_private;
+ struct recovery_journal_block *block = vio->completion.parent;
+ struct recovery_journal *journal = block->journal;
+
+ continue_vio_after_io(vio, complete_write, journal->thread_id);
+}
+
+/**
+ * add_queued_recovery_entries() - Actually add entries from the queue to the given block.
+ * @block: The journal block.
+ */
+static void add_queued_recovery_entries(struct recovery_journal_block *block)
+{
+ while (vdo_waitq_has_waiters(&block->entry_waiters)) {
+ struct data_vio *data_vio =
+ vdo_waiter_as_data_vio(vdo_waitq_dequeue_waiter(&block->entry_waiters));
+ struct tree_lock *lock = &data_vio->tree_lock;
+ struct packed_recovery_journal_entry *packed_entry;
+ struct recovery_journal_entry new_entry;
+
+ if (block->sector->entry_count == RECOVERY_JOURNAL_ENTRIES_PER_SECTOR)
+ set_active_sector(block,
+ (char *) block->sector + VDO_SECTOR_SIZE);
+
+ /* Compose and encode the entry. */
+ packed_entry = &block->sector->entries[block->sector->entry_count++];
+ new_entry = (struct recovery_journal_entry) {
+ .mapping = {
+ .pbn = data_vio->increment_updater.zpbn.pbn,
+ .state = data_vio->increment_updater.zpbn.state,
+ },
+ .unmapping = {
+ .pbn = data_vio->decrement_updater.zpbn.pbn,
+ .state = data_vio->decrement_updater.zpbn.state,
+ },
+ .operation = data_vio->increment_updater.operation,
+ .slot = lock->tree_slots[lock->height].block_map_slot,
+ };
+ *packed_entry = vdo_pack_recovery_journal_entry(&new_entry);
+ data_vio->recovery_sequence_number = block->sequence_number;
+
+ /* Enqueue the data_vio to wait for its entry to commit. */
+ vdo_waitq_enqueue_waiter(&block->commit_waiters, &data_vio->waiter);
+ }
+}
+
+/**
+ * write_block() - Issue a block for writing.
+ *
+ * Implements waiter_callback_fn.
+ */
+static void write_block(struct vdo_waiter *waiter, void *context __always_unused)
+{
+ struct recovery_journal_block *block =
+ container_of(waiter, struct recovery_journal_block, write_waiter);
+ struct recovery_journal *journal = block->journal;
+ struct packed_journal_header *header = get_block_header(block);
+
+ if (block->committing || !vdo_waitq_has_waiters(&block->entry_waiters) ||
+ is_read_only(journal))
+ return;
+
+ block->entries_in_commit = vdo_waitq_num_waiters(&block->entry_waiters);
+ add_queued_recovery_entries(block);
+
+ journal->pending_write_count += 1;
+ journal->events.blocks.written += 1;
+ journal->events.entries.written += block->entries_in_commit;
+
+ header->block_map_head = __cpu_to_le64(journal->block_map_head);
+ header->slab_journal_head = __cpu_to_le64(journal->slab_journal_head);
+ header->entry_count = __cpu_to_le16(block->entry_count);
+
+ block->committing = true;
+
+ /*
+ * We must issue a flush and a FUA for every commit. The flush is necessary to ensure that
+ * the data being referenced is stable. The FUA is necessary to ensure that the journal
+ * block itself is stable before allowing overwrites of the lbn's previous data.
+ */
+ vdo_submit_metadata_vio(&block->vio, journal->origin + block->block_number,
+ complete_write_endio, handle_write_error,
+ REQ_OP_WRITE | REQ_PRIO | REQ_PREFLUSH | REQ_SYNC | REQ_FUA);
+}
+
+
+/**
+ * write_blocks() - Attempt to commit blocks, according to write policy.
+ * @journal: The recovery journal.
+ */
+static void write_blocks(struct recovery_journal *journal)
+{
+ assert_on_journal_thread(journal, __func__);
+ /*
+ * We call this function after adding entries to the journal and after finishing a block
+ * write. Thus, when this function terminates we must either have no VIOs waiting in the
+ * journal or have some outstanding IO to provide a future wakeup.
+ *
+ * We want to only issue full blocks if there are no pending writes. However, if there are
+ * no outstanding writes and some unwritten entries, we must issue a block, even if it's
+ * the active block and it isn't full.
+ */
+ if (journal->pending_write_count > 0)
+ return;
+
+ /* Write all the full blocks. */
+ vdo_waitq_notify_all_waiters(&journal->pending_writes, write_block, NULL);
+
+ /*
+ * Do we need to write the active block? Only if we have no outstanding writes, even after
+ * issuing all of the full writes.
+ */
+ if ((journal->pending_write_count == 0) && (journal->active_block != NULL))
+ write_block(&journal->active_block->write_waiter, NULL);
+}
+
+/**
+ * vdo_add_recovery_journal_entry() - Add an entry to a recovery journal.
+ * @journal: The journal in which to make an entry.
+ * @data_vio: The data_vio for which to add the entry. The entry will be taken
+ * from the logical and new_mapped fields of the data_vio. The
+ * data_vio's recovery_sequence_number field will be set to the
+ * sequence number of the journal block in which the entry was
+ * made.
+ *
+ * This method is asynchronous. The data_vio will not be called back until the entry is committed
+ * to the on-disk journal.
+ */
+void vdo_add_recovery_journal_entry(struct recovery_journal *journal,
+ struct data_vio *data_vio)
+{
+ assert_on_journal_thread(journal, __func__);
+ if (!vdo_is_state_normal(&journal->state)) {
+ continue_data_vio_with_error(data_vio, VDO_INVALID_ADMIN_STATE);
+ return;
+ }
+
+ if (is_read_only(journal)) {
+ continue_data_vio_with_error(data_vio, VDO_READ_ONLY);
+ return;
+ }
+
+ VDO_ASSERT_LOG_ONLY(data_vio->recovery_sequence_number == 0,
+ "journal lock not held for new entry");
+
+ vdo_advance_journal_point(&journal->append_point, journal->entries_per_block);
+ vdo_waitq_enqueue_waiter(&journal->entry_waiters, &data_vio->waiter);
+ assign_entries(journal);
+}
+
+/**
+ * is_lock_locked() - Check whether a lock is locked for a zone type.
+ * @journal: The recovery journal.
+ * @lock_number: The lock to check.
+ * @zone_type: The type of the zone.
+ *
+ * If the recovery journal has a lock on the lock number, both logical and physical zones are
+ * considered locked.
+ *
+ * Return: true if the specified lock has references (is locked).
+ */
+static bool is_lock_locked(struct recovery_journal *journal, block_count_t lock_number,
+ enum vdo_zone_type zone_type)
+{
+ atomic_t *zone_count;
+ bool locked;
+
+ if (is_journal_zone_locked(journal, lock_number))
+ return true;
+
+ zone_count = get_zone_count_ptr(journal, lock_number, zone_type);
+ locked = (atomic_read(zone_count) != 0);
+ /* Pairs with implicit barrier in vdo_release_recovery_journal_block_reference() */
+ smp_rmb();
+ return locked;
+}
+
+/**
+ * reap_recovery_journal() - Conduct a sweep on a recovery journal to reclaim unreferenced blocks.
+ * @journal: The recovery journal.
+ */
+static void reap_recovery_journal(struct recovery_journal *journal)
+{
+ if (journal->reaping) {
+ /*
+ * We already have an outstanding reap in progress. We need to wait for it to
+ * finish.
+ */
+ return;
+ }
+
+ if (vdo_is_state_quiescent(&journal->state)) {
+ /* We are supposed to not do IO. Don't botch it by reaping. */
+ return;
+ }
+
+ /*
+ * Start reclaiming blocks only when the journal head has no references. Then stop when a
+ * block is referenced.
+ */
+ while ((journal->block_map_reap_head < journal->last_write_acknowledged) &&
+ !is_lock_locked(journal, journal->block_map_head_block_number,
+ VDO_ZONE_TYPE_LOGICAL)) {
+ journal->block_map_reap_head++;
+ if (++journal->block_map_head_block_number == journal->size)
+ journal->block_map_head_block_number = 0;
+ }
+
+ while ((journal->slab_journal_reap_head < journal->last_write_acknowledged) &&
+ !is_lock_locked(journal, journal->slab_journal_head_block_number,
+ VDO_ZONE_TYPE_PHYSICAL)) {
+ journal->slab_journal_reap_head++;
+ if (++journal->slab_journal_head_block_number == journal->size)
+ journal->slab_journal_head_block_number = 0;
+ }
+
+ if ((journal->block_map_reap_head == journal->block_map_head) &&
+ (journal->slab_journal_reap_head == journal->slab_journal_head)) {
+ /* Nothing happened. */
+ return;
+ }
+
+ /*
+ * If the block map head will advance, we must flush any block map page modified by the
+ * entries we are reaping. If the slab journal head will advance, we must flush the slab
+ * summary update covering the slab journal that just released some lock.
+ */
+ journal->reaping = true;
+ vdo_submit_flush_vio(journal->flush_vio, flush_endio, handle_flush_error);
+}
+
+/**
+ * vdo_acquire_recovery_journal_block_reference() - Acquire a reference to a recovery journal block
+ * from somewhere other than the journal itself.
+ * @journal: The recovery journal.
+ * @sequence_number: The journal sequence number of the referenced block.
+ * @zone_type: The type of the zone making the adjustment.
+ * @zone_id: The ID of the zone making the adjustment.
+ */
+void vdo_acquire_recovery_journal_block_reference(struct recovery_journal *journal,
+ sequence_number_t sequence_number,
+ enum vdo_zone_type zone_type,
+ zone_count_t zone_id)
+{
+ block_count_t lock_number;
+ u16 *current_value;
+
+ if (sequence_number == 0)
+ return;
+
+ VDO_ASSERT_LOG_ONLY((zone_type != VDO_ZONE_TYPE_JOURNAL),
+ "invalid lock count increment from journal zone");
+
+ lock_number = vdo_get_recovery_journal_block_number(journal, sequence_number);
+ current_value = get_counter(journal, lock_number, zone_type, zone_id);
+ VDO_ASSERT_LOG_ONLY(*current_value < U16_MAX,
+ "increment of lock counter must not overflow");
+
+ if (*current_value == 0) {
+ /*
+ * This zone is acquiring this lock for the first time. Extra barriers because this
+ * was original developed using an atomic add operation that implicitly had them.
+ */
+ smp_mb__before_atomic();
+ atomic_inc(get_zone_count_ptr(journal, lock_number, zone_type));
+ /* same as before_atomic */
+ smp_mb__after_atomic();
+ }
+
+ *current_value += 1;
+}
+
+/**
+ * vdo_release_journal_entry_lock() - Release a single per-entry reference count for a recovery
+ * journal block.
+ * @journal: The recovery journal.
+ * @sequence_number: The journal sequence number of the referenced block.
+ */
+void vdo_release_journal_entry_lock(struct recovery_journal *journal,
+ sequence_number_t sequence_number)
+{
+ block_count_t lock_number;
+
+ if (sequence_number == 0)
+ return;
+
+ lock_number = vdo_get_recovery_journal_block_number(journal, sequence_number);
+ /*
+ * Extra barriers because this was originally developed using an atomic add operation that
+ * implicitly had them.
+ */
+ smp_mb__before_atomic();
+ atomic_inc(get_decrement_counter(journal, lock_number));
+ /* same as before_atomic */
+ smp_mb__after_atomic();
+}
+
+/**
+ * initiate_drain() - Initiate a drain.
+ *
+ * Implements vdo_admin_initiator_fn.
+ */
+static void initiate_drain(struct admin_state *state)
+{
+ check_for_drain_complete(container_of(state, struct recovery_journal, state));
+}
+
+/**
+ * vdo_drain_recovery_journal() - Drain recovery journal I/O.
+ * @journal: The journal to drain.
+ * @operation: The drain operation (suspend or save).
+ * @parent: The completion to notify once the journal is drained.
+ *
+ * All uncommitted entries will be written out.
+ */
+void vdo_drain_recovery_journal(struct recovery_journal *journal,
+ const struct admin_state_code *operation,
+ struct vdo_completion *parent)
+{
+ assert_on_journal_thread(journal, __func__);
+ vdo_start_draining(&journal->state, operation, parent, initiate_drain);
+}
+
+/**
+ * resume_lock_counter() - Re-allow notifications from a suspended lock counter.
+ * @counter: The counter.
+ *
+ * Return: true if the lock counter was suspended.
+ */
+static bool resume_lock_counter(struct lock_counter *counter)
+{
+ int prior_state;
+
+ /*
+ * Extra barriers because this was original developed using a CAS operation that implicitly
+ * had them.
+ */
+ smp_mb__before_atomic();
+ prior_state = atomic_cmpxchg(&counter->state, LOCK_COUNTER_STATE_SUSPENDED,
+ LOCK_COUNTER_STATE_NOT_NOTIFYING);
+ /* same as before_atomic */
+ smp_mb__after_atomic();
+
+ return (prior_state == LOCK_COUNTER_STATE_SUSPENDED);
+}
+
+/**
+ * vdo_resume_recovery_journal() - Resume a recovery journal which has been drained.
+ * @journal: The journal to resume.
+ * @parent: The completion to finish once the journal is resumed.
+ */
+void vdo_resume_recovery_journal(struct recovery_journal *journal,
+ struct vdo_completion *parent)
+{
+ bool saved;
+
+ assert_on_journal_thread(journal, __func__);
+ saved = vdo_is_state_saved(&journal->state);
+ vdo_set_completion_result(parent, vdo_resume_if_quiescent(&journal->state));
+ if (is_read_only(journal)) {
+ vdo_continue_completion(parent, VDO_READ_ONLY);
+ return;
+ }
+
+ if (saved)
+ initialize_journal_state(journal);
+
+ if (resume_lock_counter(&journal->lock_counter)) {
+ /* We might have missed a notification. */
+ reap_recovery_journal(journal);
+ }
+
+ vdo_launch_completion(parent);
+}
+
+/**
+ * vdo_get_recovery_journal_logical_blocks_used() - Get the number of logical blocks in use by the
+ * VDO.
+ * @journal: The journal.
+ *
+ * Return: The number of logical blocks in use by the VDO.
+ */
+block_count_t vdo_get_recovery_journal_logical_blocks_used(const struct recovery_journal *journal)
+{
+ return journal->logical_blocks_used;
+}
+
+/**
+ * vdo_get_recovery_journal_statistics() - Get the current statistics from the recovery journal.
+ * @journal: The recovery journal to query.
+ *
+ * Return: A copy of the current statistics for the journal.
+ */
+struct recovery_journal_statistics
+vdo_get_recovery_journal_statistics(const struct recovery_journal *journal)
+{
+ return journal->events;
+}
+
+/**
+ * dump_recovery_block() - Dump the contents of the recovery block to the log.
+ * @block: The block to dump.
+ */
+static void dump_recovery_block(const struct recovery_journal_block *block)
+{
+ vdo_log_info(" sequence number %llu; entries %u; %s; %zu entry waiters; %zu commit waiters",
+ (unsigned long long) block->sequence_number, block->entry_count,
+ (block->committing ? "committing" : "waiting"),
+ vdo_waitq_num_waiters(&block->entry_waiters),
+ vdo_waitq_num_waiters(&block->commit_waiters));
+}
+
+/**
+ * vdo_dump_recovery_journal_statistics() - Dump some current statistics and other debug info from
+ * the recovery journal.
+ * @journal: The recovery journal to dump.
+ */
+void vdo_dump_recovery_journal_statistics(const struct recovery_journal *journal)
+{
+ const struct recovery_journal_block *block;
+ struct recovery_journal_statistics stats = vdo_get_recovery_journal_statistics(journal);
+
+ vdo_log_info("Recovery Journal");
+ vdo_log_info(" block_map_head=%llu slab_journal_head=%llu last_write_acknowledged=%llu tail=%llu block_map_reap_head=%llu slab_journal_reap_head=%llu disk_full=%llu slab_journal_commits_requested=%llu entry_waiters=%zu",
+ (unsigned long long) journal->block_map_head,
+ (unsigned long long) journal->slab_journal_head,
+ (unsigned long long) journal->last_write_acknowledged,
+ (unsigned long long) journal->tail,
+ (unsigned long long) journal->block_map_reap_head,
+ (unsigned long long) journal->slab_journal_reap_head,
+ (unsigned long long) stats.disk_full,
+ (unsigned long long) stats.slab_journal_commits_requested,
+ vdo_waitq_num_waiters(&journal->entry_waiters));
+ vdo_log_info(" entries: started=%llu written=%llu committed=%llu",
+ (unsigned long long) stats.entries.started,
+ (unsigned long long) stats.entries.written,
+ (unsigned long long) stats.entries.committed);
+ vdo_log_info(" blocks: started=%llu written=%llu committed=%llu",
+ (unsigned long long) stats.blocks.started,
+ (unsigned long long) stats.blocks.written,
+ (unsigned long long) stats.blocks.committed);
+
+ vdo_log_info(" active blocks:");
+ list_for_each_entry(block, &journal->active_tail_blocks, list_node)
+ dump_recovery_block(block);
+}
diff --git a/drivers/md/dm-vdo/recovery-journal.h b/drivers/md/dm-vdo/recovery-journal.h
new file mode 100644
index 000000000000..899071173015
--- /dev/null
+++ b/drivers/md/dm-vdo/recovery-journal.h
@@ -0,0 +1,316 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef VDO_RECOVERY_JOURNAL_H
+#define VDO_RECOVERY_JOURNAL_H
+
+#include <linux/list.h>
+
+#include "numeric.h"
+
+#include "admin-state.h"
+#include "constants.h"
+#include "encodings.h"
+#include "flush.h"
+#include "statistics.h"
+#include "types.h"
+#include "wait-queue.h"
+
+/**
+ * DOC: recovery journal.
+ *
+ * The recovery_journal provides a log of all block mapping and reference count changes which have
+ * not yet been stably written to the block map or slab journals. This log helps to reduce the
+ * write amplification of writes by providing amortization of slab journal and block map page
+ * updates.
+ *
+ * The recovery journal has a single dedicated queue and thread for performing all journal updates.
+ * The concurrency guarantees of this single-threaded model allow the code to omit more
+ * fine-grained locking for recovery journal structures.
+ *
+ * The journal consists of a set of on-disk blocks arranged as a circular log with monotonically
+ * increasing sequence numbers. Three sequence numbers serve to define the active extent of the
+ * journal. The 'head' is the oldest active block in the journal. The 'tail' is the end of the
+ * half-open interval containing the active blocks. 'active' is the number of the block actively
+ * receiving entries. In an empty journal, head == active == tail. Once any entries are added, tail
+ * = active + 1, and head may be any value in the interval [tail - size, active].
+ *
+ * The journal also contains a set of in-memory blocks which are used to buffer up entries until
+ * they can be committed. In general the number of in-memory blocks ('tail_buffer_count') will be
+ * less than the on-disk size. Each in-memory block is also a vdo_completion. Each in-memory block
+ * has a vio which is used to commit that block to disk. The vio's data is the on-disk
+ * representation of the journal block. In addition each in-memory block has a buffer which is used
+ * to accumulate entries while a partial commit of the block is in progress. In-memory blocks are
+ * kept on two rings. Free blocks live on the 'free_tail_blocks' ring. When a block becomes active
+ * (see below) it is moved to the 'active_tail_blocks' ring. When a block is fully committed, it is
+ * moved back to the 'free_tail_blocks' ring.
+ *
+ * When entries are added to the journal, they are added to the active in-memory block, as
+ * indicated by the 'active_block' field. If the caller wishes to wait for the entry to be
+ * committed, the requesting VIO will be attached to the in-memory block to which the caller's
+ * entry was added. If the caller does wish to wait, or if the entry filled the active block, an
+ * attempt will be made to commit that block to disk. If there is already another commit in
+ * progress, the attempt will be ignored and then automatically retried when the in-progress commit
+ * completes. If there is no commit in progress, any data_vios waiting on the block are transferred
+ * to the block's vio which is then written, automatically waking all of the waiters when it
+ * completes. When the write completes, any entries which accumulated in the block are copied to
+ * the vio's data buffer.
+ *
+ * Finally, the journal maintains a set of counters, one for each on disk journal block. These
+ * counters are used as locks to prevent premature reaping of journal blocks. Each time a new
+ * sequence number is used, the counter for the corresponding block is incremented. The counter is
+ * subsequently decremented when that block is filled and then committed for the last time. This
+ * prevents blocks from being reaped while they are still being updated. The counter is also
+ * incremented once for each entry added to a block, and decremented once each time the block map
+ * is updated in memory for that request. This prevents blocks from being reaped while their VIOs
+ * are still active. Finally, each in-memory block map page tracks the oldest journal block that
+ * contains entries corresponding to uncommitted updates to that block map page. Each time an
+ * in-memory block map page is updated, it checks if the journal block for the VIO is earlier than
+ * the one it references, in which case it increments the count on the earlier journal block and
+ * decrements the count on the later journal block, maintaining a lock on the oldest journal block
+ * containing entries for that page. When a block map page has been flushed from the cache, the
+ * counter for the journal block it references is decremented. Whenever the counter for the head
+ * block goes to 0, the head is advanced until it comes to a block whose counter is not 0 or until
+ * it reaches the active block. This is the mechanism for reclaiming journal space on disk.
+ *
+ * If there is no in-memory space when a VIO attempts to add an entry, the VIO will be attached to
+ * the 'commit_completion' and will be woken the next time a full block has committed. If there is
+ * no on-disk space when a VIO attempts to add an entry, the VIO will be attached to the
+ * 'reap_completion', and will be woken the next time a journal block is reaped.
+ */
+
+enum vdo_zone_type {
+ VDO_ZONE_TYPE_ADMIN,
+ VDO_ZONE_TYPE_JOURNAL,
+ VDO_ZONE_TYPE_LOGICAL,
+ VDO_ZONE_TYPE_PHYSICAL,
+};
+
+struct lock_counter {
+ /* The completion for notifying the owner of a lock release */
+ struct vdo_completion completion;
+ /* The number of logical zones which may hold locks */
+ zone_count_t logical_zones;
+ /* The number of physical zones which may hold locks */
+ zone_count_t physical_zones;
+ /* The number of locks */
+ block_count_t locks;
+ /* Whether the lock release notification is in flight */
+ atomic_t state;
+ /* The number of logical zones which hold each lock */
+ atomic_t *logical_zone_counts;
+ /* The number of physical zones which hold each lock */
+ atomic_t *physical_zone_counts;
+ /* The per-lock counts for the journal zone */
+ u16 *journal_counters;
+ /* The per-lock decrement counts for the journal zone */
+ atomic_t *journal_decrement_counts;
+ /* The per-zone, per-lock reference counts for logical zones */
+ u16 *logical_counters;
+ /* The per-zone, per-lock reference counts for physical zones */
+ u16 *physical_counters;
+};
+
+struct recovery_journal_block {
+ /* The doubly linked pointers for the free or active lists */
+ struct list_head list_node;
+ /* The waiter for the pending full block list */
+ struct vdo_waiter write_waiter;
+ /* The journal to which this block belongs */
+ struct recovery_journal *journal;
+ /* A pointer to the current sector in the packed block buffer */
+ struct packed_journal_sector *sector;
+ /* The vio for writing this block */
+ struct vio vio;
+ /* The sequence number for this block */
+ sequence_number_t sequence_number;
+ /* The location of this block in the on-disk journal */
+ physical_block_number_t block_number;
+ /* Whether this block is being committed */
+ bool committing;
+ /* The total number of entries in this block */
+ journal_entry_count_t entry_count;
+ /* The total number of uncommitted entries (queued or committing) */
+ journal_entry_count_t uncommitted_entry_count;
+ /* The number of new entries in the current commit */
+ journal_entry_count_t entries_in_commit;
+ /* The queue of vios which will make entries for the next commit */
+ struct vdo_wait_queue entry_waiters;
+ /* The queue of vios waiting for the current commit */
+ struct vdo_wait_queue commit_waiters;
+};
+
+struct recovery_journal {
+ /* The thread ID of the journal zone */
+ thread_id_t thread_id;
+ /* The slab depot which can hold locks on this journal */
+ struct slab_depot *depot;
+ /* The block map which can hold locks on this journal */
+ struct block_map *block_map;
+ /* The queue of vios waiting to make entries */
+ struct vdo_wait_queue entry_waiters;
+ /* The number of free entries in the journal */
+ u64 available_space;
+ /* The number of decrement entries which need to be made */
+ data_vio_count_t pending_decrement_count;
+ /* Whether the journal is adding entries from the increment or decrement waiters queues */
+ bool adding_entries;
+ /* The administrative state of the journal */
+ struct admin_state state;
+ /* Whether a reap is in progress */
+ bool reaping;
+ /* The location of the first journal block */
+ physical_block_number_t origin;
+ /* The oldest active block in the journal on disk for block map rebuild */
+ sequence_number_t block_map_head;
+ /* The oldest active block in the journal on disk for slab journal replay */
+ sequence_number_t slab_journal_head;
+ /* The newest block in the journal on disk to which a write has finished */
+ sequence_number_t last_write_acknowledged;
+ /* The end of the half-open interval of the active journal */
+ sequence_number_t tail;
+ /* The point at which the last entry will have been added */
+ struct journal_point append_point;
+ /* The journal point of the vio most recently released from the journal */
+ struct journal_point commit_point;
+ /* The nonce of the VDO */
+ nonce_t nonce;
+ /* The number of recoveries completed by the VDO */
+ u8 recovery_count;
+ /* The number of entries which fit in a single block */
+ journal_entry_count_t entries_per_block;
+ /* Unused in-memory journal blocks */
+ struct list_head free_tail_blocks;
+ /* In-memory journal blocks with records */
+ struct list_head active_tail_blocks;
+ /* A pointer to the active block (the one we are adding entries to now) */
+ struct recovery_journal_block *active_block;
+ /* Journal blocks that need writing */
+ struct vdo_wait_queue pending_writes;
+ /* The new block map reap head after reaping */
+ sequence_number_t block_map_reap_head;
+ /* The head block number for the block map rebuild range */
+ block_count_t block_map_head_block_number;
+ /* The new slab journal reap head after reaping */
+ sequence_number_t slab_journal_reap_head;
+ /* The head block number for the slab journal replay range */
+ block_count_t slab_journal_head_block_number;
+ /* The data-less vio, usable only for flushing */
+ struct vio *flush_vio;
+ /* The number of blocks in the on-disk journal */
+ block_count_t size;
+ /* The number of logical blocks that are in-use */
+ block_count_t logical_blocks_used;
+ /* The number of block map pages that are allocated */
+ block_count_t block_map_data_blocks;
+ /* The number of journal blocks written but not yet acknowledged */
+ block_count_t pending_write_count;
+ /* The threshold at which slab journal tail blocks will be written out */
+ block_count_t slab_journal_commit_threshold;
+ /* Counters for events in the journal that are reported as statistics */
+ struct recovery_journal_statistics events;
+ /* The locks for each on-disk block */
+ struct lock_counter lock_counter;
+ /* The tail blocks */
+ struct recovery_journal_block blocks[];
+};
+
+/**
+ * vdo_get_recovery_journal_block_number() - Get the physical block number for a given sequence
+ * number.
+ * @journal: The journal.
+ * @sequence: The sequence number of the desired block.
+ *
+ * Return: The block number corresponding to the sequence number.
+ */
+static inline physical_block_number_t __must_check
+vdo_get_recovery_journal_block_number(const struct recovery_journal *journal,
+ sequence_number_t sequence)
+{
+ /*
+ * Since journal size is a power of two, the block number modulus can just be extracted
+ * from the low-order bits of the sequence.
+ */
+ return vdo_compute_recovery_journal_block_number(journal->size, sequence);
+}
+
+/**
+ * vdo_compute_recovery_journal_check_byte() - Compute the check byte for a given sequence number.
+ * @journal: The journal.
+ * @sequence: The sequence number.
+ *
+ * Return: The check byte corresponding to the sequence number.
+ */
+static inline u8 __must_check
+vdo_compute_recovery_journal_check_byte(const struct recovery_journal *journal,
+ sequence_number_t sequence)
+{
+ /* The check byte must change with each trip around the journal. */
+ return (((sequence / journal->size) & 0x7F) | 0x80);
+}
+
+int __must_check vdo_decode_recovery_journal(struct recovery_journal_state_7_0 state,
+ nonce_t nonce, struct vdo *vdo,
+ struct partition *partition,
+ u64 recovery_count,
+ block_count_t journal_size,
+ struct recovery_journal **journal_ptr);
+
+void vdo_free_recovery_journal(struct recovery_journal *journal);
+
+void vdo_initialize_recovery_journal_post_repair(struct recovery_journal *journal,
+ u64 recovery_count,
+ sequence_number_t tail,
+ block_count_t logical_blocks_used,
+ block_count_t block_map_data_blocks);
+
+block_count_t __must_check
+vdo_get_journal_block_map_data_blocks_used(struct recovery_journal *journal);
+
+thread_id_t __must_check vdo_get_recovery_journal_thread_id(struct recovery_journal *journal);
+
+void vdo_open_recovery_journal(struct recovery_journal *journal,
+ struct slab_depot *depot, struct block_map *block_map);
+
+sequence_number_t
+vdo_get_recovery_journal_current_sequence_number(struct recovery_journal *journal);
+
+block_count_t __must_check vdo_get_recovery_journal_length(block_count_t journal_size);
+
+struct recovery_journal_state_7_0 __must_check
+vdo_record_recovery_journal(const struct recovery_journal *journal);
+
+void vdo_add_recovery_journal_entry(struct recovery_journal *journal,
+ struct data_vio *data_vio);
+
+void vdo_acquire_recovery_journal_block_reference(struct recovery_journal *journal,
+ sequence_number_t sequence_number,
+ enum vdo_zone_type zone_type,
+ zone_count_t zone_id);
+
+void vdo_release_recovery_journal_block_reference(struct recovery_journal *journal,
+ sequence_number_t sequence_number,
+ enum vdo_zone_type zone_type,
+ zone_count_t zone_id);
+
+void vdo_release_journal_entry_lock(struct recovery_journal *journal,
+ sequence_number_t sequence_number);
+
+void vdo_drain_recovery_journal(struct recovery_journal *journal,
+ const struct admin_state_code *operation,
+ struct vdo_completion *parent);
+
+void vdo_resume_recovery_journal(struct recovery_journal *journal,
+ struct vdo_completion *parent);
+
+block_count_t __must_check
+vdo_get_recovery_journal_logical_blocks_used(const struct recovery_journal *journal);
+
+struct recovery_journal_statistics __must_check
+vdo_get_recovery_journal_statistics(const struct recovery_journal *journal);
+
+void vdo_dump_recovery_journal_statistics(const struct recovery_journal *journal);
+
+#endif /* VDO_RECOVERY_JOURNAL_H */
diff --git a/drivers/md/dm-vdo/repair.c b/drivers/md/dm-vdo/repair.c
new file mode 100644
index 000000000000..defc9359f10e
--- /dev/null
+++ b/drivers/md/dm-vdo/repair.c
@@ -0,0 +1,1756 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "repair.h"
+
+#include <linux/min_heap.h>
+#include <linux/minmax.h>
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "permassert.h"
+
+#include "block-map.h"
+#include "completion.h"
+#include "constants.h"
+#include "encodings.h"
+#include "int-map.h"
+#include "io-submitter.h"
+#include "recovery-journal.h"
+#include "slab-depot.h"
+#include "types.h"
+#include "vdo.h"
+#include "wait-queue.h"
+
+/*
+ * An explicitly numbered block mapping. Numbering the mappings allows them to be sorted by logical
+ * block number during repair while still preserving the relative order of journal entries with
+ * the same logical block number.
+ */
+struct numbered_block_mapping {
+ struct block_map_slot block_map_slot;
+ struct block_map_entry block_map_entry;
+ /* A serial number to use during replay */
+ u32 number;
+} __packed;
+
+/*
+ * The absolute position of an entry in the recovery journal, including the sector number and the
+ * entry number within the sector.
+ */
+struct recovery_point {
+ /* Block sequence number */
+ sequence_number_t sequence_number;
+ /* Sector number */
+ u8 sector_count;
+ /* Entry number */
+ journal_entry_count_t entry_count;
+ /* Whether or not the increment portion of the current entry has been applied */
+ bool increment_applied;
+};
+
+struct repair_completion {
+ /* The completion header */
+ struct vdo_completion completion;
+
+ /* A buffer to hold the data read off disk */
+ char *journal_data;
+
+ /* For loading the journal */
+ data_vio_count_t vio_count;
+ data_vio_count_t vios_complete;
+ struct vio *vios;
+
+ /* The number of entries to be applied to the block map */
+ size_t block_map_entry_count;
+ /* The sequence number of the first valid block for block map recovery */
+ sequence_number_t block_map_head;
+ /* The sequence number of the first valid block for slab journal replay */
+ sequence_number_t slab_journal_head;
+ /* The sequence number of the last valid block of the journal (if known) */
+ sequence_number_t tail;
+ /*
+ * The highest sequence number of the journal. During recovery (vs read-only rebuild), not
+ * the same as the tail, since the tail ignores blocks after the first hole.
+ */
+ sequence_number_t highest_tail;
+
+ /* The number of logical blocks currently known to be in use */
+ block_count_t logical_blocks_used;
+ /* The number of block map data blocks known to be allocated */
+ block_count_t block_map_data_blocks;
+
+ /* These fields are for playing the journal into the block map */
+ /* The entry data for the block map recovery */
+ struct numbered_block_mapping *entries;
+ /* The number of entries in the entry array */
+ size_t entry_count;
+ /* number of pending (non-ready) requests*/
+ page_count_t outstanding;
+ /* number of page completions */
+ page_count_t page_count;
+ bool launching;
+ /*
+ * a heap wrapping journal_entries. It re-orders and sorts journal entries in ascending LBN
+ * order, then original journal order. This permits efficient iteration over the journal
+ * entries in order.
+ */
+ struct min_heap replay_heap;
+ /* Fields tracking progress through the journal entries. */
+ struct numbered_block_mapping *current_entry;
+ struct numbered_block_mapping *current_unfetched_entry;
+ /* Current requested page's PBN */
+ physical_block_number_t pbn;
+
+ /* These fields are only used during recovery. */
+ /* A location just beyond the last valid entry of the journal */
+ struct recovery_point tail_recovery_point;
+ /* The location of the next recovery journal entry to apply */
+ struct recovery_point next_recovery_point;
+ /* The journal point to give to the next synthesized decref */
+ struct journal_point next_journal_point;
+ /* The number of entries played into slab journals */
+ size_t entries_added_to_slab_journals;
+
+ /* These fields are only used during read-only rebuild */
+ page_count_t page_to_fetch;
+ /* the number of leaf pages in the block map */
+ page_count_t leaf_pages;
+ /* the last slot of the block map */
+ struct block_map_slot last_slot;
+
+ /*
+ * The page completions used for playing the journal into the block map, and, during
+ * read-only rebuild, for rebuilding the reference counts from the block map.
+ */
+ struct vdo_page_completion page_completions[];
+};
+
+/*
+ * This is a min_heap callback function that orders numbered_block_mappings using the
+ * 'block_map_slot' field as the primary key and the mapping 'number' field as the secondary key.
+ * Using the mapping number preserves the journal order of entries for the same slot, allowing us
+ * to sort by slot while still ensuring we replay all entries with the same slot in the exact order
+ * as they appeared in the journal.
+ */
+static bool mapping_is_less_than(const void *item1, const void *item2)
+{
+ const struct numbered_block_mapping *mapping1 =
+ (const struct numbered_block_mapping *) item1;
+ const struct numbered_block_mapping *mapping2 =
+ (const struct numbered_block_mapping *) item2;
+
+ if (mapping1->block_map_slot.pbn != mapping2->block_map_slot.pbn)
+ return mapping1->block_map_slot.pbn < mapping2->block_map_slot.pbn;
+
+ if (mapping1->block_map_slot.slot != mapping2->block_map_slot.slot)
+ return mapping1->block_map_slot.slot < mapping2->block_map_slot.slot;
+
+ if (mapping1->number != mapping2->number)
+ return mapping1->number < mapping2->number;
+
+ return 0;
+}
+
+static void swap_mappings(void *item1, void *item2)
+{
+ struct numbered_block_mapping *mapping1 = item1;
+ struct numbered_block_mapping *mapping2 = item2;
+
+ swap(*mapping1, *mapping2);
+}
+
+static const struct min_heap_callbacks repair_min_heap = {
+ .elem_size = sizeof(struct numbered_block_mapping),
+ .less = mapping_is_less_than,
+ .swp = swap_mappings,
+};
+
+static struct numbered_block_mapping *sort_next_heap_element(struct repair_completion *repair)
+{
+ struct min_heap *heap = &repair->replay_heap;
+ struct numbered_block_mapping *last;
+
+ if (heap->nr == 0)
+ return NULL;
+
+ /*
+ * Swap the next heap element with the last one on the heap, popping it off the heap,
+ * restore the heap invariant, and return a pointer to the popped element.
+ */
+ last = &repair->entries[--heap->nr];
+ swap_mappings(heap->data, last);
+ min_heapify(heap, 0, &repair_min_heap);
+ return last;
+}
+
+/**
+ * as_repair_completion() - Convert a generic completion to a repair_completion.
+ * @completion: The completion to convert.
+ *
+ * Return: The repair_completion.
+ */
+static inline struct repair_completion * __must_check
+as_repair_completion(struct vdo_completion *completion)
+{
+ vdo_assert_completion_type(completion, VDO_REPAIR_COMPLETION);
+ return container_of(completion, struct repair_completion, completion);
+}
+
+static void prepare_repair_completion(struct repair_completion *repair,
+ vdo_action_fn callback, enum vdo_zone_type zone_type)
+{
+ struct vdo_completion *completion = &repair->completion;
+ const struct thread_config *thread_config = &completion->vdo->thread_config;
+ thread_id_t thread_id;
+
+ /* All blockmap access is done on single thread, so use logical zone 0. */
+ thread_id = ((zone_type == VDO_ZONE_TYPE_LOGICAL) ?
+ thread_config->logical_threads[0] :
+ thread_config->admin_thread);
+ vdo_reset_completion(completion);
+ vdo_set_completion_callback(completion, callback, thread_id);
+}
+
+static void launch_repair_completion(struct repair_completion *repair,
+ vdo_action_fn callback, enum vdo_zone_type zone_type)
+{
+ prepare_repair_completion(repair, callback, zone_type);
+ vdo_launch_completion(&repair->completion);
+}
+
+static void uninitialize_vios(struct repair_completion *repair)
+{
+ while (repair->vio_count > 0)
+ free_vio_components(&repair->vios[--repair->vio_count]);
+
+ vdo_free(vdo_forget(repair->vios));
+}
+
+static void free_repair_completion(struct repair_completion *repair)
+{
+ if (repair == NULL)
+ return;
+
+ /*
+ * We do this here because this function is the only common bottleneck for all clean up
+ * paths.
+ */
+ repair->completion.vdo->block_map->zones[0].page_cache.rebuilding = false;
+
+ uninitialize_vios(repair);
+ vdo_free(vdo_forget(repair->journal_data));
+ vdo_free(vdo_forget(repair->entries));
+ vdo_free(repair);
+}
+
+static void finish_repair(struct vdo_completion *completion)
+{
+ struct vdo_completion *parent = completion->parent;
+ struct vdo *vdo = completion->vdo;
+ struct repair_completion *repair = as_repair_completion(completion);
+
+ vdo_assert_on_admin_thread(vdo, __func__);
+
+ if (vdo->load_state != VDO_REBUILD_FOR_UPGRADE)
+ vdo->states.vdo.complete_recoveries++;
+
+ vdo_initialize_recovery_journal_post_repair(vdo->recovery_journal,
+ vdo->states.vdo.complete_recoveries,
+ repair->highest_tail,
+ repair->logical_blocks_used,
+ repair->block_map_data_blocks);
+ free_repair_completion(vdo_forget(repair));
+
+ if (vdo_state_requires_read_only_rebuild(vdo->load_state)) {
+ vdo_log_info("Read-only rebuild complete");
+ vdo_launch_completion(parent);
+ return;
+ }
+
+ /* FIXME: shouldn't this say either "recovery" or "repair"? */
+ vdo_log_info("Rebuild complete");
+
+ /*
+ * Now that we've freed the repair completion and its vast array of journal entries, we
+ * can allocate refcounts.
+ */
+ vdo_continue_completion(parent, vdo_allocate_reference_counters(vdo->depot));
+}
+
+/**
+ * abort_repair() - Handle a repair error.
+ * @completion: The repair completion.
+ */
+static void abort_repair(struct vdo_completion *completion)
+{
+ struct vdo_completion *parent = completion->parent;
+ int result = completion->result;
+ struct repair_completion *repair = as_repair_completion(completion);
+
+ if (vdo_state_requires_read_only_rebuild(completion->vdo->load_state))
+ vdo_log_info("Read-only rebuild aborted");
+ else
+ vdo_log_warning("Recovery aborted");
+
+ free_repair_completion(vdo_forget(repair));
+ vdo_continue_completion(parent, result);
+}
+
+/**
+ * abort_on_error() - Abort a repair if there is an error.
+ * @result: The result to check.
+ * @repair: The repair completion.
+ *
+ * Return: true if the result was an error.
+ */
+static bool __must_check abort_on_error(int result, struct repair_completion *repair)
+{
+ if (result == VDO_SUCCESS)
+ return false;
+
+ vdo_fail_completion(&repair->completion, result);
+ return true;
+}
+
+/**
+ * drain_slab_depot() - Flush out all dirty refcounts blocks now that they have been rebuilt or
+ * recovered.
+ */
+static void drain_slab_depot(struct vdo_completion *completion)
+{
+ struct vdo *vdo = completion->vdo;
+ struct repair_completion *repair = as_repair_completion(completion);
+ const struct admin_state_code *operation;
+
+ vdo_assert_on_admin_thread(vdo, __func__);
+
+ prepare_repair_completion(repair, finish_repair, VDO_ZONE_TYPE_ADMIN);
+ if (vdo_state_requires_read_only_rebuild(vdo->load_state)) {
+ vdo_log_info("Saving rebuilt state");
+ operation = VDO_ADMIN_STATE_REBUILDING;
+ } else {
+ vdo_log_info("Replayed %zu journal entries into slab journals",
+ repair->entries_added_to_slab_journals);
+ operation = VDO_ADMIN_STATE_RECOVERING;
+ }
+
+ vdo_drain_slab_depot(vdo->depot, operation, completion);
+}
+
+/**
+ * flush_block_map_updates() - Flush the block map now that all the reference counts are rebuilt.
+ * @completion: The repair completion.
+ *
+ * This callback is registered in finish_if_done().
+ */
+static void flush_block_map_updates(struct vdo_completion *completion)
+{
+ vdo_assert_on_admin_thread(completion->vdo, __func__);
+
+ vdo_log_info("Flushing block map changes");
+ prepare_repair_completion(as_repair_completion(completion), drain_slab_depot,
+ VDO_ZONE_TYPE_ADMIN);
+ vdo_drain_block_map(completion->vdo->block_map, VDO_ADMIN_STATE_RECOVERING,
+ completion);
+}
+
+static bool fetch_page(struct repair_completion *repair,
+ struct vdo_completion *completion);
+
+/**
+ * handle_page_load_error() - Handle an error loading a page.
+ * @completion: The vdo_page_completion.
+ */
+static void handle_page_load_error(struct vdo_completion *completion)
+{
+ struct repair_completion *repair = completion->parent;
+
+ repair->outstanding--;
+ vdo_set_completion_result(&repair->completion, completion->result);
+ vdo_release_page_completion(completion);
+ fetch_page(repair, completion);
+}
+
+/**
+ * unmap_entry() - Unmap an invalid entry and indicate that its page must be written out.
+ * @page: The page containing the entries
+ * @completion: The page_completion for writing the page
+ * @slot: The slot to unmap
+ */
+static void unmap_entry(struct block_map_page *page, struct vdo_completion *completion,
+ slot_number_t slot)
+{
+ page->entries[slot] = UNMAPPED_BLOCK_MAP_ENTRY;
+ vdo_request_page_write(completion);
+}
+
+/**
+ * remove_out_of_bounds_entries() - Unmap entries which outside the logical space.
+ * @page: The page containing the entries
+ * @completion: The page_completion for writing the page
+ * @start: The first slot to check
+ */
+static void remove_out_of_bounds_entries(struct block_map_page *page,
+ struct vdo_completion *completion,
+ slot_number_t start)
+{
+ slot_number_t slot;
+
+ for (slot = start; slot < VDO_BLOCK_MAP_ENTRIES_PER_PAGE; slot++) {
+ struct data_location mapping = vdo_unpack_block_map_entry(&page->entries[slot]);
+
+ if (vdo_is_mapped_location(&mapping))
+ unmap_entry(page, completion, slot);
+ }
+}
+
+/**
+ * process_slot() - Update the reference counts for a single entry.
+ * @page: The page containing the entries
+ * @completion: The page_completion for writing the page
+ * @slot: The slot to check
+ *
+ * Return: true if the entry was a valid mapping
+ */
+static bool process_slot(struct block_map_page *page, struct vdo_completion *completion,
+ slot_number_t slot)
+{
+ struct slab_depot *depot = completion->vdo->depot;
+ int result;
+ struct data_location mapping = vdo_unpack_block_map_entry(&page->entries[slot]);
+
+ if (!vdo_is_valid_location(&mapping)) {
+ /* This entry is invalid, so remove it from the page. */
+ unmap_entry(page, completion, slot);
+ return false;
+ }
+
+ if (!vdo_is_mapped_location(&mapping))
+ return false;
+
+
+ if (mapping.pbn == VDO_ZERO_BLOCK)
+ return true;
+
+ if (!vdo_is_physical_data_block(depot, mapping.pbn)) {
+ /*
+ * This is a nonsense mapping. Remove it from the map so we're at least consistent
+ * and mark the page dirty.
+ */
+ unmap_entry(page, completion, slot);
+ return false;
+ }
+
+ result = vdo_adjust_reference_count_for_rebuild(depot, mapping.pbn,
+ VDO_JOURNAL_DATA_REMAPPING);
+ if (result == VDO_SUCCESS)
+ return true;
+
+ vdo_log_error_strerror(result,
+ "Could not adjust reference count for PBN %llu, slot %u mapped to PBN %llu",
+ (unsigned long long) vdo_get_block_map_page_pbn(page),
+ slot, (unsigned long long) mapping.pbn);
+ unmap_entry(page, completion, slot);
+ return false;
+}
+
+/**
+ * rebuild_reference_counts_from_page() - Rebuild reference counts from a block map page.
+ * @repair: The repair completion.
+ * @completion: The page completion holding the page.
+ */
+static void rebuild_reference_counts_from_page(struct repair_completion *repair,
+ struct vdo_completion *completion)
+{
+ slot_number_t slot, last_slot;
+ struct block_map_page *page;
+ int result;
+
+ result = vdo_get_cached_page(completion, &page);
+ if (result != VDO_SUCCESS) {
+ vdo_set_completion_result(&repair->completion, result);
+ return;
+ }
+
+ if (!page->header.initialized)
+ return;
+
+ /* Remove any bogus entries which exist beyond the end of the logical space. */
+ if (vdo_get_block_map_page_pbn(page) == repair->last_slot.pbn) {
+ last_slot = repair->last_slot.slot;
+ remove_out_of_bounds_entries(page, completion, last_slot);
+ } else {
+ last_slot = VDO_BLOCK_MAP_ENTRIES_PER_PAGE;
+ }
+
+ /* Inform the slab depot of all entries on this page. */
+ for (slot = 0; slot < last_slot; slot++) {
+ if (process_slot(page, completion, slot))
+ repair->logical_blocks_used++;
+ }
+}
+
+/**
+ * page_loaded() - Process a page which has just been loaded.
+ * @completion: The vdo_page_completion for the fetched page.
+ *
+ * This callback is registered by fetch_page().
+ */
+static void page_loaded(struct vdo_completion *completion)
+{
+ struct repair_completion *repair = completion->parent;
+
+ repair->outstanding--;
+ rebuild_reference_counts_from_page(repair, completion);
+ vdo_release_page_completion(completion);
+
+ /* Advance progress to the next page, and fetch the next page we haven't yet requested. */
+ fetch_page(repair, completion);
+}
+
+static physical_block_number_t get_pbn_to_fetch(struct repair_completion *repair,
+ struct block_map *block_map)
+{
+ physical_block_number_t pbn = VDO_ZERO_BLOCK;
+
+ if (repair->completion.result != VDO_SUCCESS)
+ return VDO_ZERO_BLOCK;
+
+ while ((pbn == VDO_ZERO_BLOCK) && (repair->page_to_fetch < repair->leaf_pages))
+ pbn = vdo_find_block_map_page_pbn(block_map, repair->page_to_fetch++);
+
+ if (vdo_is_physical_data_block(repair->completion.vdo->depot, pbn))
+ return pbn;
+
+ vdo_set_completion_result(&repair->completion, VDO_BAD_MAPPING);
+ return VDO_ZERO_BLOCK;
+}
+
+/**
+ * fetch_page() - Fetch a page from the block map.
+ * @repair: The repair_completion.
+ * @completion: The page completion to use.
+ *
+ * Return true if the rebuild is complete
+ */
+static bool fetch_page(struct repair_completion *repair,
+ struct vdo_completion *completion)
+{
+ struct vdo_page_completion *page_completion = (struct vdo_page_completion *) completion;
+ struct block_map *block_map = repair->completion.vdo->block_map;
+ physical_block_number_t pbn = get_pbn_to_fetch(repair, block_map);
+
+ if (pbn != VDO_ZERO_BLOCK) {
+ repair->outstanding++;
+ /*
+ * We must set the requeue flag here to ensure that we don't blow the stack if all
+ * the requested pages are already in the cache or get load errors.
+ */
+ vdo_get_page(page_completion, &block_map->zones[0], pbn, true, repair,
+ page_loaded, handle_page_load_error, true);
+ }
+
+ if (repair->outstanding > 0)
+ return false;
+
+ launch_repair_completion(repair, flush_block_map_updates, VDO_ZONE_TYPE_ADMIN);
+ return true;
+}
+
+/**
+ * rebuild_from_leaves() - Rebuild reference counts from the leaf block map pages.
+ * @completion: The repair completion.
+ *
+ * Rebuilds reference counts from the leaf block map pages now that reference counts have been
+ * rebuilt from the interior tree pages (which have been loaded in the process). This callback is
+ * registered in rebuild_reference_counts().
+ */
+static void rebuild_from_leaves(struct vdo_completion *completion)
+{
+ page_count_t i;
+ struct repair_completion *repair = as_repair_completion(completion);
+ struct block_map *map = completion->vdo->block_map;
+
+ repair->logical_blocks_used = 0;
+
+ /*
+ * The PBN calculation doesn't work until the tree pages have been loaded, so we can't set
+ * this value at the start of repair.
+ */
+ repair->leaf_pages = vdo_compute_block_map_page_count(map->entry_count);
+ repair->last_slot = (struct block_map_slot) {
+ .slot = map->entry_count % VDO_BLOCK_MAP_ENTRIES_PER_PAGE,
+ .pbn = vdo_find_block_map_page_pbn(map, repair->leaf_pages - 1),
+ };
+ if (repair->last_slot.slot == 0)
+ repair->last_slot.slot = VDO_BLOCK_MAP_ENTRIES_PER_PAGE;
+
+ for (i = 0; i < repair->page_count; i++) {
+ if (fetch_page(repair, &repair->page_completions[i].completion)) {
+ /*
+ * The rebuild has already moved on, so it isn't safe nor is there a need
+ * to launch any more fetches.
+ */
+ return;
+ }
+ }
+}
+
+/**
+ * process_entry() - Process a single entry from the block map tree.
+ * @pbn: A pbn which holds a block map tree page.
+ * @completion: The parent completion of the traversal.
+ *
+ * Implements vdo_entry_callback_fn.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int process_entry(physical_block_number_t pbn, struct vdo_completion *completion)
+{
+ struct repair_completion *repair = as_repair_completion(completion);
+ struct slab_depot *depot = completion->vdo->depot;
+ int result;
+
+ if ((pbn == VDO_ZERO_BLOCK) || !vdo_is_physical_data_block(depot, pbn)) {
+ return vdo_log_error_strerror(VDO_BAD_CONFIGURATION,
+ "PBN %llu out of range",
+ (unsigned long long) pbn);
+ }
+
+ result = vdo_adjust_reference_count_for_rebuild(depot, pbn,
+ VDO_JOURNAL_BLOCK_MAP_REMAPPING);
+ if (result != VDO_SUCCESS) {
+ return vdo_log_error_strerror(result,
+ "Could not adjust reference count for block map tree PBN %llu",
+ (unsigned long long) pbn);
+ }
+
+ repair->block_map_data_blocks++;
+ return VDO_SUCCESS;
+}
+
+static void rebuild_reference_counts(struct vdo_completion *completion)
+{
+ struct repair_completion *repair = as_repair_completion(completion);
+ struct vdo *vdo = completion->vdo;
+ struct vdo_page_cache *cache = &vdo->block_map->zones[0].page_cache;
+
+ /* We must allocate ref_counts before we can rebuild them. */
+ if (abort_on_error(vdo_allocate_reference_counters(vdo->depot), repair))
+ return;
+
+ /*
+ * Completion chaining from page cache hits can lead to stack overflow during the rebuild,
+ * so clear out the cache before this rebuild phase.
+ */
+ if (abort_on_error(vdo_invalidate_page_cache(cache), repair))
+ return;
+
+ prepare_repair_completion(repair, rebuild_from_leaves, VDO_ZONE_TYPE_LOGICAL);
+ vdo_traverse_forest(vdo->block_map, process_entry, completion);
+}
+
+/**
+ * increment_recovery_point() - Move the given recovery point forward by one entry.
+ */
+static void increment_recovery_point(struct recovery_point *point)
+{
+ if (++point->entry_count < RECOVERY_JOURNAL_ENTRIES_PER_SECTOR)
+ return;
+
+ point->entry_count = 0;
+ if (point->sector_count < (VDO_SECTORS_PER_BLOCK - 1)) {
+ point->sector_count++;
+ return;
+ }
+
+ point->sequence_number++;
+ point->sector_count = 1;
+}
+
+/**
+ * advance_points() - Advance the current recovery and journal points.
+ * @repair: The repair_completion whose points are to be advanced.
+ * @entries_per_block: The number of entries in a recovery journal block.
+ */
+static void advance_points(struct repair_completion *repair,
+ journal_entry_count_t entries_per_block)
+{
+ if (!repair->next_recovery_point.increment_applied) {
+ repair->next_recovery_point.increment_applied = true;
+ return;
+ }
+
+ increment_recovery_point(&repair->next_recovery_point);
+ vdo_advance_journal_point(&repair->next_journal_point, entries_per_block);
+ repair->next_recovery_point.increment_applied = false;
+}
+
+/**
+ * before_recovery_point() - Check whether the first point precedes the second point.
+ * @first: The first recovery point.
+ * @second: The second recovery point.
+ *
+ * Return: true if the first point precedes the second point.
+ */
+static bool __must_check before_recovery_point(const struct recovery_point *first,
+ const struct recovery_point *second)
+{
+ if (first->sequence_number < second->sequence_number)
+ return true;
+
+ if (first->sequence_number > second->sequence_number)
+ return false;
+
+ if (first->sector_count < second->sector_count)
+ return true;
+
+ return ((first->sector_count == second->sector_count) &&
+ (first->entry_count < second->entry_count));
+}
+
+static struct packed_journal_sector * __must_check get_sector(struct recovery_journal *journal,
+ char *journal_data,
+ sequence_number_t sequence,
+ u8 sector_number)
+{
+ off_t offset;
+
+ offset = ((vdo_get_recovery_journal_block_number(journal, sequence) * VDO_BLOCK_SIZE) +
+ (VDO_SECTOR_SIZE * sector_number));
+ return (struct packed_journal_sector *) (journal_data + offset);
+}
+
+/**
+ * get_entry() - Unpack the recovery journal entry associated with the given recovery point.
+ * @repair: The repair completion.
+ * @point: The recovery point.
+ *
+ * Return: The unpacked contents of the matching recovery journal entry.
+ */
+static struct recovery_journal_entry get_entry(const struct repair_completion *repair,
+ const struct recovery_point *point)
+{
+ struct packed_journal_sector *sector;
+
+ sector = get_sector(repair->completion.vdo->recovery_journal,
+ repair->journal_data, point->sequence_number,
+ point->sector_count);
+ return vdo_unpack_recovery_journal_entry(&sector->entries[point->entry_count]);
+}
+
+/**
+ * validate_recovery_journal_entry() - Validate a recovery journal entry.
+ * @vdo: The vdo.
+ * @entry: The entry to validate.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int validate_recovery_journal_entry(const struct vdo *vdo,
+ const struct recovery_journal_entry *entry)
+{
+ if ((entry->slot.pbn >= vdo->states.vdo.config.physical_blocks) ||
+ (entry->slot.slot >= VDO_BLOCK_MAP_ENTRIES_PER_PAGE) ||
+ !vdo_is_valid_location(&entry->mapping) ||
+ !vdo_is_valid_location(&entry->unmapping) ||
+ !vdo_is_physical_data_block(vdo->depot, entry->mapping.pbn) ||
+ !vdo_is_physical_data_block(vdo->depot, entry->unmapping.pbn)) {
+ return vdo_log_error_strerror(VDO_CORRUPT_JOURNAL,
+ "Invalid entry: %s (%llu, %u) from %llu to %llu is not within bounds",
+ vdo_get_journal_operation_name(entry->operation),
+ (unsigned long long) entry->slot.pbn,
+ entry->slot.slot,
+ (unsigned long long) entry->unmapping.pbn,
+ (unsigned long long) entry->mapping.pbn);
+ }
+
+ if ((entry->operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING) &&
+ (vdo_is_state_compressed(entry->mapping.state) ||
+ (entry->mapping.pbn == VDO_ZERO_BLOCK) ||
+ (entry->unmapping.state != VDO_MAPPING_STATE_UNMAPPED) ||
+ (entry->unmapping.pbn != VDO_ZERO_BLOCK))) {
+ return vdo_log_error_strerror(VDO_CORRUPT_JOURNAL,
+ "Invalid entry: %s (%llu, %u) from %llu to %llu is not a valid tree mapping",
+ vdo_get_journal_operation_name(entry->operation),
+ (unsigned long long) entry->slot.pbn,
+ entry->slot.slot,
+ (unsigned long long) entry->unmapping.pbn,
+ (unsigned long long) entry->mapping.pbn);
+ }
+
+ return VDO_SUCCESS;
+}
+
+/**
+ * add_slab_journal_entries() - Replay recovery journal entries into the slab journals of the
+ * allocator currently being recovered.
+ * @completion: The allocator completion.
+ *
+ * Waits for slab journal tailblock space when necessary. This method is its own callback.
+ */
+static void add_slab_journal_entries(struct vdo_completion *completion)
+{
+ struct recovery_point *recovery_point;
+ struct repair_completion *repair = completion->parent;
+ struct vdo *vdo = completion->vdo;
+ struct recovery_journal *journal = vdo->recovery_journal;
+ struct block_allocator *allocator = vdo_as_block_allocator(completion);
+
+ /* Get ready in case we need to enqueue again. */
+ vdo_prepare_completion(completion, add_slab_journal_entries,
+ vdo_notify_slab_journals_are_recovered,
+ completion->callback_thread_id, repair);
+ for (recovery_point = &repair->next_recovery_point;
+ before_recovery_point(recovery_point, &repair->tail_recovery_point);
+ advance_points(repair, journal->entries_per_block)) {
+ int result;
+ physical_block_number_t pbn;
+ struct vdo_slab *slab;
+ struct recovery_journal_entry entry = get_entry(repair, recovery_point);
+ bool increment = !repair->next_recovery_point.increment_applied;
+
+ if (increment) {
+ result = validate_recovery_journal_entry(vdo, &entry);
+ if (result != VDO_SUCCESS) {
+ vdo_enter_read_only_mode(vdo, result);
+ vdo_fail_completion(completion, result);
+ return;
+ }
+
+ pbn = entry.mapping.pbn;
+ } else {
+ pbn = entry.unmapping.pbn;
+ }
+
+ if (pbn == VDO_ZERO_BLOCK)
+ continue;
+
+ slab = vdo_get_slab(vdo->depot, pbn);
+ if (slab->allocator != allocator)
+ continue;
+
+ if (!vdo_attempt_replay_into_slab(slab, pbn, entry.operation, increment,
+ &repair->next_journal_point,
+ completion))
+ return;
+
+ repair->entries_added_to_slab_journals++;
+ }
+
+ vdo_notify_slab_journals_are_recovered(completion);
+}
+
+/**
+ * vdo_replay_into_slab_journals() - Replay recovery journal entries in the slab journals of slabs
+ * owned by a given block_allocator.
+ * @allocator: The allocator whose slab journals are to be recovered.
+ * @context: The slab depot load context supplied by a recovery when it loads the depot.
+ */
+void vdo_replay_into_slab_journals(struct block_allocator *allocator, void *context)
+{
+ struct vdo_completion *completion = &allocator->completion;
+ struct repair_completion *repair = context;
+ struct vdo *vdo = completion->vdo;
+
+ vdo_assert_on_physical_zone_thread(vdo, allocator->zone_number, __func__);
+ if (repair->entry_count == 0) {
+ /* there's nothing to replay */
+ repair->logical_blocks_used = vdo->recovery_journal->logical_blocks_used;
+ repair->block_map_data_blocks = vdo->recovery_journal->block_map_data_blocks;
+ vdo_notify_slab_journals_are_recovered(completion);
+ return;
+ }
+
+ repair->next_recovery_point = (struct recovery_point) {
+ .sequence_number = repair->slab_journal_head,
+ .sector_count = 1,
+ .entry_count = 0,
+ };
+
+ repair->next_journal_point = (struct journal_point) {
+ .sequence_number = repair->slab_journal_head,
+ .entry_count = 0,
+ };
+
+ vdo_log_info("Replaying entries into slab journals for zone %u",
+ allocator->zone_number);
+ completion->parent = repair;
+ add_slab_journal_entries(completion);
+}
+
+static void load_slab_depot(struct vdo_completion *completion)
+{
+ struct repair_completion *repair = as_repair_completion(completion);
+ const struct admin_state_code *operation;
+
+ vdo_assert_on_admin_thread(completion->vdo, __func__);
+
+ if (vdo_state_requires_read_only_rebuild(completion->vdo->load_state)) {
+ prepare_repair_completion(repair, rebuild_reference_counts,
+ VDO_ZONE_TYPE_LOGICAL);
+ operation = VDO_ADMIN_STATE_LOADING_FOR_REBUILD;
+ } else {
+ prepare_repair_completion(repair, drain_slab_depot, VDO_ZONE_TYPE_ADMIN);
+ operation = VDO_ADMIN_STATE_LOADING_FOR_RECOVERY;
+ }
+
+ vdo_load_slab_depot(completion->vdo->depot, operation, completion, repair);
+}
+
+static void flush_block_map(struct vdo_completion *completion)
+{
+ struct repair_completion *repair = as_repair_completion(completion);
+ const struct admin_state_code *operation;
+
+ vdo_assert_on_admin_thread(completion->vdo, __func__);
+
+ vdo_log_info("Flushing block map changes");
+ prepare_repair_completion(repair, load_slab_depot, VDO_ZONE_TYPE_ADMIN);
+ operation = (vdo_state_requires_read_only_rebuild(completion->vdo->load_state) ?
+ VDO_ADMIN_STATE_REBUILDING :
+ VDO_ADMIN_STATE_RECOVERING);
+ vdo_drain_block_map(completion->vdo->block_map, operation, completion);
+}
+
+static bool finish_if_done(struct repair_completion *repair)
+{
+ /* Pages are still being launched or there is still work to do */
+ if (repair->launching || (repair->outstanding > 0))
+ return false;
+
+ if (repair->completion.result != VDO_SUCCESS) {
+ page_count_t i;
+
+ for (i = 0; i < repair->page_count; i++) {
+ struct vdo_page_completion *page_completion =
+ &repair->page_completions[i];
+
+ if (page_completion->ready)
+ vdo_release_page_completion(&page_completion->completion);
+ }
+
+ vdo_launch_completion(&repair->completion);
+ return true;
+ }
+
+ if (repair->current_entry >= repair->entries)
+ return false;
+
+ launch_repair_completion(repair, flush_block_map, VDO_ZONE_TYPE_ADMIN);
+ return true;
+}
+
+static void abort_block_map_recovery(struct repair_completion *repair, int result)
+{
+ vdo_set_completion_result(&repair->completion, result);
+ finish_if_done(repair);
+}
+
+/**
+ * find_entry_starting_next_page() - Find the first journal entry after a given entry which is not
+ * on the same block map page.
+ * @current_entry: The entry to search from.
+ * @needs_sort: Whether sorting is needed to proceed.
+ *
+ * Return: Pointer to the first later journal entry on a different block map page, or a pointer to
+ * just before the journal entries if no subsequent entry is on a different block map page.
+ */
+static struct numbered_block_mapping *
+find_entry_starting_next_page(struct repair_completion *repair,
+ struct numbered_block_mapping *current_entry, bool needs_sort)
+{
+ size_t current_page;
+
+ /* If current_entry is invalid, return immediately. */
+ if (current_entry < repair->entries)
+ return current_entry;
+
+ current_page = current_entry->block_map_slot.pbn;
+
+ /* Decrement current_entry until it's out of bounds or on a different page. */
+ while ((current_entry >= repair->entries) &&
+ (current_entry->block_map_slot.pbn == current_page)) {
+ if (needs_sort) {
+ struct numbered_block_mapping *just_sorted_entry =
+ sort_next_heap_element(repair);
+ VDO_ASSERT_LOG_ONLY(just_sorted_entry < current_entry,
+ "heap is returning elements in an unexpected order");
+ }
+
+ current_entry--;
+ }
+
+ return current_entry;
+}
+
+/*
+ * Apply a range of journal entries [starting_entry, ending_entry) journal
+ * entries to a block map page.
+ */
+static void apply_journal_entries_to_page(struct block_map_page *page,
+ struct numbered_block_mapping *starting_entry,
+ struct numbered_block_mapping *ending_entry)
+{
+ struct numbered_block_mapping *current_entry = starting_entry;
+
+ while (current_entry != ending_entry) {
+ page->entries[current_entry->block_map_slot.slot] = current_entry->block_map_entry;
+ current_entry--;
+ }
+}
+
+static void recover_ready_pages(struct repair_completion *repair,
+ struct vdo_completion *completion);
+
+static void block_map_page_loaded(struct vdo_completion *completion)
+{
+ struct repair_completion *repair = as_repair_completion(completion->parent);
+
+ repair->outstanding--;
+ if (!repair->launching)
+ recover_ready_pages(repair, completion);
+}
+
+static void handle_block_map_page_load_error(struct vdo_completion *completion)
+{
+ struct repair_completion *repair = as_repair_completion(completion->parent);
+
+ repair->outstanding--;
+ abort_block_map_recovery(repair, completion->result);
+}
+
+static void fetch_block_map_page(struct repair_completion *repair,
+ struct vdo_completion *completion)
+{
+ physical_block_number_t pbn;
+
+ if (repair->current_unfetched_entry < repair->entries)
+ /* Nothing left to fetch. */
+ return;
+
+ /* Fetch the next page we haven't yet requested. */
+ pbn = repair->current_unfetched_entry->block_map_slot.pbn;
+ repair->current_unfetched_entry =
+ find_entry_starting_next_page(repair, repair->current_unfetched_entry,
+ true);
+ repair->outstanding++;
+ vdo_get_page(((struct vdo_page_completion *) completion),
+ &repair->completion.vdo->block_map->zones[0], pbn, true,
+ &repair->completion, block_map_page_loaded,
+ handle_block_map_page_load_error, false);
+}
+
+static struct vdo_page_completion *get_next_page_completion(struct repair_completion *repair,
+ struct vdo_page_completion *completion)
+{
+ completion++;
+ if (completion == (&repair->page_completions[repair->page_count]))
+ completion = &repair->page_completions[0];
+ return completion;
+}
+
+static void recover_ready_pages(struct repair_completion *repair,
+ struct vdo_completion *completion)
+{
+ struct vdo_page_completion *page_completion = (struct vdo_page_completion *) completion;
+
+ if (finish_if_done(repair))
+ return;
+
+ if (repair->pbn != page_completion->pbn)
+ return;
+
+ while (page_completion->ready) {
+ struct numbered_block_mapping *start_of_next_page;
+ struct block_map_page *page;
+ int result;
+
+ result = vdo_get_cached_page(completion, &page);
+ if (result != VDO_SUCCESS) {
+ abort_block_map_recovery(repair, result);
+ return;
+ }
+
+ start_of_next_page =
+ find_entry_starting_next_page(repair, repair->current_entry,
+ false);
+ apply_journal_entries_to_page(page, repair->current_entry,
+ start_of_next_page);
+ repair->current_entry = start_of_next_page;
+ vdo_request_page_write(completion);
+ vdo_release_page_completion(completion);
+
+ if (finish_if_done(repair))
+ return;
+
+ repair->pbn = repair->current_entry->block_map_slot.pbn;
+ fetch_block_map_page(repair, completion);
+ page_completion = get_next_page_completion(repair, page_completion);
+ completion = &page_completion->completion;
+ }
+}
+
+static void recover_block_map(struct vdo_completion *completion)
+{
+ struct repair_completion *repair = as_repair_completion(completion);
+ struct vdo *vdo = completion->vdo;
+ struct numbered_block_mapping *first_sorted_entry;
+ page_count_t i;
+
+ vdo_assert_on_logical_zone_thread(vdo, 0, __func__);
+
+ /* Suppress block map errors. */
+ vdo->block_map->zones[0].page_cache.rebuilding =
+ vdo_state_requires_read_only_rebuild(vdo->load_state);
+
+ if (repair->block_map_entry_count == 0) {
+ vdo_log_info("Replaying 0 recovery entries into block map");
+ vdo_free(vdo_forget(repair->journal_data));
+ launch_repair_completion(repair, load_slab_depot, VDO_ZONE_TYPE_ADMIN);
+ return;
+ }
+
+ /*
+ * Organize the journal entries into a binary heap so we can iterate over them in sorted
+ * order incrementally, avoiding an expensive sort call.
+ */
+ repair->replay_heap = (struct min_heap) {
+ .data = repair->entries,
+ .nr = repair->block_map_entry_count,
+ .size = repair->block_map_entry_count,
+ };
+ min_heapify_all(&repair->replay_heap, &repair_min_heap);
+
+ vdo_log_info("Replaying %zu recovery entries into block map",
+ repair->block_map_entry_count);
+
+ repair->current_entry = &repair->entries[repair->block_map_entry_count - 1];
+ first_sorted_entry = sort_next_heap_element(repair);
+ VDO_ASSERT_LOG_ONLY(first_sorted_entry == repair->current_entry,
+ "heap is returning elements in an unexpected order");
+
+ /* Prevent any page from being processed until all pages have been launched. */
+ repair->launching = true;
+ repair->pbn = repair->current_entry->block_map_slot.pbn;
+ repair->current_unfetched_entry = repair->current_entry;
+ for (i = 0; i < repair->page_count; i++) {
+ if (repair->current_unfetched_entry < repair->entries)
+ break;
+
+ fetch_block_map_page(repair, &repair->page_completions[i].completion);
+ }
+ repair->launching = false;
+
+ /* Process any ready pages. */
+ recover_ready_pages(repair, &repair->page_completions[0].completion);
+}
+
+/**
+ * get_recovery_journal_block_header() - Get the block header for a block at a position in the
+ * journal data and unpack it.
+ * @journal: The recovery journal.
+ * @data: The recovery journal data.
+ * @sequence: The sequence number.
+ *
+ * Return: The unpacked header.
+ */
+static struct recovery_block_header __must_check
+get_recovery_journal_block_header(struct recovery_journal *journal, char *data,
+ sequence_number_t sequence)
+{
+ physical_block_number_t pbn =
+ vdo_get_recovery_journal_block_number(journal, sequence);
+ char *header = &data[pbn * VDO_BLOCK_SIZE];
+
+ return vdo_unpack_recovery_block_header((struct packed_journal_header *) header);
+}
+
+/**
+ * is_valid_recovery_journal_block() - Determine whether the given header describes a valid block
+ * for the given journal.
+ * @journal: The journal to use.
+ * @header: The unpacked block header to check.
+ * @old_ok: Whether an old format header is valid.
+ *
+ * A block is not valid if it is unformatted, or if it is older than the last successful recovery
+ * or reformat.
+ *
+ * Return: True if the header is valid.
+ */
+static bool __must_check is_valid_recovery_journal_block(const struct recovery_journal *journal,
+ const struct recovery_block_header *header,
+ bool old_ok)
+{
+ if ((header->nonce != journal->nonce) ||
+ (header->recovery_count != journal->recovery_count))
+ return false;
+
+ if (header->metadata_type == VDO_METADATA_RECOVERY_JOURNAL_2)
+ return (header->entry_count <= journal->entries_per_block);
+
+ return (old_ok &&
+ (header->metadata_type == VDO_METADATA_RECOVERY_JOURNAL) &&
+ (header->entry_count <= RECOVERY_JOURNAL_1_ENTRIES_PER_BLOCK));
+}
+
+/**
+ * is_exact_recovery_journal_block() - Determine whether the given header describes the exact block
+ * indicated.
+ * @journal: The journal to use.
+ * @header: The unpacked block header to check.
+ * @sequence: The expected sequence number.
+ * @type: The expected metadata type.
+ *
+ * Return: True if the block matches.
+ */
+static bool __must_check is_exact_recovery_journal_block(const struct recovery_journal *journal,
+ const struct recovery_block_header *header,
+ sequence_number_t sequence,
+ enum vdo_metadata_type type)
+{
+ return ((header->metadata_type == type) &&
+ (header->sequence_number == sequence) &&
+ (is_valid_recovery_journal_block(journal, header, true)));
+}
+
+/**
+ * find_recovery_journal_head_and_tail() - Find the tail and head of the journal.
+ *
+ * Return: True if there were valid journal blocks.
+ */
+static bool find_recovery_journal_head_and_tail(struct repair_completion *repair)
+{
+ struct recovery_journal *journal = repair->completion.vdo->recovery_journal;
+ bool found_entries = false;
+ physical_block_number_t i;
+
+ /*
+ * Ensure that we don't replay old entries since we know the tail recorded in the super
+ * block must be a lower bound. Not doing so can result in extra data loss by setting the
+ * tail too early.
+ */
+ repair->highest_tail = journal->tail;
+ for (i = 0; i < journal->size; i++) {
+ struct recovery_block_header header =
+ get_recovery_journal_block_header(journal, repair->journal_data, i);
+
+ if (!is_valid_recovery_journal_block(journal, &header, true)) {
+ /* This block is old or incorrectly formatted */
+ continue;
+ }
+
+ if (vdo_get_recovery_journal_block_number(journal, header.sequence_number) != i) {
+ /* This block is in the wrong location */
+ continue;
+ }
+
+ if (header.sequence_number >= repair->highest_tail) {
+ found_entries = true;
+ repair->highest_tail = header.sequence_number;
+ }
+
+ if (!found_entries)
+ continue;
+
+ if (header.block_map_head > repair->block_map_head)
+ repair->block_map_head = header.block_map_head;
+
+ if (header.slab_journal_head > repair->slab_journal_head)
+ repair->slab_journal_head = header.slab_journal_head;
+ }
+
+ return found_entries;
+}
+
+/**
+ * unpack_entry() - Unpack a recovery journal entry in either format.
+ * @vdo: The vdo.
+ * @packed: The entry to unpack.
+ * @format: The expected format of the entry.
+ * @entry: The unpacked entry.
+ *
+ * Return: true if the entry should be applied.3
+ */
+static bool unpack_entry(struct vdo *vdo, char *packed, enum vdo_metadata_type format,
+ struct recovery_journal_entry *entry)
+{
+ if (format == VDO_METADATA_RECOVERY_JOURNAL_2) {
+ struct packed_recovery_journal_entry *packed_entry =
+ (struct packed_recovery_journal_entry *) packed;
+
+ *entry = vdo_unpack_recovery_journal_entry(packed_entry);
+ } else {
+ physical_block_number_t low32, high4;
+
+ struct packed_recovery_journal_entry_1 *packed_entry =
+ (struct packed_recovery_journal_entry_1 *) packed;
+
+ if (packed_entry->operation == VDO_JOURNAL_DATA_INCREMENT)
+ entry->operation = VDO_JOURNAL_DATA_REMAPPING;
+ else if (packed_entry->operation == VDO_JOURNAL_BLOCK_MAP_INCREMENT)
+ entry->operation = VDO_JOURNAL_BLOCK_MAP_REMAPPING;
+ else
+ return false;
+
+ low32 = __le32_to_cpu(packed_entry->pbn_low_word);
+ high4 = packed_entry->pbn_high_nibble;
+ entry->slot = (struct block_map_slot) {
+ .pbn = ((high4 << 32) | low32),
+ .slot = (packed_entry->slot_low | (packed_entry->slot_high << 6)),
+ };
+ entry->mapping = vdo_unpack_block_map_entry(&packed_entry->block_map_entry);
+ entry->unmapping = (struct data_location) {
+ .pbn = VDO_ZERO_BLOCK,
+ .state = VDO_MAPPING_STATE_UNMAPPED,
+ };
+ }
+
+ return (validate_recovery_journal_entry(vdo, entry) == VDO_SUCCESS);
+}
+
+/**
+ * append_sector_entries() - Append an array of recovery journal entries from a journal block
+ * sector to the array of numbered mappings in the repair completion,
+ * numbering each entry in the order they are appended.
+ * @repair: The repair completion.
+ * @entries: The entries in the sector.
+ * @format: The format of the sector.
+ * @entry_count: The number of entries to append.
+ */
+static void append_sector_entries(struct repair_completion *repair, char *entries,
+ enum vdo_metadata_type format,
+ journal_entry_count_t entry_count)
+{
+ journal_entry_count_t i;
+ struct vdo *vdo = repair->completion.vdo;
+ off_t increment = ((format == VDO_METADATA_RECOVERY_JOURNAL_2)
+ ? sizeof(struct packed_recovery_journal_entry)
+ : sizeof(struct packed_recovery_journal_entry_1));
+
+ for (i = 0; i < entry_count; i++, entries += increment) {
+ struct recovery_journal_entry entry;
+
+ if (!unpack_entry(vdo, entries, format, &entry))
+ /* When recovering from read-only mode, ignore damaged entries. */
+ continue;
+
+ repair->entries[repair->block_map_entry_count] =
+ (struct numbered_block_mapping) {
+ .block_map_slot = entry.slot,
+ .block_map_entry = vdo_pack_block_map_entry(entry.mapping.pbn,
+ entry.mapping.state),
+ .number = repair->block_map_entry_count,
+ };
+ repair->block_map_entry_count++;
+ }
+}
+
+static journal_entry_count_t entries_per_sector(enum vdo_metadata_type format,
+ u8 sector_number)
+{
+ if (format == VDO_METADATA_RECOVERY_JOURNAL_2)
+ return RECOVERY_JOURNAL_ENTRIES_PER_SECTOR;
+
+ return ((sector_number == (VDO_SECTORS_PER_BLOCK - 1))
+ ? RECOVERY_JOURNAL_1_ENTRIES_IN_LAST_SECTOR
+ : RECOVERY_JOURNAL_1_ENTRIES_PER_SECTOR);
+}
+
+static void extract_entries_from_block(struct repair_completion *repair,
+ struct recovery_journal *journal,
+ sequence_number_t sequence,
+ enum vdo_metadata_type format,
+ journal_entry_count_t entries)
+{
+ sector_count_t i;
+ struct recovery_block_header header =
+ get_recovery_journal_block_header(journal, repair->journal_data,
+ sequence);
+
+ if (!is_exact_recovery_journal_block(journal, &header, sequence, format)) {
+ /* This block is invalid, so skip it. */
+ return;
+ }
+
+ entries = min(entries, header.entry_count);
+ for (i = 1; i < VDO_SECTORS_PER_BLOCK; i++) {
+ struct packed_journal_sector *sector =
+ get_sector(journal, repair->journal_data, sequence, i);
+ journal_entry_count_t sector_entries =
+ min(entries, entries_per_sector(format, i));
+
+ if (vdo_is_valid_recovery_journal_sector(&header, sector, i)) {
+ /* Only extract as many as the block header calls for. */
+ append_sector_entries(repair, (char *) sector->entries, format,
+ min_t(journal_entry_count_t,
+ sector->entry_count,
+ sector_entries));
+ }
+
+ /*
+ * Even if the sector wasn't full, count it as full when counting up to the
+ * entry count the block header claims.
+ */
+ entries -= sector_entries;
+ }
+}
+
+static int parse_journal_for_rebuild(struct repair_completion *repair)
+{
+ int result;
+ sequence_number_t i;
+ block_count_t count;
+ enum vdo_metadata_type format;
+ struct vdo *vdo = repair->completion.vdo;
+ struct recovery_journal *journal = vdo->recovery_journal;
+ journal_entry_count_t entries_per_block = journal->entries_per_block;
+
+ format = get_recovery_journal_block_header(journal, repair->journal_data,
+ repair->highest_tail).metadata_type;
+ if (format == VDO_METADATA_RECOVERY_JOURNAL)
+ entries_per_block = RECOVERY_JOURNAL_1_ENTRIES_PER_BLOCK;
+
+ /*
+ * Allocate an array of numbered_block_mapping structures large enough to transcribe every
+ * packed_recovery_journal_entry from every valid journal block.
+ */
+ count = ((repair->highest_tail - repair->block_map_head + 1) * entries_per_block);
+ result = vdo_allocate(count, struct numbered_block_mapping, __func__,
+ &repair->entries);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ for (i = repair->block_map_head; i <= repair->highest_tail; i++)
+ extract_entries_from_block(repair, journal, i, format, entries_per_block);
+
+ return VDO_SUCCESS;
+}
+
+static int validate_heads(struct repair_completion *repair)
+{
+ /* Both reap heads must be behind the tail. */
+ if ((repair->block_map_head <= repair->tail) &&
+ (repair->slab_journal_head <= repair->tail))
+ return VDO_SUCCESS;
+
+
+ return vdo_log_error_strerror(VDO_CORRUPT_JOURNAL,
+ "Journal tail too early. block map head: %llu, slab journal head: %llu, tail: %llu",
+ (unsigned long long) repair->block_map_head,
+ (unsigned long long) repair->slab_journal_head,
+ (unsigned long long) repair->tail);
+}
+
+/**
+ * extract_new_mappings() - Find all valid new mappings to be applied to the block map.
+ *
+ * The mappings are extracted from the journal and stored in a sortable array so that all of the
+ * mappings to be applied to a given block map page can be done in a single page fetch.
+ */
+static int extract_new_mappings(struct repair_completion *repair)
+{
+ int result;
+ struct vdo *vdo = repair->completion.vdo;
+ struct recovery_point recovery_point = {
+ .sequence_number = repair->block_map_head,
+ .sector_count = 1,
+ .entry_count = 0,
+ };
+
+ /*
+ * Allocate an array of numbered_block_mapping structs just large enough to transcribe
+ * every packed_recovery_journal_entry from every valid journal block.
+ */
+ result = vdo_allocate(repair->entry_count, struct numbered_block_mapping,
+ __func__, &repair->entries);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ for (; before_recovery_point(&recovery_point, &repair->tail_recovery_point);
+ increment_recovery_point(&recovery_point)) {
+ struct recovery_journal_entry entry = get_entry(repair, &recovery_point);
+
+ result = validate_recovery_journal_entry(vdo, &entry);
+ if (result != VDO_SUCCESS) {
+ vdo_enter_read_only_mode(vdo, result);
+ return result;
+ }
+
+ repair->entries[repair->block_map_entry_count] =
+ (struct numbered_block_mapping) {
+ .block_map_slot = entry.slot,
+ .block_map_entry = vdo_pack_block_map_entry(entry.mapping.pbn,
+ entry.mapping.state),
+ .number = repair->block_map_entry_count,
+ };
+ repair->block_map_entry_count++;
+ }
+
+ result = VDO_ASSERT((repair->block_map_entry_count <= repair->entry_count),
+ "approximate entry count is an upper bound");
+ if (result != VDO_SUCCESS)
+ vdo_enter_read_only_mode(vdo, result);
+
+ return result;
+}
+
+/**
+ * compute_usages() - Compute the lbns in use and block map data blocks counts from the tail of
+ * the journal.
+ */
+static noinline int compute_usages(struct repair_completion *repair)
+{
+ /*
+ * This function is declared noinline to avoid a spurious valgrind error regarding the
+ * following structure being uninitialized.
+ */
+ struct recovery_point recovery_point = {
+ .sequence_number = repair->tail,
+ .sector_count = 1,
+ .entry_count = 0,
+ };
+
+ struct vdo *vdo = repair->completion.vdo;
+ struct recovery_journal *journal = vdo->recovery_journal;
+ struct recovery_block_header header =
+ get_recovery_journal_block_header(journal, repair->journal_data,
+ repair->tail);
+
+ repair->logical_blocks_used = header.logical_blocks_used;
+ repair->block_map_data_blocks = header.block_map_data_blocks;
+
+ for (; before_recovery_point(&recovery_point, &repair->tail_recovery_point);
+ increment_recovery_point(&recovery_point)) {
+ struct recovery_journal_entry entry = get_entry(repair, &recovery_point);
+ int result;
+
+ result = validate_recovery_journal_entry(vdo, &entry);
+ if (result != VDO_SUCCESS) {
+ vdo_enter_read_only_mode(vdo, result);
+ return result;
+ }
+
+ if (entry.operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING) {
+ repair->block_map_data_blocks++;
+ continue;
+ }
+
+ if (vdo_is_mapped_location(&entry.mapping))
+ repair->logical_blocks_used++;
+
+ if (vdo_is_mapped_location(&entry.unmapping))
+ repair->logical_blocks_used--;
+ }
+
+ return VDO_SUCCESS;
+}
+
+static int parse_journal_for_recovery(struct repair_completion *repair)
+{
+ int result;
+ sequence_number_t i, head;
+ bool found_entries = false;
+ struct recovery_journal *journal = repair->completion.vdo->recovery_journal;
+
+ head = min(repair->block_map_head, repair->slab_journal_head);
+ for (i = head; i <= repair->highest_tail; i++) {
+ struct recovery_block_header header;
+ journal_entry_count_t block_entries;
+ u8 j;
+
+ repair->tail = i;
+ repair->tail_recovery_point = (struct recovery_point) {
+ .sequence_number = i,
+ .sector_count = 0,
+ .entry_count = 0,
+ };
+
+ header = get_recovery_journal_block_header(journal, repair->journal_data, i);
+ if (header.metadata_type == VDO_METADATA_RECOVERY_JOURNAL) {
+ /* This is an old format block, so we need to upgrade */
+ vdo_log_error_strerror(VDO_UNSUPPORTED_VERSION,
+ "Recovery journal is in the old format, a read-only rebuild is required.");
+ vdo_enter_read_only_mode(repair->completion.vdo,
+ VDO_UNSUPPORTED_VERSION);
+ return VDO_UNSUPPORTED_VERSION;
+ }
+
+ if (!is_exact_recovery_journal_block(journal, &header, i,
+ VDO_METADATA_RECOVERY_JOURNAL_2)) {
+ /* A bad block header was found so this must be the end of the journal. */
+ break;
+ }
+
+ block_entries = header.entry_count;
+
+ /* Examine each sector in turn to determine the last valid sector. */
+ for (j = 1; j < VDO_SECTORS_PER_BLOCK; j++) {
+ struct packed_journal_sector *sector =
+ get_sector(journal, repair->journal_data, i, j);
+ journal_entry_count_t sector_entries =
+ min_t(journal_entry_count_t, sector->entry_count,
+ block_entries);
+
+ /* A bad sector means that this block was torn. */
+ if (!vdo_is_valid_recovery_journal_sector(&header, sector, j))
+ break;
+
+ if (sector_entries > 0) {
+ found_entries = true;
+ repair->tail_recovery_point.sector_count++;
+ repair->tail_recovery_point.entry_count = sector_entries;
+ block_entries -= sector_entries;
+ repair->entry_count += sector_entries;
+ }
+
+ /* If this sector is short, the later sectors can't matter. */
+ if ((sector_entries < RECOVERY_JOURNAL_ENTRIES_PER_SECTOR) ||
+ (block_entries == 0))
+ break;
+ }
+
+ /* If this block was not filled, or if it tore, no later block can matter. */
+ if ((header.entry_count != journal->entries_per_block) || (block_entries > 0))
+ break;
+ }
+
+ if (!found_entries)
+ return validate_heads(repair);
+
+ /* Set the tail to the last valid tail block, if there is one. */
+ if (repair->tail_recovery_point.sector_count == 0)
+ repair->tail--;
+
+ result = validate_heads(repair);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ vdo_log_info("Highest-numbered recovery journal block has sequence number %llu, and the highest-numbered usable block is %llu",
+ (unsigned long long) repair->highest_tail,
+ (unsigned long long) repair->tail);
+
+ result = extract_new_mappings(repair);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ return compute_usages(repair);
+}
+
+static int parse_journal(struct repair_completion *repair)
+{
+ if (!find_recovery_journal_head_and_tail(repair))
+ return VDO_SUCCESS;
+
+ return (vdo_state_requires_read_only_rebuild(repair->completion.vdo->load_state) ?
+ parse_journal_for_rebuild(repair) :
+ parse_journal_for_recovery(repair));
+}
+
+static void finish_journal_load(struct vdo_completion *completion)
+{
+ struct repair_completion *repair = completion->parent;
+
+ if (++repair->vios_complete != repair->vio_count)
+ return;
+
+ vdo_log_info("Finished reading recovery journal");
+ uninitialize_vios(repair);
+ prepare_repair_completion(repair, recover_block_map, VDO_ZONE_TYPE_LOGICAL);
+ vdo_continue_completion(&repair->completion, parse_journal(repair));
+}
+
+static void handle_journal_load_error(struct vdo_completion *completion)
+{
+ struct repair_completion *repair = completion->parent;
+
+ /* Preserve the error */
+ vdo_set_completion_result(&repair->completion, completion->result);
+ vio_record_metadata_io_error(as_vio(completion));
+ completion->callback(completion);
+}
+
+static void read_journal_endio(struct bio *bio)
+{
+ struct vio *vio = bio->bi_private;
+ struct vdo *vdo = vio->completion.vdo;
+
+ continue_vio_after_io(vio, finish_journal_load, vdo->thread_config.admin_thread);
+}
+
+/**
+ * vdo_repair() - Load the recovery journal and then recover or rebuild a vdo.
+ * @parent: The completion to notify when the operation is complete
+ */
+void vdo_repair(struct vdo_completion *parent)
+{
+ int result;
+ char *ptr;
+ struct repair_completion *repair;
+ struct vdo *vdo = parent->vdo;
+ struct recovery_journal *journal = vdo->recovery_journal;
+ physical_block_number_t pbn = journal->origin;
+ block_count_t remaining = journal->size;
+ block_count_t vio_count = DIV_ROUND_UP(remaining, MAX_BLOCKS_PER_VIO);
+ page_count_t page_count = min_t(page_count_t,
+ vdo->device_config->cache_size >> 1,
+ MAXIMUM_SIMULTANEOUS_VDO_BLOCK_MAP_RESTORATION_READS);
+
+ vdo_assert_on_admin_thread(vdo, __func__);
+
+ if (vdo->load_state == VDO_FORCE_REBUILD) {
+ vdo_log_warning("Rebuilding reference counts to clear read-only mode");
+ vdo->states.vdo.read_only_recoveries++;
+ } else if (vdo->load_state == VDO_REBUILD_FOR_UPGRADE) {
+ vdo_log_warning("Rebuilding reference counts for upgrade");
+ } else {
+ vdo_log_warning("Device was dirty, rebuilding reference counts");
+ }
+
+ result = vdo_allocate_extended(struct repair_completion, page_count,
+ struct vdo_page_completion, __func__,
+ &repair);
+ if (result != VDO_SUCCESS) {
+ vdo_fail_completion(parent, result);
+ return;
+ }
+
+ vdo_initialize_completion(&repair->completion, vdo, VDO_REPAIR_COMPLETION);
+ repair->completion.error_handler = abort_repair;
+ repair->completion.parent = parent;
+ prepare_repair_completion(repair, finish_repair, VDO_ZONE_TYPE_ADMIN);
+ repair->page_count = page_count;
+
+ result = vdo_allocate(remaining * VDO_BLOCK_SIZE, char, __func__,
+ &repair->journal_data);
+ if (abort_on_error(result, repair))
+ return;
+
+ result = vdo_allocate(vio_count, struct vio, __func__, &repair->vios);
+ if (abort_on_error(result, repair))
+ return;
+
+ ptr = repair->journal_data;
+ for (repair->vio_count = 0; repair->vio_count < vio_count; repair->vio_count++) {
+ block_count_t blocks = min_t(block_count_t, remaining,
+ MAX_BLOCKS_PER_VIO);
+
+ result = allocate_vio_components(vdo, VIO_TYPE_RECOVERY_JOURNAL,
+ VIO_PRIORITY_METADATA,
+ repair, blocks, ptr,
+ &repair->vios[repair->vio_count]);
+ if (abort_on_error(result, repair))
+ return;
+
+ ptr += (blocks * VDO_BLOCK_SIZE);
+ remaining -= blocks;
+ }
+
+ for (vio_count = 0; vio_count < repair->vio_count;
+ vio_count++, pbn += MAX_BLOCKS_PER_VIO) {
+ vdo_submit_metadata_vio(&repair->vios[vio_count], pbn, read_journal_endio,
+ handle_journal_load_error, REQ_OP_READ);
+ }
+}
diff --git a/drivers/md/dm-vdo/repair.h b/drivers/md/dm-vdo/repair.h
new file mode 100644
index 000000000000..ff255cf41486
--- /dev/null
+++ b/drivers/md/dm-vdo/repair.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef VDO_REPAIR_H
+#define VDO_REPAIR_H
+
+#include "types.h"
+
+void vdo_replay_into_slab_journals(struct block_allocator *allocator, void *context);
+void vdo_repair(struct vdo_completion *parent);
+
+#endif /* VDO_REPAIR_H */
diff --git a/drivers/md/dm-vdo/slab-depot.c b/drivers/md/dm-vdo/slab-depot.c
new file mode 100644
index 000000000000..46e4721e5b4f
--- /dev/null
+++ b/drivers/md/dm-vdo/slab-depot.c
@@ -0,0 +1,5101 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "slab-depot.h"
+
+#include <linux/atomic.h>
+#include <linux/bio.h>
+#include <linux/err.h>
+#include <linux/log2.h>
+#include <linux/min_heap.h>
+#include <linux/minmax.h>
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "numeric.h"
+#include "permassert.h"
+#include "string-utils.h"
+
+#include "action-manager.h"
+#include "admin-state.h"
+#include "completion.h"
+#include "constants.h"
+#include "data-vio.h"
+#include "encodings.h"
+#include "io-submitter.h"
+#include "physical-zone.h"
+#include "priority-table.h"
+#include "recovery-journal.h"
+#include "repair.h"
+#include "status-codes.h"
+#include "types.h"
+#include "vdo.h"
+#include "vio.h"
+#include "wait-queue.h"
+
+static const u64 BYTES_PER_WORD = sizeof(u64);
+static const bool NORMAL_OPERATION = true;
+
+/**
+ * get_lock() - Get the lock object for a slab journal block by sequence number.
+ * @journal: vdo_slab journal to retrieve from.
+ * @sequence_number: Sequence number of the block.
+ *
+ * Return: The lock object for the given sequence number.
+ */
+static inline struct journal_lock * __must_check get_lock(struct slab_journal *journal,
+ sequence_number_t sequence_number)
+{
+ return &journal->locks[sequence_number % journal->size];
+}
+
+static bool is_slab_open(struct vdo_slab *slab)
+{
+ return (!vdo_is_state_quiescing(&slab->state) &&
+ !vdo_is_state_quiescent(&slab->state));
+}
+
+/**
+ * must_make_entries_to_flush() - Check whether there are entry waiters which should delay a flush.
+ * @journal: The journal to check.
+ *
+ * Return: true if there are no entry waiters, or if the slab is unrecovered.
+ */
+static inline bool __must_check must_make_entries_to_flush(struct slab_journal *journal)
+{
+ return ((journal->slab->status != VDO_SLAB_REBUILDING) &&
+ vdo_waitq_has_waiters(&journal->entry_waiters));
+}
+
+/**
+ * is_reaping() - Check whether a reap is currently in progress.
+ * @journal: The journal which may be reaping.
+ *
+ * Return: true if the journal is reaping.
+ */
+static inline bool __must_check is_reaping(struct slab_journal *journal)
+{
+ return (journal->head != journal->unreapable);
+}
+
+/**
+ * initialize_tail_block() - Initialize tail block as a new block.
+ * @journal: The journal whose tail block is being initialized.
+ */
+static void initialize_tail_block(struct slab_journal *journal)
+{
+ struct slab_journal_block_header *header = &journal->tail_header;
+
+ header->sequence_number = journal->tail;
+ header->entry_count = 0;
+ header->has_block_map_increments = false;
+}
+
+/**
+ * initialize_journal_state() - Set all journal fields appropriately to start journaling.
+ * @journal: The journal to be reset, based on its tail sequence number.
+ */
+static void initialize_journal_state(struct slab_journal *journal)
+{
+ journal->unreapable = journal->head;
+ journal->reap_lock = get_lock(journal, journal->unreapable);
+ journal->next_commit = journal->tail;
+ journal->summarized = journal->last_summarized = journal->tail;
+ initialize_tail_block(journal);
+}
+
+/**
+ * block_is_full() - Check whether a journal block is full.
+ * @journal: The slab journal for the block.
+ *
+ * Return: true if the tail block is full.
+ */
+static bool __must_check block_is_full(struct slab_journal *journal)
+{
+ journal_entry_count_t count = journal->tail_header.entry_count;
+
+ return (journal->tail_header.has_block_map_increments ?
+ (journal->full_entries_per_block == count) :
+ (journal->entries_per_block == count));
+}
+
+static void add_entries(struct slab_journal *journal);
+static void update_tail_block_location(struct slab_journal *journal);
+static void release_journal_locks(struct vdo_waiter *waiter, void *context);
+
+/**
+ * is_slab_journal_blank() - Check whether a slab's journal is blank.
+ *
+ * A slab journal is blank if it has never had any entries recorded in it.
+ *
+ * Return: true if the slab's journal has never been modified.
+ */
+static bool is_slab_journal_blank(const struct vdo_slab *slab)
+{
+ return ((slab->journal.tail == 1) &&
+ (slab->journal.tail_header.entry_count == 0));
+}
+
+/**
+ * mark_slab_journal_dirty() - Put a slab journal on the dirty ring of its allocator in the correct
+ * order.
+ * @journal: The journal to be marked dirty.
+ * @lock: The recovery journal lock held by the slab journal.
+ */
+static void mark_slab_journal_dirty(struct slab_journal *journal, sequence_number_t lock)
+{
+ struct slab_journal *dirty_journal;
+ struct list_head *dirty_list = &journal->slab->allocator->dirty_slab_journals;
+
+ VDO_ASSERT_LOG_ONLY(journal->recovery_lock == 0, "slab journal was clean");
+
+ journal->recovery_lock = lock;
+ list_for_each_entry_reverse(dirty_journal, dirty_list, dirty_entry) {
+ if (dirty_journal->recovery_lock <= journal->recovery_lock)
+ break;
+ }
+
+ list_move_tail(&journal->dirty_entry, dirty_journal->dirty_entry.next);
+}
+
+static void mark_slab_journal_clean(struct slab_journal *journal)
+{
+ journal->recovery_lock = 0;
+ list_del_init(&journal->dirty_entry);
+}
+
+static void check_if_slab_drained(struct vdo_slab *slab)
+{
+ bool read_only;
+ struct slab_journal *journal = &slab->journal;
+ const struct admin_state_code *code;
+
+ if (!vdo_is_state_draining(&slab->state) ||
+ must_make_entries_to_flush(journal) ||
+ is_reaping(journal) ||
+ journal->waiting_to_commit ||
+ !list_empty(&journal->uncommitted_blocks) ||
+ journal->updating_slab_summary ||
+ (slab->active_count > 0))
+ return;
+
+ /* When not suspending or recovering, the slab must be clean. */
+ code = vdo_get_admin_state_code(&slab->state);
+ read_only = vdo_is_read_only(slab->allocator->depot->vdo);
+ if (!read_only &&
+ vdo_waitq_has_waiters(&slab->dirty_blocks) &&
+ (code != VDO_ADMIN_STATE_SUSPENDING) &&
+ (code != VDO_ADMIN_STATE_RECOVERING))
+ return;
+
+ vdo_finish_draining_with_result(&slab->state,
+ (read_only ? VDO_READ_ONLY : VDO_SUCCESS));
+}
+
+/* FULLNESS HINT COMPUTATION */
+
+/**
+ * compute_fullness_hint() - Translate a slab's free block count into a 'fullness hint' that can be
+ * stored in a slab_summary_entry's 7 bits that are dedicated to its free
+ * count.
+ * @depot: The depot whose summary being updated.
+ * @free_blocks: The number of free blocks.
+ *
+ * Note: the number of free blocks must be strictly less than 2^23 blocks, even though
+ * theoretically slabs could contain precisely 2^23 blocks; there is an assumption that at least
+ * one block is used by metadata. This assumption is necessary; otherwise, the fullness hint might
+ * overflow. The fullness hint formula is roughly (fullness >> 16) & 0x7f, but (2^23 >> 16) & 0x7f
+ * is 0, which would make it impossible to distinguish completely full from completely empty.
+ *
+ * Return: A fullness hint, which can be stored in 7 bits.
+ */
+static u8 __must_check compute_fullness_hint(struct slab_depot *depot,
+ block_count_t free_blocks)
+{
+ block_count_t hint;
+
+ VDO_ASSERT_LOG_ONLY((free_blocks < (1 << 23)), "free blocks must be less than 2^23");
+
+ if (free_blocks == 0)
+ return 0;
+
+ hint = free_blocks >> depot->hint_shift;
+ return ((hint == 0) ? 1 : hint);
+}
+
+/**
+ * check_summary_drain_complete() - Check whether an allocators summary has finished draining.
+ */
+static void check_summary_drain_complete(struct block_allocator *allocator)
+{
+ if (!vdo_is_state_draining(&allocator->summary_state) ||
+ (allocator->summary_write_count > 0))
+ return;
+
+ vdo_finish_operation(&allocator->summary_state,
+ (vdo_is_read_only(allocator->depot->vdo) ?
+ VDO_READ_ONLY : VDO_SUCCESS));
+}
+
+/**
+ * notify_summary_waiters() - Wake all the waiters in a given queue.
+ * @allocator: The block allocator summary which owns the queue.
+ * @queue: The queue to notify.
+ */
+static void notify_summary_waiters(struct block_allocator *allocator,
+ struct vdo_wait_queue *queue)
+{
+ int result = (vdo_is_read_only(allocator->depot->vdo) ?
+ VDO_READ_ONLY : VDO_SUCCESS);
+
+ vdo_waitq_notify_all_waiters(queue, NULL, &result);
+}
+
+static void launch_write(struct slab_summary_block *summary_block);
+
+/**
+ * finish_updating_slab_summary_block() - Finish processing a block which attempted to write,
+ * whether or not the attempt succeeded.
+ * @block: The block.
+ */
+static void finish_updating_slab_summary_block(struct slab_summary_block *block)
+{
+ notify_summary_waiters(block->allocator, &block->current_update_waiters);
+ block->writing = false;
+ block->allocator->summary_write_count--;
+ if (vdo_waitq_has_waiters(&block->next_update_waiters))
+ launch_write(block);
+ else
+ check_summary_drain_complete(block->allocator);
+}
+
+/**
+ * finish_update() - This is the callback for a successful summary block write.
+ * @completion: The write vio.
+ */
+static void finish_update(struct vdo_completion *completion)
+{
+ struct slab_summary_block *block =
+ container_of(as_vio(completion), struct slab_summary_block, vio);
+
+ atomic64_inc(&block->allocator->depot->summary_statistics.blocks_written);
+ finish_updating_slab_summary_block(block);
+}
+
+/**
+ * handle_write_error() - Handle an error writing a slab summary block.
+ * @completion: The write VIO.
+ */
+static void handle_write_error(struct vdo_completion *completion)
+{
+ struct slab_summary_block *block =
+ container_of(as_vio(completion), struct slab_summary_block, vio);
+
+ vio_record_metadata_io_error(as_vio(completion));
+ vdo_enter_read_only_mode(completion->vdo, completion->result);
+ finish_updating_slab_summary_block(block);
+}
+
+static void write_slab_summary_endio(struct bio *bio)
+{
+ struct vio *vio = bio->bi_private;
+ struct slab_summary_block *block =
+ container_of(vio, struct slab_summary_block, vio);
+
+ continue_vio_after_io(vio, finish_update, block->allocator->thread_id);
+}
+
+/**
+ * launch_write() - Write a slab summary block unless it is currently out for writing.
+ * @block: The block that needs to be committed.
+ */
+static void launch_write(struct slab_summary_block *block)
+{
+ struct block_allocator *allocator = block->allocator;
+ struct slab_depot *depot = allocator->depot;
+ physical_block_number_t pbn;
+
+ if (block->writing)
+ return;
+
+ allocator->summary_write_count++;
+ vdo_waitq_transfer_all_waiters(&block->next_update_waiters,
+ &block->current_update_waiters);
+ block->writing = true;
+
+ if (vdo_is_read_only(depot->vdo)) {
+ finish_updating_slab_summary_block(block);
+ return;
+ }
+
+ memcpy(block->outgoing_entries, block->entries, VDO_BLOCK_SIZE);
+
+ /*
+ * Flush before writing to ensure that the slab journal tail blocks and reference updates
+ * covered by this summary update are stable. Otherwise, a subsequent recovery could
+ * encounter a slab summary update that refers to a slab journal tail block that has not
+ * actually been written. In such cases, the slab journal referenced will be treated as
+ * empty, causing any data within the slab which predates the existing recovery journal
+ * entries to be lost.
+ */
+ pbn = (depot->summary_origin +
+ (VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE * allocator->zone_number) +
+ block->index);
+ vdo_submit_metadata_vio(&block->vio, pbn, write_slab_summary_endio,
+ handle_write_error, REQ_OP_WRITE | REQ_PREFLUSH);
+}
+
+/**
+ * update_slab_summary_entry() - Update the entry for a slab.
+ * @slab: The slab whose entry is to be updated
+ * @waiter: The waiter that is updating the summary.
+ * @tail_block_offset: The offset of the slab journal's tail block.
+ * @load_ref_counts: Whether the reference counts must be loaded from disk on the vdo load.
+ * @is_clean: Whether the slab is clean.
+ * @free_blocks: The number of free blocks.
+ */
+static void update_slab_summary_entry(struct vdo_slab *slab, struct vdo_waiter *waiter,
+ tail_block_offset_t tail_block_offset,
+ bool load_ref_counts, bool is_clean,
+ block_count_t free_blocks)
+{
+ u8 index = slab->slab_number / VDO_SLAB_SUMMARY_ENTRIES_PER_BLOCK;
+ struct block_allocator *allocator = slab->allocator;
+ struct slab_summary_block *block = &allocator->summary_blocks[index];
+ int result;
+ struct slab_summary_entry *entry;
+
+ if (vdo_is_read_only(block->vio.completion.vdo)) {
+ result = VDO_READ_ONLY;
+ waiter->callback(waiter, &result);
+ return;
+ }
+
+ if (vdo_is_state_draining(&allocator->summary_state) ||
+ vdo_is_state_quiescent(&allocator->summary_state)) {
+ result = VDO_INVALID_ADMIN_STATE;
+ waiter->callback(waiter, &result);
+ return;
+ }
+
+ entry = &allocator->summary_entries[slab->slab_number];
+ *entry = (struct slab_summary_entry) {
+ .tail_block_offset = tail_block_offset,
+ .load_ref_counts = (entry->load_ref_counts || load_ref_counts),
+ .is_dirty = !is_clean,
+ .fullness_hint = compute_fullness_hint(allocator->depot, free_blocks),
+ };
+ vdo_waitq_enqueue_waiter(&block->next_update_waiters, waiter);
+ launch_write(block);
+}
+
+/**
+ * finish_reaping() - Actually advance the head of the journal now that any necessary flushes are
+ * complete.
+ * @journal: The journal to be reaped.
+ */
+static void finish_reaping(struct slab_journal *journal)
+{
+ journal->head = journal->unreapable;
+ add_entries(journal);
+ check_if_slab_drained(journal->slab);
+}
+
+static void reap_slab_journal(struct slab_journal *journal);
+
+/**
+ * complete_reaping() - Finish reaping now that we have flushed the lower layer and then try
+ * reaping again in case we deferred reaping due to an outstanding vio.
+ * @completion: The flush vio.
+ */
+static void complete_reaping(struct vdo_completion *completion)
+{
+ struct slab_journal *journal = completion->parent;
+
+ return_vio_to_pool(journal->slab->allocator->vio_pool,
+ vio_as_pooled_vio(as_vio(vdo_forget(completion))));
+ finish_reaping(journal);
+ reap_slab_journal(journal);
+}
+
+/**
+ * handle_flush_error() - Handle an error flushing the lower layer.
+ * @completion: The flush vio.
+ */
+static void handle_flush_error(struct vdo_completion *completion)
+{
+ vio_record_metadata_io_error(as_vio(completion));
+ vdo_enter_read_only_mode(completion->vdo, completion->result);
+ complete_reaping(completion);
+}
+
+static void flush_endio(struct bio *bio)
+{
+ struct vio *vio = bio->bi_private;
+ struct slab_journal *journal = vio->completion.parent;
+
+ continue_vio_after_io(vio, complete_reaping,
+ journal->slab->allocator->thread_id);
+}
+
+/**
+ * flush_for_reaping() - A waiter callback for getting a vio with which to flush the lower layer
+ * prior to reaping.
+ * @waiter: The journal as a flush waiter.
+ * @context: The newly acquired flush vio.
+ */
+static void flush_for_reaping(struct vdo_waiter *waiter, void *context)
+{
+ struct slab_journal *journal =
+ container_of(waiter, struct slab_journal, flush_waiter);
+ struct pooled_vio *pooled = context;
+ struct vio *vio = &pooled->vio;
+
+ vio->completion.parent = journal;
+ vdo_submit_flush_vio(vio, flush_endio, handle_flush_error);
+}
+
+/**
+ * reap_slab_journal() - Conduct a reap on a slab journal to reclaim unreferenced blocks.
+ * @journal: The slab journal.
+ */
+static void reap_slab_journal(struct slab_journal *journal)
+{
+ bool reaped = false;
+
+ if (is_reaping(journal)) {
+ /* We already have a reap in progress so wait for it to finish. */
+ return;
+ }
+
+ if ((journal->slab->status != VDO_SLAB_REBUILT) ||
+ !vdo_is_state_normal(&journal->slab->state) ||
+ vdo_is_read_only(journal->slab->allocator->depot->vdo)) {
+ /*
+ * We must not reap in the first two cases, and there's no point in read-only mode.
+ */
+ return;
+ }
+
+ /*
+ * Start reclaiming blocks only when the journal head has no references. Then stop when a
+ * block is referenced or reap reaches the most recently written block, referenced by the
+ * slab summary, which has the sequence number just before the tail.
+ */
+ while ((journal->unreapable < journal->tail) && (journal->reap_lock->count == 0)) {
+ reaped = true;
+ journal->unreapable++;
+ journal->reap_lock++;
+ if (journal->reap_lock == &journal->locks[journal->size])
+ journal->reap_lock = &journal->locks[0];
+ }
+
+ if (!reaped)
+ return;
+
+ /*
+ * It is never safe to reap a slab journal block without first issuing a flush, regardless
+ * of whether a user flush has been received or not. In the absence of the flush, the
+ * reference block write which released the locks allowing the slab journal to reap may not
+ * be persisted. Although slab summary writes will eventually issue flushes, multiple slab
+ * journal block writes can be issued while previous slab summary updates have not yet been
+ * made. Even though those slab journal block writes will be ignored if the slab summary
+ * update is not persisted, they may still overwrite the to-be-reaped slab journal block
+ * resulting in a loss of reference count updates.
+ */
+ journal->flush_waiter.callback = flush_for_reaping;
+ acquire_vio_from_pool(journal->slab->allocator->vio_pool,
+ &journal->flush_waiter);
+}
+
+/**
+ * adjust_slab_journal_block_reference() - Adjust the reference count for a slab journal block.
+ * @journal: The slab journal.
+ * @sequence_number: The journal sequence number of the referenced block.
+ * @adjustment: Amount to adjust the reference counter.
+ *
+ * Note that when the adjustment is negative, the slab journal will be reaped.
+ */
+static void adjust_slab_journal_block_reference(struct slab_journal *journal,
+ sequence_number_t sequence_number,
+ int adjustment)
+{
+ struct journal_lock *lock;
+
+ if (sequence_number == 0)
+ return;
+
+ if (journal->slab->status == VDO_SLAB_REPLAYING) {
+ /* Locks should not be used during offline replay. */
+ return;
+ }
+
+ VDO_ASSERT_LOG_ONLY((adjustment != 0), "adjustment must be non-zero");
+ lock = get_lock(journal, sequence_number);
+ if (adjustment < 0) {
+ VDO_ASSERT_LOG_ONLY((-adjustment <= lock->count),
+ "adjustment %d of lock count %u for slab journal block %llu must not underflow",
+ adjustment, lock->count,
+ (unsigned long long) sequence_number);
+ }
+
+ lock->count += adjustment;
+ if (lock->count == 0)
+ reap_slab_journal(journal);
+}
+
+/**
+ * release_journal_locks() - Callback invoked after a slab summary update completes.
+ * @waiter: The slab summary waiter that has just been notified.
+ * @context: The result code of the update.
+ *
+ * Registered in the constructor on behalf of update_tail_block_location().
+ *
+ * Implements waiter_callback_fn.
+ */
+static void release_journal_locks(struct vdo_waiter *waiter, void *context)
+{
+ sequence_number_t first, i;
+ struct slab_journal *journal =
+ container_of(waiter, struct slab_journal, slab_summary_waiter);
+ int result = *((int *) context);
+
+ if (result != VDO_SUCCESS) {
+ if (result != VDO_READ_ONLY) {
+ /*
+ * Don't bother logging what might be lots of errors if we are already in
+ * read-only mode.
+ */
+ vdo_log_error_strerror(result, "failed slab summary update %llu",
+ (unsigned long long) journal->summarized);
+ }
+
+ journal->updating_slab_summary = false;
+ vdo_enter_read_only_mode(journal->slab->allocator->depot->vdo, result);
+ check_if_slab_drained(journal->slab);
+ return;
+ }
+
+ if (journal->partial_write_in_progress && (journal->summarized == journal->tail)) {
+ journal->partial_write_in_progress = false;
+ add_entries(journal);
+ }
+
+ first = journal->last_summarized;
+ journal->last_summarized = journal->summarized;
+ for (i = journal->summarized - 1; i >= first; i--) {
+ /*
+ * Release the lock the summarized block held on the recovery journal. (During
+ * replay, recovery_start will always be 0.)
+ */
+ if (journal->recovery_journal != NULL) {
+ zone_count_t zone_number = journal->slab->allocator->zone_number;
+ struct journal_lock *lock = get_lock(journal, i);
+
+ vdo_release_recovery_journal_block_reference(journal->recovery_journal,
+ lock->recovery_start,
+ VDO_ZONE_TYPE_PHYSICAL,
+ zone_number);
+ }
+
+ /*
+ * Release our own lock against reaping for blocks that are committed. (This
+ * function will not change locks during replay.)
+ */
+ adjust_slab_journal_block_reference(journal, i, -1);
+ }
+
+ journal->updating_slab_summary = false;
+
+ reap_slab_journal(journal);
+
+ /* Check if the slab summary needs to be updated again. */
+ update_tail_block_location(journal);
+}
+
+/**
+ * update_tail_block_location() - Update the tail block location in the slab summary, if necessary.
+ * @journal: The slab journal that is updating its tail block location.
+ */
+static void update_tail_block_location(struct slab_journal *journal)
+{
+ block_count_t free_block_count;
+ struct vdo_slab *slab = journal->slab;
+
+ if (journal->updating_slab_summary ||
+ vdo_is_read_only(journal->slab->allocator->depot->vdo) ||
+ (journal->last_summarized >= journal->next_commit)) {
+ check_if_slab_drained(slab);
+ return;
+ }
+
+ if (slab->status != VDO_SLAB_REBUILT) {
+ u8 hint = slab->allocator->summary_entries[slab->slab_number].fullness_hint;
+
+ free_block_count = ((block_count_t) hint) << slab->allocator->depot->hint_shift;
+ } else {
+ free_block_count = slab->free_blocks;
+ }
+
+ journal->summarized = journal->next_commit;
+ journal->updating_slab_summary = true;
+
+ /*
+ * Update slab summary as dirty.
+ * vdo_slab journal can only reap past sequence number 1 when all the ref counts for this
+ * slab have been written to the layer. Therefore, indicate that the ref counts must be
+ * loaded when the journal head has reaped past sequence number 1.
+ */
+ update_slab_summary_entry(slab, &journal->slab_summary_waiter,
+ journal->summarized % journal->size,
+ (journal->head > 1), false, free_block_count);
+}
+
+/**
+ * reopen_slab_journal() - Reopen a slab's journal by emptying it and then adding pending entries.
+ */
+static void reopen_slab_journal(struct vdo_slab *slab)
+{
+ struct slab_journal *journal = &slab->journal;
+ sequence_number_t block;
+
+ VDO_ASSERT_LOG_ONLY(journal->tail_header.entry_count == 0,
+ "vdo_slab journal's active block empty before reopening");
+ journal->head = journal->tail;
+ initialize_journal_state(journal);
+
+ /* Ensure no locks are spuriously held on an empty journal. */
+ for (block = 1; block <= journal->size; block++) {
+ VDO_ASSERT_LOG_ONLY((get_lock(journal, block)->count == 0),
+ "Scrubbed journal's block %llu is not locked",
+ (unsigned long long) block);
+ }
+
+ add_entries(journal);
+}
+
+static sequence_number_t get_committing_sequence_number(const struct pooled_vio *vio)
+{
+ const struct packed_slab_journal_block *block =
+ (const struct packed_slab_journal_block *) vio->vio.data;
+
+ return __le64_to_cpu(block->header.sequence_number);
+}
+
+/**
+ * complete_write() - Handle post-commit processing.
+ * @completion: The write vio as a completion.
+ *
+ * This is the callback registered by write_slab_journal_block().
+ */
+static void complete_write(struct vdo_completion *completion)
+{
+ int result = completion->result;
+ struct pooled_vio *pooled = vio_as_pooled_vio(as_vio(completion));
+ struct slab_journal *journal = completion->parent;
+ sequence_number_t committed = get_committing_sequence_number(pooled);
+
+ list_del_init(&pooled->list_entry);
+ return_vio_to_pool(journal->slab->allocator->vio_pool, vdo_forget(pooled));
+
+ if (result != VDO_SUCCESS) {
+ vio_record_metadata_io_error(as_vio(completion));
+ vdo_log_error_strerror(result, "cannot write slab journal block %llu",
+ (unsigned long long) committed);
+ vdo_enter_read_only_mode(journal->slab->allocator->depot->vdo, result);
+ check_if_slab_drained(journal->slab);
+ return;
+ }
+
+ WRITE_ONCE(journal->events->blocks_written, journal->events->blocks_written + 1);
+
+ if (list_empty(&journal->uncommitted_blocks)) {
+ /* If no blocks are outstanding, then the commit point is at the tail. */
+ journal->next_commit = journal->tail;
+ } else {
+ /* The commit point is always the beginning of the oldest incomplete block. */
+ pooled = container_of(journal->uncommitted_blocks.next,
+ struct pooled_vio, list_entry);
+ journal->next_commit = get_committing_sequence_number(pooled);
+ }
+
+ update_tail_block_location(journal);
+}
+
+static void write_slab_journal_endio(struct bio *bio)
+{
+ struct vio *vio = bio->bi_private;
+ struct slab_journal *journal = vio->completion.parent;
+
+ continue_vio_after_io(vio, complete_write, journal->slab->allocator->thread_id);
+}
+
+/**
+ * write_slab_journal_block() - Write a slab journal block.
+ * @waiter: The vio pool waiter which was just notified.
+ * @context: The vio pool entry for the write.
+ *
+ * Callback from acquire_vio_from_pool() registered in commit_tail().
+ */
+static void write_slab_journal_block(struct vdo_waiter *waiter, void *context)
+{
+ struct pooled_vio *pooled = context;
+ struct vio *vio = &pooled->vio;
+ struct slab_journal *journal =
+ container_of(waiter, struct slab_journal, resource_waiter);
+ struct slab_journal_block_header *header = &journal->tail_header;
+ int unused_entries = journal->entries_per_block - header->entry_count;
+ physical_block_number_t block_number;
+ const struct admin_state_code *operation;
+
+ header->head = journal->head;
+ list_add_tail(&pooled->list_entry, &journal->uncommitted_blocks);
+ vdo_pack_slab_journal_block_header(header, &journal->block->header);
+
+ /* Copy the tail block into the vio. */
+ memcpy(pooled->vio.data, journal->block, VDO_BLOCK_SIZE);
+
+ VDO_ASSERT_LOG_ONLY(unused_entries >= 0, "vdo_slab journal block is not overfull");
+ if (unused_entries > 0) {
+ /*
+ * Release the per-entry locks for any unused entries in the block we are about to
+ * write.
+ */
+ adjust_slab_journal_block_reference(journal, header->sequence_number,
+ -unused_entries);
+ journal->partial_write_in_progress = !block_is_full(journal);
+ }
+
+ block_number = journal->slab->journal_origin +
+ (header->sequence_number % journal->size);
+ vio->completion.parent = journal;
+
+ /*
+ * This block won't be read in recovery until the slab summary is updated to refer to it.
+ * The slab summary update does a flush which is sufficient to protect us from corruption
+ * due to out of order slab journal, reference block, or block map writes.
+ */
+ vdo_submit_metadata_vio(vdo_forget(vio), block_number, write_slab_journal_endio,
+ complete_write, REQ_OP_WRITE);
+
+ /* Since the write is submitted, the tail block structure can be reused. */
+ journal->tail++;
+ initialize_tail_block(journal);
+ journal->waiting_to_commit = false;
+
+ operation = vdo_get_admin_state_code(&journal->slab->state);
+ if (operation == VDO_ADMIN_STATE_WAITING_FOR_RECOVERY) {
+ vdo_finish_operation(&journal->slab->state,
+ (vdo_is_read_only(journal->slab->allocator->depot->vdo) ?
+ VDO_READ_ONLY : VDO_SUCCESS));
+ return;
+ }
+
+ add_entries(journal);
+}
+
+/**
+ * commit_tail() - Commit the tail block of the slab journal.
+ * @journal: The journal whose tail block should be committed.
+ */
+static void commit_tail(struct slab_journal *journal)
+{
+ if ((journal->tail_header.entry_count == 0) && must_make_entries_to_flush(journal)) {
+ /*
+ * There are no entries at the moment, but there are some waiters, so defer
+ * initiating the flush until those entries are ready to write.
+ */
+ return;
+ }
+
+ if (vdo_is_read_only(journal->slab->allocator->depot->vdo) ||
+ journal->waiting_to_commit ||
+ (journal->tail_header.entry_count == 0)) {
+ /*
+ * There is nothing to do since the tail block is empty, or writing, or the journal
+ * is in read-only mode.
+ */
+ return;
+ }
+
+ /*
+ * Since we are about to commit the tail block, this journal no longer needs to be on the
+ * ring of journals which the recovery journal might ask to commit.
+ */
+ mark_slab_journal_clean(journal);
+
+ journal->waiting_to_commit = true;
+
+ journal->resource_waiter.callback = write_slab_journal_block;
+ acquire_vio_from_pool(journal->slab->allocator->vio_pool,
+ &journal->resource_waiter);
+}
+
+/**
+ * encode_slab_journal_entry() - Encode a slab journal entry.
+ * @tail_header: The unpacked header for the block.
+ * @payload: The journal block payload to hold the entry.
+ * @sbn: The slab block number of the entry to encode.
+ * @operation: The type of the entry.
+ * @increment: True if this is an increment.
+ *
+ * Exposed for unit tests.
+ */
+static void encode_slab_journal_entry(struct slab_journal_block_header *tail_header,
+ slab_journal_payload *payload,
+ slab_block_number sbn,
+ enum journal_operation operation,
+ bool increment)
+{
+ journal_entry_count_t entry_number = tail_header->entry_count++;
+
+ if (operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING) {
+ if (!tail_header->has_block_map_increments) {
+ memset(payload->full_entries.entry_types, 0,
+ VDO_SLAB_JOURNAL_ENTRY_TYPES_SIZE);
+ tail_header->has_block_map_increments = true;
+ }
+
+ payload->full_entries.entry_types[entry_number / 8] |=
+ ((u8)1 << (entry_number % 8));
+ }
+
+ vdo_pack_slab_journal_entry(&payload->entries[entry_number], sbn, increment);
+}
+
+/**
+ * expand_journal_point() - Convert a recovery journal journal_point which refers to both an
+ * increment and a decrement to a single point which refers to one or the
+ * other.
+ * @recovery_point: The journal point to convert.
+ * @increment: Whether the current entry is an increment.
+ *
+ * Return: The expanded journal point
+ *
+ * Because each data_vio has but a single recovery journal point, but may need to make both
+ * increment and decrement entries in the same slab journal. In order to distinguish the two
+ * entries, the entry count of the expanded journal point is twice the actual recovery journal
+ * entry count for increments, and one more than that for decrements.
+ */
+static struct journal_point expand_journal_point(struct journal_point recovery_point,
+ bool increment)
+{
+ recovery_point.entry_count *= 2;
+ if (!increment)
+ recovery_point.entry_count++;
+
+ return recovery_point;
+}
+
+/**
+ * add_entry() - Actually add an entry to the slab journal, potentially firing off a write if a
+ * block becomes full.
+ * @journal: The slab journal to append to.
+ * @pbn: The pbn being adjusted.
+ * @operation: The type of entry to make.
+ * @increment: True if this is an increment.
+ * @recovery_point: The expanded recovery point.
+ *
+ * This function is synchronous.
+ */
+static void add_entry(struct slab_journal *journal, physical_block_number_t pbn,
+ enum journal_operation operation, bool increment,
+ struct journal_point recovery_point)
+{
+ struct packed_slab_journal_block *block = journal->block;
+ int result;
+
+ result = VDO_ASSERT(vdo_before_journal_point(&journal->tail_header.recovery_point,
+ &recovery_point),
+ "recovery journal point is monotonically increasing, recovery point: %llu.%u, block recovery point: %llu.%u",
+ (unsigned long long) recovery_point.sequence_number,
+ recovery_point.entry_count,
+ (unsigned long long) journal->tail_header.recovery_point.sequence_number,
+ journal->tail_header.recovery_point.entry_count);
+ if (result != VDO_SUCCESS) {
+ vdo_enter_read_only_mode(journal->slab->allocator->depot->vdo, result);
+ return;
+ }
+
+ if (operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING) {
+ result = VDO_ASSERT((journal->tail_header.entry_count <
+ journal->full_entries_per_block),
+ "block has room for full entries");
+ if (result != VDO_SUCCESS) {
+ vdo_enter_read_only_mode(journal->slab->allocator->depot->vdo,
+ result);
+ return;
+ }
+ }
+
+ encode_slab_journal_entry(&journal->tail_header, &block->payload,
+ pbn - journal->slab->start, operation, increment);
+ journal->tail_header.recovery_point = recovery_point;
+ if (block_is_full(journal))
+ commit_tail(journal);
+}
+
+static inline block_count_t journal_length(const struct slab_journal *journal)
+{
+ return journal->tail - journal->head;
+}
+
+/**
+ * vdo_attempt_replay_into_slab() - Replay a recovery journal entry into a slab's journal.
+ * @slab: The slab to play into.
+ * @pbn: The PBN for the entry.
+ * @operation: The type of entry to add.
+ * @increment: True if this entry is an increment.
+ * @recovery_point: The recovery journal point corresponding to this entry.
+ * @parent: The completion to notify when there is space to add the entry if the entry could not be
+ * added immediately.
+ *
+ * Return: true if the entry was added immediately.
+ */
+bool vdo_attempt_replay_into_slab(struct vdo_slab *slab, physical_block_number_t pbn,
+ enum journal_operation operation, bool increment,
+ struct journal_point *recovery_point,
+ struct vdo_completion *parent)
+{
+ struct slab_journal *journal = &slab->journal;
+ struct slab_journal_block_header *header = &journal->tail_header;
+ struct journal_point expanded = expand_journal_point(*recovery_point, increment);
+
+ /* Only accept entries after the current recovery point. */
+ if (!vdo_before_journal_point(&journal->tail_header.recovery_point, &expanded))
+ return true;
+
+ if ((header->entry_count >= journal->full_entries_per_block) &&
+ (header->has_block_map_increments || (operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING))) {
+ /*
+ * The tail block does not have room for the entry we are attempting to add so
+ * commit the tail block now.
+ */
+ commit_tail(journal);
+ }
+
+ if (journal->waiting_to_commit) {
+ vdo_start_operation_with_waiter(&journal->slab->state,
+ VDO_ADMIN_STATE_WAITING_FOR_RECOVERY,
+ parent, NULL);
+ return false;
+ }
+
+ if (journal_length(journal) >= journal->size) {
+ /*
+ * We must have reaped the current head before the crash, since the blocked
+ * threshold keeps us from having more entries than fit in a slab journal; hence we
+ * can just advance the head (and unreapable block), as needed.
+ */
+ journal->head++;
+ journal->unreapable++;
+ }
+
+ if (journal->slab->status == VDO_SLAB_REBUILT)
+ journal->slab->status = VDO_SLAB_REPLAYING;
+
+ add_entry(journal, pbn, operation, increment, expanded);
+ return true;
+}
+
+/**
+ * requires_reaping() - Check whether the journal must be reaped before adding new entries.
+ * @journal: The journal to check.
+ *
+ * Return: true if the journal must be reaped.
+ */
+static bool requires_reaping(const struct slab_journal *journal)
+{
+ return (journal_length(journal) >= journal->blocking_threshold);
+}
+
+/** finish_summary_update() - A waiter callback that resets the writing state of a slab. */
+static void finish_summary_update(struct vdo_waiter *waiter, void *context)
+{
+ struct vdo_slab *slab = container_of(waiter, struct vdo_slab, summary_waiter);
+ int result = *((int *) context);
+
+ slab->active_count--;
+
+ if ((result != VDO_SUCCESS) && (result != VDO_READ_ONLY)) {
+ vdo_log_error_strerror(result, "failed to update slab summary");
+ vdo_enter_read_only_mode(slab->allocator->depot->vdo, result);
+ }
+
+ check_if_slab_drained(slab);
+}
+
+static void write_reference_block(struct vdo_waiter *waiter, void *context);
+
+/**
+ * launch_reference_block_write() - Launch the write of a dirty reference block by first acquiring
+ * a VIO for it from the pool.
+ * @waiter: The waiter of the block which is starting to write.
+ * @context: The parent slab of the block.
+ *
+ * This can be asynchronous since the writer will have to wait if all VIOs in the pool are
+ * currently in use.
+ */
+static void launch_reference_block_write(struct vdo_waiter *waiter, void *context)
+{
+ struct vdo_slab *slab = context;
+
+ if (vdo_is_read_only(slab->allocator->depot->vdo))
+ return;
+
+ slab->active_count++;
+ container_of(waiter, struct reference_block, waiter)->is_writing = true;
+ waiter->callback = write_reference_block;
+ acquire_vio_from_pool(slab->allocator->vio_pool, waiter);
+}
+
+static void save_dirty_reference_blocks(struct vdo_slab *slab)
+{
+ vdo_waitq_notify_all_waiters(&slab->dirty_blocks,
+ launch_reference_block_write, slab);
+ check_if_slab_drained(slab);
+}
+
+/**
+ * finish_reference_block_write() - After a reference block has written, clean it, release its
+ * locks, and return its VIO to the pool.
+ * @completion: The VIO that just finished writing.
+ */
+static void finish_reference_block_write(struct vdo_completion *completion)
+{
+ struct vio *vio = as_vio(completion);
+ struct pooled_vio *pooled = vio_as_pooled_vio(vio);
+ struct reference_block *block = completion->parent;
+ struct vdo_slab *slab = block->slab;
+ tail_block_offset_t offset;
+
+ slab->active_count--;
+
+ /* Release the slab journal lock. */
+ adjust_slab_journal_block_reference(&slab->journal,
+ block->slab_journal_lock_to_release, -1);
+ return_vio_to_pool(slab->allocator->vio_pool, pooled);
+
+ /*
+ * We can't clear the is_writing flag earlier as releasing the slab journal lock may cause
+ * us to be dirtied again, but we don't want to double enqueue.
+ */
+ block->is_writing = false;
+
+ if (vdo_is_read_only(completion->vdo)) {
+ check_if_slab_drained(slab);
+ return;
+ }
+
+ /* Re-queue the block if it was re-dirtied while it was writing. */
+ if (block->is_dirty) {
+ vdo_waitq_enqueue_waiter(&block->slab->dirty_blocks, &block->waiter);
+ if (vdo_is_state_draining(&slab->state)) {
+ /* We must be saving, and this block will otherwise not be relaunched. */
+ save_dirty_reference_blocks(slab);
+ }
+
+ return;
+ }
+
+ /*
+ * Mark the slab as clean in the slab summary if there are no dirty or writing blocks
+ * and no summary update in progress.
+ */
+ if ((slab->active_count > 0) || vdo_waitq_has_waiters(&slab->dirty_blocks)) {
+ check_if_slab_drained(slab);
+ return;
+ }
+
+ offset = slab->allocator->summary_entries[slab->slab_number].tail_block_offset;
+ slab->active_count++;
+ slab->summary_waiter.callback = finish_summary_update;
+ update_slab_summary_entry(slab, &slab->summary_waiter, offset,
+ true, true, slab->free_blocks);
+}
+
+/**
+ * get_reference_counters_for_block() - Find the reference counters for a given block.
+ * @block: The reference_block in question.
+ *
+ * Return: A pointer to the reference counters for this block.
+ */
+static vdo_refcount_t * __must_check get_reference_counters_for_block(struct reference_block *block)
+{
+ size_t block_index = block - block->slab->reference_blocks;
+
+ return &block->slab->counters[block_index * COUNTS_PER_BLOCK];
+}
+
+/**
+ * pack_reference_block() - Copy data from a reference block to a buffer ready to be written out.
+ * @block: The block to copy.
+ * @buffer: The char buffer to fill with the packed block.
+ */
+static void pack_reference_block(struct reference_block *block, void *buffer)
+{
+ struct packed_reference_block *packed = buffer;
+ vdo_refcount_t *counters = get_reference_counters_for_block(block);
+ sector_count_t i;
+ struct packed_journal_point commit_point;
+
+ vdo_pack_journal_point(&block->slab->slab_journal_point, &commit_point);
+
+ for (i = 0; i < VDO_SECTORS_PER_BLOCK; i++) {
+ packed->sectors[i].commit_point = commit_point;
+ memcpy(packed->sectors[i].counts, counters + (i * COUNTS_PER_SECTOR),
+ (sizeof(vdo_refcount_t) * COUNTS_PER_SECTOR));
+ }
+}
+
+static void write_reference_block_endio(struct bio *bio)
+{
+ struct vio *vio = bio->bi_private;
+ struct reference_block *block = vio->completion.parent;
+ thread_id_t thread_id = block->slab->allocator->thread_id;
+
+ continue_vio_after_io(vio, finish_reference_block_write, thread_id);
+}
+
+/**
+ * handle_io_error() - Handle an I/O error reading or writing a reference count block.
+ * @completion: The VIO doing the I/O as a completion.
+ */
+static void handle_io_error(struct vdo_completion *completion)
+{
+ int result = completion->result;
+ struct vio *vio = as_vio(completion);
+ struct vdo_slab *slab = ((struct reference_block *) completion->parent)->slab;
+
+ vio_record_metadata_io_error(vio);
+ return_vio_to_pool(slab->allocator->vio_pool, vio_as_pooled_vio(vio));
+ slab->active_count--;
+ vdo_enter_read_only_mode(slab->allocator->depot->vdo, result);
+ check_if_slab_drained(slab);
+}
+
+/**
+ * write_reference_block() - After a dirty block waiter has gotten a VIO from the VIO pool, copy
+ * its counters and associated data into the VIO, and launch the write.
+ * @waiter: The waiter of the dirty block.
+ * @context: The VIO returned by the pool.
+ */
+static void write_reference_block(struct vdo_waiter *waiter, void *context)
+{
+ size_t block_offset;
+ physical_block_number_t pbn;
+ struct pooled_vio *pooled = context;
+ struct vdo_completion *completion = &pooled->vio.completion;
+ struct reference_block *block = container_of(waiter, struct reference_block,
+ waiter);
+
+ pack_reference_block(block, pooled->vio.data);
+ block_offset = (block - block->slab->reference_blocks);
+ pbn = (block->slab->ref_counts_origin + block_offset);
+ block->slab_journal_lock_to_release = block->slab_journal_lock;
+ completion->parent = block;
+
+ /*
+ * Mark the block as clean, since we won't be committing any updates that happen after this
+ * moment. As long as VIO order is preserved, two VIOs updating this block at once will not
+ * cause complications.
+ */
+ block->is_dirty = false;
+
+ /*
+ * Flush before writing to ensure that the recovery journal and slab journal entries which
+ * cover this reference update are stable. This prevents data corruption that can be caused
+ * by out of order writes.
+ */
+ WRITE_ONCE(block->slab->allocator->ref_counts_statistics.blocks_written,
+ block->slab->allocator->ref_counts_statistics.blocks_written + 1);
+
+ completion->callback_thread_id = ((struct block_allocator *) pooled->context)->thread_id;
+ vdo_submit_metadata_vio(&pooled->vio, pbn, write_reference_block_endio,
+ handle_io_error, REQ_OP_WRITE | REQ_PREFLUSH);
+}
+
+static void reclaim_journal_space(struct slab_journal *journal)
+{
+ block_count_t length = journal_length(journal);
+ struct vdo_slab *slab = journal->slab;
+ block_count_t write_count = vdo_waitq_num_waiters(&slab->dirty_blocks);
+ block_count_t written;
+
+ if ((length < journal->flushing_threshold) || (write_count == 0))
+ return;
+
+ /* The slab journal is over the first threshold, schedule some reference block writes. */
+ WRITE_ONCE(journal->events->flush_count, journal->events->flush_count + 1);
+ if (length < journal->flushing_deadline) {
+ /* Schedule more writes the closer to the deadline we get. */
+ write_count /= journal->flushing_deadline - length + 1;
+ write_count = max_t(block_count_t, write_count, 1);
+ }
+
+ for (written = 0; written < write_count; written++) {
+ vdo_waitq_notify_next_waiter(&slab->dirty_blocks,
+ launch_reference_block_write, slab);
+ }
+}
+
+/**
+ * reference_count_to_status() - Convert a reference count to a reference status.
+ * @count: The count to convert.
+ *
+ * Return: The appropriate reference status.
+ */
+static enum reference_status __must_check reference_count_to_status(vdo_refcount_t count)
+{
+ if (count == EMPTY_REFERENCE_COUNT)
+ return RS_FREE;
+ else if (count == 1)
+ return RS_SINGLE;
+ else if (count == PROVISIONAL_REFERENCE_COUNT)
+ return RS_PROVISIONAL;
+ else
+ return RS_SHARED;
+}
+
+/**
+ * dirty_block() - Mark a reference count block as dirty, potentially adding it to the dirty queue
+ * if it wasn't already dirty.
+ * @block: The reference block to mark as dirty.
+ */
+static void dirty_block(struct reference_block *block)
+{
+ if (block->is_dirty)
+ return;
+
+ block->is_dirty = true;
+ if (!block->is_writing)
+ vdo_waitq_enqueue_waiter(&block->slab->dirty_blocks, &block->waiter);
+}
+
+/**
+ * get_reference_block() - Get the reference block that covers the given block index.
+ */
+static struct reference_block * __must_check get_reference_block(struct vdo_slab *slab,
+ slab_block_number index)
+{
+ return &slab->reference_blocks[index / COUNTS_PER_BLOCK];
+}
+
+/**
+ * slab_block_number_from_pbn() - Determine the index within the slab of a particular physical
+ * block number.
+ * @slab: The slab.
+ * @physical_block_number: The physical block number.
+ * @slab_block_number_ptr: A pointer to the slab block number.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+static int __must_check slab_block_number_from_pbn(struct vdo_slab *slab,
+ physical_block_number_t pbn,
+ slab_block_number *slab_block_number_ptr)
+{
+ u64 slab_block_number;
+
+ if (pbn < slab->start)
+ return VDO_OUT_OF_RANGE;
+
+ slab_block_number = pbn - slab->start;
+ if (slab_block_number >= slab->allocator->depot->slab_config.data_blocks)
+ return VDO_OUT_OF_RANGE;
+
+ *slab_block_number_ptr = slab_block_number;
+ return VDO_SUCCESS;
+}
+
+/**
+ * get_reference_counter() - Get the reference counter that covers the given physical block number.
+ * @slab: The slab to query.
+ * @pbn: The physical block number.
+ * @counter_ptr: A pointer to the reference counter.
+ */
+static int __must_check get_reference_counter(struct vdo_slab *slab,
+ physical_block_number_t pbn,
+ vdo_refcount_t **counter_ptr)
+{
+ slab_block_number index;
+ int result = slab_block_number_from_pbn(slab, pbn, &index);
+
+ if (result != VDO_SUCCESS)
+ return result;
+
+ *counter_ptr = &slab->counters[index];
+
+ return VDO_SUCCESS;
+}
+
+static unsigned int calculate_slab_priority(struct vdo_slab *slab)
+{
+ block_count_t free_blocks = slab->free_blocks;
+ unsigned int unopened_slab_priority = slab->allocator->unopened_slab_priority;
+ unsigned int priority;
+
+ /*
+ * Wholly full slabs must be the only ones with lowest priority, 0.
+ *
+ * Slabs that have never been opened (empty, newly initialized, and never been written to)
+ * have lower priority than previously opened slabs that have a significant number of free
+ * blocks. This ranking causes VDO to avoid writing physical blocks for the first time
+ * unless there are very few free blocks that have been previously written to.
+ *
+ * Since VDO doesn't discard blocks currently, reusing previously written blocks makes VDO
+ * a better client of any underlying storage that is thinly-provisioned (though discarding
+ * would be better).
+ *
+ * For all other slabs, the priority is derived from the logarithm of the number of free
+ * blocks. Slabs with the same order of magnitude of free blocks have the same priority.
+ * With 2^23 blocks, the priority will range from 1 to 25. The reserved
+ * unopened_slab_priority divides the range and is skipped by the logarithmic mapping.
+ */
+
+ if (free_blocks == 0)
+ return 0;
+
+ if (is_slab_journal_blank(slab))
+ return unopened_slab_priority;
+
+ priority = (1 + ilog2(free_blocks));
+ return ((priority < unopened_slab_priority) ? priority : priority + 1);
+}
+
+/*
+ * Slabs are essentially prioritized by an approximation of the number of free blocks in the slab
+ * so slabs with lots of free blocks will be opened for allocation before slabs that have few free
+ * blocks.
+ */
+static void prioritize_slab(struct vdo_slab *slab)
+{
+ VDO_ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry),
+ "a slab must not already be on a ring when prioritizing");
+ slab->priority = calculate_slab_priority(slab);
+ vdo_priority_table_enqueue(slab->allocator->prioritized_slabs,
+ slab->priority, &slab->allocq_entry);
+}
+
+/**
+ * adjust_free_block_count() - Adjust the free block count and (if needed) reprioritize the slab.
+ * @incremented: true if the free block count went up.
+ */
+static void adjust_free_block_count(struct vdo_slab *slab, bool incremented)
+{
+ struct block_allocator *allocator = slab->allocator;
+
+ WRITE_ONCE(allocator->allocated_blocks,
+ allocator->allocated_blocks + (incremented ? -1 : 1));
+
+ /* The open slab doesn't need to be reprioritized until it is closed. */
+ if (slab == allocator->open_slab)
+ return;
+
+ /* Don't bother adjusting the priority table if unneeded. */
+ if (slab->priority == calculate_slab_priority(slab))
+ return;
+
+ /*
+ * Reprioritize the slab to reflect the new free block count by removing it from the table
+ * and re-enqueuing it with the new priority.
+ */
+ vdo_priority_table_remove(allocator->prioritized_slabs, &slab->allocq_entry);
+ prioritize_slab(slab);
+}
+
+/**
+ * increment_for_data() - Increment the reference count for a data block.
+ * @slab: The slab which owns the block.
+ * @block: The reference block which contains the block being updated.
+ * @block_number: The block to update.
+ * @old_status: The reference status of the data block before this increment.
+ * @lock: The pbn_lock associated with this increment (may be NULL).
+ * @counter_ptr: A pointer to the count for the data block (in, out).
+ * @adjust_block_count: Whether to update the allocator's free block count.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int increment_for_data(struct vdo_slab *slab, struct reference_block *block,
+ slab_block_number block_number,
+ enum reference_status old_status,
+ struct pbn_lock *lock, vdo_refcount_t *counter_ptr,
+ bool adjust_block_count)
+{
+ switch (old_status) {
+ case RS_FREE:
+ *counter_ptr = 1;
+ block->allocated_count++;
+ slab->free_blocks--;
+ if (adjust_block_count)
+ adjust_free_block_count(slab, false);
+
+ break;
+
+ case RS_PROVISIONAL:
+ *counter_ptr = 1;
+ break;
+
+ default:
+ /* Single or shared */
+ if (*counter_ptr >= MAXIMUM_REFERENCE_COUNT) {
+ return vdo_log_error_strerror(VDO_REF_COUNT_INVALID,
+ "Incrementing a block already having 254 references (slab %u, offset %u)",
+ slab->slab_number, block_number);
+ }
+ (*counter_ptr)++;
+ }
+
+ if (lock != NULL)
+ vdo_unassign_pbn_lock_provisional_reference(lock);
+ return VDO_SUCCESS;
+}
+
+/**
+ * decrement_for_data() - Decrement the reference count for a data block.
+ * @slab: The slab which owns the block.
+ * @block: The reference block which contains the block being updated.
+ * @block_number: The block to update.
+ * @old_status: The reference status of the data block before this decrement.
+ * @updater: The reference updater doing this operation in case we need to look up the pbn lock.
+ * @lock: The pbn_lock associated with the block being decremented (may be NULL).
+ * @counter_ptr: A pointer to the count for the data block (in, out).
+ * @adjust_block_count: Whether to update the allocator's free block count.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int decrement_for_data(struct vdo_slab *slab, struct reference_block *block,
+ slab_block_number block_number,
+ enum reference_status old_status,
+ struct reference_updater *updater,
+ vdo_refcount_t *counter_ptr, bool adjust_block_count)
+{
+ switch (old_status) {
+ case RS_FREE:
+ return vdo_log_error_strerror(VDO_REF_COUNT_INVALID,
+ "Decrementing free block at offset %u in slab %u",
+ block_number, slab->slab_number);
+
+ case RS_PROVISIONAL:
+ case RS_SINGLE:
+ if (updater->zpbn.zone != NULL) {
+ struct pbn_lock *lock = vdo_get_physical_zone_pbn_lock(updater->zpbn.zone,
+ updater->zpbn.pbn);
+
+ if (lock != NULL) {
+ /*
+ * There is a read lock on this block, so the block must not become
+ * unreferenced.
+ */
+ *counter_ptr = PROVISIONAL_REFERENCE_COUNT;
+ vdo_assign_pbn_lock_provisional_reference(lock);
+ break;
+ }
+ }
+
+ *counter_ptr = EMPTY_REFERENCE_COUNT;
+ block->allocated_count--;
+ slab->free_blocks++;
+ if (adjust_block_count)
+ adjust_free_block_count(slab, true);
+
+ break;
+
+ default:
+ /* Shared */
+ (*counter_ptr)--;
+ }
+
+ return VDO_SUCCESS;
+}
+
+/**
+ * increment_for_block_map() - Increment the reference count for a block map page.
+ * @slab: The slab which owns the block.
+ * @block: The reference block which contains the block being updated.
+ * @block_number: The block to update.
+ * @old_status: The reference status of the block before this increment.
+ * @lock: The pbn_lock associated with this increment (may be NULL).
+ * @normal_operation: Whether we are in normal operation vs. recovery or rebuild.
+ * @counter_ptr: A pointer to the count for the block (in, out).
+ * @adjust_block_count: Whether to update the allocator's free block count.
+ *
+ * All block map increments should be from provisional to MAXIMUM_REFERENCE_COUNT. Since block map
+ * blocks never dedupe they should never be adjusted from any other state. The adjustment always
+ * results in MAXIMUM_REFERENCE_COUNT as this value is used to prevent dedupe against block map
+ * blocks.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int increment_for_block_map(struct vdo_slab *slab, struct reference_block *block,
+ slab_block_number block_number,
+ enum reference_status old_status,
+ struct pbn_lock *lock, bool normal_operation,
+ vdo_refcount_t *counter_ptr, bool adjust_block_count)
+{
+ switch (old_status) {
+ case RS_FREE:
+ if (normal_operation) {
+ return vdo_log_error_strerror(VDO_REF_COUNT_INVALID,
+ "Incrementing unallocated block map block (slab %u, offset %u)",
+ slab->slab_number, block_number);
+ }
+
+ *counter_ptr = MAXIMUM_REFERENCE_COUNT;
+ block->allocated_count++;
+ slab->free_blocks--;
+ if (adjust_block_count)
+ adjust_free_block_count(slab, false);
+
+ return VDO_SUCCESS;
+
+ case RS_PROVISIONAL:
+ if (!normal_operation)
+ return vdo_log_error_strerror(VDO_REF_COUNT_INVALID,
+ "Block map block had provisional reference during replay (slab %u, offset %u)",
+ slab->slab_number, block_number);
+
+ *counter_ptr = MAXIMUM_REFERENCE_COUNT;
+ if (lock != NULL)
+ vdo_unassign_pbn_lock_provisional_reference(lock);
+ return VDO_SUCCESS;
+
+ default:
+ return vdo_log_error_strerror(VDO_REF_COUNT_INVALID,
+ "Incrementing a block map block which is already referenced %u times (slab %u, offset %u)",
+ *counter_ptr, slab->slab_number,
+ block_number);
+ }
+}
+
+static bool __must_check is_valid_journal_point(const struct journal_point *point)
+{
+ return ((point != NULL) && (point->sequence_number > 0));
+}
+
+/**
+ * update_reference_count() - Update the reference count of a block.
+ * @slab: The slab which owns the block.
+ * @block: The reference block which contains the block being updated.
+ * @block_number: The block to update.
+ * @slab_journal_point: The slab journal point at which this update is journaled.
+ * @updater: The reference updater.
+ * @normal_operation: Whether we are in normal operation vs. recovery or rebuild.
+ * @adjust_block_count: Whether to update the slab's free block count.
+ * @provisional_decrement_ptr: A pointer which will be set to true if this update was a decrement
+ * of a provisional reference.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int update_reference_count(struct vdo_slab *slab, struct reference_block *block,
+ slab_block_number block_number,
+ const struct journal_point *slab_journal_point,
+ struct reference_updater *updater,
+ bool normal_operation, bool adjust_block_count,
+ bool *provisional_decrement_ptr)
+{
+ vdo_refcount_t *counter_ptr = &slab->counters[block_number];
+ enum reference_status old_status = reference_count_to_status(*counter_ptr);
+ int result;
+
+ if (!updater->increment) {
+ result = decrement_for_data(slab, block, block_number, old_status,
+ updater, counter_ptr, adjust_block_count);
+ if ((result == VDO_SUCCESS) && (old_status == RS_PROVISIONAL)) {
+ if (provisional_decrement_ptr != NULL)
+ *provisional_decrement_ptr = true;
+ return VDO_SUCCESS;
+ }
+ } else if (updater->operation == VDO_JOURNAL_DATA_REMAPPING) {
+ result = increment_for_data(slab, block, block_number, old_status,
+ updater->lock, counter_ptr, adjust_block_count);
+ } else {
+ result = increment_for_block_map(slab, block, block_number, old_status,
+ updater->lock, normal_operation,
+ counter_ptr, adjust_block_count);
+ }
+
+ if (result != VDO_SUCCESS)
+ return result;
+
+ if (is_valid_journal_point(slab_journal_point))
+ slab->slab_journal_point = *slab_journal_point;
+
+ return VDO_SUCCESS;
+}
+
+static int __must_check adjust_reference_count(struct vdo_slab *slab,
+ struct reference_updater *updater,
+ const struct journal_point *slab_journal_point)
+{
+ slab_block_number block_number;
+ int result;
+ struct reference_block *block;
+ bool provisional_decrement = false;
+
+ if (!is_slab_open(slab))
+ return VDO_INVALID_ADMIN_STATE;
+
+ result = slab_block_number_from_pbn(slab, updater->zpbn.pbn, &block_number);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ block = get_reference_block(slab, block_number);
+ result = update_reference_count(slab, block, block_number, slab_journal_point,
+ updater, NORMAL_OPERATION, true,
+ &provisional_decrement);
+ if ((result != VDO_SUCCESS) || provisional_decrement)
+ return result;
+
+ if (block->is_dirty && (block->slab_journal_lock > 0)) {
+ sequence_number_t entry_lock = slab_journal_point->sequence_number;
+ /*
+ * This block is already dirty and a slab journal entry has been made for it since
+ * the last time it was clean. We must release the per-entry slab journal lock for
+ * the entry associated with the update we are now doing.
+ */
+ result = VDO_ASSERT(is_valid_journal_point(slab_journal_point),
+ "Reference count adjustments need slab journal points.");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ adjust_slab_journal_block_reference(&slab->journal, entry_lock, -1);
+ return VDO_SUCCESS;
+ }
+
+ /*
+ * This may be the first time we are applying an update for which there is a slab journal
+ * entry to this block since the block was cleaned. Therefore, we convert the per-entry
+ * slab journal lock to an uncommitted reference block lock, if there is a per-entry lock.
+ */
+ if (is_valid_journal_point(slab_journal_point))
+ block->slab_journal_lock = slab_journal_point->sequence_number;
+ else
+ block->slab_journal_lock = 0;
+
+ dirty_block(block);
+ return VDO_SUCCESS;
+}
+
+/**
+ * add_entry_from_waiter() - Add an entry to the slab journal.
+ * @waiter: The vio which should make an entry now.
+ * @context: The slab journal to make an entry in.
+ *
+ * This callback is invoked by add_entries() once it has determined that we are ready to make
+ * another entry in the slab journal. Implements waiter_callback_fn.
+ */
+static void add_entry_from_waiter(struct vdo_waiter *waiter, void *context)
+{
+ int result;
+ struct reference_updater *updater =
+ container_of(waiter, struct reference_updater, waiter);
+ struct data_vio *data_vio = data_vio_from_reference_updater(updater);
+ struct slab_journal *journal = context;
+ struct slab_journal_block_header *header = &journal->tail_header;
+ struct journal_point slab_journal_point = {
+ .sequence_number = header->sequence_number,
+ .entry_count = header->entry_count,
+ };
+ sequence_number_t recovery_block = data_vio->recovery_journal_point.sequence_number;
+
+ if (header->entry_count == 0) {
+ /*
+ * This is the first entry in the current tail block, so get a lock on the recovery
+ * journal which we will hold until this tail block is committed.
+ */
+ get_lock(journal, header->sequence_number)->recovery_start = recovery_block;
+ if (journal->recovery_journal != NULL) {
+ zone_count_t zone_number = journal->slab->allocator->zone_number;
+
+ vdo_acquire_recovery_journal_block_reference(journal->recovery_journal,
+ recovery_block,
+ VDO_ZONE_TYPE_PHYSICAL,
+ zone_number);
+ }
+
+ mark_slab_journal_dirty(journal, recovery_block);
+ reclaim_journal_space(journal);
+ }
+
+ add_entry(journal, updater->zpbn.pbn, updater->operation, updater->increment,
+ expand_journal_point(data_vio->recovery_journal_point,
+ updater->increment));
+
+ if (journal->slab->status != VDO_SLAB_REBUILT) {
+ /*
+ * If the slab is unrecovered, scrubbing will take care of the count since the
+ * update is now recorded in the journal.
+ */
+ adjust_slab_journal_block_reference(journal,
+ slab_journal_point.sequence_number, -1);
+ result = VDO_SUCCESS;
+ } else {
+ /* Now that an entry has been made in the slab journal, update the counter. */
+ result = adjust_reference_count(journal->slab, updater,
+ &slab_journal_point);
+ }
+
+ if (updater->increment)
+ continue_data_vio_with_error(data_vio, result);
+ else
+ vdo_continue_completion(&data_vio->decrement_completion, result);
+}
+
+/**
+ * is_next_entry_a_block_map_increment() - Check whether the next entry to be made is a block map
+ * increment.
+ * @journal: The journal.
+ *
+ * Return: true if the first entry waiter's operation is a block map increment.
+ */
+static inline bool is_next_entry_a_block_map_increment(struct slab_journal *journal)
+{
+ struct vdo_waiter *waiter = vdo_waitq_get_first_waiter(&journal->entry_waiters);
+ struct reference_updater *updater =
+ container_of(waiter, struct reference_updater, waiter);
+
+ return (updater->operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING);
+}
+
+/**
+ * add_entries() - Add as many entries as possible from the queue of vios waiting to make entries.
+ * @journal: The journal to which entries may be added.
+ *
+ * By processing the queue in order, we ensure that slab journal entries are made in the same order
+ * as recovery journal entries for the same increment or decrement.
+ */
+static void add_entries(struct slab_journal *journal)
+{
+ if (journal->adding_entries) {
+ /* Protect against re-entrancy. */
+ return;
+ }
+
+ journal->adding_entries = true;
+ while (vdo_waitq_has_waiters(&journal->entry_waiters)) {
+ struct slab_journal_block_header *header = &journal->tail_header;
+
+ if (journal->partial_write_in_progress ||
+ (journal->slab->status == VDO_SLAB_REBUILDING)) {
+ /*
+ * Don't add entries while rebuilding or while a partial write is
+ * outstanding, as it could result in reference count corruption.
+ */
+ break;
+ }
+
+ if (journal->waiting_to_commit) {
+ /*
+ * If we are waiting for resources to write the tail block, and the tail
+ * block is full, we can't make another entry.
+ */
+ WRITE_ONCE(journal->events->tail_busy_count,
+ journal->events->tail_busy_count + 1);
+ break;
+ } else if (is_next_entry_a_block_map_increment(journal) &&
+ (header->entry_count >= journal->full_entries_per_block)) {
+ /*
+ * The tail block does not have room for a block map increment, so commit
+ * it now.
+ */
+ commit_tail(journal);
+ if (journal->waiting_to_commit) {
+ WRITE_ONCE(journal->events->tail_busy_count,
+ journal->events->tail_busy_count + 1);
+ break;
+ }
+ }
+
+ /* If the slab is over the blocking threshold, make the vio wait. */
+ if (requires_reaping(journal)) {
+ WRITE_ONCE(journal->events->blocked_count,
+ journal->events->blocked_count + 1);
+ save_dirty_reference_blocks(journal->slab);
+ break;
+ }
+
+ if (header->entry_count == 0) {
+ struct journal_lock *lock =
+ get_lock(journal, header->sequence_number);
+
+ /*
+ * Check if the on disk slab journal is full. Because of the blocking and
+ * scrubbing thresholds, this should never happen.
+ */
+ if (lock->count > 0) {
+ VDO_ASSERT_LOG_ONLY((journal->head + journal->size) == journal->tail,
+ "New block has locks, but journal is not full");
+
+ /*
+ * The blocking threshold must let the journal fill up if the new
+ * block has locks; if the blocking threshold is smaller than the
+ * journal size, the new block cannot possibly have locks already.
+ */
+ VDO_ASSERT_LOG_ONLY((journal->blocking_threshold >= journal->size),
+ "New block can have locks already iff blocking threshold is at the end of the journal");
+
+ WRITE_ONCE(journal->events->disk_full_count,
+ journal->events->disk_full_count + 1);
+ save_dirty_reference_blocks(journal->slab);
+ break;
+ }
+
+ /*
+ * Don't allow the new block to be reaped until all of the reference count
+ * blocks are written and the journal block has been fully committed as
+ * well.
+ */
+ lock->count = journal->entries_per_block + 1;
+
+ if (header->sequence_number == 1) {
+ struct vdo_slab *slab = journal->slab;
+ block_count_t i;
+
+ /*
+ * This is the first entry in this slab journal, ever. Dirty all of
+ * the reference count blocks. Each will acquire a lock on the tail
+ * block so that the journal won't be reaped until the reference
+ * counts are initialized. The lock acquisition must be done by the
+ * ref_counts since here we don't know how many reference blocks
+ * the ref_counts has.
+ */
+ for (i = 0; i < slab->reference_block_count; i++) {
+ slab->reference_blocks[i].slab_journal_lock = 1;
+ dirty_block(&slab->reference_blocks[i]);
+ }
+
+ adjust_slab_journal_block_reference(journal, 1,
+ slab->reference_block_count);
+ }
+ }
+
+ vdo_waitq_notify_next_waiter(&journal->entry_waiters,
+ add_entry_from_waiter, journal);
+ }
+
+ journal->adding_entries = false;
+
+ /* If there are no waiters, and we are flushing or saving, commit the tail block. */
+ if (vdo_is_state_draining(&journal->slab->state) &&
+ !vdo_is_state_suspending(&journal->slab->state) &&
+ !vdo_waitq_has_waiters(&journal->entry_waiters))
+ commit_tail(journal);
+}
+
+/**
+ * reset_search_cursor() - Reset the free block search back to the first reference counter in the
+ * first reference block of a slab.
+ */
+static void reset_search_cursor(struct vdo_slab *slab)
+{
+ struct search_cursor *cursor = &slab->search_cursor;
+
+ cursor->block = cursor->first_block;
+ cursor->index = 0;
+ /* Unit tests have slabs with only one reference block (and it's a runt). */
+ cursor->end_index = min_t(u32, COUNTS_PER_BLOCK, slab->block_count);
+}
+
+/**
+ * advance_search_cursor() - Advance the search cursor to the start of the next reference block in
+ * a slab,
+ *
+ * Wraps around to the first reference block if the current block is the last reference block.
+ *
+ * Return: true unless the cursor was at the last reference block.
+ */
+static bool advance_search_cursor(struct vdo_slab *slab)
+{
+ struct search_cursor *cursor = &slab->search_cursor;
+
+ /*
+ * If we just finished searching the last reference block, then wrap back around to the
+ * start of the array.
+ */
+ if (cursor->block == cursor->last_block) {
+ reset_search_cursor(slab);
+ return false;
+ }
+
+ /* We're not already at the end, so advance to cursor to the next block. */
+ cursor->block++;
+ cursor->index = cursor->end_index;
+
+ if (cursor->block == cursor->last_block) {
+ /* The last reference block will usually be a runt. */
+ cursor->end_index = slab->block_count;
+ } else {
+ cursor->end_index += COUNTS_PER_BLOCK;
+ }
+
+ return true;
+}
+
+/**
+ * vdo_adjust_reference_count_for_rebuild() - Adjust the reference count of a block during rebuild.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+int vdo_adjust_reference_count_for_rebuild(struct slab_depot *depot,
+ physical_block_number_t pbn,
+ enum journal_operation operation)
+{
+ int result;
+ slab_block_number block_number;
+ struct reference_block *block;
+ struct vdo_slab *slab = vdo_get_slab(depot, pbn);
+ struct reference_updater updater = {
+ .operation = operation,
+ .increment = true,
+ };
+
+ result = slab_block_number_from_pbn(slab, pbn, &block_number);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ block = get_reference_block(slab, block_number);
+ result = update_reference_count(slab, block, block_number, NULL,
+ &updater, !NORMAL_OPERATION, false, NULL);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ dirty_block(block);
+ return VDO_SUCCESS;
+}
+
+/**
+ * replay_reference_count_change() - Replay the reference count adjustment from a slab journal
+ * entry into the reference count for a block.
+ * @slab: The slab.
+ * @entry_point: The slab journal point for the entry.
+ * @entry: The slab journal entry being replayed.
+ *
+ * The adjustment will be ignored if it was already recorded in the reference count.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+static int replay_reference_count_change(struct vdo_slab *slab,
+ const struct journal_point *entry_point,
+ struct slab_journal_entry entry)
+{
+ int result;
+ struct reference_block *block = get_reference_block(slab, entry.sbn);
+ sector_count_t sector = (entry.sbn % COUNTS_PER_BLOCK) / COUNTS_PER_SECTOR;
+ struct reference_updater updater = {
+ .operation = entry.operation,
+ .increment = entry.increment,
+ };
+
+ if (!vdo_before_journal_point(&block->commit_points[sector], entry_point)) {
+ /* This entry is already reflected in the existing counts, so do nothing. */
+ return VDO_SUCCESS;
+ }
+
+ /* This entry is not yet counted in the reference counts. */
+ result = update_reference_count(slab, block, entry.sbn, entry_point,
+ &updater, !NORMAL_OPERATION, false, NULL);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ dirty_block(block);
+ return VDO_SUCCESS;
+}
+
+/**
+ * find_zero_byte_in_word() - Find the array index of the first zero byte in word-sized range of
+ * reference counters.
+ * @word_ptr: A pointer to the eight counter bytes to check.
+ * @start_index: The array index corresponding to word_ptr[0].
+ * @fail_index: The array index to return if no zero byte is found.
+ *
+ * The search does no bounds checking; the function relies on the array being sufficiently padded.
+ *
+ * Return: The array index of the first zero byte in the word, or the value passed as fail_index if
+ * no zero byte was found.
+ */
+static inline slab_block_number find_zero_byte_in_word(const u8 *word_ptr,
+ slab_block_number start_index,
+ slab_block_number fail_index)
+{
+ u64 word = get_unaligned_le64(word_ptr);
+
+ /* This looks like a loop, but GCC will unroll the eight iterations for us. */
+ unsigned int offset;
+
+ for (offset = 0; offset < BYTES_PER_WORD; offset++) {
+ /* Assumes little-endian byte order, which we have on X86. */
+ if ((word & 0xFF) == 0)
+ return (start_index + offset);
+ word >>= 8;
+ }
+
+ return fail_index;
+}
+
+/**
+ * find_free_block() - Find the first block with a reference count of zero in the specified
+ * range of reference counter indexes.
+ * @slab: The slab counters to scan.
+ * @index_ptr: A pointer to hold the array index of the free block.
+ *
+ * Exposed for unit testing.
+ *
+ * Return: true if a free block was found in the specified range.
+ */
+static bool find_free_block(const struct vdo_slab *slab, slab_block_number *index_ptr)
+{
+ slab_block_number zero_index;
+ slab_block_number next_index = slab->search_cursor.index;
+ slab_block_number end_index = slab->search_cursor.end_index;
+ u8 *next_counter = &slab->counters[next_index];
+ u8 *end_counter = &slab->counters[end_index];
+
+ /*
+ * Search every byte of the first unaligned word. (Array is padded so reading past end is
+ * safe.)
+ */
+ zero_index = find_zero_byte_in_word(next_counter, next_index, end_index);
+ if (zero_index < end_index) {
+ *index_ptr = zero_index;
+ return true;
+ }
+
+ /*
+ * On architectures where unaligned word access is expensive, this would be a good place to
+ * advance to an alignment boundary.
+ */
+ next_index += BYTES_PER_WORD;
+ next_counter += BYTES_PER_WORD;
+
+ /*
+ * Now we're word-aligned; check an word at a time until we find a word containing a zero.
+ * (Array is padded so reading past end is safe.)
+ */
+ while (next_counter < end_counter) {
+ /*
+ * The following code is currently an exact copy of the code preceding the loop,
+ * but if you try to merge them by using a do loop, it runs slower because a jump
+ * instruction gets added at the start of the iteration.
+ */
+ zero_index = find_zero_byte_in_word(next_counter, next_index, end_index);
+ if (zero_index < end_index) {
+ *index_ptr = zero_index;
+ return true;
+ }
+
+ next_index += BYTES_PER_WORD;
+ next_counter += BYTES_PER_WORD;
+ }
+
+ return false;
+}
+
+/**
+ * search_current_reference_block() - Search the reference block currently saved in the search
+ * cursor for a reference count of zero, starting at the saved
+ * counter index.
+ * @slab: The slab to search.
+ * @free_index_ptr: A pointer to receive the array index of the zero reference count.
+ *
+ * Return: true if an unreferenced counter was found.
+ */
+static bool search_current_reference_block(const struct vdo_slab *slab,
+ slab_block_number *free_index_ptr)
+{
+ /* Don't bother searching if the current block is known to be full. */
+ return ((slab->search_cursor.block->allocated_count < COUNTS_PER_BLOCK) &&
+ find_free_block(slab, free_index_ptr));
+}
+
+/**
+ * search_reference_blocks() - Search each reference block for a reference count of zero.
+ * @slab: The slab to search.
+ * @free_index_ptr: A pointer to receive the array index of the zero reference count.
+ *
+ * Searches each reference block for a reference count of zero, starting at the reference block and
+ * counter index saved in the search cursor and searching up to the end of the last reference
+ * block. The search does not wrap.
+ *
+ * Return: true if an unreferenced counter was found.
+ */
+static bool search_reference_blocks(struct vdo_slab *slab,
+ slab_block_number *free_index_ptr)
+{
+ /* Start searching at the saved search position in the current block. */
+ if (search_current_reference_block(slab, free_index_ptr))
+ return true;
+
+ /* Search each reference block up to the end of the slab. */
+ while (advance_search_cursor(slab)) {
+ if (search_current_reference_block(slab, free_index_ptr))
+ return true;
+ }
+
+ return false;
+}
+
+/**
+ * make_provisional_reference() - Do the bookkeeping for making a provisional reference.
+ */
+static void make_provisional_reference(struct vdo_slab *slab,
+ slab_block_number block_number)
+{
+ struct reference_block *block = get_reference_block(slab, block_number);
+
+ /*
+ * Make the initial transition from an unreferenced block to a
+ * provisionally allocated block.
+ */
+ slab->counters[block_number] = PROVISIONAL_REFERENCE_COUNT;
+
+ /* Account for the allocation. */
+ block->allocated_count++;
+ slab->free_blocks--;
+}
+
+/**
+ * dirty_all_reference_blocks() - Mark all reference count blocks in a slab as dirty.
+ */
+static void dirty_all_reference_blocks(struct vdo_slab *slab)
+{
+ block_count_t i;
+
+ for (i = 0; i < slab->reference_block_count; i++)
+ dirty_block(&slab->reference_blocks[i]);
+}
+
+/**
+ * clear_provisional_references() - Clear the provisional reference counts from a reference block.
+ * @block: The block to clear.
+ */
+static void clear_provisional_references(struct reference_block *block)
+{
+ vdo_refcount_t *counters = get_reference_counters_for_block(block);
+ block_count_t j;
+
+ for (j = 0; j < COUNTS_PER_BLOCK; j++) {
+ if (counters[j] == PROVISIONAL_REFERENCE_COUNT) {
+ counters[j] = EMPTY_REFERENCE_COUNT;
+ block->allocated_count--;
+ }
+ }
+}
+
+static inline bool journal_points_equal(struct journal_point first,
+ struct journal_point second)
+{
+ return ((first.sequence_number == second.sequence_number) &&
+ (first.entry_count == second.entry_count));
+}
+
+/**
+ * unpack_reference_block() - Unpack reference counts blocks into the internal memory structure.
+ * @packed: The written reference block to be unpacked.
+ * @block: The internal reference block to be loaded.
+ */
+static void unpack_reference_block(struct packed_reference_block *packed,
+ struct reference_block *block)
+{
+ block_count_t index;
+ sector_count_t i;
+ struct vdo_slab *slab = block->slab;
+ vdo_refcount_t *counters = get_reference_counters_for_block(block);
+
+ for (i = 0; i < VDO_SECTORS_PER_BLOCK; i++) {
+ struct packed_reference_sector *sector = &packed->sectors[i];
+
+ vdo_unpack_journal_point(&sector->commit_point, &block->commit_points[i]);
+ memcpy(counters + (i * COUNTS_PER_SECTOR), sector->counts,
+ (sizeof(vdo_refcount_t) * COUNTS_PER_SECTOR));
+ /* The slab_journal_point must be the latest point found in any sector. */
+ if (vdo_before_journal_point(&slab->slab_journal_point,
+ &block->commit_points[i]))
+ slab->slab_journal_point = block->commit_points[i];
+
+ if ((i > 0) &&
+ !journal_points_equal(block->commit_points[0],
+ block->commit_points[i])) {
+ size_t block_index = block - block->slab->reference_blocks;
+
+ vdo_log_warning("Torn write detected in sector %u of reference block %zu of slab %u",
+ i, block_index, block->slab->slab_number);
+ }
+ }
+
+ block->allocated_count = 0;
+ for (index = 0; index < COUNTS_PER_BLOCK; index++) {
+ if (counters[index] != EMPTY_REFERENCE_COUNT)
+ block->allocated_count++;
+ }
+}
+
+/**
+ * finish_reference_block_load() - After a reference block has been read, unpack it.
+ * @completion: The VIO that just finished reading.
+ */
+static void finish_reference_block_load(struct vdo_completion *completion)
+{
+ struct vio *vio = as_vio(completion);
+ struct pooled_vio *pooled = vio_as_pooled_vio(vio);
+ struct reference_block *block = completion->parent;
+ struct vdo_slab *slab = block->slab;
+
+ unpack_reference_block((struct packed_reference_block *) vio->data, block);
+ return_vio_to_pool(slab->allocator->vio_pool, pooled);
+ slab->active_count--;
+ clear_provisional_references(block);
+
+ slab->free_blocks -= block->allocated_count;
+ check_if_slab_drained(slab);
+}
+
+static void load_reference_block_endio(struct bio *bio)
+{
+ struct vio *vio = bio->bi_private;
+ struct reference_block *block = vio->completion.parent;
+
+ continue_vio_after_io(vio, finish_reference_block_load,
+ block->slab->allocator->thread_id);
+}
+
+/**
+ * load_reference_block() - After a block waiter has gotten a VIO from the VIO pool, load the
+ * block.
+ * @waiter: The waiter of the block to load.
+ * @context: The VIO returned by the pool.
+ */
+static void load_reference_block(struct vdo_waiter *waiter, void *context)
+{
+ struct pooled_vio *pooled = context;
+ struct vio *vio = &pooled->vio;
+ struct reference_block *block =
+ container_of(waiter, struct reference_block, waiter);
+ size_t block_offset = (block - block->slab->reference_blocks);
+
+ vio->completion.parent = block;
+ vdo_submit_metadata_vio(vio, block->slab->ref_counts_origin + block_offset,
+ load_reference_block_endio, handle_io_error,
+ REQ_OP_READ);
+}
+
+/**
+ * load_reference_blocks() - Load a slab's reference blocks from the underlying storage into a
+ * pre-allocated reference counter.
+ */
+static void load_reference_blocks(struct vdo_slab *slab)
+{
+ block_count_t i;
+
+ slab->free_blocks = slab->block_count;
+ slab->active_count = slab->reference_block_count;
+ for (i = 0; i < slab->reference_block_count; i++) {
+ struct vdo_waiter *waiter = &slab->reference_blocks[i].waiter;
+
+ waiter->callback = load_reference_block;
+ acquire_vio_from_pool(slab->allocator->vio_pool, waiter);
+ }
+}
+
+/**
+ * drain_slab() - Drain all reference count I/O.
+ *
+ * Depending upon the type of drain being performed (as recorded in the ref_count's vdo_slab), the
+ * reference blocks may be loaded from disk or dirty reference blocks may be written out.
+ */
+static void drain_slab(struct vdo_slab *slab)
+{
+ bool save;
+ bool load;
+ const struct admin_state_code *state = vdo_get_admin_state_code(&slab->state);
+
+ if (state == VDO_ADMIN_STATE_SUSPENDING)
+ return;
+
+ if ((state != VDO_ADMIN_STATE_REBUILDING) &&
+ (state != VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING))
+ commit_tail(&slab->journal);
+
+ if ((state == VDO_ADMIN_STATE_RECOVERING) || (slab->counters == NULL))
+ return;
+
+ save = false;
+ load = slab->allocator->summary_entries[slab->slab_number].load_ref_counts;
+ if (state == VDO_ADMIN_STATE_SCRUBBING) {
+ if (load) {
+ load_reference_blocks(slab);
+ return;
+ }
+ } else if (state == VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING) {
+ if (!load) {
+ /* These reference counts were never written, so mark them all dirty. */
+ dirty_all_reference_blocks(slab);
+ }
+ save = true;
+ } else if (state == VDO_ADMIN_STATE_REBUILDING) {
+ /*
+ * Write out the counters if the slab has written them before, or it has any
+ * non-zero reference counts, or there are any slab journal blocks.
+ */
+ block_count_t data_blocks = slab->allocator->depot->slab_config.data_blocks;
+
+ if (load || (slab->free_blocks != data_blocks) ||
+ !is_slab_journal_blank(slab)) {
+ dirty_all_reference_blocks(slab);
+ save = true;
+ }
+ } else if (state == VDO_ADMIN_STATE_SAVING) {
+ save = (slab->status == VDO_SLAB_REBUILT);
+ } else {
+ vdo_finish_draining_with_result(&slab->state, VDO_SUCCESS);
+ return;
+ }
+
+ if (save)
+ save_dirty_reference_blocks(slab);
+}
+
+static int allocate_slab_counters(struct vdo_slab *slab)
+{
+ int result;
+ size_t index, bytes;
+
+ result = VDO_ASSERT(slab->reference_blocks == NULL,
+ "vdo_slab %u doesn't allocate refcounts twice",
+ slab->slab_number);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = vdo_allocate(slab->reference_block_count, struct reference_block,
+ __func__, &slab->reference_blocks);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ /*
+ * Allocate such that the runt slab has a full-length memory array, plus a little padding
+ * so we can word-search even at the very end.
+ */
+ bytes = (slab->reference_block_count * COUNTS_PER_BLOCK) + (2 * BYTES_PER_WORD);
+ result = vdo_allocate(bytes, vdo_refcount_t, "ref counts array",
+ &slab->counters);
+ if (result != VDO_SUCCESS) {
+ vdo_free(vdo_forget(slab->reference_blocks));
+ return result;
+ }
+
+ slab->search_cursor.first_block = slab->reference_blocks;
+ slab->search_cursor.last_block = &slab->reference_blocks[slab->reference_block_count - 1];
+ reset_search_cursor(slab);
+
+ for (index = 0; index < slab->reference_block_count; index++) {
+ slab->reference_blocks[index] = (struct reference_block) {
+ .slab = slab,
+ };
+ }
+
+ return VDO_SUCCESS;
+}
+
+static int allocate_counters_if_clean(struct vdo_slab *slab)
+{
+ if (vdo_is_state_clean_load(&slab->state))
+ return allocate_slab_counters(slab);
+
+ return VDO_SUCCESS;
+}
+
+static void finish_loading_journal(struct vdo_completion *completion)
+{
+ struct vio *vio = as_vio(completion);
+ struct slab_journal *journal = completion->parent;
+ struct vdo_slab *slab = journal->slab;
+ struct packed_slab_journal_block *block = (struct packed_slab_journal_block *) vio->data;
+ struct slab_journal_block_header header;
+
+ vdo_unpack_slab_journal_block_header(&block->header, &header);
+
+ /* FIXME: should it be an error if the following conditional fails? */
+ if ((header.metadata_type == VDO_METADATA_SLAB_JOURNAL) &&
+ (header.nonce == slab->allocator->nonce)) {
+ journal->tail = header.sequence_number + 1;
+
+ /*
+ * If the slab is clean, this implies the slab journal is empty, so advance the
+ * head appropriately.
+ */
+ journal->head = (slab->allocator->summary_entries[slab->slab_number].is_dirty ?
+ header.head : journal->tail);
+ journal->tail_header = header;
+ initialize_journal_state(journal);
+ }
+
+ return_vio_to_pool(slab->allocator->vio_pool, vio_as_pooled_vio(vio));
+ vdo_finish_loading_with_result(&slab->state, allocate_counters_if_clean(slab));
+}
+
+static void read_slab_journal_tail_endio(struct bio *bio)
+{
+ struct vio *vio = bio->bi_private;
+ struct slab_journal *journal = vio->completion.parent;
+
+ continue_vio_after_io(vio, finish_loading_journal,
+ journal->slab->allocator->thread_id);
+}
+
+static void handle_load_error(struct vdo_completion *completion)
+{
+ int result = completion->result;
+ struct slab_journal *journal = completion->parent;
+ struct vio *vio = as_vio(completion);
+
+ vio_record_metadata_io_error(vio);
+ return_vio_to_pool(journal->slab->allocator->vio_pool, vio_as_pooled_vio(vio));
+ vdo_finish_loading_with_result(&journal->slab->state, result);
+}
+
+/**
+ * read_slab_journal_tail() - Read the slab journal tail block by using a vio acquired from the vio
+ * pool.
+ * @waiter: The vio pool waiter which has just been notified.
+ * @context: The vio pool entry given to the waiter.
+ *
+ * This is the success callback from acquire_vio_from_pool() when loading a slab journal.
+ */
+static void read_slab_journal_tail(struct vdo_waiter *waiter, void *context)
+{
+ struct slab_journal *journal =
+ container_of(waiter, struct slab_journal, resource_waiter);
+ struct vdo_slab *slab = journal->slab;
+ struct pooled_vio *pooled = context;
+ struct vio *vio = &pooled->vio;
+ tail_block_offset_t last_commit_point =
+ slab->allocator->summary_entries[slab->slab_number].tail_block_offset;
+
+ /*
+ * Slab summary keeps the commit point offset, so the tail block is the block before that.
+ * Calculation supports small journals in unit tests.
+ */
+ tail_block_offset_t tail_block = ((last_commit_point == 0) ?
+ (tail_block_offset_t)(journal->size - 1) :
+ (last_commit_point - 1));
+
+ vio->completion.parent = journal;
+ vio->completion.callback_thread_id = slab->allocator->thread_id;
+ vdo_submit_metadata_vio(vio, slab->journal_origin + tail_block,
+ read_slab_journal_tail_endio, handle_load_error,
+ REQ_OP_READ);
+}
+
+/**
+ * load_slab_journal() - Load a slab's journal by reading the journal's tail.
+ */
+static void load_slab_journal(struct vdo_slab *slab)
+{
+ struct slab_journal *journal = &slab->journal;
+ tail_block_offset_t last_commit_point;
+
+ last_commit_point = slab->allocator->summary_entries[slab->slab_number].tail_block_offset;
+ if ((last_commit_point == 0) &&
+ !slab->allocator->summary_entries[slab->slab_number].load_ref_counts) {
+ /*
+ * This slab claims that it has a tail block at (journal->size - 1), but a head of
+ * 1. This is impossible, due to the scrubbing threshold, on a real system, so
+ * don't bother reading the (bogus) data off disk.
+ */
+ VDO_ASSERT_LOG_ONLY(((journal->size < 16) ||
+ (journal->scrubbing_threshold < (journal->size - 1))),
+ "Scrubbing threshold protects against reads of unwritten slab journal blocks");
+ vdo_finish_loading_with_result(&slab->state,
+ allocate_counters_if_clean(slab));
+ return;
+ }
+
+ journal->resource_waiter.callback = read_slab_journal_tail;
+ acquire_vio_from_pool(slab->allocator->vio_pool, &journal->resource_waiter);
+}
+
+static void register_slab_for_scrubbing(struct vdo_slab *slab, bool high_priority)
+{
+ struct slab_scrubber *scrubber = &slab->allocator->scrubber;
+
+ VDO_ASSERT_LOG_ONLY((slab->status != VDO_SLAB_REBUILT),
+ "slab to be scrubbed is unrecovered");
+
+ if (slab->status != VDO_SLAB_REQUIRES_SCRUBBING)
+ return;
+
+ list_del_init(&slab->allocq_entry);
+ if (!slab->was_queued_for_scrubbing) {
+ WRITE_ONCE(scrubber->slab_count, scrubber->slab_count + 1);
+ slab->was_queued_for_scrubbing = true;
+ }
+
+ if (high_priority) {
+ slab->status = VDO_SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING;
+ list_add_tail(&slab->allocq_entry, &scrubber->high_priority_slabs);
+ return;
+ }
+
+ list_add_tail(&slab->allocq_entry, &scrubber->slabs);
+}
+
+/* Queue a slab for allocation or scrubbing. */
+static void queue_slab(struct vdo_slab *slab)
+{
+ struct block_allocator *allocator = slab->allocator;
+ block_count_t free_blocks;
+ int result;
+
+ VDO_ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry),
+ "a requeued slab must not already be on a ring");
+
+ if (vdo_is_read_only(allocator->depot->vdo))
+ return;
+
+ free_blocks = slab->free_blocks;
+ result = VDO_ASSERT((free_blocks <= allocator->depot->slab_config.data_blocks),
+ "rebuilt slab %u must have a valid free block count (has %llu, expected maximum %llu)",
+ slab->slab_number, (unsigned long long) free_blocks,
+ (unsigned long long) allocator->depot->slab_config.data_blocks);
+ if (result != VDO_SUCCESS) {
+ vdo_enter_read_only_mode(allocator->depot->vdo, result);
+ return;
+ }
+
+ if (slab->status != VDO_SLAB_REBUILT) {
+ register_slab_for_scrubbing(slab, false);
+ return;
+ }
+
+ if (!vdo_is_state_resuming(&slab->state)) {
+ /*
+ * If the slab is resuming, we've already accounted for it here, so don't do it
+ * again.
+ * FIXME: under what situation would the slab be resuming here?
+ */
+ WRITE_ONCE(allocator->allocated_blocks,
+ allocator->allocated_blocks - free_blocks);
+ if (!is_slab_journal_blank(slab)) {
+ WRITE_ONCE(allocator->statistics.slabs_opened,
+ allocator->statistics.slabs_opened + 1);
+ }
+ }
+
+ if (allocator->depot->vdo->suspend_type == VDO_ADMIN_STATE_SAVING)
+ reopen_slab_journal(slab);
+
+ prioritize_slab(slab);
+}
+
+/**
+ * initiate_slab_action() - Initiate a slab action.
+ *
+ * Implements vdo_admin_initiator_fn.
+ */
+static void initiate_slab_action(struct admin_state *state)
+{
+ struct vdo_slab *slab = container_of(state, struct vdo_slab, state);
+
+ if (vdo_is_state_draining(state)) {
+ const struct admin_state_code *operation = vdo_get_admin_state_code(state);
+
+ if (operation == VDO_ADMIN_STATE_SCRUBBING)
+ slab->status = VDO_SLAB_REBUILDING;
+
+ drain_slab(slab);
+ check_if_slab_drained(slab);
+ return;
+ }
+
+ if (vdo_is_state_loading(state)) {
+ load_slab_journal(slab);
+ return;
+ }
+
+ if (vdo_is_state_resuming(state)) {
+ queue_slab(slab);
+ vdo_finish_resuming(state);
+ return;
+ }
+
+ vdo_finish_operation(state, VDO_INVALID_ADMIN_STATE);
+}
+
+/**
+ * get_next_slab() - Get the next slab to scrub.
+ * @scrubber: The slab scrubber.
+ *
+ * Return: The next slab to scrub or NULL if there are none.
+ */
+static struct vdo_slab *get_next_slab(struct slab_scrubber *scrubber)
+{
+ struct vdo_slab *slab;
+
+ slab = list_first_entry_or_null(&scrubber->high_priority_slabs,
+ struct vdo_slab, allocq_entry);
+ if (slab != NULL)
+ return slab;
+
+ return list_first_entry_or_null(&scrubber->slabs, struct vdo_slab,
+ allocq_entry);
+}
+
+/**
+ * has_slabs_to_scrub() - Check whether a scrubber has slabs to scrub.
+ * @scrubber: The scrubber to check.
+ *
+ * Return: true if the scrubber has slabs to scrub.
+ */
+static inline bool __must_check has_slabs_to_scrub(struct slab_scrubber *scrubber)
+{
+ return (get_next_slab(scrubber) != NULL);
+}
+
+/**
+ * uninitialize_scrubber_vio() - Clean up the slab_scrubber's vio.
+ * @scrubber: The scrubber.
+ */
+static void uninitialize_scrubber_vio(struct slab_scrubber *scrubber)
+{
+ vdo_free(vdo_forget(scrubber->vio.data));
+ free_vio_components(&scrubber->vio);
+}
+
+/**
+ * finish_scrubbing() - Stop scrubbing, either because there are no more slabs to scrub or because
+ * there's been an error.
+ * @scrubber: The scrubber.
+ */
+static void finish_scrubbing(struct slab_scrubber *scrubber, int result)
+{
+ bool notify = vdo_waitq_has_waiters(&scrubber->waiters);
+ bool done = !has_slabs_to_scrub(scrubber);
+ struct block_allocator *allocator =
+ container_of(scrubber, struct block_allocator, scrubber);
+
+ if (done)
+ uninitialize_scrubber_vio(scrubber);
+
+ if (scrubber->high_priority_only) {
+ scrubber->high_priority_only = false;
+ vdo_fail_completion(vdo_forget(scrubber->vio.completion.parent), result);
+ } else if (done && (atomic_add_return(-1, &allocator->depot->zones_to_scrub) == 0)) {
+ /* All of our slabs were scrubbed, and we're the last allocator to finish. */
+ enum vdo_state prior_state =
+ atomic_cmpxchg(&allocator->depot->vdo->state, VDO_RECOVERING,
+ VDO_DIRTY);
+
+ /*
+ * To be safe, even if the CAS failed, ensure anything that follows is ordered with
+ * respect to whatever state change did happen.
+ */
+ smp_mb__after_atomic();
+
+ /*
+ * We must check the VDO state here and not the depot's read_only_notifier since
+ * the compare-swap-above could have failed due to a read-only entry which our own
+ * thread does not yet know about.
+ */
+ if (prior_state == VDO_DIRTY)
+ vdo_log_info("VDO commencing normal operation");
+ else if (prior_state == VDO_RECOVERING)
+ vdo_log_info("Exiting recovery mode");
+ }
+
+ /*
+ * Note that the scrubber has stopped, and inform anyone who might be waiting for that to
+ * happen.
+ */
+ if (!vdo_finish_draining(&scrubber->admin_state))
+ WRITE_ONCE(scrubber->admin_state.current_state,
+ VDO_ADMIN_STATE_SUSPENDED);
+
+ /*
+ * We can't notify waiters until after we've finished draining or they'll just requeue.
+ * Fortunately if there were waiters, we can't have been freed yet.
+ */
+ if (notify)
+ vdo_waitq_notify_all_waiters(&scrubber->waiters, NULL, NULL);
+}
+
+static void scrub_next_slab(struct slab_scrubber *scrubber);
+
+/**
+ * slab_scrubbed() - Notify the scrubber that a slab has been scrubbed.
+ * @completion: The slab rebuild completion.
+ *
+ * This callback is registered in apply_journal_entries().
+ */
+static void slab_scrubbed(struct vdo_completion *completion)
+{
+ struct slab_scrubber *scrubber =
+ container_of(as_vio(completion), struct slab_scrubber, vio);
+ struct vdo_slab *slab = scrubber->slab;
+
+ slab->status = VDO_SLAB_REBUILT;
+ queue_slab(slab);
+ reopen_slab_journal(slab);
+ WRITE_ONCE(scrubber->slab_count, scrubber->slab_count - 1);
+ scrub_next_slab(scrubber);
+}
+
+/**
+ * abort_scrubbing() - Abort scrubbing due to an error.
+ * @scrubber: The slab scrubber.
+ * @result: The error.
+ */
+static void abort_scrubbing(struct slab_scrubber *scrubber, int result)
+{
+ vdo_enter_read_only_mode(scrubber->vio.completion.vdo, result);
+ finish_scrubbing(scrubber, result);
+}
+
+/**
+ * handle_scrubber_error() - Handle errors while rebuilding a slab.
+ * @completion: The slab rebuild completion.
+ */
+static void handle_scrubber_error(struct vdo_completion *completion)
+{
+ struct vio *vio = as_vio(completion);
+
+ vio_record_metadata_io_error(vio);
+ abort_scrubbing(container_of(vio, struct slab_scrubber, vio),
+ completion->result);
+}
+
+/**
+ * apply_block_entries() - Apply all the entries in a block to the reference counts.
+ * @block: A block with entries to apply.
+ * @entry_count: The number of entries to apply.
+ * @block_number: The sequence number of the block.
+ * @slab: The slab to apply the entries to.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+static int apply_block_entries(struct packed_slab_journal_block *block,
+ journal_entry_count_t entry_count,
+ sequence_number_t block_number, struct vdo_slab *slab)
+{
+ struct journal_point entry_point = {
+ .sequence_number = block_number,
+ .entry_count = 0,
+ };
+ int result;
+ slab_block_number max_sbn = slab->end - slab->start;
+
+ while (entry_point.entry_count < entry_count) {
+ struct slab_journal_entry entry =
+ vdo_decode_slab_journal_entry(block, entry_point.entry_count);
+
+ if (entry.sbn > max_sbn) {
+ /* This entry is out of bounds. */
+ return vdo_log_error_strerror(VDO_CORRUPT_JOURNAL,
+ "vdo_slab journal entry (%llu, %u) had invalid offset %u in slab (size %u blocks)",
+ (unsigned long long) block_number,
+ entry_point.entry_count,
+ entry.sbn, max_sbn);
+ }
+
+ result = replay_reference_count_change(slab, &entry_point, entry);
+ if (result != VDO_SUCCESS) {
+ vdo_log_error_strerror(result,
+ "vdo_slab journal entry (%llu, %u) (%s of offset %u) could not be applied in slab %u",
+ (unsigned long long) block_number,
+ entry_point.entry_count,
+ vdo_get_journal_operation_name(entry.operation),
+ entry.sbn, slab->slab_number);
+ return result;
+ }
+ entry_point.entry_count++;
+ }
+
+ return VDO_SUCCESS;
+}
+
+/**
+ * apply_journal_entries() - Find the relevant vio of the slab journal and apply all valid entries.
+ * @completion: The metadata read vio completion.
+ *
+ * This is a callback registered in start_scrubbing().
+ */
+static void apply_journal_entries(struct vdo_completion *completion)
+{
+ int result;
+ struct slab_scrubber *scrubber =
+ container_of(as_vio(completion), struct slab_scrubber, vio);
+ struct vdo_slab *slab = scrubber->slab;
+ struct slab_journal *journal = &slab->journal;
+
+ /* Find the boundaries of the useful part of the journal. */
+ sequence_number_t tail = journal->tail;
+ tail_block_offset_t end_index = (tail - 1) % journal->size;
+ char *end_data = scrubber->vio.data + (end_index * VDO_BLOCK_SIZE);
+ struct packed_slab_journal_block *end_block =
+ (struct packed_slab_journal_block *) end_data;
+
+ sequence_number_t head = __le64_to_cpu(end_block->header.head);
+ tail_block_offset_t head_index = head % journal->size;
+ block_count_t index = head_index;
+
+ struct journal_point ref_counts_point = slab->slab_journal_point;
+ struct journal_point last_entry_applied = ref_counts_point;
+ sequence_number_t sequence;
+
+ for (sequence = head; sequence < tail; sequence++) {
+ char *block_data = scrubber->vio.data + (index * VDO_BLOCK_SIZE);
+ struct packed_slab_journal_block *block =
+ (struct packed_slab_journal_block *) block_data;
+ struct slab_journal_block_header header;
+
+ vdo_unpack_slab_journal_block_header(&block->header, &header);
+
+ if ((header.nonce != slab->allocator->nonce) ||
+ (header.metadata_type != VDO_METADATA_SLAB_JOURNAL) ||
+ (header.sequence_number != sequence) ||
+ (header.entry_count > journal->entries_per_block) ||
+ (header.has_block_map_increments &&
+ (header.entry_count > journal->full_entries_per_block))) {
+ /* The block is not what we expect it to be. */
+ vdo_log_error("vdo_slab journal block for slab %u was invalid",
+ slab->slab_number);
+ abort_scrubbing(scrubber, VDO_CORRUPT_JOURNAL);
+ return;
+ }
+
+ result = apply_block_entries(block, header.entry_count, sequence, slab);
+ if (result != VDO_SUCCESS) {
+ abort_scrubbing(scrubber, result);
+ return;
+ }
+
+ last_entry_applied.sequence_number = sequence;
+ last_entry_applied.entry_count = header.entry_count - 1;
+ index++;
+ if (index == journal->size)
+ index = 0;
+ }
+
+ /*
+ * At the end of rebuild, the reference counters should be accurate to the end of the
+ * journal we just applied.
+ */
+ result = VDO_ASSERT(!vdo_before_journal_point(&last_entry_applied,
+ &ref_counts_point),
+ "Refcounts are not more accurate than the slab journal");
+ if (result != VDO_SUCCESS) {
+ abort_scrubbing(scrubber, result);
+ return;
+ }
+
+ /* Save out the rebuilt reference blocks. */
+ vdo_prepare_completion(completion, slab_scrubbed, handle_scrubber_error,
+ slab->allocator->thread_id, completion->parent);
+ vdo_start_operation_with_waiter(&slab->state,
+ VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING,
+ completion, initiate_slab_action);
+}
+
+static void read_slab_journal_endio(struct bio *bio)
+{
+ struct vio *vio = bio->bi_private;
+ struct slab_scrubber *scrubber = container_of(vio, struct slab_scrubber, vio);
+
+ continue_vio_after_io(bio->bi_private, apply_journal_entries,
+ scrubber->slab->allocator->thread_id);
+}
+
+/**
+ * start_scrubbing() - Read the current slab's journal from disk now that it has been flushed.
+ * @completion: The scrubber's vio completion.
+ *
+ * This callback is registered in scrub_next_slab().
+ */
+static void start_scrubbing(struct vdo_completion *completion)
+{
+ struct slab_scrubber *scrubber =
+ container_of(as_vio(completion), struct slab_scrubber, vio);
+ struct vdo_slab *slab = scrubber->slab;
+
+ if (!slab->allocator->summary_entries[slab->slab_number].is_dirty) {
+ slab_scrubbed(completion);
+ return;
+ }
+
+ vdo_submit_metadata_vio(&scrubber->vio, slab->journal_origin,
+ read_slab_journal_endio, handle_scrubber_error,
+ REQ_OP_READ);
+}
+
+/**
+ * scrub_next_slab() - Scrub the next slab if there is one.
+ * @scrubber: The scrubber.
+ */
+static void scrub_next_slab(struct slab_scrubber *scrubber)
+{
+ struct vdo_completion *completion = &scrubber->vio.completion;
+ struct vdo_slab *slab;
+
+ /*
+ * Note: this notify call is always safe only because scrubbing can only be started when
+ * the VDO is quiescent.
+ */
+ vdo_waitq_notify_all_waiters(&scrubber->waiters, NULL, NULL);
+
+ if (vdo_is_read_only(completion->vdo)) {
+ finish_scrubbing(scrubber, VDO_READ_ONLY);
+ return;
+ }
+
+ slab = get_next_slab(scrubber);
+ if ((slab == NULL) ||
+ (scrubber->high_priority_only && list_empty(&scrubber->high_priority_slabs))) {
+ finish_scrubbing(scrubber, VDO_SUCCESS);
+ return;
+ }
+
+ if (vdo_finish_draining(&scrubber->admin_state))
+ return;
+
+ list_del_init(&slab->allocq_entry);
+ scrubber->slab = slab;
+ vdo_prepare_completion(completion, start_scrubbing, handle_scrubber_error,
+ slab->allocator->thread_id, completion->parent);
+ vdo_start_operation_with_waiter(&slab->state, VDO_ADMIN_STATE_SCRUBBING,
+ completion, initiate_slab_action);
+}
+
+/**
+ * scrub_slabs() - Scrub all of an allocator's slabs that are eligible for scrubbing.
+ * @allocator: The block_allocator to scrub.
+ * @parent: The completion to notify when scrubbing is done, implies high_priority, may be NULL.
+ */
+static void scrub_slabs(struct block_allocator *allocator, struct vdo_completion *parent)
+{
+ struct slab_scrubber *scrubber = &allocator->scrubber;
+
+ scrubber->vio.completion.parent = parent;
+ scrubber->high_priority_only = (parent != NULL);
+ if (!has_slabs_to_scrub(scrubber)) {
+ finish_scrubbing(scrubber, VDO_SUCCESS);
+ return;
+ }
+
+ if (scrubber->high_priority_only &&
+ vdo_is_priority_table_empty(allocator->prioritized_slabs) &&
+ list_empty(&scrubber->high_priority_slabs))
+ register_slab_for_scrubbing(get_next_slab(scrubber), true);
+
+ vdo_resume_if_quiescent(&scrubber->admin_state);
+ scrub_next_slab(scrubber);
+}
+
+static inline void assert_on_allocator_thread(thread_id_t thread_id,
+ const char *function_name)
+{
+ VDO_ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == thread_id),
+ "%s called on correct thread", function_name);
+}
+
+static void register_slab_with_allocator(struct block_allocator *allocator,
+ struct vdo_slab *slab)
+{
+ allocator->slab_count++;
+ allocator->last_slab = slab->slab_number;
+}
+
+/**
+ * get_depot_slab_iterator() - Return a slab_iterator over the slabs in a slab_depot.
+ * @depot: The depot over which to iterate.
+ * @start: The number of the slab to start iterating from.
+ * @end: The number of the last slab which may be returned.
+ * @stride: The difference in slab number between successive slabs.
+ *
+ * Iteration always occurs from higher to lower numbered slabs.
+ *
+ * Return: An initialized iterator structure.
+ */
+static struct slab_iterator get_depot_slab_iterator(struct slab_depot *depot,
+ slab_count_t start, slab_count_t end,
+ slab_count_t stride)
+{
+ struct vdo_slab **slabs = depot->slabs;
+
+ return (struct slab_iterator) {
+ .slabs = slabs,
+ .next = (((slabs == NULL) || (start < end)) ? NULL : slabs[start]),
+ .end = end,
+ .stride = stride,
+ };
+}
+
+static struct slab_iterator get_slab_iterator(const struct block_allocator *allocator)
+{
+ return get_depot_slab_iterator(allocator->depot, allocator->last_slab,
+ allocator->zone_number,
+ allocator->depot->zone_count);
+}
+
+/**
+ * next_slab() - Get the next slab from a slab_iterator and advance the iterator
+ * @iterator: The slab_iterator.
+ *
+ * Return: The next slab or NULL if the iterator is exhausted.
+ */
+static struct vdo_slab *next_slab(struct slab_iterator *iterator)
+{
+ struct vdo_slab *slab = iterator->next;
+
+ if ((slab == NULL) || (slab->slab_number < iterator->end + iterator->stride))
+ iterator->next = NULL;
+ else
+ iterator->next = iterator->slabs[slab->slab_number - iterator->stride];
+
+ return slab;
+}
+
+/**
+ * abort_waiter() - Abort vios waiting to make journal entries when read-only.
+ *
+ * This callback is invoked on all vios waiting to make slab journal entries after the VDO has gone
+ * into read-only mode. Implements waiter_callback_fn.
+ */
+static void abort_waiter(struct vdo_waiter *waiter, void *context __always_unused)
+{
+ struct reference_updater *updater =
+ container_of(waiter, struct reference_updater, waiter);
+ struct data_vio *data_vio = data_vio_from_reference_updater(updater);
+
+ if (updater->increment) {
+ continue_data_vio_with_error(data_vio, VDO_READ_ONLY);
+ return;
+ }
+
+ vdo_continue_completion(&data_vio->decrement_completion, VDO_READ_ONLY);
+}
+
+/* Implements vdo_read_only_notification_fn. */
+static void notify_block_allocator_of_read_only_mode(void *listener,
+ struct vdo_completion *parent)
+{
+ struct block_allocator *allocator = listener;
+ struct slab_iterator iterator;
+
+ assert_on_allocator_thread(allocator->thread_id, __func__);
+ iterator = get_slab_iterator(allocator);
+ while (iterator.next != NULL) {
+ struct vdo_slab *slab = next_slab(&iterator);
+
+ vdo_waitq_notify_all_waiters(&slab->journal.entry_waiters,
+ abort_waiter, &slab->journal);
+ check_if_slab_drained(slab);
+ }
+
+ vdo_finish_completion(parent);
+}
+
+/**
+ * vdo_acquire_provisional_reference() - Acquire a provisional reference on behalf of a PBN lock if
+ * the block it locks is unreferenced.
+ * @slab: The slab which contains the block.
+ * @pbn: The physical block to reference.
+ * @lock: The lock.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+int vdo_acquire_provisional_reference(struct vdo_slab *slab, physical_block_number_t pbn,
+ struct pbn_lock *lock)
+{
+ slab_block_number block_number;
+ int result;
+
+ if (vdo_pbn_lock_has_provisional_reference(lock))
+ return VDO_SUCCESS;
+
+ if (!is_slab_open(slab))
+ return VDO_INVALID_ADMIN_STATE;
+
+ result = slab_block_number_from_pbn(slab, pbn, &block_number);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ if (slab->counters[block_number] == EMPTY_REFERENCE_COUNT) {
+ make_provisional_reference(slab, block_number);
+ if (lock != NULL)
+ vdo_assign_pbn_lock_provisional_reference(lock);
+ }
+
+ if (vdo_pbn_lock_has_provisional_reference(lock))
+ adjust_free_block_count(slab, false);
+
+ return VDO_SUCCESS;
+}
+
+static int __must_check allocate_slab_block(struct vdo_slab *slab,
+ physical_block_number_t *block_number_ptr)
+{
+ slab_block_number free_index;
+
+ if (!is_slab_open(slab))
+ return VDO_INVALID_ADMIN_STATE;
+
+ if (!search_reference_blocks(slab, &free_index))
+ return VDO_NO_SPACE;
+
+ VDO_ASSERT_LOG_ONLY((slab->counters[free_index] == EMPTY_REFERENCE_COUNT),
+ "free block must have ref count of zero");
+ make_provisional_reference(slab, free_index);
+ adjust_free_block_count(slab, false);
+
+ /*
+ * Update the search hint so the next search will start at the array index just past the
+ * free block we just found.
+ */
+ slab->search_cursor.index = (free_index + 1);
+
+ *block_number_ptr = slab->start + free_index;
+ return VDO_SUCCESS;
+}
+
+/**
+ * open_slab() - Prepare a slab to be allocated from.
+ * @slab: The slab.
+ */
+static void open_slab(struct vdo_slab *slab)
+{
+ reset_search_cursor(slab);
+ if (is_slab_journal_blank(slab)) {
+ WRITE_ONCE(slab->allocator->statistics.slabs_opened,
+ slab->allocator->statistics.slabs_opened + 1);
+ dirty_all_reference_blocks(slab);
+ } else {
+ WRITE_ONCE(slab->allocator->statistics.slabs_reopened,
+ slab->allocator->statistics.slabs_reopened + 1);
+ }
+
+ slab->allocator->open_slab = slab;
+}
+
+
+/*
+ * The block allocated will have a provisional reference and the reference must be either confirmed
+ * with a subsequent increment or vacated with a subsequent decrement via
+ * vdo_release_block_reference().
+ */
+int vdo_allocate_block(struct block_allocator *allocator,
+ physical_block_number_t *block_number_ptr)
+{
+ int result;
+
+ if (allocator->open_slab != NULL) {
+ /* Try to allocate the next block in the currently open slab. */
+ result = allocate_slab_block(allocator->open_slab, block_number_ptr);
+ if ((result == VDO_SUCCESS) || (result != VDO_NO_SPACE))
+ return result;
+
+ /* Put the exhausted open slab back into the priority table. */
+ prioritize_slab(allocator->open_slab);
+ }
+
+ /* Remove the highest priority slab from the priority table and make it the open slab. */
+ open_slab(list_entry(vdo_priority_table_dequeue(allocator->prioritized_slabs),
+ struct vdo_slab, allocq_entry));
+
+ /*
+ * Try allocating again. If we're out of space immediately after opening a slab, then every
+ * slab must be fully allocated.
+ */
+ return allocate_slab_block(allocator->open_slab, block_number_ptr);
+}
+
+/**
+ * vdo_enqueue_clean_slab_waiter() - Wait for a clean slab.
+ * @allocator: The block_allocator on which to wait.
+ * @waiter: The waiter.
+ *
+ * Return: VDO_SUCCESS if the waiter was queued, VDO_NO_SPACE if there are no slabs to scrub, and
+ * some other error otherwise.
+ */
+int vdo_enqueue_clean_slab_waiter(struct block_allocator *allocator,
+ struct vdo_waiter *waiter)
+{
+ if (vdo_is_read_only(allocator->depot->vdo))
+ return VDO_READ_ONLY;
+
+ if (vdo_is_state_quiescent(&allocator->scrubber.admin_state))
+ return VDO_NO_SPACE;
+
+ vdo_waitq_enqueue_waiter(&allocator->scrubber.waiters, waiter);
+ return VDO_SUCCESS;
+}
+
+/**
+ * vdo_modify_reference_count() - Modify the reference count of a block by first making a slab
+ * journal entry and then updating the reference counter.
+ *
+ * @data_vio: The data_vio for which to add the entry.
+ * @updater: Which of the data_vio's reference updaters is being submitted.
+ */
+void vdo_modify_reference_count(struct vdo_completion *completion,
+ struct reference_updater *updater)
+{
+ struct vdo_slab *slab = vdo_get_slab(completion->vdo->depot, updater->zpbn.pbn);
+
+ if (!is_slab_open(slab)) {
+ vdo_continue_completion(completion, VDO_INVALID_ADMIN_STATE);
+ return;
+ }
+
+ if (vdo_is_read_only(completion->vdo)) {
+ vdo_continue_completion(completion, VDO_READ_ONLY);
+ return;
+ }
+
+ vdo_waitq_enqueue_waiter(&slab->journal.entry_waiters, &updater->waiter);
+ if ((slab->status != VDO_SLAB_REBUILT) && requires_reaping(&slab->journal))
+ register_slab_for_scrubbing(slab, true);
+
+ add_entries(&slab->journal);
+}
+
+/* Release an unused provisional reference. */
+int vdo_release_block_reference(struct block_allocator *allocator,
+ physical_block_number_t pbn)
+{
+ struct reference_updater updater;
+
+ if (pbn == VDO_ZERO_BLOCK)
+ return VDO_SUCCESS;
+
+ updater = (struct reference_updater) {
+ .operation = VDO_JOURNAL_DATA_REMAPPING,
+ .increment = false,
+ .zpbn = {
+ .pbn = pbn,
+ },
+ };
+
+ return adjust_reference_count(vdo_get_slab(allocator->depot, pbn),
+ &updater, NULL);
+}
+
+/*
+ * This is a min_heap callback function orders slab_status structures using the 'is_clean' field as
+ * the primary key and the 'emptiness' field as the secondary key.
+ *
+ * Slabs need to be pushed onto the rings in the same order they are to be popped off. Popping
+ * should always get the most empty first, so pushing should be from most empty to least empty.
+ * Thus, the ordering is reversed from the usual sense since min_heap returns smaller elements
+ * before larger ones.
+ */
+static bool slab_status_is_less_than(const void *item1, const void *item2)
+{
+ const struct slab_status *info1 = item1;
+ const struct slab_status *info2 = item2;
+
+ if (info1->is_clean != info2->is_clean)
+ return info1->is_clean;
+ if (info1->emptiness != info2->emptiness)
+ return info1->emptiness > info2->emptiness;
+ return info1->slab_number < info2->slab_number;
+}
+
+static void swap_slab_statuses(void *item1, void *item2)
+{
+ struct slab_status *info1 = item1;
+ struct slab_status *info2 = item2;
+
+ swap(*info1, *info2);
+}
+
+static const struct min_heap_callbacks slab_status_min_heap = {
+ .elem_size = sizeof(struct slab_status),
+ .less = slab_status_is_less_than,
+ .swp = swap_slab_statuses,
+};
+
+/* Inform the slab actor that a action has finished on some slab; used by apply_to_slabs(). */
+static void slab_action_callback(struct vdo_completion *completion)
+{
+ struct block_allocator *allocator = vdo_as_block_allocator(completion);
+ struct slab_actor *actor = &allocator->slab_actor;
+
+ if (--actor->slab_action_count == 0) {
+ actor->callback(completion);
+ return;
+ }
+
+ vdo_reset_completion(completion);
+}
+
+/* Preserve the error from part of an action and continue. */
+static void handle_operation_error(struct vdo_completion *completion)
+{
+ struct block_allocator *allocator = vdo_as_block_allocator(completion);
+
+ if (allocator->state.waiter != NULL)
+ vdo_set_completion_result(allocator->state.waiter, completion->result);
+ completion->callback(completion);
+}
+
+/* Perform an action on each of an allocator's slabs in parallel. */
+static void apply_to_slabs(struct block_allocator *allocator, vdo_action_fn callback)
+{
+ struct slab_iterator iterator;
+
+ vdo_prepare_completion(&allocator->completion, slab_action_callback,
+ handle_operation_error, allocator->thread_id, NULL);
+ allocator->completion.requeue = false;
+
+ /*
+ * Since we are going to dequeue all of the slabs, the open slab will become invalid, so
+ * clear it.
+ */
+ allocator->open_slab = NULL;
+
+ /* Ensure that we don't finish before we're done starting. */
+ allocator->slab_actor = (struct slab_actor) {
+ .slab_action_count = 1,
+ .callback = callback,
+ };
+
+ iterator = get_slab_iterator(allocator);
+ while (iterator.next != NULL) {
+ const struct admin_state_code *operation =
+ vdo_get_admin_state_code(&allocator->state);
+ struct vdo_slab *slab = next_slab(&iterator);
+
+ list_del_init(&slab->allocq_entry);
+ allocator->slab_actor.slab_action_count++;
+ vdo_start_operation_with_waiter(&slab->state, operation,
+ &allocator->completion,
+ initiate_slab_action);
+ }
+
+ slab_action_callback(&allocator->completion);
+}
+
+static void finish_loading_allocator(struct vdo_completion *completion)
+{
+ struct block_allocator *allocator = vdo_as_block_allocator(completion);
+ const struct admin_state_code *operation =
+ vdo_get_admin_state_code(&allocator->state);
+
+ if (allocator->eraser != NULL)
+ dm_kcopyd_client_destroy(vdo_forget(allocator->eraser));
+
+ if (operation == VDO_ADMIN_STATE_LOADING_FOR_RECOVERY) {
+ void *context =
+ vdo_get_current_action_context(allocator->depot->action_manager);
+
+ vdo_replay_into_slab_journals(allocator, context);
+ return;
+ }
+
+ vdo_finish_loading(&allocator->state);
+}
+
+static void erase_next_slab_journal(struct block_allocator *allocator);
+
+static void copy_callback(int read_err, unsigned long write_err, void *context)
+{
+ struct block_allocator *allocator = context;
+ int result = (((read_err == 0) && (write_err == 0)) ? VDO_SUCCESS : -EIO);
+
+ if (result != VDO_SUCCESS) {
+ vdo_fail_completion(&allocator->completion, result);
+ return;
+ }
+
+ erase_next_slab_journal(allocator);
+}
+
+/* erase_next_slab_journal() - Erase the next slab journal. */
+static void erase_next_slab_journal(struct block_allocator *allocator)
+{
+ struct vdo_slab *slab;
+ physical_block_number_t pbn;
+ struct dm_io_region regions[1];
+ struct slab_depot *depot = allocator->depot;
+ block_count_t blocks = depot->slab_config.slab_journal_blocks;
+
+ if (allocator->slabs_to_erase.next == NULL) {
+ vdo_finish_completion(&allocator->completion);
+ return;
+ }
+
+ slab = next_slab(&allocator->slabs_to_erase);
+ pbn = slab->journal_origin - depot->vdo->geometry.bio_offset;
+ regions[0] = (struct dm_io_region) {
+ .bdev = vdo_get_backing_device(depot->vdo),
+ .sector = pbn * VDO_SECTORS_PER_BLOCK,
+ .count = blocks * VDO_SECTORS_PER_BLOCK,
+ };
+ dm_kcopyd_zero(allocator->eraser, 1, regions, 0, copy_callback, allocator);
+}
+
+/* Implements vdo_admin_initiator_fn. */
+static void initiate_load(struct admin_state *state)
+{
+ struct block_allocator *allocator =
+ container_of(state, struct block_allocator, state);
+ const struct admin_state_code *operation = vdo_get_admin_state_code(state);
+
+ if (operation == VDO_ADMIN_STATE_LOADING_FOR_REBUILD) {
+ /*
+ * Must requeue because the kcopyd client cannot be freed in the same stack frame
+ * as the kcopyd callback, lest it deadlock.
+ */
+ vdo_prepare_completion_for_requeue(&allocator->completion,
+ finish_loading_allocator,
+ handle_operation_error,
+ allocator->thread_id, NULL);
+ allocator->eraser = dm_kcopyd_client_create(NULL);
+ if (IS_ERR(allocator->eraser)) {
+ vdo_fail_completion(&allocator->completion,
+ PTR_ERR(allocator->eraser));
+ allocator->eraser = NULL;
+ return;
+ }
+ allocator->slabs_to_erase = get_slab_iterator(allocator);
+
+ erase_next_slab_journal(allocator);
+ return;
+ }
+
+ apply_to_slabs(allocator, finish_loading_allocator);
+}
+
+/**
+ * vdo_notify_slab_journals_are_recovered() - Inform a block allocator that its slab journals have
+ * been recovered from the recovery journal.
+ * @completion The allocator completion
+ */
+void vdo_notify_slab_journals_are_recovered(struct vdo_completion *completion)
+{
+ struct block_allocator *allocator = vdo_as_block_allocator(completion);
+
+ vdo_finish_loading_with_result(&allocator->state, completion->result);
+}
+
+static int get_slab_statuses(struct block_allocator *allocator,
+ struct slab_status **statuses_ptr)
+{
+ int result;
+ struct slab_status *statuses;
+ struct slab_iterator iterator = get_slab_iterator(allocator);
+
+ result = vdo_allocate(allocator->slab_count, struct slab_status, __func__,
+ &statuses);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ *statuses_ptr = statuses;
+
+ while (iterator.next != NULL) {
+ slab_count_t slab_number = next_slab(&iterator)->slab_number;
+
+ *statuses++ = (struct slab_status) {
+ .slab_number = slab_number,
+ .is_clean = !allocator->summary_entries[slab_number].is_dirty,
+ .emptiness = allocator->summary_entries[slab_number].fullness_hint,
+ };
+ }
+
+ return VDO_SUCCESS;
+}
+
+/* Prepare slabs for allocation or scrubbing. */
+static int __must_check vdo_prepare_slabs_for_allocation(struct block_allocator *allocator)
+{
+ struct slab_status current_slab_status;
+ struct min_heap heap;
+ int result;
+ struct slab_status *slab_statuses;
+ struct slab_depot *depot = allocator->depot;
+
+ WRITE_ONCE(allocator->allocated_blocks,
+ allocator->slab_count * depot->slab_config.data_blocks);
+ result = get_slab_statuses(allocator, &slab_statuses);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ /* Sort the slabs by cleanliness, then by emptiness hint. */
+ heap = (struct min_heap) {
+ .data = slab_statuses,
+ .nr = allocator->slab_count,
+ .size = allocator->slab_count,
+ };
+ min_heapify_all(&heap, &slab_status_min_heap);
+
+ while (heap.nr > 0) {
+ bool high_priority;
+ struct vdo_slab *slab;
+ struct slab_journal *journal;
+
+ current_slab_status = slab_statuses[0];
+ min_heap_pop(&heap, &slab_status_min_heap);
+ slab = depot->slabs[current_slab_status.slab_number];
+
+ if ((depot->load_type == VDO_SLAB_DEPOT_REBUILD_LOAD) ||
+ (!allocator->summary_entries[slab->slab_number].load_ref_counts &&
+ current_slab_status.is_clean)) {
+ queue_slab(slab);
+ continue;
+ }
+
+ slab->status = VDO_SLAB_REQUIRES_SCRUBBING;
+ journal = &slab->journal;
+ high_priority = ((current_slab_status.is_clean &&
+ (depot->load_type == VDO_SLAB_DEPOT_NORMAL_LOAD)) ||
+ (journal_length(journal) >= journal->scrubbing_threshold));
+ register_slab_for_scrubbing(slab, high_priority);
+ }
+
+ vdo_free(slab_statuses);
+ return VDO_SUCCESS;
+}
+
+static const char *status_to_string(enum slab_rebuild_status status)
+{
+ switch (status) {
+ case VDO_SLAB_REBUILT:
+ return "REBUILT";
+ case VDO_SLAB_REQUIRES_SCRUBBING:
+ return "SCRUBBING";
+ case VDO_SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING:
+ return "PRIORITY_SCRUBBING";
+ case VDO_SLAB_REBUILDING:
+ return "REBUILDING";
+ case VDO_SLAB_REPLAYING:
+ return "REPLAYING";
+ default:
+ return "UNKNOWN";
+ }
+}
+
+void vdo_dump_block_allocator(const struct block_allocator *allocator)
+{
+ unsigned int pause_counter = 0;
+ struct slab_iterator iterator = get_slab_iterator(allocator);
+ const struct slab_scrubber *scrubber = &allocator->scrubber;
+
+ vdo_log_info("block_allocator zone %u", allocator->zone_number);
+ while (iterator.next != NULL) {
+ struct vdo_slab *slab = next_slab(&iterator);
+ struct slab_journal *journal = &slab->journal;
+
+ if (slab->reference_blocks != NULL) {
+ /* Terse because there are a lot of slabs to dump and syslog is lossy. */
+ vdo_log_info("slab %u: P%u, %llu free", slab->slab_number,
+ slab->priority,
+ (unsigned long long) slab->free_blocks);
+ } else {
+ vdo_log_info("slab %u: status %s", slab->slab_number,
+ status_to_string(slab->status));
+ }
+
+ vdo_log_info(" slab journal: entry_waiters=%zu waiting_to_commit=%s updating_slab_summary=%s head=%llu unreapable=%llu tail=%llu next_commit=%llu summarized=%llu last_summarized=%llu recovery_lock=%llu dirty=%s",
+ vdo_waitq_num_waiters(&journal->entry_waiters),
+ vdo_bool_to_string(journal->waiting_to_commit),
+ vdo_bool_to_string(journal->updating_slab_summary),
+ (unsigned long long) journal->head,
+ (unsigned long long) journal->unreapable,
+ (unsigned long long) journal->tail,
+ (unsigned long long) journal->next_commit,
+ (unsigned long long) journal->summarized,
+ (unsigned long long) journal->last_summarized,
+ (unsigned long long) journal->recovery_lock,
+ vdo_bool_to_string(journal->recovery_lock != 0));
+ /*
+ * Given the frequency with which the locks are just a tiny bit off, it might be
+ * worth dumping all the locks, but that might be too much logging.
+ */
+
+ if (slab->counters != NULL) {
+ /* Terse because there are a lot of slabs to dump and syslog is lossy. */
+ vdo_log_info(" slab: free=%u/%u blocks=%u dirty=%zu active=%zu journal@(%llu,%u)",
+ slab->free_blocks, slab->block_count,
+ slab->reference_block_count,
+ vdo_waitq_num_waiters(&slab->dirty_blocks),
+ slab->active_count,
+ (unsigned long long) slab->slab_journal_point.sequence_number,
+ slab->slab_journal_point.entry_count);
+ } else {
+ vdo_log_info(" no counters");
+ }
+
+ /*
+ * Wait for a while after each batch of 32 slabs dumped, an arbitrary number,
+ * allowing the kernel log a chance to be flushed instead of being overrun.
+ */
+ if (pause_counter++ == 31) {
+ pause_counter = 0;
+ vdo_pause_for_logger();
+ }
+ }
+
+ vdo_log_info("slab_scrubber slab_count %u waiters %zu %s%s",
+ READ_ONCE(scrubber->slab_count),
+ vdo_waitq_num_waiters(&scrubber->waiters),
+ vdo_get_admin_state_code(&scrubber->admin_state)->name,
+ scrubber->high_priority_only ? ", high_priority_only " : "");
+}
+
+static void free_slab(struct vdo_slab *slab)
+{
+ if (slab == NULL)
+ return;
+
+ list_del(&slab->allocq_entry);
+ vdo_free(vdo_forget(slab->journal.block));
+ vdo_free(vdo_forget(slab->journal.locks));
+ vdo_free(vdo_forget(slab->counters));
+ vdo_free(vdo_forget(slab->reference_blocks));
+ vdo_free(slab);
+}
+
+static int initialize_slab_journal(struct vdo_slab *slab)
+{
+ struct slab_journal *journal = &slab->journal;
+ const struct slab_config *slab_config = &slab->allocator->depot->slab_config;
+ int result;
+
+ result = vdo_allocate(slab_config->slab_journal_blocks, struct journal_lock,
+ __func__, &journal->locks);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = vdo_allocate(VDO_BLOCK_SIZE, char, "struct packed_slab_journal_block",
+ (char **) &journal->block);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ journal->slab = slab;
+ journal->size = slab_config->slab_journal_blocks;
+ journal->flushing_threshold = slab_config->slab_journal_flushing_threshold;
+ journal->blocking_threshold = slab_config->slab_journal_blocking_threshold;
+ journal->scrubbing_threshold = slab_config->slab_journal_scrubbing_threshold;
+ journal->entries_per_block = VDO_SLAB_JOURNAL_ENTRIES_PER_BLOCK;
+ journal->full_entries_per_block = VDO_SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK;
+ journal->events = &slab->allocator->slab_journal_statistics;
+ journal->recovery_journal = slab->allocator->depot->vdo->recovery_journal;
+ journal->tail = 1;
+ journal->head = 1;
+
+ journal->flushing_deadline = journal->flushing_threshold;
+ /*
+ * Set there to be some time between the deadline and the blocking threshold, so that
+ * hopefully all are done before blocking.
+ */
+ if ((journal->blocking_threshold - journal->flushing_threshold) > 5)
+ journal->flushing_deadline = journal->blocking_threshold - 5;
+
+ journal->slab_summary_waiter.callback = release_journal_locks;
+
+ INIT_LIST_HEAD(&journal->dirty_entry);
+ INIT_LIST_HEAD(&journal->uncommitted_blocks);
+
+ journal->tail_header.nonce = slab->allocator->nonce;
+ journal->tail_header.metadata_type = VDO_METADATA_SLAB_JOURNAL;
+ initialize_journal_state(journal);
+ return VDO_SUCCESS;
+}
+
+/**
+ * make_slab() - Construct a new, empty slab.
+ * @slab_origin: The physical block number within the block allocator partition of the first block
+ * in the slab.
+ * @allocator: The block allocator to which the slab belongs.
+ * @slab_number: The slab number of the slab.
+ * @is_new: true if this slab is being allocated as part of a resize.
+ * @slab_ptr: A pointer to receive the new slab.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+static int __must_check make_slab(physical_block_number_t slab_origin,
+ struct block_allocator *allocator,
+ slab_count_t slab_number, bool is_new,
+ struct vdo_slab **slab_ptr)
+{
+ const struct slab_config *slab_config = &allocator->depot->slab_config;
+ struct vdo_slab *slab;
+ int result;
+
+ result = vdo_allocate(1, struct vdo_slab, __func__, &slab);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ *slab = (struct vdo_slab) {
+ .allocator = allocator,
+ .start = slab_origin,
+ .end = slab_origin + slab_config->slab_blocks,
+ .slab_number = slab_number,
+ .ref_counts_origin = slab_origin + slab_config->data_blocks,
+ .journal_origin =
+ vdo_get_slab_journal_start_block(slab_config, slab_origin),
+ .block_count = slab_config->data_blocks,
+ .free_blocks = slab_config->data_blocks,
+ .reference_block_count =
+ vdo_get_saved_reference_count_size(slab_config->data_blocks),
+ };
+ INIT_LIST_HEAD(&slab->allocq_entry);
+
+ result = initialize_slab_journal(slab);
+ if (result != VDO_SUCCESS) {
+ free_slab(slab);
+ return result;
+ }
+
+ if (is_new) {
+ vdo_set_admin_state_code(&slab->state, VDO_ADMIN_STATE_NEW);
+ result = allocate_slab_counters(slab);
+ if (result != VDO_SUCCESS) {
+ free_slab(slab);
+ return result;
+ }
+ } else {
+ vdo_set_admin_state_code(&slab->state, VDO_ADMIN_STATE_NORMAL_OPERATION);
+ }
+
+ *slab_ptr = slab;
+ return VDO_SUCCESS;
+}
+
+/**
+ * allocate_slabs() - Allocate a new slab pointer array.
+ * @depot: The depot.
+ * @slab_count: The number of slabs the depot should have in the new array.
+ *
+ * Any existing slab pointers will be copied into the new array, and slabs will be allocated as
+ * needed. The newly allocated slabs will not be distributed for use by the block allocators.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+static int allocate_slabs(struct slab_depot *depot, slab_count_t slab_count)
+{
+ block_count_t slab_size;
+ bool resizing = false;
+ physical_block_number_t slab_origin;
+ int result;
+
+ result = vdo_allocate(slab_count, struct vdo_slab *,
+ "slab pointer array", &depot->new_slabs);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ if (depot->slabs != NULL) {
+ memcpy(depot->new_slabs, depot->slabs,
+ depot->slab_count * sizeof(struct vdo_slab *));
+ resizing = true;
+ }
+
+ slab_size = depot->slab_config.slab_blocks;
+ slab_origin = depot->first_block + (depot->slab_count * slab_size);
+
+ for (depot->new_slab_count = depot->slab_count;
+ depot->new_slab_count < slab_count;
+ depot->new_slab_count++, slab_origin += slab_size) {
+ struct block_allocator *allocator =
+ &depot->allocators[depot->new_slab_count % depot->zone_count];
+ struct vdo_slab **slab_ptr = &depot->new_slabs[depot->new_slab_count];
+
+ result = make_slab(slab_origin, allocator, depot->new_slab_count,
+ resizing, slab_ptr);
+ if (result != VDO_SUCCESS)
+ return result;
+ }
+
+ return VDO_SUCCESS;
+}
+
+/**
+ * vdo_abandon_new_slabs() - Abandon any new slabs in this depot, freeing them as needed.
+ * @depot: The depot.
+ */
+void vdo_abandon_new_slabs(struct slab_depot *depot)
+{
+ slab_count_t i;
+
+ if (depot->new_slabs == NULL)
+ return;
+
+ for (i = depot->slab_count; i < depot->new_slab_count; i++)
+ free_slab(vdo_forget(depot->new_slabs[i]));
+ depot->new_slab_count = 0;
+ depot->new_size = 0;
+ vdo_free(vdo_forget(depot->new_slabs));
+}
+
+/**
+ * get_allocator_thread_id() - Get the ID of the thread on which a given allocator operates.
+ *
+ * Implements vdo_zone_thread_getter_fn.
+ */
+static thread_id_t get_allocator_thread_id(void *context, zone_count_t zone_number)
+{
+ return ((struct slab_depot *) context)->allocators[zone_number].thread_id;
+}
+
+/**
+ * release_recovery_journal_lock() - Request the slab journal to release the recovery journal lock
+ * it may hold on a specified recovery journal block.
+ * @journal: The slab journal.
+ * @recovery_lock: The sequence number of the recovery journal block whose locks should be
+ * released.
+ *
+ * Return: true if the journal does hold a lock on the specified block (which it will release).
+ */
+static bool __must_check release_recovery_journal_lock(struct slab_journal *journal,
+ sequence_number_t recovery_lock)
+{
+ if (recovery_lock > journal->recovery_lock) {
+ VDO_ASSERT_LOG_ONLY((recovery_lock < journal->recovery_lock),
+ "slab journal recovery lock is not older than the recovery journal head");
+ return false;
+ }
+
+ if ((recovery_lock < journal->recovery_lock) ||
+ vdo_is_read_only(journal->slab->allocator->depot->vdo))
+ return false;
+
+ /* All locks are held by the block which is in progress; write it. */
+ commit_tail(journal);
+ return true;
+}
+
+/*
+ * Request a commit of all dirty tail blocks which are locking the recovery journal block the depot
+ * is seeking to release.
+ *
+ * Implements vdo_zone_action_fn.
+ */
+static void release_tail_block_locks(void *context, zone_count_t zone_number,
+ struct vdo_completion *parent)
+{
+ struct slab_journal *journal, *tmp;
+ struct slab_depot *depot = context;
+ struct list_head *list = &depot->allocators[zone_number].dirty_slab_journals;
+
+ list_for_each_entry_safe(journal, tmp, list, dirty_entry) {
+ if (!release_recovery_journal_lock(journal,
+ depot->active_release_request))
+ break;
+ }
+
+ vdo_finish_completion(parent);
+}
+
+/**
+ * prepare_for_tail_block_commit() - Prepare to commit oldest tail blocks.
+ *
+ * Implements vdo_action_preamble_fn.
+ */
+static void prepare_for_tail_block_commit(void *context, struct vdo_completion *parent)
+{
+ struct slab_depot *depot = context;
+
+ depot->active_release_request = depot->new_release_request;
+ vdo_finish_completion(parent);
+}
+
+/**
+ * schedule_tail_block_commit() - Schedule a tail block commit if necessary.
+ *
+ * This method should not be called directly. Rather, call vdo_schedule_default_action() on the
+ * depot's action manager.
+ *
+ * Implements vdo_action_scheduler_fn.
+ */
+static bool schedule_tail_block_commit(void *context)
+{
+ struct slab_depot *depot = context;
+
+ if (depot->new_release_request == depot->active_release_request)
+ return false;
+
+ return vdo_schedule_action(depot->action_manager,
+ prepare_for_tail_block_commit,
+ release_tail_block_locks,
+ NULL, NULL);
+}
+
+/**
+ * initialize_slab_scrubber() - Initialize an allocator's slab scrubber.
+ * @allocator: The allocator being initialized
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int initialize_slab_scrubber(struct block_allocator *allocator)
+{
+ struct slab_scrubber *scrubber = &allocator->scrubber;
+ block_count_t slab_journal_size =
+ allocator->depot->slab_config.slab_journal_blocks;
+ char *journal_data;
+ int result;
+
+ result = vdo_allocate(VDO_BLOCK_SIZE * slab_journal_size,
+ char, __func__, &journal_data);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = allocate_vio_components(allocator->completion.vdo,
+ VIO_TYPE_SLAB_JOURNAL,
+ VIO_PRIORITY_METADATA,
+ allocator, slab_journal_size,
+ journal_data, &scrubber->vio);
+ if (result != VDO_SUCCESS) {
+ vdo_free(journal_data);
+ return result;
+ }
+
+ INIT_LIST_HEAD(&scrubber->high_priority_slabs);
+ INIT_LIST_HEAD(&scrubber->slabs);
+ vdo_set_admin_state_code(&scrubber->admin_state, VDO_ADMIN_STATE_SUSPENDED);
+ return VDO_SUCCESS;
+}
+
+/**
+ * initialize_slab_summary_block() - Initialize a slab_summary_block.
+ * @allocator: The allocator which owns the block.
+ * @index: The index of this block in its zone's summary.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int __must_check initialize_slab_summary_block(struct block_allocator *allocator,
+ block_count_t index)
+{
+ struct slab_summary_block *block = &allocator->summary_blocks[index];
+ int result;
+
+ result = vdo_allocate(VDO_BLOCK_SIZE, char, __func__, &block->outgoing_entries);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = allocate_vio_components(allocator->depot->vdo, VIO_TYPE_SLAB_SUMMARY,
+ VIO_PRIORITY_METADATA, NULL, 1,
+ block->outgoing_entries, &block->vio);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ block->allocator = allocator;
+ block->entries = &allocator->summary_entries[VDO_SLAB_SUMMARY_ENTRIES_PER_BLOCK * index];
+ block->index = index;
+ return VDO_SUCCESS;
+}
+
+static int __must_check initialize_block_allocator(struct slab_depot *depot,
+ zone_count_t zone)
+{
+ int result;
+ block_count_t i;
+ struct block_allocator *allocator = &depot->allocators[zone];
+ struct vdo *vdo = depot->vdo;
+ block_count_t max_free_blocks = depot->slab_config.data_blocks;
+ unsigned int max_priority = (2 + ilog2(max_free_blocks));
+
+ *allocator = (struct block_allocator) {
+ .depot = depot,
+ .zone_number = zone,
+ .thread_id = vdo->thread_config.physical_threads[zone],
+ .nonce = vdo->states.vdo.nonce,
+ };
+
+ INIT_LIST_HEAD(&allocator->dirty_slab_journals);
+ vdo_set_admin_state_code(&allocator->state, VDO_ADMIN_STATE_NORMAL_OPERATION);
+ result = vdo_register_read_only_listener(vdo, allocator,
+ notify_block_allocator_of_read_only_mode,
+ allocator->thread_id);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ vdo_initialize_completion(&allocator->completion, vdo, VDO_BLOCK_ALLOCATOR_COMPLETION);
+ result = make_vio_pool(vdo, BLOCK_ALLOCATOR_VIO_POOL_SIZE, allocator->thread_id,
+ VIO_TYPE_SLAB_JOURNAL, VIO_PRIORITY_METADATA,
+ allocator, &allocator->vio_pool);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = initialize_slab_scrubber(allocator);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = vdo_make_priority_table(max_priority, &allocator->prioritized_slabs);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = vdo_allocate(VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE,
+ struct slab_summary_block, __func__,
+ &allocator->summary_blocks);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ vdo_set_admin_state_code(&allocator->summary_state,
+ VDO_ADMIN_STATE_NORMAL_OPERATION);
+ allocator->summary_entries = depot->summary_entries + (MAX_VDO_SLABS * zone);
+
+ /* Initialize each summary block. */
+ for (i = 0; i < VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE; i++) {
+ result = initialize_slab_summary_block(allocator, i);
+ if (result != VDO_SUCCESS)
+ return result;
+ }
+
+ /*
+ * Performing well atop thin provisioned storage requires either that VDO discards freed
+ * blocks, or that the block allocator try to use slabs that already have allocated blocks
+ * in preference to slabs that have never been opened. For reasons we have not been able to
+ * fully understand, some SSD machines have been have been very sensitive (50% reduction in
+ * test throughput) to very slight differences in the timing and locality of block
+ * allocation. Assigning a low priority to unopened slabs (max_priority/2, say) would be
+ * ideal for the story, but anything less than a very high threshold (max_priority - 1)
+ * hurts on these machines.
+ *
+ * This sets the free block threshold for preferring to open an unopened slab to the binary
+ * floor of 3/4ths the total number of data blocks in a slab, which will generally evaluate
+ * to about half the slab size.
+ */
+ allocator->unopened_slab_priority = (1 + ilog2((max_free_blocks * 3) / 4));
+
+ return VDO_SUCCESS;
+}
+
+static int allocate_components(struct slab_depot *depot,
+ struct partition *summary_partition)
+{
+ int result;
+ zone_count_t zone;
+ slab_count_t slab_count;
+ u8 hint;
+ u32 i;
+ const struct thread_config *thread_config = &depot->vdo->thread_config;
+
+ result = vdo_make_action_manager(depot->zone_count, get_allocator_thread_id,
+ thread_config->journal_thread, depot,
+ schedule_tail_block_commit,
+ depot->vdo, &depot->action_manager);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ depot->origin = depot->first_block;
+
+ /* block size must be a multiple of entry size */
+ BUILD_BUG_ON((VDO_BLOCK_SIZE % sizeof(struct slab_summary_entry)) != 0);
+
+ depot->summary_origin = summary_partition->offset;
+ depot->hint_shift = vdo_get_slab_summary_hint_shift(depot->slab_size_shift);
+ result = vdo_allocate(MAXIMUM_VDO_SLAB_SUMMARY_ENTRIES,
+ struct slab_summary_entry, __func__,
+ &depot->summary_entries);
+ if (result != VDO_SUCCESS)
+ return result;
+
+
+ /* Initialize all the entries. */
+ hint = compute_fullness_hint(depot, depot->slab_config.data_blocks);
+ for (i = 0; i < MAXIMUM_VDO_SLAB_SUMMARY_ENTRIES; i++) {
+ /*
+ * This default tail block offset must be reflected in
+ * slabJournal.c::read_slab_journal_tail().
+ */
+ depot->summary_entries[i] = (struct slab_summary_entry) {
+ .tail_block_offset = 0,
+ .fullness_hint = hint,
+ .load_ref_counts = false,
+ .is_dirty = false,
+ };
+ }
+
+ slab_count = vdo_compute_slab_count(depot->first_block, depot->last_block,
+ depot->slab_size_shift);
+ if (thread_config->physical_zone_count > slab_count) {
+ return vdo_log_error_strerror(VDO_BAD_CONFIGURATION,
+ "%u physical zones exceeds slab count %u",
+ thread_config->physical_zone_count,
+ slab_count);
+ }
+
+ /* Initialize the block allocators. */
+ for (zone = 0; zone < depot->zone_count; zone++) {
+ result = initialize_block_allocator(depot, zone);
+ if (result != VDO_SUCCESS)
+ return result;
+ }
+
+ /* Allocate slabs. */
+ result = allocate_slabs(depot, slab_count);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ /* Use the new slabs. */
+ for (i = depot->slab_count; i < depot->new_slab_count; i++) {
+ struct vdo_slab *slab = depot->new_slabs[i];
+
+ register_slab_with_allocator(slab->allocator, slab);
+ WRITE_ONCE(depot->slab_count, depot->slab_count + 1);
+ }
+
+ depot->slabs = depot->new_slabs;
+ depot->new_slabs = NULL;
+ depot->new_slab_count = 0;
+
+ return VDO_SUCCESS;
+}
+
+/**
+ * vdo_decode_slab_depot() - Make a slab depot and configure it with the state read from the super
+ * block.
+ * @state: The slab depot state from the super block.
+ * @vdo: The VDO which will own the depot.
+ * @summary_partition: The partition which holds the slab summary.
+ * @depot_ptr: A pointer to hold the depot.
+ *
+ * Return: A success or error code.
+ */
+int vdo_decode_slab_depot(struct slab_depot_state_2_0 state, struct vdo *vdo,
+ struct partition *summary_partition,
+ struct slab_depot **depot_ptr)
+{
+ unsigned int slab_size_shift;
+ struct slab_depot *depot;
+ int result;
+
+ /*
+ * Calculate the bit shift for efficiently mapping block numbers to slabs. Using a shift
+ * requires that the slab size be a power of two.
+ */
+ block_count_t slab_size = state.slab_config.slab_blocks;
+
+ if (!is_power_of_2(slab_size)) {
+ return vdo_log_error_strerror(UDS_INVALID_ARGUMENT,
+ "slab size must be a power of two");
+ }
+ slab_size_shift = ilog2(slab_size);
+
+ result = vdo_allocate_extended(struct slab_depot,
+ vdo->thread_config.physical_zone_count,
+ struct block_allocator, __func__, &depot);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ depot->vdo = vdo;
+ depot->old_zone_count = state.zone_count;
+ depot->zone_count = vdo->thread_config.physical_zone_count;
+ depot->slab_config = state.slab_config;
+ depot->first_block = state.first_block;
+ depot->last_block = state.last_block;
+ depot->slab_size_shift = slab_size_shift;
+
+ result = allocate_components(depot, summary_partition);
+ if (result != VDO_SUCCESS) {
+ vdo_free_slab_depot(depot);
+ return result;
+ }
+
+ *depot_ptr = depot;
+ return VDO_SUCCESS;
+}
+
+static void uninitialize_allocator_summary(struct block_allocator *allocator)
+{
+ block_count_t i;
+
+ if (allocator->summary_blocks == NULL)
+ return;
+
+ for (i = 0; i < VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE; i++) {
+ free_vio_components(&allocator->summary_blocks[i].vio);
+ vdo_free(vdo_forget(allocator->summary_blocks[i].outgoing_entries));
+ }
+
+ vdo_free(vdo_forget(allocator->summary_blocks));
+}
+
+/**
+ * vdo_free_slab_depot() - Destroy a slab depot.
+ * @depot: The depot to destroy.
+ */
+void vdo_free_slab_depot(struct slab_depot *depot)
+{
+ zone_count_t zone = 0;
+
+ if (depot == NULL)
+ return;
+
+ vdo_abandon_new_slabs(depot);
+
+ for (zone = 0; zone < depot->zone_count; zone++) {
+ struct block_allocator *allocator = &depot->allocators[zone];
+
+ if (allocator->eraser != NULL)
+ dm_kcopyd_client_destroy(vdo_forget(allocator->eraser));
+
+ uninitialize_allocator_summary(allocator);
+ uninitialize_scrubber_vio(&allocator->scrubber);
+ free_vio_pool(vdo_forget(allocator->vio_pool));
+ vdo_free_priority_table(vdo_forget(allocator->prioritized_slabs));
+ }
+
+ if (depot->slabs != NULL) {
+ slab_count_t i;
+
+ for (i = 0; i < depot->slab_count; i++)
+ free_slab(vdo_forget(depot->slabs[i]));
+ }
+
+ vdo_free(vdo_forget(depot->slabs));
+ vdo_free(vdo_forget(depot->action_manager));
+ vdo_free(vdo_forget(depot->summary_entries));
+ vdo_free(depot);
+}
+
+/**
+ * vdo_record_slab_depot() - Record the state of a slab depot for encoding into the super block.
+ * @depot: The depot to encode.
+ *
+ * Return: The depot state.
+ */
+struct slab_depot_state_2_0 vdo_record_slab_depot(const struct slab_depot *depot)
+{
+ /*
+ * If this depot is currently using 0 zones, it must have been synchronously loaded by a
+ * tool and is now being saved. We did not load and combine the slab summary, so we still
+ * need to do that next time we load with the old zone count rather than 0.
+ */
+ struct slab_depot_state_2_0 state;
+ zone_count_t zones_to_record = depot->zone_count;
+
+ if (depot->zone_count == 0)
+ zones_to_record = depot->old_zone_count;
+
+ state = (struct slab_depot_state_2_0) {
+ .slab_config = depot->slab_config,
+ .first_block = depot->first_block,
+ .last_block = depot->last_block,
+ .zone_count = zones_to_record,
+ };
+
+ return state;
+}
+
+/**
+ * vdo_allocate_reference_counters() - Allocate the reference counters for all slabs in the depot.
+ *
+ * Context: This method may be called only before entering normal operation from the load thread.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+int vdo_allocate_reference_counters(struct slab_depot *depot)
+{
+ struct slab_iterator iterator =
+ get_depot_slab_iterator(depot, depot->slab_count - 1, 0, 1);
+
+ while (iterator.next != NULL) {
+ int result = allocate_slab_counters(next_slab(&iterator));
+
+ if (result != VDO_SUCCESS)
+ return result;
+ }
+
+ return VDO_SUCCESS;
+}
+
+/**
+ * get_slab_number() - Get the number of the slab that contains a specified block.
+ * @depot: The slab depot.
+ * @pbn: The physical block number.
+ * @slab_number_ptr: A pointer to hold the slab number.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int __must_check get_slab_number(const struct slab_depot *depot,
+ physical_block_number_t pbn,
+ slab_count_t *slab_number_ptr)
+{
+ slab_count_t slab_number;
+
+ if (pbn < depot->first_block)
+ return VDO_OUT_OF_RANGE;
+
+ slab_number = (pbn - depot->first_block) >> depot->slab_size_shift;
+ if (slab_number >= depot->slab_count)
+ return VDO_OUT_OF_RANGE;
+
+ *slab_number_ptr = slab_number;
+ return VDO_SUCCESS;
+}
+
+/**
+ * vdo_get_slab() - Get the slab object for the slab that contains a specified block.
+ * @depot: The slab depot.
+ * @pbn: The physical block number.
+ *
+ * Will put the VDO in read-only mode if the PBN is not a valid data block nor the zero block.
+ *
+ * Return: The slab containing the block, or NULL if the block number is the zero block or
+ * otherwise out of range.
+ */
+struct vdo_slab *vdo_get_slab(const struct slab_depot *depot,
+ physical_block_number_t pbn)
+{
+ slab_count_t slab_number;
+ int result;
+
+ if (pbn == VDO_ZERO_BLOCK)
+ return NULL;
+
+ result = get_slab_number(depot, pbn, &slab_number);
+ if (result != VDO_SUCCESS) {
+ vdo_enter_read_only_mode(depot->vdo, result);
+ return NULL;
+ }
+
+ return depot->slabs[slab_number];
+}
+
+/**
+ * vdo_get_increment_limit() - Determine how many new references a block can acquire.
+ * @depot: The slab depot.
+ * @pbn: The physical block number that is being queried.
+ *
+ * Context: This method must be called from the physical zone thread of the PBN.
+ *
+ * Return: The number of available references.
+ */
+u8 vdo_get_increment_limit(struct slab_depot *depot, physical_block_number_t pbn)
+{
+ struct vdo_slab *slab = vdo_get_slab(depot, pbn);
+ vdo_refcount_t *counter_ptr = NULL;
+ int result;
+
+ if ((slab == NULL) || (slab->status != VDO_SLAB_REBUILT))
+ return 0;
+
+ result = get_reference_counter(slab, pbn, &counter_ptr);
+ if (result != VDO_SUCCESS)
+ return 0;
+
+ if (*counter_ptr == PROVISIONAL_REFERENCE_COUNT)
+ return (MAXIMUM_REFERENCE_COUNT - 1);
+
+ return (MAXIMUM_REFERENCE_COUNT - *counter_ptr);
+}
+
+/**
+ * vdo_is_physical_data_block() - Determine whether the given PBN refers to a data block.
+ * @depot: The depot.
+ * @pbn: The physical block number to ask about.
+ *
+ * Return: True if the PBN corresponds to a data block.
+ */
+bool vdo_is_physical_data_block(const struct slab_depot *depot,
+ physical_block_number_t pbn)
+{
+ slab_count_t slab_number;
+ slab_block_number sbn;
+
+ return ((pbn == VDO_ZERO_BLOCK) ||
+ ((get_slab_number(depot, pbn, &slab_number) == VDO_SUCCESS) &&
+ (slab_block_number_from_pbn(depot->slabs[slab_number], pbn, &sbn) ==
+ VDO_SUCCESS)));
+}
+
+/**
+ * vdo_get_slab_depot_allocated_blocks() - Get the total number of data blocks allocated across all
+ * the slabs in the depot.
+ * @depot: The slab depot.
+ *
+ * This is the total number of blocks with a non-zero reference count.
+ *
+ * Context: This may be called from any thread.
+ *
+ * Return: The total number of blocks with a non-zero reference count.
+ */
+block_count_t vdo_get_slab_depot_allocated_blocks(const struct slab_depot *depot)
+{
+ block_count_t total = 0;
+ zone_count_t zone;
+
+ for (zone = 0; zone < depot->zone_count; zone++) {
+ /* The allocators are responsible for thread safety. */
+ total += READ_ONCE(depot->allocators[zone].allocated_blocks);
+ }
+
+ return total;
+}
+
+/**
+ * vdo_get_slab_depot_data_blocks() - Get the total number of data blocks in all the slabs in the
+ * depot.
+ * @depot: The slab depot.
+ *
+ * Context: This may be called from any thread.
+ *
+ * Return: The total number of data blocks in all slabs.
+ */
+block_count_t vdo_get_slab_depot_data_blocks(const struct slab_depot *depot)
+{
+ return (READ_ONCE(depot->slab_count) * depot->slab_config.data_blocks);
+}
+
+/**
+ * finish_combining_zones() - Clean up after saving out the combined slab summary.
+ * @completion: The vio which was used to write the summary data.
+ */
+static void finish_combining_zones(struct vdo_completion *completion)
+{
+ int result = completion->result;
+ struct vdo_completion *parent = completion->parent;
+
+ free_vio(as_vio(vdo_forget(completion)));
+ vdo_fail_completion(parent, result);
+}
+
+static void handle_combining_error(struct vdo_completion *completion)
+{
+ vio_record_metadata_io_error(as_vio(completion));
+ finish_combining_zones(completion);
+}
+
+static void write_summary_endio(struct bio *bio)
+{
+ struct vio *vio = bio->bi_private;
+ struct vdo *vdo = vio->completion.vdo;
+
+ continue_vio_after_io(vio, finish_combining_zones,
+ vdo->thread_config.admin_thread);
+}
+
+/**
+ * combine_summaries() - Treating the current entries buffer as the on-disk value of all zones,
+ * update every zone to the correct values for every slab.
+ * @depot: The depot whose summary entries should be combined.
+ */
+static void combine_summaries(struct slab_depot *depot)
+{
+ /*
+ * Combine all the old summary data into the portion of the buffer corresponding to the
+ * first zone.
+ */
+ zone_count_t zone = 0;
+ struct slab_summary_entry *entries = depot->summary_entries;
+
+ if (depot->old_zone_count > 1) {
+ slab_count_t entry_number;
+
+ for (entry_number = 0; entry_number < MAX_VDO_SLABS; entry_number++) {
+ if (zone != 0) {
+ memcpy(entries + entry_number,
+ entries + (zone * MAX_VDO_SLABS) + entry_number,
+ sizeof(struct slab_summary_entry));
+ }
+
+ zone++;
+ if (zone == depot->old_zone_count)
+ zone = 0;
+ }
+ }
+
+ /* Copy the combined data to each zones's region of the buffer. */
+ for (zone = 1; zone < MAX_VDO_PHYSICAL_ZONES; zone++) {
+ memcpy(entries + (zone * MAX_VDO_SLABS), entries,
+ MAX_VDO_SLABS * sizeof(struct slab_summary_entry));
+ }
+}
+
+/**
+ * finish_loading_summary() - Finish loading slab summary data.
+ * @completion: The vio which was used to read the summary data.
+ *
+ * Combines the slab summary data from all the previously written zones and copies the combined
+ * summary to each partition's data region. Then writes the combined summary back out to disk. This
+ * callback is registered in load_summary_endio().
+ */
+static void finish_loading_summary(struct vdo_completion *completion)
+{
+ struct slab_depot *depot = completion->vdo->depot;
+
+ /* Combine the summary from each zone so each zone is correct for all slabs. */
+ combine_summaries(depot);
+
+ /* Write the combined summary back out. */
+ vdo_submit_metadata_vio(as_vio(completion), depot->summary_origin,
+ write_summary_endio, handle_combining_error,
+ REQ_OP_WRITE);
+}
+
+static void load_summary_endio(struct bio *bio)
+{
+ struct vio *vio = bio->bi_private;
+ struct vdo *vdo = vio->completion.vdo;
+
+ continue_vio_after_io(vio, finish_loading_summary,
+ vdo->thread_config.admin_thread);
+}
+
+/**
+ * load_slab_summary() - The preamble of a load operation.
+ *
+ * Implements vdo_action_preamble_fn.
+ */
+static void load_slab_summary(void *context, struct vdo_completion *parent)
+{
+ int result;
+ struct vio *vio;
+ struct slab_depot *depot = context;
+ const struct admin_state_code *operation =
+ vdo_get_current_manager_operation(depot->action_manager);
+
+ result = create_multi_block_metadata_vio(depot->vdo, VIO_TYPE_SLAB_SUMMARY,
+ VIO_PRIORITY_METADATA, parent,
+ VDO_SLAB_SUMMARY_BLOCKS,
+ (char *) depot->summary_entries, &vio);
+ if (result != VDO_SUCCESS) {
+ vdo_fail_completion(parent, result);
+ return;
+ }
+
+ if ((operation == VDO_ADMIN_STATE_FORMATTING) ||
+ (operation == VDO_ADMIN_STATE_LOADING_FOR_REBUILD)) {
+ finish_loading_summary(&vio->completion);
+ return;
+ }
+
+ vdo_submit_metadata_vio(vio, depot->summary_origin, load_summary_endio,
+ handle_combining_error, REQ_OP_READ);
+}
+
+/* Implements vdo_zone_action_fn. */
+static void load_allocator(void *context, zone_count_t zone_number,
+ struct vdo_completion *parent)
+{
+ struct slab_depot *depot = context;
+
+ vdo_start_loading(&depot->allocators[zone_number].state,
+ vdo_get_current_manager_operation(depot->action_manager),
+ parent, initiate_load);
+}
+
+/**
+ * vdo_load_slab_depot() - Asynchronously load any slab depot state that isn't included in the
+ * super_block component.
+ * @depot: The depot to load.
+ * @operation: The type of load to perform.
+ * @parent: The completion to notify when the load is complete.
+ * @context: Additional context for the load operation; may be NULL.
+ *
+ * This method may be called only before entering normal operation from the load thread.
+ */
+void vdo_load_slab_depot(struct slab_depot *depot,
+ const struct admin_state_code *operation,
+ struct vdo_completion *parent, void *context)
+{
+ if (!vdo_assert_load_operation(operation, parent))
+ return;
+
+ vdo_schedule_operation_with_context(depot->action_manager, operation,
+ load_slab_summary, load_allocator,
+ NULL, context, parent);
+}
+
+/* Implements vdo_zone_action_fn. */
+static void prepare_to_allocate(void *context, zone_count_t zone_number,
+ struct vdo_completion *parent)
+{
+ struct slab_depot *depot = context;
+ struct block_allocator *allocator = &depot->allocators[zone_number];
+ int result;
+
+ result = vdo_prepare_slabs_for_allocation(allocator);
+ if (result != VDO_SUCCESS) {
+ vdo_fail_completion(parent, result);
+ return;
+ }
+
+ scrub_slabs(allocator, parent);
+}
+
+/**
+ * vdo_prepare_slab_depot_to_allocate() - Prepare the slab depot to come online and start
+ * allocating blocks.
+ * @depot: The depot to prepare.
+ * @load_type: The load type.
+ * @parent: The completion to notify when the operation is complete.
+ *
+ * This method may be called only before entering normal operation from the load thread. It must be
+ * called before allocation may proceed.
+ */
+void vdo_prepare_slab_depot_to_allocate(struct slab_depot *depot,
+ enum slab_depot_load_type load_type,
+ struct vdo_completion *parent)
+{
+ depot->load_type = load_type;
+ atomic_set(&depot->zones_to_scrub, depot->zone_count);
+ vdo_schedule_action(depot->action_manager, NULL,
+ prepare_to_allocate, NULL, parent);
+}
+
+/**
+ * vdo_update_slab_depot_size() - Update the slab depot to reflect its new size in memory.
+ * @depot: The depot to update.
+ *
+ * This size is saved to disk as part of the super block.
+ */
+void vdo_update_slab_depot_size(struct slab_depot *depot)
+{
+ depot->last_block = depot->new_last_block;
+}
+
+/**
+ * vdo_prepare_to_grow_slab_depot() - Allocate new memory needed for a resize of a slab depot to
+ * the given size.
+ * @depot: The depot to prepare to resize.
+ * @partition: The new depot partition
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+int vdo_prepare_to_grow_slab_depot(struct slab_depot *depot,
+ const struct partition *partition)
+{
+ struct slab_depot_state_2_0 new_state;
+ int result;
+ slab_count_t new_slab_count;
+
+ if ((partition->count >> depot->slab_size_shift) <= depot->slab_count)
+ return VDO_INCREMENT_TOO_SMALL;
+
+ /* Generate the depot configuration for the new block count. */
+ VDO_ASSERT_LOG_ONLY(depot->first_block == partition->offset,
+ "New slab depot partition doesn't change origin");
+ result = vdo_configure_slab_depot(partition, depot->slab_config,
+ depot->zone_count, &new_state);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ new_slab_count = vdo_compute_slab_count(depot->first_block,
+ new_state.last_block,
+ depot->slab_size_shift);
+ if (new_slab_count <= depot->slab_count)
+ return vdo_log_error_strerror(VDO_INCREMENT_TOO_SMALL,
+ "Depot can only grow");
+ if (new_slab_count == depot->new_slab_count) {
+ /* Check it out, we've already got all the new slabs allocated! */
+ return VDO_SUCCESS;
+ }
+
+ vdo_abandon_new_slabs(depot);
+ result = allocate_slabs(depot, new_slab_count);
+ if (result != VDO_SUCCESS) {
+ vdo_abandon_new_slabs(depot);
+ return result;
+ }
+
+ depot->new_size = partition->count;
+ depot->old_last_block = depot->last_block;
+ depot->new_last_block = new_state.last_block;
+
+ return VDO_SUCCESS;
+}
+
+/**
+ * finish_registration() - Finish registering new slabs now that all of the allocators have
+ * received their new slabs.
+ *
+ * Implements vdo_action_conclusion_fn.
+ */
+static int finish_registration(void *context)
+{
+ struct slab_depot *depot = context;
+
+ WRITE_ONCE(depot->slab_count, depot->new_slab_count);
+ vdo_free(depot->slabs);
+ depot->slabs = depot->new_slabs;
+ depot->new_slabs = NULL;
+ depot->new_slab_count = 0;
+ return VDO_SUCCESS;
+}
+
+/* Implements vdo_zone_action_fn. */
+static void register_new_slabs(void *context, zone_count_t zone_number,
+ struct vdo_completion *parent)
+{
+ struct slab_depot *depot = context;
+ struct block_allocator *allocator = &depot->allocators[zone_number];
+ slab_count_t i;
+
+ for (i = depot->slab_count; i < depot->new_slab_count; i++) {
+ struct vdo_slab *slab = depot->new_slabs[i];
+
+ if (slab->allocator == allocator)
+ register_slab_with_allocator(allocator, slab);
+ }
+
+ vdo_finish_completion(parent);
+}
+
+/**
+ * vdo_use_new_slabs() - Use the new slabs allocated for resize.
+ * @depot: The depot.
+ * @parent: The object to notify when complete.
+ */
+void vdo_use_new_slabs(struct slab_depot *depot, struct vdo_completion *parent)
+{
+ VDO_ASSERT_LOG_ONLY(depot->new_slabs != NULL, "Must have new slabs to use");
+ vdo_schedule_operation(depot->action_manager,
+ VDO_ADMIN_STATE_SUSPENDED_OPERATION,
+ NULL, register_new_slabs,
+ finish_registration, parent);
+}
+
+/**
+ * stop_scrubbing() - Tell the scrubber to stop scrubbing after it finishes the slab it is
+ * currently working on.
+ * @scrubber: The scrubber to stop.
+ * @parent: The completion to notify when scrubbing has stopped.
+ */
+static void stop_scrubbing(struct block_allocator *allocator)
+{
+ struct slab_scrubber *scrubber = &allocator->scrubber;
+
+ if (vdo_is_state_quiescent(&scrubber->admin_state)) {
+ vdo_finish_completion(&allocator->completion);
+ } else {
+ vdo_start_draining(&scrubber->admin_state,
+ VDO_ADMIN_STATE_SUSPENDING,
+ &allocator->completion, NULL);
+ }
+}
+
+/* Implements vdo_admin_initiator_fn. */
+static void initiate_summary_drain(struct admin_state *state)
+{
+ check_summary_drain_complete(container_of(state, struct block_allocator,
+ summary_state));
+}
+
+static void do_drain_step(struct vdo_completion *completion)
+{
+ struct block_allocator *allocator = vdo_as_block_allocator(completion);
+
+ vdo_prepare_completion_for_requeue(&allocator->completion, do_drain_step,
+ handle_operation_error, allocator->thread_id,
+ NULL);
+ switch (++allocator->drain_step) {
+ case VDO_DRAIN_ALLOCATOR_STEP_SCRUBBER:
+ stop_scrubbing(allocator);
+ return;
+
+ case VDO_DRAIN_ALLOCATOR_STEP_SLABS:
+ apply_to_slabs(allocator, do_drain_step);
+ return;
+
+ case VDO_DRAIN_ALLOCATOR_STEP_SUMMARY:
+ vdo_start_draining(&allocator->summary_state,
+ vdo_get_admin_state_code(&allocator->state),
+ completion, initiate_summary_drain);
+ return;
+
+ case VDO_DRAIN_ALLOCATOR_STEP_FINISHED:
+ VDO_ASSERT_LOG_ONLY(!is_vio_pool_busy(allocator->vio_pool),
+ "vio pool not busy");
+ vdo_finish_draining_with_result(&allocator->state, completion->result);
+ return;
+
+ default:
+ vdo_finish_draining_with_result(&allocator->state, UDS_BAD_STATE);
+ }
+}
+
+/* Implements vdo_admin_initiator_fn. */
+static void initiate_drain(struct admin_state *state)
+{
+ struct block_allocator *allocator =
+ container_of(state, struct block_allocator, state);
+
+ allocator->drain_step = VDO_DRAIN_ALLOCATOR_START;
+ do_drain_step(&allocator->completion);
+}
+
+/*
+ * Drain all allocator I/O. Depending upon the type of drain, some or all dirty metadata may be
+ * written to disk. The type of drain will be determined from the state of the allocator's depot.
+ *
+ * Implements vdo_zone_action_fn.
+ */
+static void drain_allocator(void *context, zone_count_t zone_number,
+ struct vdo_completion *parent)
+{
+ struct slab_depot *depot = context;
+
+ vdo_start_draining(&depot->allocators[zone_number].state,
+ vdo_get_current_manager_operation(depot->action_manager),
+ parent, initiate_drain);
+}
+
+/**
+ * vdo_drain_slab_depot() - Drain all slab depot I/O.
+ * @depot: The depot to drain.
+ * @operation: The drain operation (flush, rebuild, suspend, or save).
+ * @parent: The completion to finish when the drain is complete.
+ *
+ * If saving, or flushing, all dirty depot metadata will be written out. If saving or suspending,
+ * the depot will be left in a suspended state.
+ */
+void vdo_drain_slab_depot(struct slab_depot *depot,
+ const struct admin_state_code *operation,
+ struct vdo_completion *parent)
+{
+ vdo_schedule_operation(depot->action_manager, operation,
+ NULL, drain_allocator, NULL, parent);
+}
+
+/**
+ * resume_scrubbing() - Tell the scrubber to resume scrubbing if it has been stopped.
+ * @allocator: The allocator being resumed.
+ */
+static void resume_scrubbing(struct block_allocator *allocator)
+{
+ int result;
+ struct slab_scrubber *scrubber = &allocator->scrubber;
+
+ if (!has_slabs_to_scrub(scrubber)) {
+ vdo_finish_completion(&allocator->completion);
+ return;
+ }
+
+ result = vdo_resume_if_quiescent(&scrubber->admin_state);
+ if (result != VDO_SUCCESS) {
+ vdo_fail_completion(&allocator->completion, result);
+ return;
+ }
+
+ scrub_next_slab(scrubber);
+ vdo_finish_completion(&allocator->completion);
+}
+
+static void do_resume_step(struct vdo_completion *completion)
+{
+ struct block_allocator *allocator = vdo_as_block_allocator(completion);
+
+ vdo_prepare_completion_for_requeue(&allocator->completion, do_resume_step,
+ handle_operation_error,
+ allocator->thread_id, NULL);
+ switch (--allocator->drain_step) {
+ case VDO_DRAIN_ALLOCATOR_STEP_SUMMARY:
+ vdo_fail_completion(completion,
+ vdo_resume_if_quiescent(&allocator->summary_state));
+ return;
+
+ case VDO_DRAIN_ALLOCATOR_STEP_SLABS:
+ apply_to_slabs(allocator, do_resume_step);
+ return;
+
+ case VDO_DRAIN_ALLOCATOR_STEP_SCRUBBER:
+ resume_scrubbing(allocator);
+ return;
+
+ case VDO_DRAIN_ALLOCATOR_START:
+ vdo_finish_resuming_with_result(&allocator->state, completion->result);
+ return;
+
+ default:
+ vdo_finish_resuming_with_result(&allocator->state, UDS_BAD_STATE);
+ }
+}
+
+/* Implements vdo_admin_initiator_fn. */
+static void initiate_resume(struct admin_state *state)
+{
+ struct block_allocator *allocator =
+ container_of(state, struct block_allocator, state);
+
+ allocator->drain_step = VDO_DRAIN_ALLOCATOR_STEP_FINISHED;
+ do_resume_step(&allocator->completion);
+}
+
+/* Implements vdo_zone_action_fn. */
+static void resume_allocator(void *context, zone_count_t zone_number,
+ struct vdo_completion *parent)
+{
+ struct slab_depot *depot = context;
+
+ vdo_start_resuming(&depot->allocators[zone_number].state,
+ vdo_get_current_manager_operation(depot->action_manager),
+ parent, initiate_resume);
+}
+
+/**
+ * vdo_resume_slab_depot() - Resume a suspended slab depot.
+ * @depot: The depot to resume.
+ * @parent: The completion to finish when the depot has resumed.
+ */
+void vdo_resume_slab_depot(struct slab_depot *depot, struct vdo_completion *parent)
+{
+ if (vdo_is_read_only(depot->vdo)) {
+ vdo_continue_completion(parent, VDO_READ_ONLY);
+ return;
+ }
+
+ vdo_schedule_operation(depot->action_manager, VDO_ADMIN_STATE_RESUMING,
+ NULL, resume_allocator, NULL, parent);
+}
+
+/**
+ * vdo_commit_oldest_slab_journal_tail_blocks() - Commit all dirty tail blocks which are locking a
+ * given recovery journal block.
+ * @depot: The depot.
+ * @recovery_block_number: The sequence number of the recovery journal block whose locks should be
+ * released.
+ *
+ * Context: This method must be called from the journal zone thread.
+ */
+void vdo_commit_oldest_slab_journal_tail_blocks(struct slab_depot *depot,
+ sequence_number_t recovery_block_number)
+{
+ if (depot == NULL)
+ return;
+
+ depot->new_release_request = recovery_block_number;
+ vdo_schedule_default_action(depot->action_manager);
+}
+
+/* Implements vdo_zone_action_fn. */
+static void scrub_all_unrecovered_slabs(void *context, zone_count_t zone_number,
+ struct vdo_completion *parent)
+{
+ struct slab_depot *depot = context;
+
+ scrub_slabs(&depot->allocators[zone_number], NULL);
+ vdo_launch_completion(parent);
+}
+
+/**
+ * vdo_scrub_all_unrecovered_slabs() - Scrub all unrecovered slabs.
+ * @depot: The depot to scrub.
+ * @parent: The object to notify when scrubbing has been launched for all zones.
+ */
+void vdo_scrub_all_unrecovered_slabs(struct slab_depot *depot,
+ struct vdo_completion *parent)
+{
+ vdo_schedule_action(depot->action_manager, NULL,
+ scrub_all_unrecovered_slabs,
+ NULL, parent);
+}
+
+/**
+ * get_block_allocator_statistics() - Get the total of the statistics from all the block allocators
+ * in the depot.
+ * @depot: The slab depot.
+ *
+ * Return: The statistics from all block allocators in the depot.
+ */
+static struct block_allocator_statistics __must_check
+get_block_allocator_statistics(const struct slab_depot *depot)
+{
+ struct block_allocator_statistics totals;
+ zone_count_t zone;
+
+ memset(&totals, 0, sizeof(totals));
+
+ for (zone = 0; zone < depot->zone_count; zone++) {
+ const struct block_allocator *allocator = &depot->allocators[zone];
+ const struct block_allocator_statistics *stats = &allocator->statistics;
+
+ totals.slab_count += allocator->slab_count;
+ totals.slabs_opened += READ_ONCE(stats->slabs_opened);
+ totals.slabs_reopened += READ_ONCE(stats->slabs_reopened);
+ }
+
+ return totals;
+}
+
+/**
+ * get_ref_counts_statistics() - Get the cumulative ref_counts statistics for the depot.
+ * @depot: The slab depot.
+ *
+ * Return: The cumulative statistics for all ref_counts in the depot.
+ */
+static struct ref_counts_statistics __must_check
+get_ref_counts_statistics(const struct slab_depot *depot)
+{
+ struct ref_counts_statistics totals;
+ zone_count_t zone;
+
+ memset(&totals, 0, sizeof(totals));
+
+ for (zone = 0; zone < depot->zone_count; zone++) {
+ totals.blocks_written +=
+ READ_ONCE(depot->allocators[zone].ref_counts_statistics.blocks_written);
+ }
+
+ return totals;
+}
+
+/**
+ * get_slab_journal_statistics() - Get the aggregated slab journal statistics for the depot.
+ * @depot: The slab depot.
+ *
+ * Return: The aggregated statistics for all slab journals in the depot.
+ */
+static struct slab_journal_statistics __must_check
+get_slab_journal_statistics(const struct slab_depot *depot)
+{
+ struct slab_journal_statistics totals;
+ zone_count_t zone;
+
+ memset(&totals, 0, sizeof(totals));
+
+ for (zone = 0; zone < depot->zone_count; zone++) {
+ const struct slab_journal_statistics *stats =
+ &depot->allocators[zone].slab_journal_statistics;
+
+ totals.disk_full_count += READ_ONCE(stats->disk_full_count);
+ totals.flush_count += READ_ONCE(stats->flush_count);
+ totals.blocked_count += READ_ONCE(stats->blocked_count);
+ totals.blocks_written += READ_ONCE(stats->blocks_written);
+ totals.tail_busy_count += READ_ONCE(stats->tail_busy_count);
+ }
+
+ return totals;
+}
+
+/**
+ * vdo_get_slab_depot_statistics() - Get all the vdo_statistics fields that are properties of the
+ * slab depot.
+ * @depot: The slab depot.
+ * @stats: The vdo statistics structure to partially fill.
+ */
+void vdo_get_slab_depot_statistics(const struct slab_depot *depot,
+ struct vdo_statistics *stats)
+{
+ slab_count_t slab_count = READ_ONCE(depot->slab_count);
+ slab_count_t unrecovered = 0;
+ zone_count_t zone;
+
+ for (zone = 0; zone < depot->zone_count; zone++) {
+ /* The allocators are responsible for thread safety. */
+ unrecovered += READ_ONCE(depot->allocators[zone].scrubber.slab_count);
+ }
+
+ stats->recovery_percentage = (slab_count - unrecovered) * 100 / slab_count;
+ stats->allocator = get_block_allocator_statistics(depot);
+ stats->ref_counts = get_ref_counts_statistics(depot);
+ stats->slab_journal = get_slab_journal_statistics(depot);
+ stats->slab_summary = (struct slab_summary_statistics) {
+ .blocks_written = atomic64_read(&depot->summary_statistics.blocks_written),
+ };
+}
+
+/**
+ * vdo_dump_slab_depot() - Dump the slab depot, in a thread-unsafe fashion.
+ * @depot: The slab depot.
+ */
+void vdo_dump_slab_depot(const struct slab_depot *depot)
+{
+ vdo_log_info("vdo slab depot");
+ vdo_log_info(" zone_count=%u old_zone_count=%u slabCount=%u active_release_request=%llu new_release_request=%llu",
+ (unsigned int) depot->zone_count,
+ (unsigned int) depot->old_zone_count, READ_ONCE(depot->slab_count),
+ (unsigned long long) depot->active_release_request,
+ (unsigned long long) depot->new_release_request);
+}
diff --git a/drivers/md/dm-vdo/slab-depot.h b/drivers/md/dm-vdo/slab-depot.h
new file mode 100644
index 000000000000..f234853501ca
--- /dev/null
+++ b/drivers/md/dm-vdo/slab-depot.h
@@ -0,0 +1,601 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef VDO_SLAB_DEPOT_H
+#define VDO_SLAB_DEPOT_H
+
+#include <linux/atomic.h>
+#include <linux/dm-kcopyd.h>
+#include <linux/list.h>
+
+#include "numeric.h"
+
+#include "admin-state.h"
+#include "completion.h"
+#include "data-vio.h"
+#include "encodings.h"
+#include "physical-zone.h"
+#include "priority-table.h"
+#include "recovery-journal.h"
+#include "statistics.h"
+#include "types.h"
+#include "vio.h"
+#include "wait-queue.h"
+
+/*
+ * A slab_depot is responsible for managing all of the slabs and block allocators of a VDO. It has
+ * a single array of slabs in order to eliminate the need for additional math in order to compute
+ * which physical zone a PBN is in. It also has a block_allocator per zone.
+ *
+ * Each physical zone has a single dedicated queue and thread for performing all updates to the
+ * slabs assigned to that zone. The concurrency guarantees of this single-threaded model allow the
+ * code to omit more fine-grained locking for the various slab structures. Each physical zone
+ * maintains a separate copy of the slab summary to remove the need for explicit locking on that
+ * structure as well.
+ *
+ * Load operations must be performed on the admin thread. Normal operations, such as allocations
+ * and reference count updates, must be performed on the appropriate physical zone thread. Requests
+ * from the recovery journal to commit slab journal tail blocks must be scheduled from the recovery
+ * journal thread to run on the appropriate physical zone thread. Save operations must be launched
+ * from the same admin thread as the original load operation.
+ */
+
+enum {
+ /* The number of vios in the vio pool is proportional to the throughput of the VDO. */
+ BLOCK_ALLOCATOR_VIO_POOL_SIZE = 128,
+};
+
+/*
+ * Represents the possible status of a block.
+ */
+enum reference_status {
+ RS_FREE, /* this block is free */
+ RS_SINGLE, /* this block is singly-referenced */
+ RS_SHARED, /* this block is shared */
+ RS_PROVISIONAL /* this block is provisionally allocated */
+};
+
+struct vdo_slab;
+
+struct journal_lock {
+ u16 count;
+ sequence_number_t recovery_start;
+};
+
+struct slab_journal {
+ /* A waiter object for getting a VIO pool entry */
+ struct vdo_waiter resource_waiter;
+ /* A waiter object for updating the slab summary */
+ struct vdo_waiter slab_summary_waiter;
+ /* A waiter object for getting a vio with which to flush */
+ struct vdo_waiter flush_waiter;
+ /* The queue of VIOs waiting to make an entry */
+ struct vdo_wait_queue entry_waiters;
+ /* The parent slab reference of this journal */
+ struct vdo_slab *slab;
+
+ /* Whether a tail block commit is pending */
+ bool waiting_to_commit;
+ /* Whether the journal is updating the slab summary */
+ bool updating_slab_summary;
+ /* Whether the journal is adding entries from the entry_waiters queue */
+ bool adding_entries;
+ /* Whether a partial write is in progress */
+ bool partial_write_in_progress;
+
+ /* The oldest block in the journal on disk */
+ sequence_number_t head;
+ /* The oldest block in the journal which may not be reaped */
+ sequence_number_t unreapable;
+ /* The end of the half-open interval of the active journal */
+ sequence_number_t tail;
+ /* The next journal block to be committed */
+ sequence_number_t next_commit;
+ /* The tail sequence number that is written in the slab summary */
+ sequence_number_t summarized;
+ /* The tail sequence number that was last summarized in slab summary */
+ sequence_number_t last_summarized;
+
+ /* The sequence number of the recovery journal lock */
+ sequence_number_t recovery_lock;
+
+ /*
+ * The number of entries which fit in a single block. Can't use the constant because unit
+ * tests change this number.
+ */
+ journal_entry_count_t entries_per_block;
+ /*
+ * The number of full entries which fit in a single block. Can't use the constant because
+ * unit tests change this number.
+ */
+ journal_entry_count_t full_entries_per_block;
+
+ /* The recovery journal of the VDO (slab journal holds locks on it) */
+ struct recovery_journal *recovery_journal;
+
+ /* The statistics shared by all slab journals in our physical zone */
+ struct slab_journal_statistics *events;
+ /* A list of the VIO pool entries for outstanding journal block writes */
+ struct list_head uncommitted_blocks;
+
+ /*
+ * The current tail block header state. This will be packed into the block just before it
+ * is written.
+ */
+ struct slab_journal_block_header tail_header;
+ /* A pointer to a block-sized buffer holding the packed block data */
+ struct packed_slab_journal_block *block;
+
+ /* The number of blocks in the on-disk journal */
+ block_count_t size;
+ /* The number of blocks at which to start pushing reference blocks */
+ block_count_t flushing_threshold;
+ /* The number of blocks at which all reference blocks should be writing */
+ block_count_t flushing_deadline;
+ /* The number of blocks at which to wait for reference blocks to write */
+ block_count_t blocking_threshold;
+ /* The number of blocks at which to scrub the slab before coming online */
+ block_count_t scrubbing_threshold;
+
+ /* This list entry is for block_allocator to keep a queue of dirty journals */
+ struct list_head dirty_entry;
+
+ /* The lock for the oldest unreaped block of the journal */
+ struct journal_lock *reap_lock;
+ /* The locks for each on disk block */
+ struct journal_lock *locks;
+};
+
+/*
+ * Reference_block structure
+ *
+ * Blocks are used as a proxy, permitting saves of partial refcounts.
+ */
+struct reference_block {
+ /* This block waits on the ref_counts to tell it to write */
+ struct vdo_waiter waiter;
+ /* The slab to which this reference_block belongs */
+ struct vdo_slab *slab;
+ /* The number of references in this block that represent allocations */
+ block_size_t allocated_count;
+ /* The slab journal block on which this block must hold a lock */
+ sequence_number_t slab_journal_lock;
+ /* The slab journal block which should be released when this block is committed */
+ sequence_number_t slab_journal_lock_to_release;
+ /* The point up to which each sector is accurate on disk */
+ struct journal_point commit_points[VDO_SECTORS_PER_BLOCK];
+ /* Whether this block has been modified since it was written to disk */
+ bool is_dirty;
+ /* Whether this block is currently writing */
+ bool is_writing;
+};
+
+/* The search_cursor represents the saved position of a free block search. */
+struct search_cursor {
+ /* The reference block containing the current search index */
+ struct reference_block *block;
+ /* The position at which to start searching for the next free counter */
+ slab_block_number index;
+ /* The position just past the last valid counter in the current block */
+ slab_block_number end_index;
+
+ /* A pointer to the first reference block in the slab */
+ struct reference_block *first_block;
+ /* A pointer to the last reference block in the slab */
+ struct reference_block *last_block;
+};
+
+enum slab_rebuild_status {
+ VDO_SLAB_REBUILT,
+ VDO_SLAB_REPLAYING,
+ VDO_SLAB_REQUIRES_SCRUBBING,
+ VDO_SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING,
+ VDO_SLAB_REBUILDING,
+};
+
+/*
+ * This is the type declaration for the vdo_slab type. A vdo_slab currently consists of a run of
+ * 2^23 data blocks, but that will soon change to dedicate a small number of those blocks for
+ * metadata storage for the reference counts and slab journal for the slab.
+ *
+ * A reference count is maintained for each physical block number. The vast majority of blocks have
+ * a very small reference count (usually 0 or 1). For references less than or equal to MAXIMUM_REFS
+ * (254) the reference count is stored in counters[pbn].
+ */
+struct vdo_slab {
+ /* A list entry to queue this slab in a block_allocator list */
+ struct list_head allocq_entry;
+
+ /* The struct block_allocator that owns this slab */
+ struct block_allocator *allocator;
+
+ /* The journal for this slab */
+ struct slab_journal journal;
+
+ /* The slab number of this slab */
+ slab_count_t slab_number;
+ /* The offset in the allocator partition of the first block in this slab */
+ physical_block_number_t start;
+ /* The offset of the first block past the end of this slab */
+ physical_block_number_t end;
+ /* The starting translated PBN of the slab journal */
+ physical_block_number_t journal_origin;
+ /* The starting translated PBN of the reference counts */
+ physical_block_number_t ref_counts_origin;
+
+ /* The administrative state of the slab */
+ struct admin_state state;
+ /* The status of the slab */
+ enum slab_rebuild_status status;
+ /* Whether the slab was ever queued for scrubbing */
+ bool was_queued_for_scrubbing;
+
+ /* The priority at which this slab has been queued for allocation */
+ u8 priority;
+
+ /* Fields beyond this point are the reference counts for the data blocks in this slab. */
+ /* The size of the counters array */
+ u32 block_count;
+ /* The number of free blocks */
+ u32 free_blocks;
+ /* The array of reference counts */
+ vdo_refcount_t *counters; /* use vdo_allocate() to align data ptr */
+
+ /* The saved block pointer and array indexes for the free block search */
+ struct search_cursor search_cursor;
+
+ /* A list of the dirty blocks waiting to be written out */
+ struct vdo_wait_queue dirty_blocks;
+ /* The number of blocks which are currently writing */
+ size_t active_count;
+
+ /* A waiter object for updating the slab summary */
+ struct vdo_waiter summary_waiter;
+
+ /* The latest slab journal for which there has been a reference count update */
+ struct journal_point slab_journal_point;
+
+ /* The number of reference count blocks */
+ u32 reference_block_count;
+ /* reference count block array */
+ struct reference_block *reference_blocks;
+};
+
+enum block_allocator_drain_step {
+ VDO_DRAIN_ALLOCATOR_START,
+ VDO_DRAIN_ALLOCATOR_STEP_SCRUBBER,
+ VDO_DRAIN_ALLOCATOR_STEP_SLABS,
+ VDO_DRAIN_ALLOCATOR_STEP_SUMMARY,
+ VDO_DRAIN_ALLOCATOR_STEP_FINISHED,
+};
+
+struct slab_scrubber {
+ /* The queue of slabs to scrub first */
+ struct list_head high_priority_slabs;
+ /* The queue of slabs to scrub once there are no high_priority_slabs */
+ struct list_head slabs;
+ /* The queue of VIOs waiting for a slab to be scrubbed */
+ struct vdo_wait_queue waiters;
+
+ /*
+ * The number of slabs that are unrecovered or being scrubbed. This field is modified by
+ * the physical zone thread, but is queried by other threads.
+ */
+ slab_count_t slab_count;
+
+ /* The administrative state of the scrubber */
+ struct admin_state admin_state;
+ /* Whether to only scrub high-priority slabs */
+ bool high_priority_only;
+ /* The slab currently being scrubbed */
+ struct vdo_slab *slab;
+ /* The vio for loading slab journal blocks */
+ struct vio vio;
+};
+
+/* A sub-structure for applying actions in parallel to all an allocator's slabs. */
+struct slab_actor {
+ /* The number of slabs performing a slab action */
+ slab_count_t slab_action_count;
+ /* The method to call when a slab action has been completed by all slabs */
+ vdo_action_fn callback;
+};
+
+/* A slab_iterator is a structure for iterating over a set of slabs. */
+struct slab_iterator {
+ struct vdo_slab **slabs;
+ struct vdo_slab *next;
+ slab_count_t end;
+ slab_count_t stride;
+};
+
+/*
+ * The slab_summary provides hints during load and recovery about the state of the slabs in order
+ * to avoid the need to read the slab journals in their entirety before a VDO can come online.
+ *
+ * The information in the summary for each slab includes the rough number of free blocks (which is
+ * used to prioritize scrubbing), the cleanliness of a slab (so that clean slabs containing free
+ * space will be used on restart), and the location of the tail block of the slab's journal.
+ *
+ * The slab_summary has its own partition at the end of the volume which is sized to allow for a
+ * complete copy of the summary for each of up to 16 physical zones.
+ *
+ * During resize, the slab_summary moves its backing partition and is saved once moved; the
+ * slab_summary is not permitted to overwrite the previous recovery journal space.
+ *
+ * The slab_summary does not have its own version information, but relies on the VDO volume version
+ * number.
+ */
+
+/*
+ * A slab status is a very small structure for use in determining the ordering of slabs in the
+ * scrubbing process.
+ */
+struct slab_status {
+ slab_count_t slab_number;
+ bool is_clean;
+ u8 emptiness;
+};
+
+struct slab_summary_block {
+ /* The block_allocator to which this block belongs */
+ struct block_allocator *allocator;
+ /* The index of this block in its zone's summary */
+ block_count_t index;
+ /* Whether this block has a write outstanding */
+ bool writing;
+ /* Ring of updates waiting on the outstanding write */
+ struct vdo_wait_queue current_update_waiters;
+ /* Ring of updates waiting on the next write */
+ struct vdo_wait_queue next_update_waiters;
+ /* The active slab_summary_entry array for this block */
+ struct slab_summary_entry *entries;
+ /* The vio used to write this block */
+ struct vio vio;
+ /* The packed entries, one block long, backing the vio */
+ char *outgoing_entries;
+};
+
+/*
+ * The statistics for all the slab summary zones owned by this slab summary. These fields are all
+ * mutated only by their physical zone threads, but are read by other threads when gathering
+ * statistics for the entire depot.
+ */
+struct atomic_slab_summary_statistics {
+ /* Number of blocks written */
+ atomic64_t blocks_written;
+};
+
+struct block_allocator {
+ struct vdo_completion completion;
+ /* The slab depot for this allocator */
+ struct slab_depot *depot;
+ /* The nonce of the VDO */
+ nonce_t nonce;
+ /* The physical zone number of this allocator */
+ zone_count_t zone_number;
+ /* The thread ID for this allocator's physical zone */
+ thread_id_t thread_id;
+ /* The number of slabs in this allocator */
+ slab_count_t slab_count;
+ /* The number of the last slab owned by this allocator */
+ slab_count_t last_slab;
+ /* The reduced priority level used to preserve unopened slabs */
+ unsigned int unopened_slab_priority;
+ /* The state of this allocator */
+ struct admin_state state;
+ /* The actor for applying an action to all slabs */
+ struct slab_actor slab_actor;
+
+ /* The slab from which blocks are currently being allocated */
+ struct vdo_slab *open_slab;
+ /* A priority queue containing all slabs available for allocation */
+ struct priority_table *prioritized_slabs;
+ /* The slab scrubber */
+ struct slab_scrubber scrubber;
+ /* What phase of the close operation the allocator is to perform */
+ enum block_allocator_drain_step drain_step;
+
+ /*
+ * These statistics are all mutated only by the physical zone thread, but are read by other
+ * threads when gathering statistics for the entire depot.
+ */
+ /*
+ * The count of allocated blocks in this zone. Not in block_allocator_statistics for
+ * historical reasons.
+ */
+ u64 allocated_blocks;
+ /* Statistics for this block allocator */
+ struct block_allocator_statistics statistics;
+ /* Cumulative statistics for the slab journals in this zone */
+ struct slab_journal_statistics slab_journal_statistics;
+ /* Cumulative statistics for the reference counters in this zone */
+ struct ref_counts_statistics ref_counts_statistics;
+
+ /*
+ * This is the head of a queue of slab journals which have entries in their tail blocks
+ * which have not yet started to commit. When the recovery journal is under space pressure,
+ * slab journals which have uncommitted entries holding a lock on the recovery journal head
+ * are forced to commit their blocks early. This list is kept in order, with the tail
+ * containing the slab journal holding the most recent recovery journal lock.
+ */
+ struct list_head dirty_slab_journals;
+
+ /* The vio pool for reading and writing block allocator metadata */
+ struct vio_pool *vio_pool;
+ /* The dm_kcopyd client for erasing slab journals */
+ struct dm_kcopyd_client *eraser;
+ /* Iterator over the slabs to be erased */
+ struct slab_iterator slabs_to_erase;
+
+ /* The portion of the slab summary managed by this allocator */
+ /* The state of the slab summary */
+ struct admin_state summary_state;
+ /* The number of outstanding summary writes */
+ block_count_t summary_write_count;
+ /* The array (owned by the blocks) of all entries */
+ struct slab_summary_entry *summary_entries;
+ /* The array of slab_summary_blocks */
+ struct slab_summary_block *summary_blocks;
+};
+
+enum slab_depot_load_type {
+ VDO_SLAB_DEPOT_NORMAL_LOAD,
+ VDO_SLAB_DEPOT_RECOVERY_LOAD,
+ VDO_SLAB_DEPOT_REBUILD_LOAD
+};
+
+struct slab_depot {
+ zone_count_t zone_count;
+ zone_count_t old_zone_count;
+ struct vdo *vdo;
+ struct slab_config slab_config;
+ struct action_manager *action_manager;
+
+ physical_block_number_t first_block;
+ physical_block_number_t last_block;
+ physical_block_number_t origin;
+
+ /* slab_size == (1 << slab_size_shift) */
+ unsigned int slab_size_shift;
+
+ /* Determines how slabs should be queued during load */
+ enum slab_depot_load_type load_type;
+
+ /* The state for notifying slab journals to release recovery journal */
+ sequence_number_t active_release_request;
+ sequence_number_t new_release_request;
+
+ /* State variables for scrubbing complete handling */
+ atomic_t zones_to_scrub;
+
+ /* Array of pointers to individually allocated slabs */
+ struct vdo_slab **slabs;
+ /* The number of slabs currently allocated and stored in 'slabs' */
+ slab_count_t slab_count;
+
+ /* Array of pointers to a larger set of slabs (used during resize) */
+ struct vdo_slab **new_slabs;
+ /* The number of slabs currently allocated and stored in 'new_slabs' */
+ slab_count_t new_slab_count;
+ /* The size that 'new_slabs' was allocated for */
+ block_count_t new_size;
+
+ /* The last block before resize, for rollback */
+ physical_block_number_t old_last_block;
+ /* The last block after resize, for resize */
+ physical_block_number_t new_last_block;
+
+ /* The statistics for the slab summary */
+ struct atomic_slab_summary_statistics summary_statistics;
+ /* The start of the slab summary partition */
+ physical_block_number_t summary_origin;
+ /* The number of bits to shift to get a 7-bit fullness hint */
+ unsigned int hint_shift;
+ /* The slab summary entries for all of the zones the partition can hold */
+ struct slab_summary_entry *summary_entries;
+
+ /* The block allocators for this depot */
+ struct block_allocator allocators[];
+};
+
+struct reference_updater;
+
+bool __must_check vdo_attempt_replay_into_slab(struct vdo_slab *slab,
+ physical_block_number_t pbn,
+ enum journal_operation operation,
+ bool increment,
+ struct journal_point *recovery_point,
+ struct vdo_completion *parent);
+
+int __must_check vdo_adjust_reference_count_for_rebuild(struct slab_depot *depot,
+ physical_block_number_t pbn,
+ enum journal_operation operation);
+
+static inline struct block_allocator *vdo_as_block_allocator(struct vdo_completion *completion)
+{
+ vdo_assert_completion_type(completion, VDO_BLOCK_ALLOCATOR_COMPLETION);
+ return container_of(completion, struct block_allocator, completion);
+}
+
+int __must_check vdo_acquire_provisional_reference(struct vdo_slab *slab,
+ physical_block_number_t pbn,
+ struct pbn_lock *lock);
+
+int __must_check vdo_allocate_block(struct block_allocator *allocator,
+ physical_block_number_t *block_number_ptr);
+
+int vdo_enqueue_clean_slab_waiter(struct block_allocator *allocator,
+ struct vdo_waiter *waiter);
+
+void vdo_modify_reference_count(struct vdo_completion *completion,
+ struct reference_updater *updater);
+
+int __must_check vdo_release_block_reference(struct block_allocator *allocator,
+ physical_block_number_t pbn);
+
+void vdo_notify_slab_journals_are_recovered(struct vdo_completion *completion);
+
+void vdo_dump_block_allocator(const struct block_allocator *allocator);
+
+int __must_check vdo_decode_slab_depot(struct slab_depot_state_2_0 state,
+ struct vdo *vdo,
+ struct partition *summary_partition,
+ struct slab_depot **depot_ptr);
+
+void vdo_free_slab_depot(struct slab_depot *depot);
+
+struct slab_depot_state_2_0 __must_check vdo_record_slab_depot(const struct slab_depot *depot);
+
+int __must_check vdo_allocate_reference_counters(struct slab_depot *depot);
+
+struct vdo_slab * __must_check vdo_get_slab(const struct slab_depot *depot,
+ physical_block_number_t pbn);
+
+u8 __must_check vdo_get_increment_limit(struct slab_depot *depot,
+ physical_block_number_t pbn);
+
+bool __must_check vdo_is_physical_data_block(const struct slab_depot *depot,
+ physical_block_number_t pbn);
+
+block_count_t __must_check vdo_get_slab_depot_allocated_blocks(const struct slab_depot *depot);
+
+block_count_t __must_check vdo_get_slab_depot_data_blocks(const struct slab_depot *depot);
+
+void vdo_get_slab_depot_statistics(const struct slab_depot *depot,
+ struct vdo_statistics *stats);
+
+void vdo_load_slab_depot(struct slab_depot *depot,
+ const struct admin_state_code *operation,
+ struct vdo_completion *parent, void *context);
+
+void vdo_prepare_slab_depot_to_allocate(struct slab_depot *depot,
+ enum slab_depot_load_type load_type,
+ struct vdo_completion *parent);
+
+void vdo_update_slab_depot_size(struct slab_depot *depot);
+
+int __must_check vdo_prepare_to_grow_slab_depot(struct slab_depot *depot,
+ const struct partition *partition);
+
+void vdo_use_new_slabs(struct slab_depot *depot, struct vdo_completion *parent);
+
+void vdo_abandon_new_slabs(struct slab_depot *depot);
+
+void vdo_drain_slab_depot(struct slab_depot *depot,
+ const struct admin_state_code *operation,
+ struct vdo_completion *parent);
+
+void vdo_resume_slab_depot(struct slab_depot *depot, struct vdo_completion *parent);
+
+void vdo_commit_oldest_slab_journal_tail_blocks(struct slab_depot *depot,
+ sequence_number_t recovery_block_number);
+
+void vdo_scrub_all_unrecovered_slabs(struct slab_depot *depot,
+ struct vdo_completion *parent);
+
+void vdo_dump_slab_depot(const struct slab_depot *depot);
+
+#endif /* VDO_SLAB_DEPOT_H */
diff --git a/drivers/md/dm-vdo/statistics.h b/drivers/md/dm-vdo/statistics.h
new file mode 100644
index 000000000000..c88a75dffba3
--- /dev/null
+++ b/drivers/md/dm-vdo/statistics.h
@@ -0,0 +1,278 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef STATISTICS_H
+#define STATISTICS_H
+
+#include "types.h"
+
+enum {
+ STATISTICS_VERSION = 36,
+};
+
+struct block_allocator_statistics {
+ /* The total number of slabs from which blocks may be allocated */
+ u64 slab_count;
+ /* The total number of slabs from which blocks have ever been allocated */
+ u64 slabs_opened;
+ /* The number of times since loading that a slab has been re-opened */
+ u64 slabs_reopened;
+};
+
+/**
+ * Counters for tracking the number of items written (blocks, requests, etc.)
+ * that keep track of totals at steps in the write pipeline. Three counters
+ * allow the number of buffered, in-memory items and the number of in-flight,
+ * unacknowledged writes to be derived, while still tracking totals for
+ * reporting purposes
+ */
+struct commit_statistics {
+ /* The total number of items on which processing has started */
+ u64 started;
+ /* The total number of items for which a write operation has been issued */
+ u64 written;
+ /* The total number of items for which a write operation has completed */
+ u64 committed;
+};
+
+/** Counters for events in the recovery journal */
+struct recovery_journal_statistics {
+ /* Number of times the on-disk journal was full */
+ u64 disk_full;
+ /* Number of times the recovery journal requested slab journal commits. */
+ u64 slab_journal_commits_requested;
+ /* Write/Commit totals for individual journal entries */
+ struct commit_statistics entries;
+ /* Write/Commit totals for journal blocks */
+ struct commit_statistics blocks;
+};
+
+/** The statistics for the compressed block packer. */
+struct packer_statistics {
+ /* Number of compressed data items written since startup */
+ u64 compressed_fragments_written;
+ /* Number of blocks containing compressed items written since startup */
+ u64 compressed_blocks_written;
+ /* Number of VIOs that are pending in the packer */
+ u64 compressed_fragments_in_packer;
+};
+
+/** The statistics for the slab journals. */
+struct slab_journal_statistics {
+ /* Number of times the on-disk journal was full */
+ u64 disk_full_count;
+ /* Number of times an entry was added over the flush threshold */
+ u64 flush_count;
+ /* Number of times an entry was added over the block threshold */
+ u64 blocked_count;
+ /* Number of times a tail block was written */
+ u64 blocks_written;
+ /* Number of times we had to wait for the tail to write */
+ u64 tail_busy_count;
+};
+
+/** The statistics for the slab summary. */
+struct slab_summary_statistics {
+ /* Number of blocks written */
+ u64 blocks_written;
+};
+
+/** The statistics for the reference counts. */
+struct ref_counts_statistics {
+ /* Number of reference blocks written */
+ u64 blocks_written;
+};
+
+/** The statistics for the block map. */
+struct block_map_statistics {
+ /* number of dirty (resident) pages */
+ u32 dirty_pages;
+ /* number of clean (resident) pages */
+ u32 clean_pages;
+ /* number of free pages */
+ u32 free_pages;
+ /* number of pages in failed state */
+ u32 failed_pages;
+ /* number of pages incoming */
+ u32 incoming_pages;
+ /* number of pages outgoing */
+ u32 outgoing_pages;
+ /* how many times free page not avail */
+ u32 cache_pressure;
+ /* number of get_vdo_page() calls for read */
+ u64 read_count;
+ /* number of get_vdo_page() calls for write */
+ u64 write_count;
+ /* number of times pages failed to read */
+ u64 failed_reads;
+ /* number of times pages failed to write */
+ u64 failed_writes;
+ /* number of gets that are reclaimed */
+ u64 reclaimed;
+ /* number of gets for outgoing pages */
+ u64 read_outgoing;
+ /* number of gets that were already there */
+ u64 found_in_cache;
+ /* number of gets requiring discard */
+ u64 discard_required;
+ /* number of gets enqueued for their page */
+ u64 wait_for_page;
+ /* number of gets that have to fetch */
+ u64 fetch_required;
+ /* number of page fetches */
+ u64 pages_loaded;
+ /* number of page saves */
+ u64 pages_saved;
+ /* the number of flushes issued */
+ u64 flush_count;
+};
+
+/** The dedupe statistics from hash locks */
+struct hash_lock_statistics {
+ /* Number of times the UDS advice proved correct */
+ u64 dedupe_advice_valid;
+ /* Number of times the UDS advice proved incorrect */
+ u64 dedupe_advice_stale;
+ /* Number of writes with the same data as another in-flight write */
+ u64 concurrent_data_matches;
+ /* Number of writes whose hash collided with an in-flight write */
+ u64 concurrent_hash_collisions;
+ /* Current number of dedupe queries that are in flight */
+ u32 curr_dedupe_queries;
+};
+
+/** Counts of error conditions in VDO. */
+struct error_statistics {
+ /* number of times VDO got an invalid dedupe advice PBN from UDS */
+ u64 invalid_advice_pbn_count;
+ /* number of times a VIO completed with a VDO_NO_SPACE error */
+ u64 no_space_error_count;
+ /* number of times a VIO completed with a VDO_READ_ONLY error */
+ u64 read_only_error_count;
+};
+
+struct bio_stats {
+ /* Number of REQ_OP_READ bios */
+ u64 read;
+ /* Number of REQ_OP_WRITE bios with data */
+ u64 write;
+ /* Number of bios tagged with REQ_PREFLUSH and containing no data */
+ u64 empty_flush;
+ /* Number of REQ_OP_DISCARD bios */
+ u64 discard;
+ /* Number of bios tagged with REQ_PREFLUSH */
+ u64 flush;
+ /* Number of bios tagged with REQ_FUA */
+ u64 fua;
+};
+
+struct memory_usage {
+ /* Tracked bytes currently allocated. */
+ u64 bytes_used;
+ /* Maximum tracked bytes allocated. */
+ u64 peak_bytes_used;
+};
+
+/** UDS index statistics */
+struct index_statistics {
+ /* Number of records stored in the index */
+ u64 entries_indexed;
+ /* Number of post calls that found an existing entry */
+ u64 posts_found;
+ /* Number of post calls that added a new entry */
+ u64 posts_not_found;
+ /* Number of query calls that found an existing entry */
+ u64 queries_found;
+ /* Number of query calls that added a new entry */
+ u64 queries_not_found;
+ /* Number of update calls that found an existing entry */
+ u64 updates_found;
+ /* Number of update calls that added a new entry */
+ u64 updates_not_found;
+ /* Number of entries discarded */
+ u64 entries_discarded;
+};
+
+/** The statistics of the vdo service. */
+struct vdo_statistics {
+ u32 version;
+ /* Number of blocks used for data */
+ u64 data_blocks_used;
+ /* Number of blocks used for VDO metadata */
+ u64 overhead_blocks_used;
+ /* Number of logical blocks that are currently mapped to physical blocks */
+ u64 logical_blocks_used;
+ /* number of physical blocks */
+ block_count_t physical_blocks;
+ /* number of logical blocks */
+ block_count_t logical_blocks;
+ /* Size of the block map page cache, in bytes */
+ u64 block_map_cache_size;
+ /* The physical block size */
+ u64 block_size;
+ /* Number of times the VDO has successfully recovered */
+ u64 complete_recoveries;
+ /* Number of times the VDO has recovered from read-only mode */
+ u64 read_only_recoveries;
+ /* String describing the operating mode of the VDO */
+ char mode[15];
+ /* Whether the VDO is in recovery mode */
+ bool in_recovery_mode;
+ /* What percentage of recovery mode work has been completed */
+ u8 recovery_percentage;
+ /* The statistics for the compressed block packer */
+ struct packer_statistics packer;
+ /* Counters for events in the block allocator */
+ struct block_allocator_statistics allocator;
+ /* Counters for events in the recovery journal */
+ struct recovery_journal_statistics journal;
+ /* The statistics for the slab journals */
+ struct slab_journal_statistics slab_journal;
+ /* The statistics for the slab summary */
+ struct slab_summary_statistics slab_summary;
+ /* The statistics for the reference counts */
+ struct ref_counts_statistics ref_counts;
+ /* The statistics for the block map */
+ struct block_map_statistics block_map;
+ /* The dedupe statistics from hash locks */
+ struct hash_lock_statistics hash_lock;
+ /* Counts of error conditions */
+ struct error_statistics errors;
+ /* The VDO instance */
+ u32 instance;
+ /* Current number of active VIOs */
+ u32 current_vios_in_progress;
+ /* Maximum number of active VIOs */
+ u32 max_vios;
+ /* Number of times the UDS index was too slow in responding */
+ u64 dedupe_advice_timeouts;
+ /* Number of flush requests submitted to the storage device */
+ u64 flush_out;
+ /* Logical block size */
+ u64 logical_block_size;
+ /* Bios submitted into VDO from above */
+ struct bio_stats bios_in;
+ struct bio_stats bios_in_partial;
+ /* Bios submitted onward for user data */
+ struct bio_stats bios_out;
+ /* Bios submitted onward for metadata */
+ struct bio_stats bios_meta;
+ struct bio_stats bios_journal;
+ struct bio_stats bios_page_cache;
+ struct bio_stats bios_out_completed;
+ struct bio_stats bios_meta_completed;
+ struct bio_stats bios_journal_completed;
+ struct bio_stats bios_page_cache_completed;
+ struct bio_stats bios_acknowledged;
+ struct bio_stats bios_acknowledged_partial;
+ /* Current number of bios in progress */
+ struct bio_stats bios_in_progress;
+ /* Memory usage stats. */
+ struct memory_usage memory_usage;
+ /* The statistics for the UDS index */
+ struct index_statistics index;
+};
+
+#endif /* not STATISTICS_H */
diff --git a/drivers/md/dm-vdo/status-codes.c b/drivers/md/dm-vdo/status-codes.c
new file mode 100644
index 000000000000..d3493450b169
--- /dev/null
+++ b/drivers/md/dm-vdo/status-codes.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "status-codes.h"
+
+#include "errors.h"
+#include "logger.h"
+#include "permassert.h"
+#include "thread-utils.h"
+
+const struct error_info vdo_status_list[] = {
+ { "VDO_NOT_IMPLEMENTED", "Not implemented" },
+ { "VDO_OUT_OF_RANGE", "Out of range" },
+ { "VDO_REF_COUNT_INVALID", "Reference count would become invalid" },
+ { "VDO_NO_SPACE", "Out of space" },
+ { "VDO_BAD_CONFIGURATION", "Bad configuration option" },
+ { "VDO_COMPONENT_BUSY", "Prior operation still in progress" },
+ { "VDO_BAD_PAGE", "Corrupt or incorrect page" },
+ { "VDO_UNSUPPORTED_VERSION", "Unsupported component version" },
+ { "VDO_INCORRECT_COMPONENT", "Component id mismatch in decoder" },
+ { "VDO_PARAMETER_MISMATCH", "Parameters have conflicting values" },
+ { "VDO_UNKNOWN_PARTITION", "No partition exists with a given id" },
+ { "VDO_PARTITION_EXISTS", "A partition already exists with a given id" },
+ { "VDO_INCREMENT_TOO_SMALL", "Physical block growth of too few blocks" },
+ { "VDO_CHECKSUM_MISMATCH", "Incorrect checksum" },
+ { "VDO_LOCK_ERROR", "A lock is held incorrectly" },
+ { "VDO_READ_ONLY", "The device is in read-only mode" },
+ { "VDO_SHUTTING_DOWN", "The device is shutting down" },
+ { "VDO_CORRUPT_JOURNAL", "Recovery journal entries corrupted" },
+ { "VDO_TOO_MANY_SLABS", "Exceeds maximum number of slabs supported" },
+ { "VDO_INVALID_FRAGMENT", "Compressed block fragment is invalid" },
+ { "VDO_RETRY_AFTER_REBUILD", "Retry operation after rebuilding finishes" },
+ { "VDO_BAD_MAPPING", "Invalid page mapping" },
+ { "VDO_BIO_CREATION_FAILED", "Bio creation failed" },
+ { "VDO_BAD_MAGIC", "Bad magic number" },
+ { "VDO_BAD_NONCE", "Bad nonce" },
+ { "VDO_JOURNAL_OVERFLOW", "Journal sequence number overflow" },
+ { "VDO_INVALID_ADMIN_STATE", "Invalid operation for current state" },
+};
+
+/**
+ * vdo_register_status_codes() - Register the VDO status codes.
+ * Return: A success or error code.
+ */
+int vdo_register_status_codes(void)
+{
+ int result;
+
+ BUILD_BUG_ON((VDO_STATUS_CODE_LAST - VDO_STATUS_CODE_BASE) !=
+ ARRAY_SIZE(vdo_status_list));
+
+ result = uds_register_error_block("VDO Status", VDO_STATUS_CODE_BASE,
+ VDO_STATUS_CODE_BLOCK_END, vdo_status_list,
+ sizeof(vdo_status_list));
+ return (result == UDS_SUCCESS) ? VDO_SUCCESS : result;
+}
+
+/**
+ * vdo_status_to_errno() - Given an error code, return a value we can return to the OS.
+ * @error: The error code to convert.
+ *
+ * The input error code may be a system-generated value (such as -EIO), an errno macro used in our
+ * code (such as EIO), or a UDS or VDO status code; the result must be something the rest of the OS
+ * can consume (negative errno values such as -EIO, in the case of the kernel).
+ *
+ * Return: A system error code value.
+ */
+int vdo_status_to_errno(int error)
+{
+ char error_name[VDO_MAX_ERROR_NAME_SIZE];
+ char error_message[VDO_MAX_ERROR_MESSAGE_SIZE];
+
+ /* 0 is success, negative a system error code */
+ if (likely(error <= 0))
+ return error;
+ if (error < 1024)
+ return -error;
+
+ /* VDO or UDS error */
+ switch (error) {
+ case VDO_NO_SPACE:
+ return -ENOSPC;
+ case VDO_READ_ONLY:
+ return -EIO;
+ default:
+ vdo_log_info("%s: mapping internal status code %d (%s: %s) to EIO",
+ __func__, error,
+ uds_string_error_name(error, error_name, sizeof(error_name)),
+ uds_string_error(error, error_message, sizeof(error_message)));
+ return -EIO;
+ }
+}
diff --git a/drivers/md/dm-vdo/status-codes.h b/drivers/md/dm-vdo/status-codes.h
new file mode 100644
index 000000000000..72da04159f88
--- /dev/null
+++ b/drivers/md/dm-vdo/status-codes.h
@@ -0,0 +1,86 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef VDO_STATUS_CODES_H
+#define VDO_STATUS_CODES_H
+
+#include "errors.h"
+
+enum {
+ UDS_ERRORS_BLOCK_SIZE = UDS_ERROR_CODE_BLOCK_END - UDS_ERROR_CODE_BASE,
+ VDO_ERRORS_BLOCK_START = UDS_ERROR_CODE_BLOCK_END,
+ VDO_ERRORS_BLOCK_END = VDO_ERRORS_BLOCK_START + UDS_ERRORS_BLOCK_SIZE,
+};
+
+/* VDO-specific status codes. */
+enum vdo_status_codes {
+ /* base of all VDO errors */
+ VDO_STATUS_CODE_BASE = VDO_ERRORS_BLOCK_START,
+ /* we haven't written this yet */
+ VDO_NOT_IMPLEMENTED = VDO_STATUS_CODE_BASE,
+ /* input out of range */
+ VDO_OUT_OF_RANGE,
+ /* an invalid reference count would result */
+ VDO_REF_COUNT_INVALID,
+ /* a free block could not be allocated */
+ VDO_NO_SPACE,
+ /* improper or missing configuration option */
+ VDO_BAD_CONFIGURATION,
+ /* prior operation still in progress */
+ VDO_COMPONENT_BUSY,
+ /* page contents incorrect or corrupt data */
+ VDO_BAD_PAGE,
+ /* unsupported version of some component */
+ VDO_UNSUPPORTED_VERSION,
+ /* component id mismatch in decoder */
+ VDO_INCORRECT_COMPONENT,
+ /* parameters have conflicting values */
+ VDO_PARAMETER_MISMATCH,
+ /* no partition exists with a given id */
+ VDO_UNKNOWN_PARTITION,
+ /* a partition already exists with a given id */
+ VDO_PARTITION_EXISTS,
+ /* physical block growth of too few blocks */
+ VDO_INCREMENT_TOO_SMALL,
+ /* incorrect checksum */
+ VDO_CHECKSUM_MISMATCH,
+ /* a lock is held incorrectly */
+ VDO_LOCK_ERROR,
+ /* the VDO is in read-only mode */
+ VDO_READ_ONLY,
+ /* the VDO is shutting down */
+ VDO_SHUTTING_DOWN,
+ /* the recovery journal has corrupt entries */
+ VDO_CORRUPT_JOURNAL,
+ /* exceeds maximum number of slabs supported */
+ VDO_TOO_MANY_SLABS,
+ /* a compressed block fragment is invalid */
+ VDO_INVALID_FRAGMENT,
+ /* action is unsupported while rebuilding */
+ VDO_RETRY_AFTER_REBUILD,
+ /* a block map entry is invalid */
+ VDO_BAD_MAPPING,
+ /* bio_add_page failed */
+ VDO_BIO_CREATION_FAILED,
+ /* bad magic number */
+ VDO_BAD_MAGIC,
+ /* bad nonce */
+ VDO_BAD_NONCE,
+ /* sequence number overflow */
+ VDO_JOURNAL_OVERFLOW,
+ /* the VDO is not in a state to perform an admin operation */
+ VDO_INVALID_ADMIN_STATE,
+ /* one more than last error code */
+ VDO_STATUS_CODE_LAST,
+ VDO_STATUS_CODE_BLOCK_END = VDO_ERRORS_BLOCK_END
+};
+
+extern const struct error_info vdo_status_list[];
+
+int vdo_register_status_codes(void);
+
+int vdo_status_to_errno(int error);
+
+#endif /* VDO_STATUS_CODES_H */
diff --git a/drivers/md/dm-vdo/string-utils.c b/drivers/md/dm-vdo/string-utils.c
new file mode 100644
index 000000000000..71e44b4683ea
--- /dev/null
+++ b/drivers/md/dm-vdo/string-utils.c
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "string-utils.h"
+
+char *vdo_append_to_buffer(char *buffer, char *buf_end, const char *fmt, ...)
+{
+ va_list args;
+ size_t n;
+
+ va_start(args, fmt);
+ n = vsnprintf(buffer, buf_end - buffer, fmt, args);
+ if (n >= (size_t) (buf_end - buffer))
+ buffer = buf_end;
+ else
+ buffer += n;
+ va_end(args);
+
+ return buffer;
+}
diff --git a/drivers/md/dm-vdo/string-utils.h b/drivers/md/dm-vdo/string-utils.h
new file mode 100644
index 000000000000..96eecd38b1c2
--- /dev/null
+++ b/drivers/md/dm-vdo/string-utils.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef VDO_STRING_UTILS_H
+#define VDO_STRING_UTILS_H
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+
+/* Utilities related to string manipulation */
+
+static inline const char *vdo_bool_to_string(bool value)
+{
+ return value ? "true" : "false";
+}
+
+/* Append a formatted string to the end of a buffer. */
+char *vdo_append_to_buffer(char *buffer, char *buf_end, const char *fmt, ...)
+ __printf(3, 4);
+
+#endif /* VDO_STRING_UTILS_H */
diff --git a/drivers/md/dm-vdo/thread-device.c b/drivers/md/dm-vdo/thread-device.c
new file mode 100644
index 000000000000..df13ca914db8
--- /dev/null
+++ b/drivers/md/dm-vdo/thread-device.c
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "thread-device.h"
+
+/* A registry of threads associated with device id numbers. */
+static struct thread_registry device_id_thread_registry;
+
+/* Any registered thread must be unregistered. */
+void vdo_register_thread_device_id(struct registered_thread *new_thread,
+ unsigned int *id_ptr)
+{
+ vdo_register_thread(&device_id_thread_registry, new_thread, id_ptr);
+}
+
+void vdo_unregister_thread_device_id(void)
+{
+ vdo_unregister_thread(&device_id_thread_registry);
+}
+
+int vdo_get_thread_device_id(void)
+{
+ const unsigned int *pointer;
+
+ pointer = vdo_lookup_thread(&device_id_thread_registry);
+ return (pointer != NULL) ? *pointer : -1;
+}
+
+void vdo_initialize_thread_device_registry(void)
+{
+ vdo_initialize_thread_registry(&device_id_thread_registry);
+}
diff --git a/drivers/md/dm-vdo/thread-device.h b/drivers/md/dm-vdo/thread-device.h
new file mode 100644
index 000000000000..494d9c9ef3f6
--- /dev/null
+++ b/drivers/md/dm-vdo/thread-device.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef VDO_THREAD_DEVICE_H
+#define VDO_THREAD_DEVICE_H
+
+#include "thread-registry.h"
+
+void vdo_register_thread_device_id(struct registered_thread *new_thread,
+ unsigned int *id_ptr);
+
+void vdo_unregister_thread_device_id(void);
+
+int vdo_get_thread_device_id(void);
+
+void vdo_initialize_thread_device_registry(void);
+
+#endif /* VDO_THREAD_DEVICE_H */
diff --git a/drivers/md/dm-vdo/thread-registry.c b/drivers/md/dm-vdo/thread-registry.c
new file mode 100644
index 000000000000..d4a077d58c60
--- /dev/null
+++ b/drivers/md/dm-vdo/thread-registry.c
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "thread-registry.h"
+
+#include <asm/current.h>
+#include <linux/rculist.h>
+
+#include "permassert.h"
+
+/*
+ * We need to be careful when using other facilities that may use thread registry functions in
+ * their normal operation. For example, we do not want to invoke the logger while holding a lock.
+ */
+
+void vdo_initialize_thread_registry(struct thread_registry *registry)
+{
+ INIT_LIST_HEAD(&registry->links);
+ spin_lock_init(&registry->lock);
+}
+
+/* Register the current thread and associate it with a data pointer. */
+void vdo_register_thread(struct thread_registry *registry,
+ struct registered_thread *new_thread, const void *pointer)
+{
+ struct registered_thread *thread;
+ bool found_it = false;
+
+ INIT_LIST_HEAD(&new_thread->links);
+ new_thread->pointer = pointer;
+ new_thread->task = current;
+
+ spin_lock(&registry->lock);
+ list_for_each_entry(thread, &registry->links, links) {
+ if (thread->task == current) {
+ /* There should be no existing entry. */
+ list_del_rcu(&thread->links);
+ found_it = true;
+ break;
+ }
+ }
+ list_add_tail_rcu(&new_thread->links, &registry->links);
+ spin_unlock(&registry->lock);
+
+ VDO_ASSERT_LOG_ONLY(!found_it, "new thread not already in registry");
+ if (found_it) {
+ /* Ensure no RCU iterators see it before re-initializing. */
+ synchronize_rcu();
+ INIT_LIST_HEAD(&thread->links);
+ }
+}
+
+void vdo_unregister_thread(struct thread_registry *registry)
+{
+ struct registered_thread *thread;
+ bool found_it = false;
+
+ spin_lock(&registry->lock);
+ list_for_each_entry(thread, &registry->links, links) {
+ if (thread->task == current) {
+ list_del_rcu(&thread->links);
+ found_it = true;
+ break;
+ }
+ }
+ spin_unlock(&registry->lock);
+
+ VDO_ASSERT_LOG_ONLY(found_it, "thread found in registry");
+ if (found_it) {
+ /* Ensure no RCU iterators see it before re-initializing. */
+ synchronize_rcu();
+ INIT_LIST_HEAD(&thread->links);
+ }
+}
+
+const void *vdo_lookup_thread(struct thread_registry *registry)
+{
+ struct registered_thread *thread;
+ const void *result = NULL;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(thread, &registry->links, links) {
+ if (thread->task == current) {
+ result = thread->pointer;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return result;
+}
diff --git a/drivers/md/dm-vdo/thread-registry.h b/drivers/md/dm-vdo/thread-registry.h
new file mode 100644
index 000000000000..cc6d78312b9e
--- /dev/null
+++ b/drivers/md/dm-vdo/thread-registry.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef VDO_THREAD_REGISTRY_H
+#define VDO_THREAD_REGISTRY_H
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+struct thread_registry {
+ struct list_head links;
+ spinlock_t lock;
+};
+
+struct registered_thread {
+ struct list_head links;
+ const void *pointer;
+ struct task_struct *task;
+};
+
+void vdo_initialize_thread_registry(struct thread_registry *registry);
+
+void vdo_register_thread(struct thread_registry *registry,
+ struct registered_thread *new_thread, const void *pointer);
+
+void vdo_unregister_thread(struct thread_registry *registry);
+
+const void *vdo_lookup_thread(struct thread_registry *registry);
+
+#endif /* VDO_THREAD_REGISTRY_H */
diff --git a/drivers/md/dm-vdo/thread-utils.c b/drivers/md/dm-vdo/thread-utils.c
new file mode 100644
index 000000000000..ec08478dd013
--- /dev/null
+++ b/drivers/md/dm-vdo/thread-utils.c
@@ -0,0 +1,108 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "thread-utils.h"
+
+#include <asm/current.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/mutex.h>
+#include <linux/types.h>
+
+#include "errors.h"
+#include "logger.h"
+#include "memory-alloc.h"
+
+static struct hlist_head thread_list;
+static struct mutex thread_mutex;
+
+struct thread {
+ void (*thread_function)(void *thread_data);
+ void *thread_data;
+ struct hlist_node thread_links;
+ struct task_struct *thread_task;
+ struct completion thread_done;
+};
+
+void vdo_initialize_threads_mutex(void)
+{
+ mutex_init(&thread_mutex);
+}
+
+static int thread_starter(void *arg)
+{
+ struct registered_thread allocating_thread;
+ struct thread *thread = arg;
+
+ thread->thread_task = current;
+ mutex_lock(&thread_mutex);
+ hlist_add_head(&thread->thread_links, &thread_list);
+ mutex_unlock(&thread_mutex);
+ vdo_register_allocating_thread(&allocating_thread, NULL);
+ thread->thread_function(thread->thread_data);
+ vdo_unregister_allocating_thread();
+ complete(&thread->thread_done);
+ return 0;
+}
+
+int vdo_create_thread(void (*thread_function)(void *), void *thread_data,
+ const char *name, struct thread **new_thread)
+{
+ char *name_colon = strchr(name, ':');
+ char *my_name_colon = strchr(current->comm, ':');
+ struct task_struct *task;
+ struct thread *thread;
+ int result;
+
+ result = vdo_allocate(1, struct thread, __func__, &thread);
+ if (result != VDO_SUCCESS) {
+ vdo_log_warning("Error allocating memory for %s", name);
+ return result;
+ }
+
+ thread->thread_function = thread_function;
+ thread->thread_data = thread_data;
+ init_completion(&thread->thread_done);
+ /*
+ * Start the thread, with an appropriate thread name.
+ *
+ * If the name supplied contains a colon character, use that name. This causes uds module
+ * threads to have names like "uds:callbackW" and the main test runner thread to be named
+ * "zub:runtest".
+ *
+ * Otherwise if the current thread has a name containing a colon character, prefix the name
+ * supplied with the name of the current thread up to (and including) the colon character.
+ * Thus when the "kvdo0:dedupeQ" thread opens an index session, all the threads associated
+ * with that index will have names like "kvdo0:foo".
+ *
+ * Otherwise just use the name supplied. This should be a rare occurrence.
+ */
+ if ((name_colon == NULL) && (my_name_colon != NULL)) {
+ task = kthread_run(thread_starter, thread, "%.*s:%s",
+ (int) (my_name_colon - current->comm), current->comm,
+ name);
+ } else {
+ task = kthread_run(thread_starter, thread, "%s", name);
+ }
+
+ if (IS_ERR(task)) {
+ vdo_free(thread);
+ return PTR_ERR(task);
+ }
+
+ *new_thread = thread;
+ return VDO_SUCCESS;
+}
+
+void vdo_join_threads(struct thread *thread)
+{
+ while (wait_for_completion_interruptible(&thread->thread_done))
+ fsleep(1000);
+
+ mutex_lock(&thread_mutex);
+ hlist_del(&thread->thread_links);
+ mutex_unlock(&thread_mutex);
+ vdo_free(thread);
+}
diff --git a/drivers/md/dm-vdo/thread-utils.h b/drivers/md/dm-vdo/thread-utils.h
new file mode 100644
index 000000000000..687ab43e2cee
--- /dev/null
+++ b/drivers/md/dm-vdo/thread-utils.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef THREAD_UTILS_H
+#define THREAD_UTILS_H
+
+#include <linux/atomic.h>
+
+/* Thread and synchronization utilities */
+
+struct thread;
+
+void vdo_initialize_threads_mutex(void);
+int __must_check vdo_create_thread(void (*thread_function)(void *), void *thread_data,
+ const char *name, struct thread **new_thread);
+void vdo_join_threads(struct thread *thread);
+
+#endif /* UDS_THREADS_H */
diff --git a/drivers/md/dm-vdo/time-utils.h b/drivers/md/dm-vdo/time-utils.h
new file mode 100644
index 000000000000..5f1e850fd826
--- /dev/null
+++ b/drivers/md/dm-vdo/time-utils.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef UDS_TIME_UTILS_H
+#define UDS_TIME_UTILS_H
+
+#include <linux/ktime.h>
+#include <linux/time.h>
+#include <linux/types.h>
+
+static inline s64 ktime_to_seconds(ktime_t reltime)
+{
+ return reltime / NSEC_PER_SEC;
+}
+
+static inline ktime_t current_time_ns(clockid_t clock)
+{
+ return clock == CLOCK_MONOTONIC ? ktime_get_ns() : ktime_get_real_ns();
+}
+
+static inline ktime_t current_time_us(void)
+{
+ return current_time_ns(CLOCK_REALTIME) / NSEC_PER_USEC;
+}
+
+#endif /* UDS_TIME_UTILS_H */
diff --git a/drivers/md/dm-vdo/types.h b/drivers/md/dm-vdo/types.h
new file mode 100644
index 000000000000..dbe892b10f26
--- /dev/null
+++ b/drivers/md/dm-vdo/types.h
@@ -0,0 +1,393 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef VDO_TYPES_H
+#define VDO_TYPES_H
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/device-mapper.h>
+#include <linux/list.h>
+#include <linux/compiler_attributes.h>
+#include <linux/types.h>
+
+#include "funnel-queue.h"
+
+/* A size type in blocks. */
+typedef u64 block_count_t;
+
+/* The size of a block. */
+typedef u16 block_size_t;
+
+/* A counter for data_vios */
+typedef u16 data_vio_count_t;
+
+/* A height within a tree. */
+typedef u8 height_t;
+
+/* The logical block number as used by the consumer. */
+typedef u64 logical_block_number_t;
+
+/* The type of the nonce used to identify instances of VDO. */
+typedef u64 nonce_t;
+
+/* A size in pages. */
+typedef u32 page_count_t;
+
+/* A page number. */
+typedef u32 page_number_t;
+
+/*
+ * The physical (well, less logical) block number at which the block is found on the underlying
+ * device.
+ */
+typedef u64 physical_block_number_t;
+
+/* A count of tree roots. */
+typedef u8 root_count_t;
+
+/* A number of sectors. */
+typedef u8 sector_count_t;
+
+/* A sequence number. */
+typedef u64 sequence_number_t;
+
+/* The offset of a block within a slab. */
+typedef u32 slab_block_number;
+
+/* A size type in slabs. */
+typedef u16 slab_count_t;
+
+/* A slot in a bin or block map page. */
+typedef u16 slot_number_t;
+
+/* typedef thread_count_t - A thread counter. */
+typedef u8 thread_count_t;
+
+/* typedef thread_id_t - A thread ID, vdo threads are numbered sequentially from 0. */
+typedef u8 thread_id_t;
+
+/* A zone counter */
+typedef u8 zone_count_t;
+
+/* The following enums are persisted on storage, so the values must be preserved. */
+
+/* The current operating mode of the VDO. */
+enum vdo_state {
+ VDO_DIRTY = 0,
+ VDO_NEW = 1,
+ VDO_CLEAN = 2,
+ VDO_READ_ONLY_MODE = 3,
+ VDO_FORCE_REBUILD = 4,
+ VDO_RECOVERING = 5,
+ VDO_REPLAYING = 6, /* VDO_REPLAYING is never set anymore, but retained for upgrade */
+ VDO_REBUILD_FOR_UPGRADE = 7,
+
+ /* Keep VDO_STATE_COUNT at the bottom. */
+ VDO_STATE_COUNT
+};
+
+/**
+ * vdo_state_requires_read_only_rebuild() - Check whether a vdo_state indicates
+ * that a read-only rebuild is required.
+ * @state: The vdo_state to check.
+ *
+ * Return: true if the state indicates a rebuild is required
+ */
+static inline bool __must_check vdo_state_requires_read_only_rebuild(enum vdo_state state)
+{
+ return ((state == VDO_FORCE_REBUILD) || (state == VDO_REBUILD_FOR_UPGRADE));
+}
+
+/**
+ * vdo_state_requires_recovery() - Check whether a vdo state indicates that recovery is needed.
+ * @state: The state to check.
+ *
+ * Return: true if the state indicates a recovery is required
+ */
+static inline bool __must_check vdo_state_requires_recovery(enum vdo_state state)
+{
+ return ((state == VDO_DIRTY) || (state == VDO_REPLAYING) || (state == VDO_RECOVERING));
+}
+
+/*
+ * The current operation on a physical block (from the point of view of the recovery journal, slab
+ * journals, and reference counts.
+ */
+enum journal_operation {
+ VDO_JOURNAL_DATA_REMAPPING = 0,
+ VDO_JOURNAL_BLOCK_MAP_REMAPPING = 1,
+} __packed;
+
+/* Partition IDs encoded in the volume layout in the super block. */
+enum partition_id {
+ VDO_BLOCK_MAP_PARTITION = 0,
+ VDO_SLAB_DEPOT_PARTITION = 1,
+ VDO_RECOVERY_JOURNAL_PARTITION = 2,
+ VDO_SLAB_SUMMARY_PARTITION = 3,
+} __packed;
+
+/* Metadata types for the vdo. */
+enum vdo_metadata_type {
+ VDO_METADATA_RECOVERY_JOURNAL = 1,
+ VDO_METADATA_SLAB_JOURNAL = 2,
+ VDO_METADATA_RECOVERY_JOURNAL_2 = 3,
+} __packed;
+
+/* A position in the block map where a block map entry is stored. */
+struct block_map_slot {
+ physical_block_number_t pbn;
+ slot_number_t slot;
+};
+
+/*
+ * Four bits of each five-byte block map entry contain a mapping state value used to distinguish
+ * unmapped or discarded logical blocks (which are treated as mapped to the zero block) from entries
+ * that have been mapped to a physical block, including the zero block.
+ *
+ * FIXME: these should maybe be defines.
+ */
+enum block_mapping_state {
+ VDO_MAPPING_STATE_UNMAPPED = 0, /* Must be zero to be the default value */
+ VDO_MAPPING_STATE_UNCOMPRESSED = 1, /* A normal (uncompressed) block */
+ VDO_MAPPING_STATE_COMPRESSED_BASE = 2, /* Compressed in slot 0 */
+ VDO_MAPPING_STATE_COMPRESSED_MAX = 15, /* Compressed in slot 13 */
+};
+
+enum {
+ VDO_MAX_COMPRESSION_SLOTS =
+ (VDO_MAPPING_STATE_COMPRESSED_MAX - VDO_MAPPING_STATE_COMPRESSED_BASE + 1),
+};
+
+
+struct data_location {
+ physical_block_number_t pbn;
+ enum block_mapping_state state;
+};
+
+/* The configuration of a single slab derived from the configured block size and slab size. */
+struct slab_config {
+ /* total number of blocks in the slab */
+ block_count_t slab_blocks;
+ /* number of blocks available for data */
+ block_count_t data_blocks;
+ /* number of blocks for reference counts */
+ block_count_t reference_count_blocks;
+ /* number of blocks for the slab journal */
+ block_count_t slab_journal_blocks;
+ /*
+ * Number of blocks after which the slab journal starts pushing out a reference_block for
+ * each new entry it receives.
+ */
+ block_count_t slab_journal_flushing_threshold;
+ /*
+ * Number of blocks after which the slab journal pushes out all reference_blocks and makes
+ * all vios wait.
+ */
+ block_count_t slab_journal_blocking_threshold;
+ /* Number of blocks after which the slab must be scrubbed before coming online. */
+ block_count_t slab_journal_scrubbing_threshold;
+} __packed;
+
+/*
+ * This structure is memcmp'd for equality. Keep it packed and don't add any fields that are not
+ * properly set in both extant and parsed configs.
+ */
+struct thread_count_config {
+ unsigned int bio_ack_threads;
+ unsigned int bio_threads;
+ unsigned int bio_rotation_interval;
+ unsigned int cpu_threads;
+ unsigned int logical_zones;
+ unsigned int physical_zones;
+ unsigned int hash_zones;
+} __packed;
+
+struct device_config {
+ struct dm_target *owning_target;
+ struct dm_dev *owned_device;
+ struct vdo *vdo;
+ /* All configs referencing a layer are kept on a list in the layer */
+ struct list_head config_list;
+ char *original_string;
+ unsigned int version;
+ char *parent_device_name;
+ block_count_t physical_blocks;
+ /*
+ * This is the number of logical blocks from VDO's internal point of view. It is the number
+ * of 4K blocks regardless of the value of the logical_block_size parameter below.
+ */
+ block_count_t logical_blocks;
+ unsigned int logical_block_size;
+ unsigned int cache_size;
+ unsigned int block_map_maximum_age;
+ bool deduplication;
+ bool compression;
+ struct thread_count_config thread_counts;
+ block_count_t max_discard_blocks;
+};
+
+enum vdo_completion_type {
+ /* Keep VDO_UNSET_COMPLETION_TYPE at the top. */
+ VDO_UNSET_COMPLETION_TYPE,
+ VDO_ACTION_COMPLETION,
+ VDO_ADMIN_COMPLETION,
+ VDO_BLOCK_ALLOCATOR_COMPLETION,
+ VDO_DATA_VIO_POOL_COMPLETION,
+ VDO_DECREMENT_COMPLETION,
+ VDO_FLUSH_COMPLETION,
+ VDO_FLUSH_NOTIFICATION_COMPLETION,
+ VDO_GENERATION_FLUSHED_COMPLETION,
+ VDO_HASH_ZONE_COMPLETION,
+ VDO_HASH_ZONES_COMPLETION,
+ VDO_LOCK_COUNTER_COMPLETION,
+ VDO_PAGE_COMPLETION,
+ VDO_READ_ONLY_MODE_COMPLETION,
+ VDO_REPAIR_COMPLETION,
+ VDO_SYNC_COMPLETION,
+ VIO_COMPLETION,
+} __packed;
+
+struct vdo_completion;
+
+/**
+ * typedef vdo_action_fn - An asynchronous VDO operation.
+ * @completion: The completion of the operation.
+ */
+typedef void (*vdo_action_fn)(struct vdo_completion *completion);
+
+enum vdo_completion_priority {
+ BIO_ACK_Q_ACK_PRIORITY = 0,
+ BIO_ACK_Q_MAX_PRIORITY = 0,
+ BIO_Q_COMPRESSED_DATA_PRIORITY = 0,
+ BIO_Q_DATA_PRIORITY = 0,
+ BIO_Q_FLUSH_PRIORITY = 2,
+ BIO_Q_HIGH_PRIORITY = 2,
+ BIO_Q_METADATA_PRIORITY = 1,
+ BIO_Q_VERIFY_PRIORITY = 1,
+ BIO_Q_MAX_PRIORITY = 2,
+ CPU_Q_COMPLETE_VIO_PRIORITY = 0,
+ CPU_Q_COMPLETE_READ_PRIORITY = 0,
+ CPU_Q_COMPRESS_BLOCK_PRIORITY = 0,
+ CPU_Q_EVENT_REPORTER_PRIORITY = 0,
+ CPU_Q_HASH_BLOCK_PRIORITY = 0,
+ CPU_Q_MAX_PRIORITY = 0,
+ UDS_Q_PRIORITY = 0,
+ UDS_Q_MAX_PRIORITY = 0,
+ VDO_DEFAULT_Q_COMPLETION_PRIORITY = 1,
+ VDO_DEFAULT_Q_FLUSH_PRIORITY = 2,
+ VDO_DEFAULT_Q_MAP_BIO_PRIORITY = 0,
+ VDO_DEFAULT_Q_SYNC_PRIORITY = 2,
+ VDO_DEFAULT_Q_VIO_CALLBACK_PRIORITY = 1,
+ VDO_DEFAULT_Q_MAX_PRIORITY = 2,
+ /* The maximum allowable priority */
+ VDO_WORK_Q_MAX_PRIORITY = 2,
+ /* A value which must be out of range for a valid priority */
+ VDO_WORK_Q_DEFAULT_PRIORITY = VDO_WORK_Q_MAX_PRIORITY + 1,
+};
+
+struct vdo_completion {
+ /* The type of completion this is */
+ enum vdo_completion_type type;
+
+ /*
+ * <code>true</code> once the processing of the operation is complete. This flag should not
+ * be used by waiters external to the VDO base as it is used to gate calling the callback.
+ */
+ bool complete;
+
+ /*
+ * If true, queue this completion on the next callback invocation, even if it is already
+ * running on the correct thread.
+ */
+ bool requeue;
+
+ /* The ID of the thread which should run the next callback */
+ thread_id_t callback_thread_id;
+
+ /* The result of the operation */
+ int result;
+
+ /* The VDO on which this completion operates */
+ struct vdo *vdo;
+
+ /* The callback which will be called once the operation is complete */
+ vdo_action_fn callback;
+
+ /* Callback which, if set, will be called if an error result is set */
+ vdo_action_fn error_handler;
+
+ /* The parent object, if any, that spawned this completion */
+ void *parent;
+
+ /* Entry link for lock-free work queue */
+ struct funnel_queue_entry work_queue_entry_link;
+ enum vdo_completion_priority priority;
+ struct vdo_work_queue *my_queue;
+};
+
+struct block_allocator;
+struct data_vio;
+struct vdo;
+struct vdo_config;
+
+/* vio types for statistics and instrumentation. */
+enum vio_type {
+ VIO_TYPE_UNINITIALIZED = 0,
+ VIO_TYPE_DATA,
+ VIO_TYPE_BLOCK_ALLOCATOR,
+ VIO_TYPE_BLOCK_MAP,
+ VIO_TYPE_BLOCK_MAP_INTERIOR,
+ VIO_TYPE_GEOMETRY,
+ VIO_TYPE_PARTITION_COPY,
+ VIO_TYPE_RECOVERY_JOURNAL,
+ VIO_TYPE_SLAB_JOURNAL,
+ VIO_TYPE_SLAB_SUMMARY,
+ VIO_TYPE_SUPER_BLOCK,
+} __packed;
+
+/* Priority levels for asynchronous I/O operations performed on a vio. */
+enum vio_priority {
+ VIO_PRIORITY_LOW = 0,
+ VIO_PRIORITY_DATA = VIO_PRIORITY_LOW,
+ VIO_PRIORITY_COMPRESSED_DATA = VIO_PRIORITY_DATA,
+ VIO_PRIORITY_METADATA,
+ VIO_PRIORITY_HIGH,
+} __packed;
+
+/*
+ * A wrapper for a bio. All I/O to the storage below a vdo is conducted via vios.
+ */
+struct vio {
+ /* The completion for this vio */
+ struct vdo_completion completion;
+
+ /* The bio zone in which I/O should be processed */
+ zone_count_t bio_zone;
+
+ /* The queueing priority of the vio operation */
+ enum vio_priority priority;
+
+ /* The vio type is used for statistics and instrumentation. */
+ enum vio_type type;
+
+ /* The size of this vio in blocks */
+ unsigned int block_count;
+
+ /* The data being read or written. */
+ char *data;
+
+ /* The VDO-owned bio to use for all IO for this vio */
+ struct bio *bio;
+
+ /*
+ * A list of enqueued bios with consecutive block numbers, stored by vdo_submit_bio() under
+ * the first-enqueued vio. The other vios are found via their bio entries in this list, and
+ * are not added to the work queue as separate completions.
+ */
+ struct bio_list bios_merged;
+};
+
+#endif /* VDO_TYPES_H */
diff --git a/drivers/md/dm-vdo/vdo.c b/drivers/md/dm-vdo/vdo.c
new file mode 100644
index 000000000000..fff847767755
--- /dev/null
+++ b/drivers/md/dm-vdo/vdo.c
@@ -0,0 +1,1730 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+/*
+ * This file contains the main entry points for normal operations on a vdo as well as functions for
+ * constructing and destroying vdo instances (in memory).
+ */
+
+/**
+ * DOC:
+ *
+ * A read_only_notifier has a single completion which is used to perform read-only notifications,
+ * however, vdo_enter_read_only_mode() may be called from any thread. A pair of fields, protected
+ * by a spinlock, are used to control the read-only mode entry process. The first field holds the
+ * read-only error. The second is the state field, which may hold any of the four special values
+ * enumerated here.
+ *
+ * When vdo_enter_read_only_mode() is called from some vdo thread, if the read_only_error field
+ * already contains an error (i.e. its value is not VDO_SUCCESS), then some other error has already
+ * initiated the read-only process, and nothing more is done. Otherwise, the new error is stored in
+ * the read_only_error field, and the state field is consulted. If the state is MAY_NOTIFY, it is
+ * set to NOTIFYING, and the notification process begins. If the state is MAY_NOT_NOTIFY, then
+ * notifications are currently disallowed, generally due to the vdo being suspended. In this case,
+ * the nothing more will be done until the vdo is resumed, at which point the notification will be
+ * performed. In any other case, the vdo is already read-only, and there is nothing more to do.
+ */
+
+#include "vdo.h"
+
+#include <linux/completion.h>
+#include <linux/device-mapper.h>
+#include <linux/kernel.h>
+#include <linux/lz4.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "permassert.h"
+#include "string-utils.h"
+
+#include "block-map.h"
+#include "completion.h"
+#include "data-vio.h"
+#include "dedupe.h"
+#include "encodings.h"
+#include "funnel-workqueue.h"
+#include "io-submitter.h"
+#include "logical-zone.h"
+#include "packer.h"
+#include "physical-zone.h"
+#include "recovery-journal.h"
+#include "slab-depot.h"
+#include "statistics.h"
+#include "status-codes.h"
+#include "vio.h"
+
+#define PARANOID_THREAD_CONSISTENCY_CHECKS 0
+
+struct sync_completion {
+ struct vdo_completion vdo_completion;
+ struct completion completion;
+};
+
+/* A linked list is adequate for the small number of entries we expect. */
+struct device_registry {
+ struct list_head links;
+ /* TODO: Convert to rcu per kernel recommendation. */
+ rwlock_t lock;
+};
+
+static struct device_registry registry;
+
+/**
+ * vdo_initialize_device_registry_once() - Initialize the necessary structures for the device
+ * registry.
+ */
+void vdo_initialize_device_registry_once(void)
+{
+ INIT_LIST_HEAD(&registry.links);
+ rwlock_init(&registry.lock);
+}
+
+/** vdo_is_equal() - Implements vdo_filter_fn. */
+static bool vdo_is_equal(struct vdo *vdo, const void *context)
+{
+ return (vdo == context);
+}
+
+/**
+ * filter_vdos_locked() - Find a vdo in the registry if it exists there.
+ * @filter: The filter function to apply to devices.
+ * @context: A bit of context to provide the filter.
+ *
+ * Context: Must be called holding the lock.
+ *
+ * Return: the vdo object found, if any.
+ */
+static struct vdo * __must_check filter_vdos_locked(vdo_filter_fn filter,
+ const void *context)
+{
+ struct vdo *vdo;
+
+ list_for_each_entry(vdo, &registry.links, registration) {
+ if (filter(vdo, context))
+ return vdo;
+ }
+
+ return NULL;
+}
+
+/**
+ * vdo_find_matching() - Find and return the first (if any) vdo matching a given filter function.
+ * @filter: The filter function to apply to vdos.
+ * @context: A bit of context to provide the filter.
+ */
+struct vdo *vdo_find_matching(vdo_filter_fn filter, const void *context)
+{
+ struct vdo *vdo;
+
+ read_lock(&registry.lock);
+ vdo = filter_vdos_locked(filter, context);
+ read_unlock(&registry.lock);
+
+ return vdo;
+}
+
+static void start_vdo_request_queue(void *ptr)
+{
+ struct vdo_thread *thread = vdo_get_work_queue_owner(vdo_get_current_work_queue());
+
+ vdo_register_allocating_thread(&thread->allocating_thread,
+ &thread->vdo->allocations_allowed);
+}
+
+static void finish_vdo_request_queue(void *ptr)
+{
+ vdo_unregister_allocating_thread();
+}
+
+#ifdef MODULE
+#define MODULE_NAME THIS_MODULE->name
+#else
+#define MODULE_NAME "dm-vdo"
+#endif /* MODULE */
+
+static const struct vdo_work_queue_type default_queue_type = {
+ .start = start_vdo_request_queue,
+ .finish = finish_vdo_request_queue,
+ .max_priority = VDO_DEFAULT_Q_MAX_PRIORITY,
+ .default_priority = VDO_DEFAULT_Q_COMPLETION_PRIORITY,
+};
+
+static const struct vdo_work_queue_type bio_ack_q_type = {
+ .start = NULL,
+ .finish = NULL,
+ .max_priority = BIO_ACK_Q_MAX_PRIORITY,
+ .default_priority = BIO_ACK_Q_ACK_PRIORITY,
+};
+
+static const struct vdo_work_queue_type cpu_q_type = {
+ .start = NULL,
+ .finish = NULL,
+ .max_priority = CPU_Q_MAX_PRIORITY,
+ .default_priority = CPU_Q_MAX_PRIORITY,
+};
+
+static void uninitialize_thread_config(struct thread_config *config)
+{
+ vdo_free(vdo_forget(config->logical_threads));
+ vdo_free(vdo_forget(config->physical_threads));
+ vdo_free(vdo_forget(config->hash_zone_threads));
+ vdo_free(vdo_forget(config->bio_threads));
+ memset(config, 0, sizeof(struct thread_config));
+}
+
+static void assign_thread_ids(struct thread_config *config,
+ thread_id_t thread_ids[], zone_count_t count)
+{
+ zone_count_t zone;
+
+ for (zone = 0; zone < count; zone++)
+ thread_ids[zone] = config->thread_count++;
+}
+
+/**
+ * initialize_thread_config() - Initialize the thread mapping
+ *
+ * If the logical, physical, and hash zone counts are all 0, a single thread will be shared by all
+ * three plus the packer and recovery journal. Otherwise, there must be at least one of each type,
+ * and each will have its own thread, as will the packer and recovery journal.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int __must_check initialize_thread_config(struct thread_count_config counts,
+ struct thread_config *config)
+{
+ int result;
+ bool single = ((counts.logical_zones + counts.physical_zones + counts.hash_zones) == 0);
+
+ config->bio_thread_count = counts.bio_threads;
+ if (single) {
+ config->logical_zone_count = 1;
+ config->physical_zone_count = 1;
+ config->hash_zone_count = 1;
+ } else {
+ config->logical_zone_count = counts.logical_zones;
+ config->physical_zone_count = counts.physical_zones;
+ config->hash_zone_count = counts.hash_zones;
+ }
+
+ result = vdo_allocate(config->logical_zone_count, thread_id_t,
+ "logical thread array", &config->logical_threads);
+ if (result != VDO_SUCCESS) {
+ uninitialize_thread_config(config);
+ return result;
+ }
+
+ result = vdo_allocate(config->physical_zone_count, thread_id_t,
+ "physical thread array", &config->physical_threads);
+ if (result != VDO_SUCCESS) {
+ uninitialize_thread_config(config);
+ return result;
+ }
+
+ result = vdo_allocate(config->hash_zone_count, thread_id_t,
+ "hash thread array", &config->hash_zone_threads);
+ if (result != VDO_SUCCESS) {
+ uninitialize_thread_config(config);
+ return result;
+ }
+
+ result = vdo_allocate(config->bio_thread_count, thread_id_t,
+ "bio thread array", &config->bio_threads);
+ if (result != VDO_SUCCESS) {
+ uninitialize_thread_config(config);
+ return result;
+ }
+
+ if (single) {
+ config->logical_threads[0] = config->thread_count;
+ config->physical_threads[0] = config->thread_count;
+ config->hash_zone_threads[0] = config->thread_count++;
+ } else {
+ config->admin_thread = config->thread_count;
+ config->journal_thread = config->thread_count++;
+ config->packer_thread = config->thread_count++;
+ assign_thread_ids(config, config->logical_threads, counts.logical_zones);
+ assign_thread_ids(config, config->physical_threads, counts.physical_zones);
+ assign_thread_ids(config, config->hash_zone_threads, counts.hash_zones);
+ }
+
+ config->dedupe_thread = config->thread_count++;
+ config->bio_ack_thread =
+ ((counts.bio_ack_threads > 0) ? config->thread_count++ : VDO_INVALID_THREAD_ID);
+ config->cpu_thread = config->thread_count++;
+ assign_thread_ids(config, config->bio_threads, counts.bio_threads);
+ return VDO_SUCCESS;
+}
+
+/**
+ * read_geometry_block() - Synchronously read the geometry block from a vdo's underlying block
+ * device.
+ * @vdo: The vdo whose geometry is to be read.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+static int __must_check read_geometry_block(struct vdo *vdo)
+{
+ struct vio *vio;
+ char *block;
+ int result;
+
+ result = vdo_allocate(VDO_BLOCK_SIZE, u8, __func__, &block);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = create_metadata_vio(vdo, VIO_TYPE_GEOMETRY, VIO_PRIORITY_HIGH, NULL,
+ block, &vio);
+ if (result != VDO_SUCCESS) {
+ vdo_free(block);
+ return result;
+ }
+
+ /*
+ * This is only safe because, having not already loaded the geometry, the vdo's geometry's
+ * bio_offset field is 0, so the fact that vio_reset_bio() will subtract that offset from
+ * the supplied pbn is not a problem.
+ */
+ result = vio_reset_bio(vio, block, NULL, REQ_OP_READ,
+ VDO_GEOMETRY_BLOCK_LOCATION);
+ if (result != VDO_SUCCESS) {
+ free_vio(vdo_forget(vio));
+ vdo_free(block);
+ return result;
+ }
+
+ bio_set_dev(vio->bio, vdo_get_backing_device(vdo));
+ submit_bio_wait(vio->bio);
+ result = blk_status_to_errno(vio->bio->bi_status);
+ free_vio(vdo_forget(vio));
+ if (result != 0) {
+ vdo_log_error_strerror(result, "synchronous read failed");
+ vdo_free(block);
+ return -EIO;
+ }
+
+ result = vdo_parse_geometry_block((u8 *) block, &vdo->geometry);
+ vdo_free(block);
+ return result;
+}
+
+static bool get_zone_thread_name(const thread_id_t thread_ids[], zone_count_t count,
+ thread_id_t id, const char *prefix,
+ char *buffer, size_t buffer_length)
+{
+ if (id >= thread_ids[0]) {
+ thread_id_t index = id - thread_ids[0];
+
+ if (index < count) {
+ snprintf(buffer, buffer_length, "%s%d", prefix, index);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/**
+ * get_thread_name() - Format the name of the worker thread desired to support a given work queue.
+ * @thread_config: The thread configuration.
+ * @thread_id: The thread id.
+ * @buffer: Where to put the formatted name.
+ * @buffer_length: Size of the output buffer.
+ *
+ * The physical layer may add a prefix identifying the product; the output from this function
+ * should just identify the thread.
+ */
+static void get_thread_name(const struct thread_config *thread_config,
+ thread_id_t thread_id, char *buffer, size_t buffer_length)
+{
+ if (thread_id == thread_config->journal_thread) {
+ if (thread_config->packer_thread == thread_id) {
+ /*
+ * This is the "single thread" config where one thread is used for the
+ * journal, packer, logical, physical, and hash zones. In that case, it is
+ * known as the "request queue."
+ */
+ snprintf(buffer, buffer_length, "reqQ");
+ return;
+ }
+
+ snprintf(buffer, buffer_length, "journalQ");
+ return;
+ } else if (thread_id == thread_config->admin_thread) {
+ /* Theoretically this could be different from the journal thread. */
+ snprintf(buffer, buffer_length, "adminQ");
+ return;
+ } else if (thread_id == thread_config->packer_thread) {
+ snprintf(buffer, buffer_length, "packerQ");
+ return;
+ } else if (thread_id == thread_config->dedupe_thread) {
+ snprintf(buffer, buffer_length, "dedupeQ");
+ return;
+ } else if (thread_id == thread_config->bio_ack_thread) {
+ snprintf(buffer, buffer_length, "ackQ");
+ return;
+ } else if (thread_id == thread_config->cpu_thread) {
+ snprintf(buffer, buffer_length, "cpuQ");
+ return;
+ }
+
+ if (get_zone_thread_name(thread_config->logical_threads,
+ thread_config->logical_zone_count,
+ thread_id, "logQ", buffer, buffer_length))
+ return;
+
+ if (get_zone_thread_name(thread_config->physical_threads,
+ thread_config->physical_zone_count,
+ thread_id, "physQ", buffer, buffer_length))
+ return;
+
+ if (get_zone_thread_name(thread_config->hash_zone_threads,
+ thread_config->hash_zone_count,
+ thread_id, "hashQ", buffer, buffer_length))
+ return;
+
+ if (get_zone_thread_name(thread_config->bio_threads,
+ thread_config->bio_thread_count,
+ thread_id, "bioQ", buffer, buffer_length))
+ return;
+
+ /* Some sort of misconfiguration? */
+ snprintf(buffer, buffer_length, "reqQ%d", thread_id);
+}
+
+/**
+ * vdo_make_thread() - Construct a single vdo work_queue and its associated thread (or threads for
+ * round-robin queues).
+ * @vdo: The vdo which owns the thread.
+ * @thread_id: The id of the thread to create (as determined by the thread_config).
+ * @type: The description of the work queue for this thread.
+ * @queue_count: The number of actual threads/queues contained in the "thread".
+ * @contexts: An array of queue_count contexts, one for each individual queue; may be NULL.
+ *
+ * Each "thread" constructed by this method is represented by a unique thread id in the thread
+ * config, and completions can be enqueued to the queue and run on the threads comprising this
+ * entity.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+int vdo_make_thread(struct vdo *vdo, thread_id_t thread_id,
+ const struct vdo_work_queue_type *type,
+ unsigned int queue_count, void *contexts[])
+{
+ struct vdo_thread *thread = &vdo->threads[thread_id];
+ char queue_name[MAX_VDO_WORK_QUEUE_NAME_LEN];
+
+ if (type == NULL)
+ type = &default_queue_type;
+
+ if (thread->queue != NULL) {
+ return VDO_ASSERT(vdo_work_queue_type_is(thread->queue, type),
+ "already constructed vdo thread %u is of the correct type",
+ thread_id);
+ }
+
+ thread->vdo = vdo;
+ thread->thread_id = thread_id;
+ get_thread_name(&vdo->thread_config, thread_id, queue_name, sizeof(queue_name));
+ return vdo_make_work_queue(vdo->thread_name_prefix, queue_name, thread,
+ type, queue_count, contexts, &thread->queue);
+}
+
+/**
+ * register_vdo() - Register a VDO; it must not already be registered.
+ * @vdo: The vdo to register.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int register_vdo(struct vdo *vdo)
+{
+ int result;
+
+ write_lock(&registry.lock);
+ result = VDO_ASSERT(filter_vdos_locked(vdo_is_equal, vdo) == NULL,
+ "VDO not already registered");
+ if (result == VDO_SUCCESS) {
+ INIT_LIST_HEAD(&vdo->registration);
+ list_add_tail(&vdo->registration, &registry.links);
+ }
+ write_unlock(&registry.lock);
+
+ return result;
+}
+
+/**
+ * initialize_vdo() - Do the portion of initializing a vdo which will clean up after itself on
+ * error.
+ * @vdo: The vdo being initialized
+ * @config: The configuration of the vdo
+ * @instance: The instance number of the vdo
+ * @reason: The buffer to hold the failure reason on error
+ */
+static int initialize_vdo(struct vdo *vdo, struct device_config *config,
+ unsigned int instance, char **reason)
+{
+ int result;
+ zone_count_t i;
+
+ vdo->device_config = config;
+ vdo->starting_sector_offset = config->owning_target->begin;
+ vdo->instance = instance;
+ vdo->allocations_allowed = true;
+ vdo_set_admin_state_code(&vdo->admin.state, VDO_ADMIN_STATE_NEW);
+ INIT_LIST_HEAD(&vdo->device_config_list);
+ vdo_initialize_completion(&vdo->admin.completion, vdo, VDO_ADMIN_COMPLETION);
+ init_completion(&vdo->admin.callback_sync);
+ mutex_init(&vdo->stats_mutex);
+ result = read_geometry_block(vdo);
+ if (result != VDO_SUCCESS) {
+ *reason = "Could not load geometry block";
+ return result;
+ }
+
+ result = initialize_thread_config(config->thread_counts, &vdo->thread_config);
+ if (result != VDO_SUCCESS) {
+ *reason = "Cannot create thread configuration";
+ return result;
+ }
+
+ vdo_log_info("zones: %d logical, %d physical, %d hash; total threads: %d",
+ config->thread_counts.logical_zones,
+ config->thread_counts.physical_zones,
+ config->thread_counts.hash_zones, vdo->thread_config.thread_count);
+
+ /* Compression context storage */
+ result = vdo_allocate(config->thread_counts.cpu_threads, char *, "LZ4 context",
+ &vdo->compression_context);
+ if (result != VDO_SUCCESS) {
+ *reason = "cannot allocate LZ4 context";
+ return result;
+ }
+
+ for (i = 0; i < config->thread_counts.cpu_threads; i++) {
+ result = vdo_allocate(LZ4_MEM_COMPRESS, char, "LZ4 context",
+ &vdo->compression_context[i]);
+ if (result != VDO_SUCCESS) {
+ *reason = "cannot allocate LZ4 context";
+ return result;
+ }
+ }
+
+ result = register_vdo(vdo);
+ if (result != VDO_SUCCESS) {
+ *reason = "Cannot add VDO to device registry";
+ return result;
+ }
+
+ vdo_set_admin_state_code(&vdo->admin.state, VDO_ADMIN_STATE_INITIALIZED);
+ return result;
+}
+
+/**
+ * vdo_make() - Allocate and initialize a vdo.
+ * @instance: Device instantiation counter.
+ * @config: The device configuration.
+ * @reason: The reason for any failure during this call.
+ * @vdo_ptr: A pointer to hold the created vdo.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+int vdo_make(unsigned int instance, struct device_config *config, char **reason,
+ struct vdo **vdo_ptr)
+{
+ int result;
+ struct vdo *vdo;
+
+ /* Initialize with a generic failure reason to prevent returning garbage. */
+ *reason = "Unspecified error";
+
+ result = vdo_allocate(1, struct vdo, __func__, &vdo);
+ if (result != VDO_SUCCESS) {
+ *reason = "Cannot allocate VDO";
+ return result;
+ }
+
+ result = initialize_vdo(vdo, config, instance, reason);
+ if (result != VDO_SUCCESS) {
+ vdo_destroy(vdo);
+ return result;
+ }
+
+ /* From here on, the caller will clean up if there is an error. */
+ *vdo_ptr = vdo;
+
+ snprintf(vdo->thread_name_prefix, sizeof(vdo->thread_name_prefix),
+ "%s%u", MODULE_NAME, instance);
+ BUG_ON(vdo->thread_name_prefix[0] == '\0');
+ result = vdo_allocate(vdo->thread_config.thread_count,
+ struct vdo_thread, __func__, &vdo->threads);
+ if (result != VDO_SUCCESS) {
+ *reason = "Cannot allocate thread structures";
+ return result;
+ }
+
+ result = vdo_make_thread(vdo, vdo->thread_config.admin_thread,
+ &default_queue_type, 1, NULL);
+ if (result != VDO_SUCCESS) {
+ *reason = "Cannot make admin thread";
+ return result;
+ }
+
+ result = vdo_make_flusher(vdo);
+ if (result != VDO_SUCCESS) {
+ *reason = "Cannot make flusher zones";
+ return result;
+ }
+
+ result = vdo_make_packer(vdo, DEFAULT_PACKER_BINS, &vdo->packer);
+ if (result != VDO_SUCCESS) {
+ *reason = "Cannot make packer zones";
+ return result;
+ }
+
+ BUG_ON(vdo->device_config->logical_block_size <= 0);
+ BUG_ON(vdo->device_config->owned_device == NULL);
+ result = make_data_vio_pool(vdo, MAXIMUM_VDO_USER_VIOS,
+ MAXIMUM_VDO_USER_VIOS * 3 / 4,
+ &vdo->data_vio_pool);
+ if (result != VDO_SUCCESS) {
+ *reason = "Cannot allocate data_vio pool";
+ return result;
+ }
+
+ result = vdo_make_io_submitter(config->thread_counts.bio_threads,
+ config->thread_counts.bio_rotation_interval,
+ get_data_vio_pool_request_limit(vdo->data_vio_pool),
+ vdo, &vdo->io_submitter);
+ if (result != VDO_SUCCESS) {
+ *reason = "bio submission initialization failed";
+ return result;
+ }
+
+ if (vdo_uses_bio_ack_queue(vdo)) {
+ result = vdo_make_thread(vdo, vdo->thread_config.bio_ack_thread,
+ &bio_ack_q_type,
+ config->thread_counts.bio_ack_threads, NULL);
+ if (result != VDO_SUCCESS) {
+ *reason = "bio ack queue initialization failed";
+ return result;
+ }
+ }
+
+ result = vdo_make_thread(vdo, vdo->thread_config.cpu_thread, &cpu_q_type,
+ config->thread_counts.cpu_threads,
+ (void **) vdo->compression_context);
+ if (result != VDO_SUCCESS) {
+ *reason = "CPU queue initialization failed";
+ return result;
+ }
+
+ return VDO_SUCCESS;
+}
+
+static void finish_vdo(struct vdo *vdo)
+{
+ int i;
+
+ if (vdo->threads == NULL)
+ return;
+
+ vdo_cleanup_io_submitter(vdo->io_submitter);
+ vdo_finish_dedupe_index(vdo->hash_zones);
+
+ for (i = 0; i < vdo->thread_config.thread_count; i++)
+ vdo_finish_work_queue(vdo->threads[i].queue);
+}
+
+/**
+ * free_listeners() - Free the list of read-only listeners associated with a thread.
+ * @thread_data: The thread holding the list to free.
+ */
+static void free_listeners(struct vdo_thread *thread)
+{
+ struct read_only_listener *listener, *next;
+
+ for (listener = vdo_forget(thread->listeners); listener != NULL; listener = next) {
+ next = vdo_forget(listener->next);
+ vdo_free(listener);
+ }
+}
+
+static void uninitialize_super_block(struct vdo_super_block *super_block)
+{
+ free_vio_components(&super_block->vio);
+ vdo_free(super_block->buffer);
+}
+
+/**
+ * unregister_vdo() - Remove a vdo from the device registry.
+ * @vdo: The vdo to remove.
+ */
+static void unregister_vdo(struct vdo *vdo)
+{
+ write_lock(&registry.lock);
+ if (filter_vdos_locked(vdo_is_equal, vdo) == vdo)
+ list_del_init(&vdo->registration);
+
+ write_unlock(&registry.lock);
+}
+
+/**
+ * vdo_destroy() - Destroy a vdo instance.
+ * @vdo: The vdo to destroy (may be NULL).
+ */
+void vdo_destroy(struct vdo *vdo)
+{
+ unsigned int i;
+
+ if (vdo == NULL)
+ return;
+
+ /* A running VDO should never be destroyed without suspending first. */
+ BUG_ON(vdo_get_admin_state(vdo)->normal);
+
+ vdo->allocations_allowed = true;
+
+ finish_vdo(vdo);
+ unregister_vdo(vdo);
+ free_data_vio_pool(vdo->data_vio_pool);
+ vdo_free_io_submitter(vdo_forget(vdo->io_submitter));
+ vdo_free_flusher(vdo_forget(vdo->flusher));
+ vdo_free_packer(vdo_forget(vdo->packer));
+ vdo_free_recovery_journal(vdo_forget(vdo->recovery_journal));
+ vdo_free_slab_depot(vdo_forget(vdo->depot));
+ vdo_uninitialize_layout(&vdo->layout);
+ vdo_uninitialize_layout(&vdo->next_layout);
+ if (vdo->partition_copier)
+ dm_kcopyd_client_destroy(vdo_forget(vdo->partition_copier));
+ uninitialize_super_block(&vdo->super_block);
+ vdo_free_block_map(vdo_forget(vdo->block_map));
+ vdo_free_hash_zones(vdo_forget(vdo->hash_zones));
+ vdo_free_physical_zones(vdo_forget(vdo->physical_zones));
+ vdo_free_logical_zones(vdo_forget(vdo->logical_zones));
+
+ if (vdo->threads != NULL) {
+ for (i = 0; i < vdo->thread_config.thread_count; i++) {
+ free_listeners(&vdo->threads[i]);
+ vdo_free_work_queue(vdo_forget(vdo->threads[i].queue));
+ }
+ vdo_free(vdo_forget(vdo->threads));
+ }
+
+ uninitialize_thread_config(&vdo->thread_config);
+
+ if (vdo->compression_context != NULL) {
+ for (i = 0; i < vdo->device_config->thread_counts.cpu_threads; i++)
+ vdo_free(vdo_forget(vdo->compression_context[i]));
+
+ vdo_free(vdo_forget(vdo->compression_context));
+ }
+ vdo_free(vdo);
+}
+
+static int initialize_super_block(struct vdo *vdo, struct vdo_super_block *super_block)
+{
+ int result;
+
+ result = vdo_allocate(VDO_BLOCK_SIZE, char, "encoded super block",
+ (char **) &vdo->super_block.buffer);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ return allocate_vio_components(vdo, VIO_TYPE_SUPER_BLOCK,
+ VIO_PRIORITY_METADATA, NULL, 1,
+ (char *) super_block->buffer,
+ &vdo->super_block.vio);
+}
+
+/**
+ * finish_reading_super_block() - Continue after loading the super block.
+ * @completion: The super block vio.
+ *
+ * This callback is registered in vdo_load_super_block().
+ */
+static void finish_reading_super_block(struct vdo_completion *completion)
+{
+ struct vdo_super_block *super_block =
+ container_of(as_vio(completion), struct vdo_super_block, vio);
+
+ vdo_continue_completion(vdo_forget(completion->parent),
+ vdo_decode_super_block(super_block->buffer));
+}
+
+/**
+ * handle_super_block_read_error() - Handle an error reading the super block.
+ * @completion: The super block vio.
+ *
+ * This error handler is registered in vdo_load_super_block().
+ */
+static void handle_super_block_read_error(struct vdo_completion *completion)
+{
+ vio_record_metadata_io_error(as_vio(completion));
+ finish_reading_super_block(completion);
+}
+
+static void read_super_block_endio(struct bio *bio)
+{
+ struct vio *vio = bio->bi_private;
+ struct vdo_completion *parent = vio->completion.parent;
+
+ continue_vio_after_io(vio, finish_reading_super_block,
+ parent->callback_thread_id);
+}
+
+/**
+ * vdo_load_super_block() - Allocate a super block and read its contents from storage.
+ * @vdo: The vdo containing the super block on disk.
+ * @parent: The completion to notify after loading the super block.
+ */
+void vdo_load_super_block(struct vdo *vdo, struct vdo_completion *parent)
+{
+ int result;
+
+ result = initialize_super_block(vdo, &vdo->super_block);
+ if (result != VDO_SUCCESS) {
+ vdo_continue_completion(parent, result);
+ return;
+ }
+
+ vdo->super_block.vio.completion.parent = parent;
+ vdo_submit_metadata_vio(&vdo->super_block.vio,
+ vdo_get_data_region_start(vdo->geometry),
+ read_super_block_endio,
+ handle_super_block_read_error,
+ REQ_OP_READ);
+}
+
+/**
+ * vdo_get_backing_device() - Get the block device object underlying a vdo.
+ * @vdo: The vdo.
+ *
+ * Return: The vdo's current block device.
+ */
+struct block_device *vdo_get_backing_device(const struct vdo *vdo)
+{
+ return vdo->device_config->owned_device->bdev;
+}
+
+/**
+ * vdo_get_device_name() - Get the device name associated with the vdo target.
+ * @target: The target device interface.
+ *
+ * Return: The block device name.
+ */
+const char *vdo_get_device_name(const struct dm_target *target)
+{
+ return dm_device_name(dm_table_get_md(target->table));
+}
+
+/**
+ * vdo_synchronous_flush() - Issue a flush request and wait for it to complete.
+ * @vdo: The vdo.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+int vdo_synchronous_flush(struct vdo *vdo)
+{
+ int result;
+ struct bio bio;
+
+ bio_init(&bio, vdo_get_backing_device(vdo), NULL, 0,
+ REQ_OP_WRITE | REQ_PREFLUSH);
+ submit_bio_wait(&bio);
+ result = blk_status_to_errno(bio.bi_status);
+
+ atomic64_inc(&vdo->stats.flush_out);
+ if (result != 0) {
+ vdo_log_error_strerror(result, "synchronous flush failed");
+ result = -EIO;
+ }
+
+ bio_uninit(&bio);
+ return result;
+}
+
+/**
+ * vdo_get_state() - Get the current state of the vdo.
+ * @vdo: The vdo.
+
+ * Context: This method may be called from any thread.
+ *
+ * Return: The current state of the vdo.
+ */
+enum vdo_state vdo_get_state(const struct vdo *vdo)
+{
+ enum vdo_state state = atomic_read(&vdo->state);
+
+ /* pairs with barriers where state field is changed */
+ smp_rmb();
+ return state;
+}
+
+/**
+ * vdo_set_state() - Set the current state of the vdo.
+ * @vdo: The vdo whose state is to be set.
+ * @state: The new state of the vdo.
+ *
+ * Context: This method may be called from any thread.
+ */
+void vdo_set_state(struct vdo *vdo, enum vdo_state state)
+{
+ /* pairs with barrier in vdo_get_state */
+ smp_wmb();
+ atomic_set(&vdo->state, state);
+}
+
+/**
+ * vdo_get_admin_state() - Get the admin state of the vdo.
+ * @vdo: The vdo.
+ *
+ * Return: The code for the vdo's current admin state.
+ */
+const struct admin_state_code *vdo_get_admin_state(const struct vdo *vdo)
+{
+ return vdo_get_admin_state_code(&vdo->admin.state);
+}
+
+/**
+ * record_vdo() - Record the state of the VDO for encoding in the super block.
+ */
+static void record_vdo(struct vdo *vdo)
+{
+ /* This is for backwards compatibility. */
+ vdo->states.unused = vdo->geometry.unused;
+ vdo->states.vdo.state = vdo_get_state(vdo);
+ vdo->states.block_map = vdo_record_block_map(vdo->block_map);
+ vdo->states.recovery_journal = vdo_record_recovery_journal(vdo->recovery_journal);
+ vdo->states.slab_depot = vdo_record_slab_depot(vdo->depot);
+ vdo->states.layout = vdo->layout;
+}
+
+/**
+ * continue_super_block_parent() - Continue the parent of a super block save operation.
+ * @completion: The super block vio.
+ *
+ * This callback is registered in vdo_save_components().
+ */
+static void continue_super_block_parent(struct vdo_completion *completion)
+{
+ vdo_continue_completion(vdo_forget(completion->parent), completion->result);
+}
+
+/**
+ * handle_save_error() - Log a super block save error.
+ * @completion: The super block vio.
+ *
+ * This error handler is registered in vdo_save_components().
+ */
+static void handle_save_error(struct vdo_completion *completion)
+{
+ struct vdo_super_block *super_block =
+ container_of(as_vio(completion), struct vdo_super_block, vio);
+
+ vio_record_metadata_io_error(&super_block->vio);
+ vdo_log_error_strerror(completion->result, "super block save failed");
+ /*
+ * Mark the super block as unwritable so that we won't attempt to write it again. This
+ * avoids the case where a growth attempt fails writing the super block with the new size,
+ * but the subsequent attempt to write out the read-only state succeeds. In this case,
+ * writes which happened just before the suspend would not be visible if the VDO is
+ * restarted without rebuilding, but, after a read-only rebuild, the effects of those
+ * writes would reappear.
+ */
+ super_block->unwritable = true;
+ completion->callback(completion);
+}
+
+static void super_block_write_endio(struct bio *bio)
+{
+ struct vio *vio = bio->bi_private;
+ struct vdo_completion *parent = vio->completion.parent;
+
+ continue_vio_after_io(vio, continue_super_block_parent,
+ parent->callback_thread_id);
+}
+
+/**
+ * vdo_save_components() - Encode the vdo and save the super block asynchronously.
+ * @vdo: The vdo whose state is being saved.
+ * @parent: The completion to notify when the save is complete.
+ */
+void vdo_save_components(struct vdo *vdo, struct vdo_completion *parent)
+{
+ struct vdo_super_block *super_block = &vdo->super_block;
+
+ if (super_block->unwritable) {
+ vdo_continue_completion(parent, VDO_READ_ONLY);
+ return;
+ }
+
+ if (super_block->vio.completion.parent != NULL) {
+ vdo_continue_completion(parent, VDO_COMPONENT_BUSY);
+ return;
+ }
+
+ record_vdo(vdo);
+
+ vdo_encode_super_block(super_block->buffer, &vdo->states);
+ super_block->vio.completion.parent = parent;
+ super_block->vio.completion.callback_thread_id = parent->callback_thread_id;
+ vdo_submit_metadata_vio(&super_block->vio,
+ vdo_get_data_region_start(vdo->geometry),
+ super_block_write_endio, handle_save_error,
+ REQ_OP_WRITE | REQ_PREFLUSH | REQ_FUA);
+}
+
+/**
+ * vdo_register_read_only_listener() - Register a listener to be notified when the VDO goes
+ * read-only.
+ * @vdo: The vdo to register with.
+ * @listener: The object to notify.
+ * @notification: The function to call to send the notification.
+ * @thread_id: The id of the thread on which to send the notification.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+int vdo_register_read_only_listener(struct vdo *vdo, void *listener,
+ vdo_read_only_notification_fn notification,
+ thread_id_t thread_id)
+{
+ struct vdo_thread *thread = &vdo->threads[thread_id];
+ struct read_only_listener *read_only_listener;
+ int result;
+
+ result = VDO_ASSERT(thread_id != vdo->thread_config.dedupe_thread,
+ "read only listener not registered on dedupe thread");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = vdo_allocate(1, struct read_only_listener, __func__,
+ &read_only_listener);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ *read_only_listener = (struct read_only_listener) {
+ .listener = listener,
+ .notify = notification,
+ .next = thread->listeners,
+ };
+
+ thread->listeners = read_only_listener;
+ return VDO_SUCCESS;
+}
+
+/**
+ * notify_vdo_of_read_only_mode() - Notify a vdo that it is going read-only.
+ * @listener: The vdo.
+ * @parent: The completion to notify in order to acknowledge the notification.
+ *
+ * This will save the read-only state to the super block.
+ *
+ * Implements vdo_read_only_notification_fn.
+ */
+static void notify_vdo_of_read_only_mode(void *listener, struct vdo_completion *parent)
+{
+ struct vdo *vdo = listener;
+
+ if (vdo_in_read_only_mode(vdo))
+ vdo_finish_completion(parent);
+
+ vdo_set_state(vdo, VDO_READ_ONLY_MODE);
+ vdo_save_components(vdo, parent);
+}
+
+/**
+ * vdo_enable_read_only_entry() - Enable a vdo to enter read-only mode on errors.
+ * @vdo: The vdo to enable.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+int vdo_enable_read_only_entry(struct vdo *vdo)
+{
+ thread_id_t id;
+ bool is_read_only = vdo_in_read_only_mode(vdo);
+ struct read_only_notifier *notifier = &vdo->read_only_notifier;
+
+ if (is_read_only) {
+ notifier->read_only_error = VDO_READ_ONLY;
+ notifier->state = NOTIFIED;
+ } else {
+ notifier->state = MAY_NOT_NOTIFY;
+ }
+
+ spin_lock_init(&notifier->lock);
+ vdo_initialize_completion(&notifier->completion, vdo,
+ VDO_READ_ONLY_MODE_COMPLETION);
+
+ for (id = 0; id < vdo->thread_config.thread_count; id++)
+ vdo->threads[id].is_read_only = is_read_only;
+
+ return vdo_register_read_only_listener(vdo, vdo, notify_vdo_of_read_only_mode,
+ vdo->thread_config.admin_thread);
+}
+
+/**
+ * vdo_wait_until_not_entering_read_only_mode() - Wait until no read-only notifications are in
+ * progress and prevent any subsequent
+ * notifications.
+ * @parent: The completion to notify when no threads are entering read-only mode.
+ *
+ * Notifications may be re-enabled by calling vdo_allow_read_only_mode_entry().
+ */
+void vdo_wait_until_not_entering_read_only_mode(struct vdo_completion *parent)
+{
+ struct vdo *vdo = parent->vdo;
+ struct read_only_notifier *notifier = &vdo->read_only_notifier;
+
+ vdo_assert_on_admin_thread(vdo, __func__);
+
+ if (notifier->waiter != NULL) {
+ vdo_continue_completion(parent, VDO_COMPONENT_BUSY);
+ return;
+ }
+
+ spin_lock(&notifier->lock);
+ if (notifier->state == NOTIFYING)
+ notifier->waiter = parent;
+ else if (notifier->state == MAY_NOTIFY)
+ notifier->state = MAY_NOT_NOTIFY;
+ spin_unlock(&notifier->lock);
+
+ if (notifier->waiter == NULL) {
+ /*
+ * A notification was not in progress, and now they are
+ * disallowed.
+ */
+ vdo_launch_completion(parent);
+ return;
+ }
+}
+
+/**
+ * as_notifier() - Convert a generic vdo_completion to a read_only_notifier.
+ * @completion: The completion to convert.
+ *
+ * Return: The completion as a read_only_notifier.
+ */
+static inline struct read_only_notifier *as_notifier(struct vdo_completion *completion)
+{
+ vdo_assert_completion_type(completion, VDO_READ_ONLY_MODE_COMPLETION);
+ return container_of(completion, struct read_only_notifier, completion);
+}
+
+/**
+ * finish_entering_read_only_mode() - Complete the process of entering read only mode.
+ * @completion: The read-only mode completion.
+ */
+static void finish_entering_read_only_mode(struct vdo_completion *completion)
+{
+ struct read_only_notifier *notifier = as_notifier(completion);
+
+ vdo_assert_on_admin_thread(completion->vdo, __func__);
+
+ spin_lock(&notifier->lock);
+ notifier->state = NOTIFIED;
+ spin_unlock(&notifier->lock);
+
+ if (notifier->waiter != NULL)
+ vdo_continue_completion(vdo_forget(notifier->waiter),
+ completion->result);
+}
+
+/**
+ * make_thread_read_only() - Inform each thread that the VDO is in read-only mode.
+ * @completion: The read-only mode completion.
+ */
+static void make_thread_read_only(struct vdo_completion *completion)
+{
+ struct vdo *vdo = completion->vdo;
+ thread_id_t thread_id = completion->callback_thread_id;
+ struct read_only_notifier *notifier = as_notifier(completion);
+ struct read_only_listener *listener = completion->parent;
+
+ if (listener == NULL) {
+ /* This is the first call on this thread */
+ struct vdo_thread *thread = &vdo->threads[thread_id];
+
+ thread->is_read_only = true;
+ listener = thread->listeners;
+ if (thread_id == 0)
+ vdo_log_error_strerror(READ_ONCE(notifier->read_only_error),
+ "Unrecoverable error, entering read-only mode");
+ } else {
+ /* We've just finished notifying a listener */
+ listener = listener->next;
+ }
+
+ if (listener != NULL) {
+ /* We have a listener to notify */
+ vdo_prepare_completion(completion, make_thread_read_only,
+ make_thread_read_only, thread_id,
+ listener);
+ listener->notify(listener->listener, completion);
+ return;
+ }
+
+ /* We're done with this thread */
+ if (++thread_id == vdo->thread_config.dedupe_thread) {
+ /*
+ * We don't want to notify the dedupe thread since it may be
+ * blocked rebuilding the index.
+ */
+ thread_id++;
+ }
+
+ if (thread_id >= vdo->thread_config.thread_count) {
+ /* There are no more threads */
+ vdo_prepare_completion(completion, finish_entering_read_only_mode,
+ finish_entering_read_only_mode,
+ vdo->thread_config.admin_thread, NULL);
+ } else {
+ vdo_prepare_completion(completion, make_thread_read_only,
+ make_thread_read_only, thread_id, NULL);
+ }
+
+ vdo_launch_completion(completion);
+}
+
+/**
+ * vdo_allow_read_only_mode_entry() - Allow the notifier to put the VDO into read-only mode,
+ * reversing the effects of
+ * vdo_wait_until_not_entering_read_only_mode().
+ * @parent: The object to notify once the operation is complete.
+ *
+ * If some thread tried to put the vdo into read-only mode while notifications were disallowed, it
+ * will be done when this method is called. If that happens, the parent will not be notified until
+ * the vdo has actually entered read-only mode and attempted to save the super block.
+ *
+ * Context: This method may only be called from the admin thread.
+ */
+void vdo_allow_read_only_mode_entry(struct vdo_completion *parent)
+{
+ struct vdo *vdo = parent->vdo;
+ struct read_only_notifier *notifier = &vdo->read_only_notifier;
+
+ vdo_assert_on_admin_thread(vdo, __func__);
+
+ if (notifier->waiter != NULL) {
+ vdo_continue_completion(parent, VDO_COMPONENT_BUSY);
+ return;
+ }
+
+ spin_lock(&notifier->lock);
+ if (notifier->state == MAY_NOT_NOTIFY) {
+ if (notifier->read_only_error == VDO_SUCCESS) {
+ notifier->state = MAY_NOTIFY;
+ } else {
+ notifier->state = NOTIFYING;
+ notifier->waiter = parent;
+ }
+ }
+ spin_unlock(&notifier->lock);
+
+ if (notifier->waiter == NULL) {
+ /* We're done */
+ vdo_launch_completion(parent);
+ return;
+ }
+
+ /* Do the pending notification. */
+ make_thread_read_only(&notifier->completion);
+}
+
+/**
+ * vdo_enter_read_only_mode() - Put a VDO into read-only mode and save the read-only state in the
+ * super block.
+ * @vdo: The vdo.
+ * @error_code: The error which caused the VDO to enter read-only mode.
+ *
+ * This method is a no-op if the VDO is already read-only.
+ */
+void vdo_enter_read_only_mode(struct vdo *vdo, int error_code)
+{
+ bool notify = false;
+ thread_id_t thread_id = vdo_get_callback_thread_id();
+ struct read_only_notifier *notifier = &vdo->read_only_notifier;
+ struct vdo_thread *thread;
+
+ if (thread_id != VDO_INVALID_THREAD_ID) {
+ thread = &vdo->threads[thread_id];
+ if (thread->is_read_only) {
+ /* This thread has already gone read-only. */
+ return;
+ }
+
+ /* Record for this thread that the VDO is read-only. */
+ thread->is_read_only = true;
+ }
+
+ spin_lock(&notifier->lock);
+ if (notifier->read_only_error == VDO_SUCCESS) {
+ WRITE_ONCE(notifier->read_only_error, error_code);
+ if (notifier->state == MAY_NOTIFY) {
+ notifier->state = NOTIFYING;
+ notify = true;
+ }
+ }
+ spin_unlock(&notifier->lock);
+
+ if (!notify) {
+ /* The notifier is already aware of a read-only error */
+ return;
+ }
+
+ /* Initiate a notification starting on the lowest numbered thread. */
+ vdo_launch_completion_callback(&notifier->completion, make_thread_read_only, 0);
+}
+
+/**
+ * vdo_is_read_only() - Check whether the VDO is read-only.
+ * @vdo: The vdo.
+ *
+ * Return: true if the vdo is read-only.
+ *
+ * This method may be called from any thread, as opposed to examining the VDO's state field which
+ * is only safe to check from the admin thread.
+ */
+bool vdo_is_read_only(struct vdo *vdo)
+{
+ return vdo->threads[vdo_get_callback_thread_id()].is_read_only;
+}
+
+/**
+ * vdo_in_read_only_mode() - Check whether a vdo is in read-only mode.
+ * @vdo: The vdo to query.
+ *
+ * Return: true if the vdo is in read-only mode.
+ */
+bool vdo_in_read_only_mode(const struct vdo *vdo)
+{
+ return (vdo_get_state(vdo) == VDO_READ_ONLY_MODE);
+}
+
+/**
+ * vdo_in_recovery_mode() - Check whether the vdo is in recovery mode.
+ * @vdo: The vdo to query.
+ *
+ * Return: true if the vdo is in recovery mode.
+ */
+bool vdo_in_recovery_mode(const struct vdo *vdo)
+{
+ return (vdo_get_state(vdo) == VDO_RECOVERING);
+}
+
+/**
+ * vdo_enter_recovery_mode() - Put the vdo into recovery mode.
+ * @vdo: The vdo.
+ */
+void vdo_enter_recovery_mode(struct vdo *vdo)
+{
+ vdo_assert_on_admin_thread(vdo, __func__);
+
+ if (vdo_in_read_only_mode(vdo))
+ return;
+
+ vdo_log_info("Entering recovery mode");
+ vdo_set_state(vdo, VDO_RECOVERING);
+}
+
+/**
+ * complete_synchronous_action() - Signal the waiting thread that a synchronous action is complete.
+ * @completion: The sync completion.
+ */
+static void complete_synchronous_action(struct vdo_completion *completion)
+{
+ vdo_assert_completion_type(completion, VDO_SYNC_COMPLETION);
+ complete(&(container_of(completion, struct sync_completion,
+ vdo_completion)->completion));
+}
+
+/**
+ * perform_synchronous_action() - Launch an action on a VDO thread and wait for it to complete.
+ * @vdo: The vdo.
+ * @action: The callback to launch.
+ * @thread_id: The thread on which to run the action.
+ * @parent: The parent of the sync completion (may be NULL).
+ */
+static int perform_synchronous_action(struct vdo *vdo, vdo_action_fn action,
+ thread_id_t thread_id, void *parent)
+{
+ struct sync_completion sync;
+
+ vdo_initialize_completion(&sync.vdo_completion, vdo, VDO_SYNC_COMPLETION);
+ init_completion(&sync.completion);
+ sync.vdo_completion.parent = parent;
+ vdo_launch_completion_callback(&sync.vdo_completion, action, thread_id);
+ wait_for_completion(&sync.completion);
+ return sync.vdo_completion.result;
+}
+
+/**
+ * set_compression_callback() - Callback to turn compression on or off.
+ * @completion: The completion.
+ */
+static void set_compression_callback(struct vdo_completion *completion)
+{
+ struct vdo *vdo = completion->vdo;
+ bool *enable = completion->parent;
+ bool was_enabled = vdo_get_compressing(vdo);
+
+ if (*enable != was_enabled) {
+ WRITE_ONCE(vdo->compressing, *enable);
+ if (was_enabled) {
+ /* Signal the packer to flush since compression has been disabled. */
+ vdo_flush_packer(vdo->packer);
+ }
+ }
+
+ vdo_log_info("compression is %s", (*enable ? "enabled" : "disabled"));
+ *enable = was_enabled;
+ complete_synchronous_action(completion);
+}
+
+/**
+ * vdo_set_compressing() - Turn compression on or off.
+ * @vdo: The vdo.
+ * @enable: Whether to enable or disable compression.
+ *
+ * Return: Whether compression was previously on or off.
+ */
+bool vdo_set_compressing(struct vdo *vdo, bool enable)
+{
+ perform_synchronous_action(vdo, set_compression_callback,
+ vdo->thread_config.packer_thread,
+ &enable);
+ return enable;
+}
+
+/**
+ * vdo_get_compressing() - Get whether compression is enabled in a vdo.
+ * @vdo: The vdo.
+ *
+ * Return: State of compression.
+ */
+bool vdo_get_compressing(struct vdo *vdo)
+{
+ return READ_ONCE(vdo->compressing);
+}
+
+static size_t get_block_map_cache_size(const struct vdo *vdo)
+{
+ return ((size_t) vdo->device_config->cache_size) * VDO_BLOCK_SIZE;
+}
+
+static struct error_statistics __must_check get_vdo_error_statistics(const struct vdo *vdo)
+{
+ /*
+ * The error counts can be incremented from arbitrary threads and so must be incremented
+ * atomically, but they are just statistics with no semantics that could rely on memory
+ * order, so unfenced reads are sufficient.
+ */
+ const struct atomic_statistics *atoms = &vdo->stats;
+
+ return (struct error_statistics) {
+ .invalid_advice_pbn_count = atomic64_read(&atoms->invalid_advice_pbn_count),
+ .no_space_error_count = atomic64_read(&atoms->no_space_error_count),
+ .read_only_error_count = atomic64_read(&atoms->read_only_error_count),
+ };
+}
+
+static void copy_bio_stat(struct bio_stats *b, const struct atomic_bio_stats *a)
+{
+ b->read = atomic64_read(&a->read);
+ b->write = atomic64_read(&a->write);
+ b->discard = atomic64_read(&a->discard);
+ b->flush = atomic64_read(&a->flush);
+ b->empty_flush = atomic64_read(&a->empty_flush);
+ b->fua = atomic64_read(&a->fua);
+}
+
+static struct bio_stats subtract_bio_stats(struct bio_stats minuend,
+ struct bio_stats subtrahend)
+{
+ return (struct bio_stats) {
+ .read = minuend.read - subtrahend.read,
+ .write = minuend.write - subtrahend.write,
+ .discard = minuend.discard - subtrahend.discard,
+ .flush = minuend.flush - subtrahend.flush,
+ .empty_flush = minuend.empty_flush - subtrahend.empty_flush,
+ .fua = minuend.fua - subtrahend.fua,
+ };
+}
+
+/**
+ * vdo_get_physical_blocks_allocated() - Get the number of physical blocks in use by user data.
+ * @vdo: The vdo.
+ *
+ * Return: The number of blocks allocated for user data.
+ */
+static block_count_t __must_check vdo_get_physical_blocks_allocated(const struct vdo *vdo)
+{
+ return (vdo_get_slab_depot_allocated_blocks(vdo->depot) -
+ vdo_get_journal_block_map_data_blocks_used(vdo->recovery_journal));
+}
+
+/**
+ * vdo_get_physical_blocks_overhead() - Get the number of physical blocks used by vdo metadata.
+ * @vdo: The vdo.
+ *
+ * Return: The number of overhead blocks.
+ */
+static block_count_t __must_check vdo_get_physical_blocks_overhead(const struct vdo *vdo)
+{
+ /*
+ * config.physical_blocks is mutated during resize and is in a packed structure,
+ * but resize runs on admin thread.
+ * TODO: Verify that this is always safe.
+ */
+ return (vdo->states.vdo.config.physical_blocks -
+ vdo_get_slab_depot_data_blocks(vdo->depot) +
+ vdo_get_journal_block_map_data_blocks_used(vdo->recovery_journal));
+}
+
+static const char *vdo_describe_state(enum vdo_state state)
+{
+ /* These strings should all fit in the 15 chars of VDOStatistics.mode. */
+ switch (state) {
+ case VDO_RECOVERING:
+ return "recovering";
+
+ case VDO_READ_ONLY_MODE:
+ return "read-only";
+
+ default:
+ return "normal";
+ }
+}
+
+/**
+ * get_vdo_statistics() - Populate a vdo_statistics structure on the admin thread.
+ * @vdo: The vdo.
+ * @stats: The statistics structure to populate.
+ */
+static void get_vdo_statistics(const struct vdo *vdo, struct vdo_statistics *stats)
+{
+ struct recovery_journal *journal = vdo->recovery_journal;
+ enum vdo_state state = vdo_get_state(vdo);
+
+ vdo_assert_on_admin_thread(vdo, __func__);
+
+ /* start with a clean slate */
+ memset(stats, 0, sizeof(struct vdo_statistics));
+
+ /*
+ * These are immutable properties of the vdo object, so it is safe to query them from any
+ * thread.
+ */
+ stats->version = STATISTICS_VERSION;
+ stats->logical_blocks = vdo->states.vdo.config.logical_blocks;
+ /*
+ * config.physical_blocks is mutated during resize and is in a packed structure, but resize
+ * runs on the admin thread.
+ * TODO: verify that this is always safe
+ */
+ stats->physical_blocks = vdo->states.vdo.config.physical_blocks;
+ stats->block_size = VDO_BLOCK_SIZE;
+ stats->complete_recoveries = vdo->states.vdo.complete_recoveries;
+ stats->read_only_recoveries = vdo->states.vdo.read_only_recoveries;
+ stats->block_map_cache_size = get_block_map_cache_size(vdo);
+
+ /* The callees are responsible for thread-safety. */
+ stats->data_blocks_used = vdo_get_physical_blocks_allocated(vdo);
+ stats->overhead_blocks_used = vdo_get_physical_blocks_overhead(vdo);
+ stats->logical_blocks_used = vdo_get_recovery_journal_logical_blocks_used(journal);
+ vdo_get_slab_depot_statistics(vdo->depot, stats);
+ stats->journal = vdo_get_recovery_journal_statistics(journal);
+ stats->packer = vdo_get_packer_statistics(vdo->packer);
+ stats->block_map = vdo_get_block_map_statistics(vdo->block_map);
+ vdo_get_dedupe_statistics(vdo->hash_zones, stats);
+ stats->errors = get_vdo_error_statistics(vdo);
+ stats->in_recovery_mode = (state == VDO_RECOVERING);
+ snprintf(stats->mode, sizeof(stats->mode), "%s", vdo_describe_state(state));
+
+ stats->instance = vdo->instance;
+ stats->current_vios_in_progress = get_data_vio_pool_active_requests(vdo->data_vio_pool);
+ stats->max_vios = get_data_vio_pool_maximum_requests(vdo->data_vio_pool);
+
+ stats->flush_out = atomic64_read(&vdo->stats.flush_out);
+ stats->logical_block_size = vdo->device_config->logical_block_size;
+ copy_bio_stat(&stats->bios_in, &vdo->stats.bios_in);
+ copy_bio_stat(&stats->bios_in_partial, &vdo->stats.bios_in_partial);
+ copy_bio_stat(&stats->bios_out, &vdo->stats.bios_out);
+ copy_bio_stat(&stats->bios_meta, &vdo->stats.bios_meta);
+ copy_bio_stat(&stats->bios_journal, &vdo->stats.bios_journal);
+ copy_bio_stat(&stats->bios_page_cache, &vdo->stats.bios_page_cache);
+ copy_bio_stat(&stats->bios_out_completed, &vdo->stats.bios_out_completed);
+ copy_bio_stat(&stats->bios_meta_completed, &vdo->stats.bios_meta_completed);
+ copy_bio_stat(&stats->bios_journal_completed,
+ &vdo->stats.bios_journal_completed);
+ copy_bio_stat(&stats->bios_page_cache_completed,
+ &vdo->stats.bios_page_cache_completed);
+ copy_bio_stat(&stats->bios_acknowledged, &vdo->stats.bios_acknowledged);
+ copy_bio_stat(&stats->bios_acknowledged_partial, &vdo->stats.bios_acknowledged_partial);
+ stats->bios_in_progress =
+ subtract_bio_stats(stats->bios_in, stats->bios_acknowledged);
+ vdo_get_memory_stats(&stats->memory_usage.bytes_used,
+ &stats->memory_usage.peak_bytes_used);
+}
+
+/**
+ * vdo_fetch_statistics_callback() - Action to populate a vdo_statistics
+ * structure on the admin thread.
+ * @completion: The completion.
+ *
+ * This callback is registered in vdo_fetch_statistics().
+ */
+static void vdo_fetch_statistics_callback(struct vdo_completion *completion)
+{
+ get_vdo_statistics(completion->vdo, completion->parent);
+ complete_synchronous_action(completion);
+}
+
+/**
+ * vdo_fetch_statistics() - Fetch statistics on the correct thread.
+ * @vdo: The vdo.
+ * @stats: The vdo statistics are returned here.
+ */
+void vdo_fetch_statistics(struct vdo *vdo, struct vdo_statistics *stats)
+{
+ perform_synchronous_action(vdo, vdo_fetch_statistics_callback,
+ vdo->thread_config.admin_thread, stats);
+}
+
+/**
+ * vdo_get_callback_thread_id() - Get the id of the callback thread on which a completion is
+ * currently running.
+ *
+ * Return: The current thread ID, or -1 if no such thread.
+ */
+thread_id_t vdo_get_callback_thread_id(void)
+{
+ struct vdo_work_queue *queue = vdo_get_current_work_queue();
+ struct vdo_thread *thread;
+ thread_id_t thread_id;
+
+ if (queue == NULL)
+ return VDO_INVALID_THREAD_ID;
+
+ thread = vdo_get_work_queue_owner(queue);
+ thread_id = thread->thread_id;
+
+ if (PARANOID_THREAD_CONSISTENCY_CHECKS) {
+ BUG_ON(thread_id >= thread->vdo->thread_config.thread_count);
+ BUG_ON(thread != &thread->vdo->threads[thread_id]);
+ }
+
+ return thread_id;
+}
+
+/**
+ * vdo_dump_status() - Dump status information about a vdo to the log for debugging.
+ * @vdo: The vdo to dump.
+ */
+void vdo_dump_status(const struct vdo *vdo)
+{
+ zone_count_t zone;
+
+ vdo_dump_flusher(vdo->flusher);
+ vdo_dump_recovery_journal_statistics(vdo->recovery_journal);
+ vdo_dump_packer(vdo->packer);
+ vdo_dump_slab_depot(vdo->depot);
+
+ for (zone = 0; zone < vdo->thread_config.logical_zone_count; zone++)
+ vdo_dump_logical_zone(&vdo->logical_zones->zones[zone]);
+
+ for (zone = 0; zone < vdo->thread_config.physical_zone_count; zone++)
+ vdo_dump_physical_zone(&vdo->physical_zones->zones[zone]);
+
+ vdo_dump_hash_zones(vdo->hash_zones);
+}
+
+/**
+ * vdo_assert_on_admin_thread() - Assert that we are running on the admin thread.
+ * @vdo: The vdo.
+ * @name: The name of the function which should be running on the admin thread (for logging).
+ */
+void vdo_assert_on_admin_thread(const struct vdo *vdo, const char *name)
+{
+ VDO_ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == vdo->thread_config.admin_thread),
+ "%s called on admin thread", name);
+}
+
+/**
+ * vdo_assert_on_logical_zone_thread() - Assert that this function was called on the specified
+ * logical zone thread.
+ * @vdo: The vdo.
+ * @logical_zone: The number of the logical zone.
+ * @name: The name of the calling function.
+ */
+void vdo_assert_on_logical_zone_thread(const struct vdo *vdo, zone_count_t logical_zone,
+ const char *name)
+{
+ VDO_ASSERT_LOG_ONLY((vdo_get_callback_thread_id() ==
+ vdo->thread_config.logical_threads[logical_zone]),
+ "%s called on logical thread", name);
+}
+
+/**
+ * vdo_assert_on_physical_zone_thread() - Assert that this function was called on the specified
+ * physical zone thread.
+ * @vdo: The vdo.
+ * @physical_zone: The number of the physical zone.
+ * @name: The name of the calling function.
+ */
+void vdo_assert_on_physical_zone_thread(const struct vdo *vdo,
+ zone_count_t physical_zone, const char *name)
+{
+ VDO_ASSERT_LOG_ONLY((vdo_get_callback_thread_id() ==
+ vdo->thread_config.physical_threads[physical_zone]),
+ "%s called on physical thread", name);
+}
+
+/**
+ * vdo_get_physical_zone() - Get the physical zone responsible for a given physical block number.
+ * @vdo: The vdo containing the physical zones.
+ * @pbn: The PBN of the data block.
+ * @zone_ptr: A pointer to return the physical zone.
+ *
+ * Gets the physical zone responsible for a given physical block number of a data block in this vdo
+ * instance, or of the zero block (for which a NULL zone is returned). For any other block number
+ * that is not in the range of valid data block numbers in any slab, an error will be returned.
+ * This function is safe to call on invalid block numbers; it will not put the vdo into read-only
+ * mode.
+ *
+ * Return: VDO_SUCCESS or VDO_OUT_OF_RANGE if the block number is invalid or an error code for any
+ * other failure.
+ */
+int vdo_get_physical_zone(const struct vdo *vdo, physical_block_number_t pbn,
+ struct physical_zone **zone_ptr)
+{
+ struct vdo_slab *slab;
+ int result;
+
+ if (pbn == VDO_ZERO_BLOCK) {
+ *zone_ptr = NULL;
+ return VDO_SUCCESS;
+ }
+
+ /*
+ * Used because it does a more restrictive bounds check than vdo_get_slab(), and done first
+ * because it won't trigger read-only mode on an invalid PBN.
+ */
+ if (!vdo_is_physical_data_block(vdo->depot, pbn))
+ return VDO_OUT_OF_RANGE;
+
+ /* With the PBN already checked, we should always succeed in finding a slab. */
+ slab = vdo_get_slab(vdo->depot, pbn);
+ result = VDO_ASSERT(slab != NULL, "vdo_get_slab must succeed on all valid PBNs");
+ if (result != VDO_SUCCESS)
+ return result;
+
+ *zone_ptr = &vdo->physical_zones->zones[slab->allocator->zone_number];
+ return VDO_SUCCESS;
+}
diff --git a/drivers/md/dm-vdo/vdo.h b/drivers/md/dm-vdo/vdo.h
new file mode 100644
index 000000000000..483ae873e002
--- /dev/null
+++ b/drivers/md/dm-vdo/vdo.h
@@ -0,0 +1,362 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef VDO_H
+#define VDO_H
+
+#include <linux/atomic.h>
+#include <linux/blk_types.h>
+#include <linux/completion.h>
+#include <linux/dm-kcopyd.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+#include "admin-state.h"
+#include "encodings.h"
+#include "funnel-workqueue.h"
+#include "packer.h"
+#include "physical-zone.h"
+#include "statistics.h"
+#include "thread-registry.h"
+#include "types.h"
+
+enum notifier_state {
+ /* Notifications are allowed but not in progress */
+ MAY_NOTIFY,
+ /* A notification is in progress */
+ NOTIFYING,
+ /* Notifications are not allowed */
+ MAY_NOT_NOTIFY,
+ /* A notification has completed */
+ NOTIFIED,
+};
+
+/**
+ * typedef vdo_read_only_notification_fn - A function to notify a listener that the VDO has gone
+ * read-only.
+ * @listener: The object to notify.
+ * @parent: The completion to notify in order to acknowledge the notification.
+ */
+typedef void (*vdo_read_only_notification_fn)(void *listener, struct vdo_completion *parent);
+
+/*
+ * An object to be notified when the VDO enters read-only mode
+ */
+struct read_only_listener {
+ /* The listener */
+ void *listener;
+ /* The method to call to notify the listener */
+ vdo_read_only_notification_fn notify;
+ /* A pointer to the next listener */
+ struct read_only_listener *next;
+};
+
+struct vdo_thread {
+ struct vdo *vdo;
+ thread_id_t thread_id;
+ struct vdo_work_queue *queue;
+ /*
+ * Each thread maintains its own notion of whether the VDO is read-only so that the
+ * read-only state can be checked from any base thread without worrying about
+ * synchronization or thread safety. This does mean that knowledge of the VDO going
+ * read-only does not occur simultaneously across the VDO's threads, but that does not seem
+ * to cause any problems.
+ */
+ bool is_read_only;
+ /*
+ * A list of objects waiting to be notified on this thread that the VDO has entered
+ * read-only mode.
+ */
+ struct read_only_listener *listeners;
+ struct registered_thread allocating_thread;
+};
+
+/* Keep struct bio statistics atomically */
+struct atomic_bio_stats {
+ atomic64_t read; /* Number of not REQ_WRITE bios */
+ atomic64_t write; /* Number of REQ_WRITE bios */
+ atomic64_t discard; /* Number of REQ_DISCARD bios */
+ atomic64_t flush; /* Number of REQ_FLUSH bios */
+ atomic64_t empty_flush; /* Number of REQ_PREFLUSH bios without data */
+ atomic64_t fua; /* Number of REQ_FUA bios */
+};
+
+/* Counters are atomic since updates can arrive concurrently from arbitrary threads. */
+struct atomic_statistics {
+ atomic64_t bios_submitted;
+ atomic64_t bios_completed;
+ atomic64_t flush_out;
+ atomic64_t invalid_advice_pbn_count;
+ atomic64_t no_space_error_count;
+ atomic64_t read_only_error_count;
+ struct atomic_bio_stats bios_in;
+ struct atomic_bio_stats bios_in_partial;
+ struct atomic_bio_stats bios_out;
+ struct atomic_bio_stats bios_out_completed;
+ struct atomic_bio_stats bios_acknowledged;
+ struct atomic_bio_stats bios_acknowledged_partial;
+ struct atomic_bio_stats bios_meta;
+ struct atomic_bio_stats bios_meta_completed;
+ struct atomic_bio_stats bios_journal;
+ struct atomic_bio_stats bios_journal_completed;
+ struct atomic_bio_stats bios_page_cache;
+ struct atomic_bio_stats bios_page_cache_completed;
+};
+
+struct read_only_notifier {
+ /* The completion for entering read-only mode */
+ struct vdo_completion completion;
+ /* A completion waiting for notifications to be drained or enabled */
+ struct vdo_completion *waiter;
+ /* Lock to protect the next two fields */
+ spinlock_t lock;
+ /* The code of the error which put the VDO into read-only mode */
+ int read_only_error;
+ /* The current state of the notifier (values described above) */
+ enum notifier_state state;
+};
+
+/*
+ * The thread ID returned when the current thread is not a vdo thread, or can not be determined
+ * (usually due to being at interrupt context).
+ */
+#define VDO_INVALID_THREAD_ID ((thread_id_t) -1)
+
+struct thread_config {
+ zone_count_t logical_zone_count;
+ zone_count_t physical_zone_count;
+ zone_count_t hash_zone_count;
+ thread_count_t bio_thread_count;
+ thread_count_t thread_count;
+ thread_id_t admin_thread;
+ thread_id_t journal_thread;
+ thread_id_t packer_thread;
+ thread_id_t dedupe_thread;
+ thread_id_t bio_ack_thread;
+ thread_id_t cpu_thread;
+ thread_id_t *logical_threads;
+ thread_id_t *physical_threads;
+ thread_id_t *hash_zone_threads;
+ thread_id_t *bio_threads;
+};
+
+struct thread_count_config;
+
+struct vdo_super_block {
+ /* The vio for reading and writing the super block to disk */
+ struct vio vio;
+ /* A buffer to hold the super block */
+ u8 *buffer;
+ /* Whether this super block may not be written */
+ bool unwritable;
+};
+
+struct data_vio_pool;
+
+struct vdo_administrator {
+ struct vdo_completion completion;
+ struct admin_state state;
+ atomic_t busy;
+ u32 phase;
+ struct completion callback_sync;
+};
+
+struct vdo {
+ char thread_name_prefix[MAX_VDO_WORK_QUEUE_NAME_LEN];
+ struct vdo_thread *threads;
+ vdo_action_fn action;
+ struct vdo_completion *completion;
+ struct vio_tracer *vio_tracer;
+
+ /* The atomic version of the state of this vdo */
+ atomic_t state;
+ /* The full state of all components */
+ struct vdo_component_states states;
+ /*
+ * A counter value to attach to thread names and log messages to identify the individual
+ * device.
+ */
+ unsigned int instance;
+ /* The read-only notifier */
+ struct read_only_notifier read_only_notifier;
+ /* The load-time configuration of this vdo */
+ struct device_config *device_config;
+ /* The thread mapping */
+ struct thread_config thread_config;
+
+ /* The super block */
+ struct vdo_super_block super_block;
+
+ /* The partitioning of the underlying storage */
+ struct layout layout;
+ struct layout next_layout;
+ struct dm_kcopyd_client *partition_copier;
+
+ /* The block map */
+ struct block_map *block_map;
+
+ /* The journal for block map recovery */
+ struct recovery_journal *recovery_journal;
+
+ /* The slab depot */
+ struct slab_depot *depot;
+
+ /* The compressed-block packer */
+ struct packer *packer;
+ /* Whether incoming data should be compressed */
+ bool compressing;
+
+ /* The handler for flush requests */
+ struct flusher *flusher;
+
+ /* The state the vdo was in when loaded (primarily for unit tests) */
+ enum vdo_state load_state;
+
+ /* The logical zones of this vdo */
+ struct logical_zones *logical_zones;
+
+ /* The physical zones of this vdo */
+ struct physical_zones *physical_zones;
+
+ /* The hash lock zones of this vdo */
+ struct hash_zones *hash_zones;
+
+ /* Bio submission manager used for sending bios to the storage device. */
+ struct io_submitter *io_submitter;
+
+ /* The pool of data_vios for servicing incoming bios */
+ struct data_vio_pool *data_vio_pool;
+
+ /* The manager for administrative operations */
+ struct vdo_administrator admin;
+
+ /* Flags controlling administrative operations */
+ const struct admin_state_code *suspend_type;
+ bool allocations_allowed;
+ bool dump_on_shutdown;
+ atomic_t processing_message;
+
+ /*
+ * Statistics
+ * Atomic stats counters
+ */
+ struct atomic_statistics stats;
+ /* Used to gather statistics without allocating memory */
+ struct vdo_statistics stats_buffer;
+ /* Protects the stats_buffer */
+ struct mutex stats_mutex;
+
+ /* A list of all device_configs referencing this vdo */
+ struct list_head device_config_list;
+
+ /* This VDO's list entry for the device registry */
+ struct list_head registration;
+
+ /* Underlying block device info. */
+ u64 starting_sector_offset;
+ struct volume_geometry geometry;
+
+ /* N blobs of context data for LZ4 code, one per CPU thread. */
+ char **compression_context;
+};
+
+/**
+ * vdo_uses_bio_ack_queue() - Indicate whether the vdo is configured to use a separate work queue
+ * for acknowledging received and processed bios.
+ * @vdo: The vdo.
+ *
+ * Note that this directly controls the handling of write operations, but the compile-time flag
+ * VDO_USE_BIO_ACK_QUEUE_FOR_READ is also checked for read operations.
+ *
+ * Return: Whether a bio-acknowledgement work queue is in use.
+ */
+static inline bool vdo_uses_bio_ack_queue(struct vdo *vdo)
+{
+ return vdo->device_config->thread_counts.bio_ack_threads > 0;
+}
+
+/**
+ * typedef vdo_filter_fn - Method type for vdo matching methods.
+ *
+ * A filter function returns false if the vdo doesn't match.
+ */
+typedef bool (*vdo_filter_fn)(struct vdo *vdo, const void *context);
+
+void vdo_initialize_device_registry_once(void);
+struct vdo * __must_check vdo_find_matching(vdo_filter_fn filter, const void *context);
+
+int __must_check vdo_make_thread(struct vdo *vdo, thread_id_t thread_id,
+ const struct vdo_work_queue_type *type,
+ unsigned int queue_count, void *contexts[]);
+
+static inline int __must_check vdo_make_default_thread(struct vdo *vdo,
+ thread_id_t thread_id)
+{
+ return vdo_make_thread(vdo, thread_id, NULL, 1, NULL);
+}
+
+int __must_check vdo_make(unsigned int instance, struct device_config *config,
+ char **reason, struct vdo **vdo_ptr);
+
+void vdo_destroy(struct vdo *vdo);
+
+void vdo_load_super_block(struct vdo *vdo, struct vdo_completion *parent);
+
+struct block_device * __must_check vdo_get_backing_device(const struct vdo *vdo);
+
+const char * __must_check vdo_get_device_name(const struct dm_target *target);
+
+int __must_check vdo_synchronous_flush(struct vdo *vdo);
+
+const struct admin_state_code * __must_check vdo_get_admin_state(const struct vdo *vdo);
+
+bool vdo_set_compressing(struct vdo *vdo, bool enable);
+
+bool vdo_get_compressing(struct vdo *vdo);
+
+void vdo_fetch_statistics(struct vdo *vdo, struct vdo_statistics *stats);
+
+thread_id_t vdo_get_callback_thread_id(void);
+
+enum vdo_state __must_check vdo_get_state(const struct vdo *vdo);
+
+void vdo_set_state(struct vdo *vdo, enum vdo_state state);
+
+void vdo_save_components(struct vdo *vdo, struct vdo_completion *parent);
+
+int vdo_register_read_only_listener(struct vdo *vdo, void *listener,
+ vdo_read_only_notification_fn notification,
+ thread_id_t thread_id);
+
+int vdo_enable_read_only_entry(struct vdo *vdo);
+
+void vdo_wait_until_not_entering_read_only_mode(struct vdo_completion *parent);
+
+void vdo_allow_read_only_mode_entry(struct vdo_completion *parent);
+
+void vdo_enter_read_only_mode(struct vdo *vdo, int error_code);
+
+bool __must_check vdo_is_read_only(struct vdo *vdo);
+
+bool __must_check vdo_in_read_only_mode(const struct vdo *vdo);
+
+bool __must_check vdo_in_recovery_mode(const struct vdo *vdo);
+
+void vdo_enter_recovery_mode(struct vdo *vdo);
+
+void vdo_assert_on_admin_thread(const struct vdo *vdo, const char *name);
+
+void vdo_assert_on_logical_zone_thread(const struct vdo *vdo, zone_count_t logical_zone,
+ const char *name);
+
+void vdo_assert_on_physical_zone_thread(const struct vdo *vdo, zone_count_t physical_zone,
+ const char *name);
+
+int __must_check vdo_get_physical_zone(const struct vdo *vdo, physical_block_number_t pbn,
+ struct physical_zone **zone_ptr);
+
+void vdo_dump_status(const struct vdo *vdo);
+
+#endif /* VDO_H */
diff --git a/drivers/md/dm-vdo/vio.c b/drivers/md/dm-vdo/vio.c
new file mode 100644
index 000000000000..b291578f726f
--- /dev/null
+++ b/drivers/md/dm-vdo/vio.c
@@ -0,0 +1,500 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "vio.h"
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/kernel.h>
+#include <linux/ratelimit.h>
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "permassert.h"
+
+#include "constants.h"
+#include "io-submitter.h"
+#include "vdo.h"
+
+/* A vio_pool is a collection of preallocated vios. */
+struct vio_pool {
+ /* The number of objects managed by the pool */
+ size_t size;
+ /* The list of objects which are available */
+ struct list_head available;
+ /* The queue of requestors waiting for objects from the pool */
+ struct vdo_wait_queue waiting;
+ /* The number of objects currently in use */
+ size_t busy_count;
+ /* The list of objects which are in use */
+ struct list_head busy;
+ /* The ID of the thread on which this pool may be used */
+ thread_id_t thread_id;
+ /* The buffer backing the pool's vios */
+ char *buffer;
+ /* The pool entries */
+ struct pooled_vio vios[];
+};
+
+physical_block_number_t pbn_from_vio_bio(struct bio *bio)
+{
+ struct vio *vio = bio->bi_private;
+ struct vdo *vdo = vio->completion.vdo;
+ physical_block_number_t pbn = bio->bi_iter.bi_sector / VDO_SECTORS_PER_BLOCK;
+
+ return ((pbn == VDO_GEOMETRY_BLOCK_LOCATION) ? pbn : pbn + vdo->geometry.bio_offset);
+}
+
+static int create_multi_block_bio(block_count_t size, struct bio **bio_ptr)
+{
+ struct bio *bio = NULL;
+ int result;
+
+ result = vdo_allocate_extended(struct bio, size + 1, struct bio_vec,
+ "bio", &bio);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ *bio_ptr = bio;
+ return VDO_SUCCESS;
+}
+
+int vdo_create_bio(struct bio **bio_ptr)
+{
+ return create_multi_block_bio(1, bio_ptr);
+}
+
+void vdo_free_bio(struct bio *bio)
+{
+ if (bio == NULL)
+ return;
+
+ bio_uninit(bio);
+ vdo_free(vdo_forget(bio));
+}
+
+int allocate_vio_components(struct vdo *vdo, enum vio_type vio_type,
+ enum vio_priority priority, void *parent,
+ unsigned int block_count, char *data, struct vio *vio)
+{
+ struct bio *bio;
+ int result;
+
+ result = VDO_ASSERT(block_count <= MAX_BLOCKS_PER_VIO,
+ "block count %u does not exceed maximum %u", block_count,
+ MAX_BLOCKS_PER_VIO);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = VDO_ASSERT(((vio_type != VIO_TYPE_UNINITIALIZED) && (vio_type != VIO_TYPE_DATA)),
+ "%d is a metadata type", vio_type);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ result = create_multi_block_bio(block_count, &bio);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ initialize_vio(vio, bio, block_count, vio_type, priority, vdo);
+ vio->completion.parent = parent;
+ vio->data = data;
+ return VDO_SUCCESS;
+}
+
+/**
+ * create_multi_block_metadata_vio() - Create a vio.
+ * @vdo: The vdo on which the vio will operate.
+ * @vio_type: The type of vio to create.
+ * @priority: The relative priority to assign to the vio.
+ * @parent: The parent of the vio.
+ * @block_count: The size of the vio in blocks.
+ * @data: The buffer.
+ * @vio_ptr: A pointer to hold the new vio.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+int create_multi_block_metadata_vio(struct vdo *vdo, enum vio_type vio_type,
+ enum vio_priority priority, void *parent,
+ unsigned int block_count, char *data,
+ struct vio **vio_ptr)
+{
+ struct vio *vio;
+ int result;
+
+ BUILD_BUG_ON(sizeof(struct vio) > 256);
+
+ /*
+ * Metadata vios should use direct allocation and not use the buffer pool, which is
+ * reserved for submissions from the linux block layer.
+ */
+ result = vdo_allocate(1, struct vio, __func__, &vio);
+ if (result != VDO_SUCCESS) {
+ vdo_log_error("metadata vio allocation failure %d", result);
+ return result;
+ }
+
+ result = allocate_vio_components(vdo, vio_type, priority, parent, block_count,
+ data, vio);
+ if (result != VDO_SUCCESS) {
+ vdo_free(vio);
+ return result;
+ }
+
+ *vio_ptr = vio;
+ return VDO_SUCCESS;
+}
+
+/**
+ * free_vio_components() - Free the components of a vio embedded in a larger structure.
+ * @vio: The vio to destroy
+ */
+void free_vio_components(struct vio *vio)
+{
+ if (vio == NULL)
+ return;
+
+ BUG_ON(is_data_vio(vio));
+ vdo_free_bio(vdo_forget(vio->bio));
+}
+
+/**
+ * free_vio() - Destroy a vio.
+ * @vio: The vio to destroy.
+ */
+void free_vio(struct vio *vio)
+{
+ free_vio_components(vio);
+ vdo_free(vio);
+}
+
+/* Set bio properties for a VDO read or write. */
+void vdo_set_bio_properties(struct bio *bio, struct vio *vio, bio_end_io_t callback,
+ blk_opf_t bi_opf, physical_block_number_t pbn)
+{
+ struct vdo *vdo = vio->completion.vdo;
+ struct device_config *config = vdo->device_config;
+
+ pbn -= vdo->geometry.bio_offset;
+ vio->bio_zone = ((pbn / config->thread_counts.bio_rotation_interval) %
+ config->thread_counts.bio_threads);
+
+ bio->bi_private = vio;
+ bio->bi_end_io = callback;
+ bio->bi_opf = bi_opf;
+ bio->bi_iter.bi_sector = pbn * VDO_SECTORS_PER_BLOCK;
+}
+
+/*
+ * Prepares the bio to perform IO with the specified buffer. May only be used on a VDO-allocated
+ * bio, as it assumes the bio wraps a 4k buffer that is 4k aligned, but there does not have to be a
+ * vio associated with the bio.
+ */
+int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback,
+ blk_opf_t bi_opf, physical_block_number_t pbn)
+{
+ int bvec_count, offset, len, i;
+ struct bio *bio = vio->bio;
+
+ bio_reset(bio, bio->bi_bdev, bi_opf);
+ vdo_set_bio_properties(bio, vio, callback, bi_opf, pbn);
+ if (data == NULL)
+ return VDO_SUCCESS;
+
+ bio->bi_io_vec = bio->bi_inline_vecs;
+ bio->bi_max_vecs = vio->block_count + 1;
+ len = VDO_BLOCK_SIZE * vio->block_count;
+ offset = offset_in_page(data);
+ bvec_count = DIV_ROUND_UP(offset + len, PAGE_SIZE);
+
+ /*
+ * If we knew that data was always on one page, or contiguous pages, we wouldn't need the
+ * loop. But if we're using vmalloc, it's not impossible that the data is in different
+ * pages that can't be merged in bio_add_page...
+ */
+ for (i = 0; (i < bvec_count) && (len > 0); i++) {
+ struct page *page;
+ int bytes_added;
+ int bytes = PAGE_SIZE - offset;
+
+ if (bytes > len)
+ bytes = len;
+
+ page = is_vmalloc_addr(data) ? vmalloc_to_page(data) : virt_to_page(data);
+ bytes_added = bio_add_page(bio, page, bytes, offset);
+
+ if (bytes_added != bytes) {
+ return vdo_log_error_strerror(VDO_BIO_CREATION_FAILED,
+ "Could only add %i bytes to bio",
+ bytes_added);
+ }
+
+ data += bytes;
+ len -= bytes;
+ offset = 0;
+ }
+
+ return VDO_SUCCESS;
+}
+
+/**
+ * update_vio_error_stats() - Update per-vio error stats and log the error.
+ * @vio: The vio which got an error.
+ * @format: The format of the message to log (a printf style format).
+ */
+void update_vio_error_stats(struct vio *vio, const char *format, ...)
+{
+ static DEFINE_RATELIMIT_STATE(error_limiter, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+ va_list args;
+ int priority;
+ struct vdo *vdo = vio->completion.vdo;
+
+ switch (vio->completion.result) {
+ case VDO_READ_ONLY:
+ atomic64_inc(&vdo->stats.read_only_error_count);
+ return;
+
+ case VDO_NO_SPACE:
+ atomic64_inc(&vdo->stats.no_space_error_count);
+ priority = VDO_LOG_DEBUG;
+ break;
+
+ default:
+ priority = VDO_LOG_ERR;
+ }
+
+ if (!__ratelimit(&error_limiter))
+ return;
+
+ va_start(args, format);
+ vdo_vlog_strerror(priority, vio->completion.result, VDO_LOGGING_MODULE_NAME,
+ format, args);
+ va_end(args);
+}
+
+void vio_record_metadata_io_error(struct vio *vio)
+{
+ const char *description;
+ physical_block_number_t pbn = pbn_from_vio_bio(vio->bio);
+
+ if (bio_op(vio->bio) == REQ_OP_READ) {
+ description = "read";
+ } else if ((vio->bio->bi_opf & REQ_PREFLUSH) == REQ_PREFLUSH) {
+ description = (((vio->bio->bi_opf & REQ_FUA) == REQ_FUA) ?
+ "write+preflush+fua" :
+ "write+preflush");
+ } else if ((vio->bio->bi_opf & REQ_FUA) == REQ_FUA) {
+ description = "write+fua";
+ } else {
+ description = "write";
+ }
+
+ update_vio_error_stats(vio,
+ "Completing %s vio of type %u for physical block %llu with error",
+ description, vio->type, (unsigned long long) pbn);
+}
+
+/**
+ * make_vio_pool() - Create a new vio pool.
+ * @vdo: The vdo.
+ * @pool_size: The number of vios in the pool.
+ * @thread_id: The ID of the thread using this pool.
+ * @vio_type: The type of vios in the pool.
+ * @priority: The priority with which vios from the pool should be enqueued.
+ * @context: The context that each entry will have.
+ * @pool_ptr: The resulting pool.
+ *
+ * Return: A success or error code.
+ */
+int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id,
+ enum vio_type vio_type, enum vio_priority priority, void *context,
+ struct vio_pool **pool_ptr)
+{
+ struct vio_pool *pool;
+ char *ptr;
+ int result;
+
+ result = vdo_allocate_extended(struct vio_pool, pool_size, struct pooled_vio,
+ __func__, &pool);
+ if (result != VDO_SUCCESS)
+ return result;
+
+ pool->thread_id = thread_id;
+ INIT_LIST_HEAD(&pool->available);
+ INIT_LIST_HEAD(&pool->busy);
+
+ result = vdo_allocate(pool_size * VDO_BLOCK_SIZE, char,
+ "VIO pool buffer", &pool->buffer);
+ if (result != VDO_SUCCESS) {
+ free_vio_pool(pool);
+ return result;
+ }
+
+ ptr = pool->buffer;
+ for (pool->size = 0; pool->size < pool_size; pool->size++, ptr += VDO_BLOCK_SIZE) {
+ struct pooled_vio *pooled = &pool->vios[pool->size];
+
+ result = allocate_vio_components(vdo, vio_type, priority, NULL, 1, ptr,
+ &pooled->vio);
+ if (result != VDO_SUCCESS) {
+ free_vio_pool(pool);
+ return result;
+ }
+
+ pooled->context = context;
+ list_add_tail(&pooled->pool_entry, &pool->available);
+ }
+
+ *pool_ptr = pool;
+ return VDO_SUCCESS;
+}
+
+/**
+ * free_vio_pool() - Destroy a vio pool.
+ * @pool: The pool to free.
+ */
+void free_vio_pool(struct vio_pool *pool)
+{
+ struct pooled_vio *pooled, *tmp;
+
+ if (pool == NULL)
+ return;
+
+ /* Remove all available vios from the object pool. */
+ VDO_ASSERT_LOG_ONLY(!vdo_waitq_has_waiters(&pool->waiting),
+ "VIO pool must not have any waiters when being freed");
+ VDO_ASSERT_LOG_ONLY((pool->busy_count == 0),
+ "VIO pool must not have %zu busy entries when being freed",
+ pool->busy_count);
+ VDO_ASSERT_LOG_ONLY(list_empty(&pool->busy),
+ "VIO pool must not have busy entries when being freed");
+
+ list_for_each_entry_safe(pooled, tmp, &pool->available, pool_entry) {
+ list_del(&pooled->pool_entry);
+ free_vio_components(&pooled->vio);
+ pool->size--;
+ }
+
+ VDO_ASSERT_LOG_ONLY(pool->size == 0,
+ "VIO pool must not have missing entries when being freed");
+
+ vdo_free(vdo_forget(pool->buffer));
+ vdo_free(pool);
+}
+
+/**
+ * is_vio_pool_busy() - Check whether an vio pool has outstanding entries.
+ *
+ * Return: true if the pool is busy.
+ */
+bool is_vio_pool_busy(struct vio_pool *pool)
+{
+ return (pool->busy_count != 0);
+}
+
+/**
+ * acquire_vio_from_pool() - Acquire a vio and buffer from the pool (asynchronous).
+ * @pool: The vio pool.
+ * @waiter: Object that is requesting a vio.
+ */
+void acquire_vio_from_pool(struct vio_pool *pool, struct vdo_waiter *waiter)
+{
+ struct pooled_vio *pooled;
+
+ VDO_ASSERT_LOG_ONLY((pool->thread_id == vdo_get_callback_thread_id()),
+ "acquire from active vio_pool called from correct thread");
+
+ if (list_empty(&pool->available)) {
+ vdo_waitq_enqueue_waiter(&pool->waiting, waiter);
+ return;
+ }
+
+ pooled = list_first_entry(&pool->available, struct pooled_vio, pool_entry);
+ pool->busy_count++;
+ list_move_tail(&pooled->pool_entry, &pool->busy);
+ (*waiter->callback)(waiter, pooled);
+}
+
+/**
+ * return_vio_to_pool() - Return a vio to the pool
+ * @pool: The vio pool.
+ * @vio: The pooled vio to return.
+ */
+void return_vio_to_pool(struct vio_pool *pool, struct pooled_vio *vio)
+{
+ VDO_ASSERT_LOG_ONLY((pool->thread_id == vdo_get_callback_thread_id()),
+ "vio pool entry returned on same thread as it was acquired");
+
+ vio->vio.completion.error_handler = NULL;
+ vio->vio.completion.parent = NULL;
+ if (vdo_waitq_has_waiters(&pool->waiting)) {
+ vdo_waitq_notify_next_waiter(&pool->waiting, NULL, vio);
+ return;
+ }
+
+ list_move_tail(&vio->pool_entry, &pool->available);
+ --pool->busy_count;
+}
+
+/*
+ * Various counting functions for statistics.
+ * These are used for bios coming into VDO, as well as bios generated by VDO.
+ */
+void vdo_count_bios(struct atomic_bio_stats *bio_stats, struct bio *bio)
+{
+ if (((bio->bi_opf & REQ_PREFLUSH) != 0) && (bio->bi_iter.bi_size == 0)) {
+ atomic64_inc(&bio_stats->empty_flush);
+ atomic64_inc(&bio_stats->flush);
+ return;
+ }
+
+ switch (bio_op(bio)) {
+ case REQ_OP_WRITE:
+ atomic64_inc(&bio_stats->write);
+ break;
+ case REQ_OP_READ:
+ atomic64_inc(&bio_stats->read);
+ break;
+ case REQ_OP_DISCARD:
+ atomic64_inc(&bio_stats->discard);
+ break;
+ /*
+ * All other operations are filtered out in dmvdo.c, or not created by VDO, so
+ * shouldn't exist.
+ */
+ default:
+ VDO_ASSERT_LOG_ONLY(0, "Bio operation %d not a write, read, discard, or empty flush",
+ bio_op(bio));
+ }
+
+ if ((bio->bi_opf & REQ_PREFLUSH) != 0)
+ atomic64_inc(&bio_stats->flush);
+ if (bio->bi_opf & REQ_FUA)
+ atomic64_inc(&bio_stats->fua);
+}
+
+static void count_all_bios_completed(struct vio *vio, struct bio *bio)
+{
+ struct atomic_statistics *stats = &vio->completion.vdo->stats;
+
+ if (is_data_vio(vio)) {
+ vdo_count_bios(&stats->bios_out_completed, bio);
+ return;
+ }
+
+ vdo_count_bios(&stats->bios_meta_completed, bio);
+ if (vio->type == VIO_TYPE_RECOVERY_JOURNAL)
+ vdo_count_bios(&stats->bios_journal_completed, bio);
+ else if (vio->type == VIO_TYPE_BLOCK_MAP)
+ vdo_count_bios(&stats->bios_page_cache_completed, bio);
+}
+
+void vdo_count_completed_bios(struct bio *bio)
+{
+ struct vio *vio = (struct vio *) bio->bi_private;
+
+ atomic64_inc(&vio->completion.vdo->stats.bios_completed);
+ count_all_bios_completed(vio, bio);
+}
diff --git a/drivers/md/dm-vdo/vio.h b/drivers/md/dm-vdo/vio.h
new file mode 100644
index 000000000000..3490e9f59b04
--- /dev/null
+++ b/drivers/md/dm-vdo/vio.h
@@ -0,0 +1,199 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef VIO_H
+#define VIO_H
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+
+#include "completion.h"
+#include "constants.h"
+#include "types.h"
+#include "vdo.h"
+
+enum {
+ MAX_BLOCKS_PER_VIO = (BIO_MAX_VECS << PAGE_SHIFT) / VDO_BLOCK_SIZE,
+};
+
+struct pooled_vio {
+ /* The underlying vio */
+ struct vio vio;
+ /* The list entry for chaining pooled vios together */
+ struct list_head list_entry;
+ /* The context set by the pool */
+ void *context;
+ /* The list entry used by the pool */
+ struct list_head pool_entry;
+};
+
+/**
+ * as_vio() - Convert a generic vdo_completion to a vio.
+ * @completion: The completion to convert.
+ *
+ * Return: The completion as a vio.
+ */
+static inline struct vio *as_vio(struct vdo_completion *completion)
+{
+ vdo_assert_completion_type(completion, VIO_COMPLETION);
+ return container_of(completion, struct vio, completion);
+}
+
+/**
+ * get_vio_bio_zone_thread_id() - Get the thread id of the bio zone in which a vio should submit
+ * its I/O.
+ * @vio: The vio.
+ *
+ * Return: The id of the bio zone thread the vio should use.
+ */
+static inline thread_id_t __must_check get_vio_bio_zone_thread_id(struct vio *vio)
+{
+ return vio->completion.vdo->thread_config.bio_threads[vio->bio_zone];
+}
+
+physical_block_number_t __must_check pbn_from_vio_bio(struct bio *bio);
+
+/**
+ * assert_vio_in_bio_zone() - Check that a vio is running on the correct thread for its bio zone.
+ * @vio: The vio to check.
+ */
+static inline void assert_vio_in_bio_zone(struct vio *vio)
+{
+ thread_id_t expected = get_vio_bio_zone_thread_id(vio);
+ thread_id_t thread_id = vdo_get_callback_thread_id();
+
+ VDO_ASSERT_LOG_ONLY((expected == thread_id),
+ "vio I/O for physical block %llu on thread %u, should be on bio zone thread %u",
+ (unsigned long long) pbn_from_vio_bio(vio->bio), thread_id,
+ expected);
+}
+
+int vdo_create_bio(struct bio **bio_ptr);
+void vdo_free_bio(struct bio *bio);
+int allocate_vio_components(struct vdo *vdo, enum vio_type vio_type,
+ enum vio_priority priority, void *parent,
+ unsigned int block_count, char *data, struct vio *vio);
+int __must_check create_multi_block_metadata_vio(struct vdo *vdo, enum vio_type vio_type,
+ enum vio_priority priority,
+ void *parent, unsigned int block_count,
+ char *data, struct vio **vio_ptr);
+
+static inline int __must_check create_metadata_vio(struct vdo *vdo, enum vio_type vio_type,
+ enum vio_priority priority,
+ void *parent, char *data,
+ struct vio **vio_ptr)
+{
+ return create_multi_block_metadata_vio(vdo, vio_type, priority, parent, 1, data,
+ vio_ptr);
+}
+
+void free_vio_components(struct vio *vio);
+void free_vio(struct vio *vio);
+
+/**
+ * initialize_vio() - Initialize a vio.
+ * @vio: The vio to initialize.
+ * @bio: The bio this vio should use for its I/O.
+ * @block_count: The size of this vio in vdo blocks.
+ * @vio_type: The vio type.
+ * @priority: The relative priority of the vio.
+ * @vdo: The vdo for this vio.
+ */
+static inline void initialize_vio(struct vio *vio, struct bio *bio,
+ unsigned int block_count, enum vio_type vio_type,
+ enum vio_priority priority, struct vdo *vdo)
+{
+ /* data_vio's may not span multiple blocks */
+ BUG_ON((vio_type == VIO_TYPE_DATA) && (block_count != 1));
+
+ vio->bio = bio;
+ vio->block_count = block_count;
+ vio->type = vio_type;
+ vio->priority = priority;
+ vdo_initialize_completion(&vio->completion, vdo, VIO_COMPLETION);
+}
+
+void vdo_set_bio_properties(struct bio *bio, struct vio *vio, bio_end_io_t callback,
+ blk_opf_t bi_opf, physical_block_number_t pbn);
+
+int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback,
+ blk_opf_t bi_opf, physical_block_number_t pbn);
+
+void update_vio_error_stats(struct vio *vio, const char *format, ...)
+ __printf(2, 3);
+
+/**
+ * is_data_vio() - Check whether a vio is servicing an external data request.
+ * @vio: The vio to check.
+ */
+static inline bool is_data_vio(struct vio *vio)
+{
+ return (vio->type == VIO_TYPE_DATA);
+}
+
+/**
+ * get_metadata_priority() - Convert a vio's priority to a work item priority.
+ * @vio: The vio.
+ *
+ * Return: The priority with which to submit the vio's bio.
+ */
+static inline enum vdo_completion_priority get_metadata_priority(struct vio *vio)
+{
+ return ((vio->priority == VIO_PRIORITY_HIGH) ?
+ BIO_Q_HIGH_PRIORITY :
+ BIO_Q_METADATA_PRIORITY);
+}
+
+/**
+ * continue_vio() - Enqueue a vio to run its next callback.
+ * @vio: The vio to continue.
+ *
+ * Return: The result of the current operation.
+ */
+static inline void continue_vio(struct vio *vio, int result)
+{
+ if (unlikely(result != VDO_SUCCESS))
+ vdo_set_completion_result(&vio->completion, result);
+
+ vdo_enqueue_completion(&vio->completion, VDO_WORK_Q_DEFAULT_PRIORITY);
+}
+
+void vdo_count_bios(struct atomic_bio_stats *bio_stats, struct bio *bio);
+void vdo_count_completed_bios(struct bio *bio);
+
+/**
+ * continue_vio_after_io() - Continue a vio now that its I/O has returned.
+ */
+static inline void continue_vio_after_io(struct vio *vio, vdo_action_fn callback,
+ thread_id_t thread)
+{
+ vdo_count_completed_bios(vio->bio);
+ vdo_set_completion_callback(&vio->completion, callback, thread);
+ continue_vio(vio, blk_status_to_errno(vio->bio->bi_status));
+}
+
+void vio_record_metadata_io_error(struct vio *vio);
+
+/* A vio_pool is a collection of preallocated vios used to write arbitrary metadata blocks. */
+
+static inline struct pooled_vio *vio_as_pooled_vio(struct vio *vio)
+{
+ return container_of(vio, struct pooled_vio, vio);
+}
+
+struct vio_pool;
+
+int __must_check make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id,
+ enum vio_type vio_type, enum vio_priority priority,
+ void *context, struct vio_pool **pool_ptr);
+void free_vio_pool(struct vio_pool *pool);
+bool __must_check is_vio_pool_busy(struct vio_pool *pool);
+void acquire_vio_from_pool(struct vio_pool *pool, struct vdo_waiter *waiter);
+void return_vio_to_pool(struct vio_pool *pool, struct pooled_vio *vio);
+
+#endif /* VIO_H */
diff --git a/drivers/md/dm-vdo/wait-queue.c b/drivers/md/dm-vdo/wait-queue.c
new file mode 100644
index 000000000000..6e1e739277ef
--- /dev/null
+++ b/drivers/md/dm-vdo/wait-queue.c
@@ -0,0 +1,205 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "wait-queue.h"
+
+#include <linux/device-mapper.h>
+
+#include "permassert.h"
+
+#include "status-codes.h"
+
+/**
+ * vdo_waitq_enqueue_waiter() - Add a waiter to the tail end of a waitq.
+ * @waitq: The vdo_wait_queue to which to add the waiter.
+ * @waiter: The waiter to add to the waitq.
+ *
+ * The waiter must not already be waiting in a waitq.
+ */
+void vdo_waitq_enqueue_waiter(struct vdo_wait_queue *waitq, struct vdo_waiter *waiter)
+{
+ BUG_ON(waiter->next_waiter != NULL);
+
+ if (waitq->last_waiter == NULL) {
+ /*
+ * The waitq is empty, so form the initial circular list by self-linking the
+ * initial waiter.
+ */
+ waiter->next_waiter = waiter;
+ } else {
+ /* Splice the new waiter in at the end of the waitq. */
+ waiter->next_waiter = waitq->last_waiter->next_waiter;
+ waitq->last_waiter->next_waiter = waiter;
+ }
+
+ /* In both cases, the waiter we added to the ring becomes the last waiter. */
+ waitq->last_waiter = waiter;
+ waitq->length += 1;
+}
+
+/**
+ * vdo_waitq_transfer_all_waiters() - Transfer all waiters from one waitq to
+ * a second waitq, emptying the first waitq.
+ * @from_waitq: The waitq containing the waiters to move.
+ * @to_waitq: The waitq that will receive the waiters from the first waitq.
+ */
+void vdo_waitq_transfer_all_waiters(struct vdo_wait_queue *from_waitq,
+ struct vdo_wait_queue *to_waitq)
+{
+ /* If the source waitq is empty, there's nothing to do. */
+ if (!vdo_waitq_has_waiters(from_waitq))
+ return;
+
+ if (vdo_waitq_has_waiters(to_waitq)) {
+ /*
+ * Both are non-empty. Splice the two circular lists together
+ * by swapping the next (head) pointers in the list tails.
+ */
+ struct vdo_waiter *from_head = from_waitq->last_waiter->next_waiter;
+ struct vdo_waiter *to_head = to_waitq->last_waiter->next_waiter;
+
+ to_waitq->last_waiter->next_waiter = from_head;
+ from_waitq->last_waiter->next_waiter = to_head;
+ }
+
+ to_waitq->last_waiter = from_waitq->last_waiter;
+ to_waitq->length += from_waitq->length;
+ vdo_waitq_init(from_waitq);
+}
+
+/**
+ * vdo_waitq_notify_all_waiters() - Notify all the entries waiting in a waitq.
+ * @waitq: The vdo_wait_queue containing the waiters to notify.
+ * @callback: The function to call to notify each waiter, or NULL to invoke the callback field
+ * registered in each waiter.
+ * @context: The context to pass to the callback function.
+ *
+ * Notifies all the entries waiting in a waitq to continue execution by invoking a callback
+ * function on each of them in turn. The waitq is copied and emptied before invoking any callbacks,
+ * and only the waiters that were in the waitq at the start of the call will be notified.
+ */
+void vdo_waitq_notify_all_waiters(struct vdo_wait_queue *waitq,
+ vdo_waiter_callback_fn callback, void *context)
+{
+ /*
+ * Copy and empty the waitq first, avoiding the possibility of an infinite
+ * loop if entries are returned to the waitq by the callback function.
+ */
+ struct vdo_wait_queue waiters;
+
+ vdo_waitq_init(&waiters);
+ vdo_waitq_transfer_all_waiters(waitq, &waiters);
+
+ /* Drain the copied waitq, invoking the callback on every entry. */
+ while (vdo_waitq_has_waiters(&waiters))
+ vdo_waitq_notify_next_waiter(&waiters, callback, context);
+}
+
+/**
+ * vdo_waitq_get_first_waiter() - Return the waiter that is at the head end of a waitq.
+ * @waitq: The vdo_wait_queue from which to get the first waiter.
+ *
+ * Return: The first (oldest) waiter in the waitq, or NULL if the waitq is empty.
+ */
+struct vdo_waiter *vdo_waitq_get_first_waiter(const struct vdo_wait_queue *waitq)
+{
+ struct vdo_waiter *last_waiter = waitq->last_waiter;
+
+ if (last_waiter == NULL) {
+ /* There are no waiters, so we're done. */
+ return NULL;
+ }
+
+ /* The waitq is circular, so the last entry links to the head of the waitq. */
+ return last_waiter->next_waiter;
+}
+
+/**
+ * vdo_waitq_dequeue_matching_waiters() - Remove all waiters that match based on the specified
+ * matching method and append them to a vdo_wait_queue.
+ * @waitq: The vdo_wait_queue to process.
+ * @waiter_match: The method to determine matching.
+ * @match_context: Contextual info for the match method.
+ * @matched_waitq: A wait_waitq to store matches.
+ */
+void vdo_waitq_dequeue_matching_waiters(struct vdo_wait_queue *waitq,
+ vdo_waiter_match_fn waiter_match,
+ void *match_context,
+ struct vdo_wait_queue *matched_waitq)
+{
+ struct vdo_wait_queue iteration_waitq;
+
+ vdo_waitq_init(&iteration_waitq);
+ vdo_waitq_transfer_all_waiters(waitq, &iteration_waitq);
+
+ while (vdo_waitq_has_waiters(&iteration_waitq)) {
+ struct vdo_waiter *waiter = vdo_waitq_dequeue_waiter(&iteration_waitq);
+
+ vdo_waitq_enqueue_waiter((waiter_match(waiter, match_context) ?
+ matched_waitq : waitq), waiter);
+ }
+}
+
+/**
+ * vdo_waitq_dequeue_waiter() - Remove the first (oldest) waiter from a waitq.
+ * @waitq: The vdo_wait_queue from which to remove the first entry.
+ *
+ * The caller will be responsible for waking the waiter by continuing its
+ * execution appropriately.
+ *
+ * Return: The first (oldest) waiter in the waitq, or NULL if the waitq is empty.
+ */
+struct vdo_waiter *vdo_waitq_dequeue_waiter(struct vdo_wait_queue *waitq)
+{
+ struct vdo_waiter *first_waiter = vdo_waitq_get_first_waiter(waitq);
+ struct vdo_waiter *last_waiter = waitq->last_waiter;
+
+ if (first_waiter == NULL)
+ return NULL;
+
+ if (first_waiter == last_waiter) {
+ /* The waitq has a single entry, so empty it by nulling the tail. */
+ waitq->last_waiter = NULL;
+ } else {
+ /*
+ * The waitq has multiple waiters, so splice the first waiter out
+ * of the circular waitq.
+ */
+ last_waiter->next_waiter = first_waiter->next_waiter;
+ }
+
+ /* The waiter is no longer in a waitq. */
+ first_waiter->next_waiter = NULL;
+ waitq->length -= 1;
+
+ return first_waiter;
+}
+
+/**
+ * vdo_waitq_notify_next_waiter() - Notify the next entry waiting in a waitq.
+ * @waitq: The vdo_wait_queue containing the waiter to notify.
+ * @callback: The function to call to notify the waiter, or NULL to invoke the callback field
+ * registered in the waiter.
+ * @context: The context to pass to the callback function.
+ *
+ * Notifies the next entry waiting in a waitq to continue execution by invoking a callback function
+ * on it after removing it from the waitq.
+ *
+ * Return: true if there was a waiter in the waitq.
+ */
+bool vdo_waitq_notify_next_waiter(struct vdo_wait_queue *waitq,
+ vdo_waiter_callback_fn callback, void *context)
+{
+ struct vdo_waiter *waiter = vdo_waitq_dequeue_waiter(waitq);
+
+ if (waiter == NULL)
+ return false;
+
+ if (callback == NULL)
+ callback = waiter->callback;
+ callback(waiter, context);
+
+ return true;
+}
diff --git a/drivers/md/dm-vdo/wait-queue.h b/drivers/md/dm-vdo/wait-queue.h
new file mode 100644
index 000000000000..7e8ee6afe7c7
--- /dev/null
+++ b/drivers/md/dm-vdo/wait-queue.h
@@ -0,0 +1,138 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef VDO_WAIT_QUEUE_H
+#define VDO_WAIT_QUEUE_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+/**
+ * A vdo_wait_queue is a circular singly linked list of entries waiting to be notified
+ * of a change in a condition. Keeping a circular list allows the vdo_wait_queue
+ * structure to simply be a pointer to the tail (newest) entry, supporting
+ * constant-time enqueue and dequeue operations. A null pointer is an empty waitq.
+ *
+ * An empty waitq:
+ * waitq0.last_waiter -> NULL
+ *
+ * A singleton waitq:
+ * waitq1.last_waiter -> entry1 -> entry1 -> [...]
+ *
+ * A three-element waitq:
+ * waitq2.last_waiter -> entry3 -> entry1 -> entry2 -> entry3 -> [...]
+ *
+ * linux/wait.h's wait_queue_head is _not_ used because vdo_wait_queue's
+ * interface is much less complex (doesn't need locking, priorities or timers).
+ * Made possible by vdo's thread-based resource allocation and locking; and
+ * the polling nature of vdo_wait_queue consumers.
+ *
+ * FIXME: could be made to use a linux/list.h's list_head but its extra barriers
+ * really aren't needed. Nor is a doubly linked list, but vdo_wait_queue could
+ * make use of __list_del_clearprev() -- but that would compromise the ability
+ * to make full use of linux's list interface.
+ */
+
+struct vdo_waiter;
+
+struct vdo_wait_queue {
+ /* The tail of the queue, the last (most recently added) entry */
+ struct vdo_waiter *last_waiter;
+ /* The number of waiters currently in the queue */
+ size_t length;
+};
+
+/**
+ * vdo_waiter_callback_fn - Callback type that will be called to resume processing
+ * of a waiter after it has been removed from its wait queue.
+ */
+typedef void (*vdo_waiter_callback_fn)(struct vdo_waiter *waiter, void *context);
+
+/**
+ * vdo_waiter_match_fn - Method type for waiter matching methods.
+ *
+ * Returns false if the waiter does not match.
+ */
+typedef bool (*vdo_waiter_match_fn)(struct vdo_waiter *waiter, void *context);
+
+/* The structure for entries in a vdo_wait_queue. */
+struct vdo_waiter {
+ /*
+ * The next waiter in the waitq. If this entry is the last waiter, then this
+ * is actually a pointer back to the head of the waitq.
+ */
+ struct vdo_waiter *next_waiter;
+
+ /* Optional waiter-specific callback to invoke when dequeuing this waiter. */
+ vdo_waiter_callback_fn callback;
+};
+
+/**
+ * vdo_waiter_is_waiting() - Check whether a waiter is waiting.
+ * @waiter: The waiter to check.
+ *
+ * Return: true if the waiter is on some vdo_wait_queue.
+ */
+static inline bool vdo_waiter_is_waiting(struct vdo_waiter *waiter)
+{
+ return (waiter->next_waiter != NULL);
+}
+
+/**
+ * vdo_waitq_init() - Initialize a vdo_wait_queue.
+ * @waitq: The vdo_wait_queue to initialize.
+ */
+static inline void vdo_waitq_init(struct vdo_wait_queue *waitq)
+{
+ *waitq = (struct vdo_wait_queue) {
+ .last_waiter = NULL,
+ .length = 0,
+ };
+}
+
+/**
+ * vdo_waitq_has_waiters() - Check whether a vdo_wait_queue has any entries waiting.
+ * @waitq: The vdo_wait_queue to query.
+ *
+ * Return: true if there are any waiters in the waitq.
+ */
+static inline bool __must_check vdo_waitq_has_waiters(const struct vdo_wait_queue *waitq)
+{
+ return (waitq->last_waiter != NULL);
+}
+
+void vdo_waitq_enqueue_waiter(struct vdo_wait_queue *waitq,
+ struct vdo_waiter *waiter);
+
+struct vdo_waiter *vdo_waitq_dequeue_waiter(struct vdo_wait_queue *waitq);
+
+void vdo_waitq_notify_all_waiters(struct vdo_wait_queue *waitq,
+ vdo_waiter_callback_fn callback, void *context);
+
+bool vdo_waitq_notify_next_waiter(struct vdo_wait_queue *waitq,
+ vdo_waiter_callback_fn callback, void *context);
+
+void vdo_waitq_transfer_all_waiters(struct vdo_wait_queue *from_waitq,
+ struct vdo_wait_queue *to_waitq);
+
+struct vdo_waiter *vdo_waitq_get_first_waiter(const struct vdo_wait_queue *waitq);
+
+void vdo_waitq_dequeue_matching_waiters(struct vdo_wait_queue *waitq,
+ vdo_waiter_match_fn waiter_match,
+ void *match_context,
+ struct vdo_wait_queue *matched_waitq);
+
+/**
+ * vdo_waitq_num_waiters() - Return the number of waiters in a vdo_wait_queue.
+ * @waitq: The vdo_wait_queue to query.
+ *
+ * Return: The number of waiters in the waitq.
+ */
+static inline size_t __must_check vdo_waitq_num_waiters(const struct vdo_wait_queue *waitq)
+{
+ return waitq->length;
+}
+
+#endif /* VDO_WAIT_QUEUE_H */