diff options
117 files changed, 6276 insertions, 5028 deletions
diff --git a/Documentation/devicetree/bindings/i2c/i2c-arb-gpio-challenge.txt b/Documentation/devicetree/bindings/i2c/i2c-arb-gpio-challenge.txt new file mode 100644 index 000000000000..1ac8ea8ade1d --- /dev/null +++ b/Documentation/devicetree/bindings/i2c/i2c-arb-gpio-challenge.txt @@ -0,0 +1,80 @@ +GPIO-based I2C Arbitration Using a Challenge & Response Mechanism +================================================================= +This uses GPIO lines and a challenge & response mechanism to arbitrate who is +the master of an I2C bus in a multimaster situation. + +In many cases using GPIOs to arbitrate is not needed and a design can use +the standard I2C multi-master rules. Using GPIOs is generally useful in +the case where there is a device on the bus that has errata and/or bugs +that makes standard multimaster mode not feasible. + + +Algorithm: + +All masters on the bus have a 'bus claim' line which is an output that the +others can see. These are all active low with pull-ups enabled. We'll +describe these lines as: + +- OUR_CLAIM: output from us signaling to other hosts that we want the bus +- THEIR_CLAIMS: output from others signaling that they want the bus + +The basic algorithm is to assert your line when you want the bus, then make +sure that the other side doesn't want it also. A detailed explanation is best +done with an example. + +Let's say we want to claim the bus. We: +1. Assert OUR_CLAIM. +2. Waits a little bit for the other sides to notice (slew time, say 10 + microseconds). +3. Check THEIR_CLAIMS. If none are asserted then the we have the bus and we are + done. +4. Otherwise, wait for a few milliseconds and see if THEIR_CLAIMS are released. +5. If not, back off, release the claim and wait for a few more milliseconds. +6. Go back to 1 (until retry time has expired). + + +Required properties: +- compatible: i2c-arb-gpio-challenge +- our-claim-gpio: The GPIO that we use to claim the bus. +- their-claim-gpios: The GPIOs that the other sides use to claim the bus. + Note that some implementations may only support a single other master. +- Standard I2C mux properties. See mux.txt in this directory. +- Single I2C child bus node at reg 0. See mux.txt in this directory. + +Optional properties: +- slew-delay-us: microseconds to wait for a GPIO to go high. Default is 10 us. +- wait-retry-us: we'll attempt another claim after this many microseconds. + Default is 3000 us. +- wait-free-us: we'll give up after this many microseconds. Default is 50000 us. + + +Example: + i2c@12CA0000 { + compatible = "acme,some-i2c-device"; + #address-cells = <1>; + #size-cells = <0>; + }; + + i2c-arbitrator { + compatible = "i2c-arb-gpio-challenge"; + #address-cells = <1>; + #size-cells = <0>; + + i2c-parent = <&{/i2c@12CA0000}>; + + our-claim-gpio = <&gpf0 3 1>; + their-claim-gpios = <&gpe0 4 1>; + slew-delay-us = <10>; + wait-retry-us = <3000>; + wait-free-us = <50000>; + + i2c@0 { + reg = <0>; + #address-cells = <1>; + #size-cells = <0>; + + i2c@52 { + // Normal I2C device + }; + }; + }; diff --git a/Documentation/virtual/00-INDEX b/Documentation/virtual/00-INDEX index 924bd462675e..e952d30bbf0f 100644 --- a/Documentation/virtual/00-INDEX +++ b/Documentation/virtual/00-INDEX @@ -6,6 +6,3 @@ kvm/ - Kernel Virtual Machine. See also http://linux-kvm.org uml/ - User Mode Linux, builds/runs Linux kernel as a userspace program. -virtio.txt - - Text version of draft virtio spec. - See http://ozlabs.org/~rusty/virtio-spec diff --git a/Documentation/virtual/virtio-spec.txt b/Documentation/virtual/virtio-spec.txt deleted file mode 100644 index eb094039b50d..000000000000 --- a/Documentation/virtual/virtio-spec.txt +++ /dev/null @@ -1,3210 +0,0 @@ -[Generated file: see http://ozlabs.org/~rusty/virtio-spec/] -Virtio PCI Card Specification -v0.9.5 DRAFT -- - -Rusty Russell <rusty@rustcorp.com.au> IBM Corporation (Editor) - -2012 May 7. - -Purpose and Description - -This document describes the specifications of the “virtio” family -of PCI[LaTeX Command: nomenclature] devices. These are devices -are found in virtual environments[LaTeX Command: nomenclature], -yet by design they are not all that different from physical PCI -devices, and this document treats them as such. This allows the -guest to use standard PCI drivers and discovery mechanisms. - -The purpose of virtio and this specification is that virtual -environments and guests should have a straightforward, efficient, -standard and extensible mechanism for virtual devices, rather -than boutique per-environment or per-OS mechanisms. - - Straightforward: Virtio PCI devices use normal PCI mechanisms - of interrupts and DMA which should be familiar to any device - driver author. There is no exotic page-flipping or COW - mechanism: it's just a PCI device.[footnote: -This lack of page-sharing implies that the implementation of the -device (e.g. the hypervisor or host) needs full access to the -guest memory. Communication with untrusted parties (i.e. -inter-guest communication) requires copying. -] - - Efficient: Virtio PCI devices consist of rings of descriptors - for input and output, which are neatly separated to avoid cache - effects from both guest and device writing to the same cache - lines. - - Standard: Virtio PCI makes no assumptions about the environment - in which it operates, beyond supporting PCI. In fact the virtio - devices specified in the appendices do not require PCI at all: - they have been implemented on non-PCI buses.[footnote: -The Linux implementation further separates the PCI virtio code -from the specific virtio drivers: these drivers are shared with -the non-PCI implementations (currently lguest and S/390). -] - - Extensible: Virtio PCI devices contain feature bits which are - acknowledged by the guest operating system during device setup. - This allows forwards and backwards compatibility: the device - offers all the features it knows about, and the driver - acknowledges those it understands and wishes to use. - - Virtqueues - -The mechanism for bulk data transport on virtio PCI devices is -pretentiously called a virtqueue. Each device can have zero or -more virtqueues: for example, the network device has one for -transmit and one for receive. - -Each virtqueue occupies two or more physically-contiguous pages -(defined, for the purposes of this specification, as 4096 bytes), -and consists of three parts: - - -+-------------------+-----------------------------------+-----------+ -| Descriptor Table | Available Ring (padding) | Used Ring | -+-------------------+-----------------------------------+-----------+ - - -When the driver wants to send a buffer to the device, it fills in -a slot in the descriptor table (or chains several together), and -writes the descriptor index into the available ring. It then -notifies the device. When the device has finished a buffer, it -writes the descriptor into the used ring, and sends an interrupt. - -Specification - - PCI Discovery - -Any PCI device with Vendor ID 0x1AF4, and Device ID 0x1000 -through 0x103F inclusive is a virtio device[footnote: -The actual value within this range is ignored -]. The device must also have a Revision ID of 0 to match this -specification. - -The Subsystem Device ID indicates which virtio device is -supported by the device. The Subsystem Vendor ID should reflect -the PCI Vendor ID of the environment (it's currently only used -for informational purposes by the guest). - - -+----------------------+--------------------+---------------+ -| Subsystem Device ID | Virtio Device | Specification | -+----------------------+--------------------+---------------+ -+----------------------+--------------------+---------------+ -| 1 | network card | Appendix C | -+----------------------+--------------------+---------------+ -| 2 | block device | Appendix D | -+----------------------+--------------------+---------------+ -| 3 | console | Appendix E | -+----------------------+--------------------+---------------+ -| 4 | entropy source | Appendix F | -+----------------------+--------------------+---------------+ -| 5 | memory ballooning | Appendix G | -+----------------------+--------------------+---------------+ -| 6 | ioMemory | - | -+----------------------+--------------------+---------------+ -| 7 | rpmsg | Appendix H | -+----------------------+--------------------+---------------+ -| 8 | SCSI host | Appendix I | -+----------------------+--------------------+---------------+ -| 9 | 9P transport | - | -+----------------------+--------------------+---------------+ -| 10 | mac80211 wlan | - | -+----------------------+--------------------+---------------+ - - - Device Configuration - -To configure the device, we use the first I/O region of the PCI -device. This contains a virtio header followed by a -device-specific region. - -There may be different widths of accesses to the I/O region; the “ -natural” access method for each field in the virtio header must -be used (i.e. 32-bit accesses for 32-bit fields, etc), but the -device-specific region can be accessed using any width accesses, -and should obtain the same results. - -Note that this is possible because while the virtio header is PCI -(i.e. little) endian, the device-specific region is encoded in -the native endian of the guest (where such distinction is -applicable). - - Device Initialization Sequence<sub:Device-Initialization-Sequence> - -We start with an overview of device initialization, then expand -on the details of the device and how each step is preformed. - - Reset the device. This is not required on initial start up. - - The ACKNOWLEDGE status bit is set: we have noticed the device. - - The DRIVER status bit is set: we know how to drive the device. - - Device-specific setup, including reading the Device Feature - Bits, discovery of virtqueues for the device, optional MSI-X - setup, and reading and possibly writing the virtio - configuration space. - - The subset of Device Feature Bits understood by the driver is - written to the device. - - The DRIVER_OK status bit is set. - - The device can now be used (ie. buffers added to the - virtqueues)[footnote: -Historically, drivers have used the device before steps 5 and 6. -This is only allowed if the driver does not use any features -which would alter this early use of the device. -] - -If any of these steps go irrecoverably wrong, the guest should -set the FAILED status bit to indicate that it has given up on the -device (it can reset the device later to restart if desired). - -We now cover the fields required for general setup in detail. - - Virtio Header - -The virtio header looks as follows: - - -+------------++---------------------+---------------------+----------+--------+---------+---------+---------+--------+ -| Bits || 32 | 32 | 32 | 16 | 16 | 16 | 8 | 8 | -+------------++---------------------+---------------------+----------+--------+---------+---------+---------+--------+ -| Read/Write || R | R+W | R+W | R | R+W | R+W | R+W | R | -+------------++---------------------+---------------------+----------+--------+---------+---------+---------+--------+ -| Purpose || Device | Guest | Queue | Queue | Queue | Queue | Device | ISR | -| || Features bits 0:31 | Features bits 0:31 | Address | Size | Select | Notify | Status | Status | -+------------++---------------------+---------------------+----------+--------+---------+---------+---------+--------+ - - -If MSI-X is enabled for the device, two additional fields -immediately follow this header:[footnote: -ie. once you enable MSI-X on the device, the other fields move. -If you turn it off again, they move back! -] - - -+------------++----------------+--------+ -| Bits || 16 | 16 | - +----------------+--------+ -+------------++----------------+--------+ -| Read/Write || R+W | R+W | -+------------++----------------+--------+ -| Purpose || Configuration | Queue | -| (MSI-X) || Vector | Vector | -+------------++----------------+--------+ - - -Immediately following these general headers, there may be -device-specific headers: - - -+------------++--------------------+ -| Bits || Device Specific | - +--------------------+ -+------------++--------------------+ -| Read/Write || Device Specific | -+------------++--------------------+ -| Purpose || Device Specific... | -| || | -+------------++--------------------+ - - - Device Status - -The Device Status field is updated by the guest to indicate its -progress. This provides a simple low-level diagnostic: it's most -useful to imagine them hooked up to traffic lights on the console -indicating the status of each device. - -The device can be reset by writing a 0 to this field, otherwise -at least one bit should be set: - - ACKNOWLEDGE (1) Indicates that the guest OS has found the - device and recognized it as a valid virtio device. - - DRIVER (2) Indicates that the guest OS knows how to drive the - device. Under Linux, drivers can be loadable modules so there - may be a significant (or infinite) delay before setting this - bit. - - DRIVER_OK (4) Indicates that the driver is set up and ready to - drive the device. - - FAILED (128) Indicates that something went wrong in the guest, - and it has given up on the device. This could be an internal - error, or the driver didn't like the device for some reason, or - even a fatal error during device operation. The device must be - reset before attempting to re-initialize. - - Feature Bits<sub:Feature-Bits> - -Thefirst configuration field indicates the features that the -device supports. The bits are allocated as follows: - - 0 to 23 Feature bits for the specific device type - - 24 to 32 Feature bits reserved for extensions to the queue and - feature negotiation mechanisms - -For example, feature bit 0 for a network device (i.e. Subsystem -Device ID 1) indicates that the device supports checksumming of -packets. - -The feature bits are negotiated: the device lists all the -features it understands in the Device Features field, and the -guest writes the subset that it understands into the Guest -Features field. The only way to renegotiate is to reset the -device. - -In particular, new fields in the device configuration header are -indicated by offering a feature bit, so the guest can check -before accessing that part of the configuration space. - -This allows for forwards and backwards compatibility: if the -device is enhanced with a new feature bit, older guests will not -write that feature bit back to the Guest Features field and it -can go into backwards compatibility mode. Similarly, if a guest -is enhanced with a feature that the device doesn't support, it -will not see that feature bit in the Device Features field and -can go into backwards compatibility mode (or, for poor -implementations, set the FAILED Device Status bit). - - Configuration/Queue Vectors - -When MSI-X capability is present and enabled in the device -(through standard PCI configuration space) 4 bytes at byte offset -20 are used to map configuration change and queue interrupts to -MSI-X vectors. In this case, the ISR Status field is unused, and -device specific configuration starts at byte offset 24 in virtio -header structure. When MSI-X capability is not enabled, device -specific configuration starts at byte offset 20 in virtio header. - -Writing a valid MSI-X Table entry number, 0 to 0x7FF, to one of -Configuration/Queue Vector registers, maps interrupts triggered -by the configuration change/selected queue events respectively to -the corresponding MSI-X vector. To disable interrupts for a -specific event type, unmap it by writing a special NO_VECTOR -value: - -/* Vector value used to disable MSI for queue */ - -#define VIRTIO_MSI_NO_VECTOR 0xffff - -Reading these registers returns vector mapped to a given event, -or NO_VECTOR if unmapped. All queue and configuration change -events are unmapped by default. - -Note that mapping an event to vector might require allocating -internal device resources, and might fail. Devices report such -failures by returning the NO_VECTOR value when the relevant -Vector field is read. After mapping an event to vector, the -driver must verify success by reading the Vector field value: on -success, the previously written value is returned, and on -failure, NO_VECTOR is returned. If a mapping failure is detected, -the driver can retry mapping with fewervectors, or disable MSI-X. - - Virtqueue Configuration<sec:Virtqueue-Configuration> - -As a device can have zero or more virtqueues for bulk data -transport (for example, the network driver has two), the driver -needs to configure them as part of the device-specific -configuration. - -This is done as follows, for each virtqueue a device has: - - Write the virtqueue index (first queue is 0) to the Queue - Select field. - - Read the virtqueue size from the Queue Size field, which is - always a power of 2. This controls how big the virtqueue is - (see below). If this field is 0, the virtqueue does not exist. - - Allocate and zero virtqueue in contiguous physical memory, on a - 4096 byte alignment. Write the physical address, divided by - 4096 to the Queue Address field.[footnote: -The 4096 is based on the x86 page size, but it's also large -enough to ensure that the separate parts of the virtqueue are on -separate cache lines. -] - - Optionally, if MSI-X capability is present and enabled on the - device, select a vector to use to request interrupts triggered - by virtqueue events. Write the MSI-X Table entry number - corresponding to this vector in Queue Vector field. Read the - Queue Vector field: on success, previously written value is - returned; on failure, NO_VECTOR value is returned. - -The Queue Size field controls the total number of bytes required -for the virtqueue according to the following formula: - -#define ALIGN(x) (((x) + 4095) & ~4095) - -static inline unsigned vring_size(unsigned int qsz) - -{ - - return ALIGN(sizeof(struct vring_desc)*qsz + sizeof(u16)*(2 -+ qsz)) - - + ALIGN(sizeof(struct vring_used_elem)*qsz); - -} - -This currently wastes some space with padding, but also allows -future extensions. The virtqueue layout structure looks like this -(qsz is the Queue Size field, which is a variable, so this code -won't compile): - -struct vring { - - /* The actual descriptors (16 bytes each) */ - - struct vring_desc desc[qsz]; - - - - /* A ring of available descriptor heads with free-running -index. */ - - struct vring_avail avail; - - - - // Padding to the next 4096 boundary. - - char pad[]; - - - - // A ring of used descriptor heads with free-running index. - - struct vring_used used; - -}; - - A Note on Virtqueue Endianness - -Note that the endian of these fields and everything else in the -virtqueue is the native endian of the guest, not little-endian as -PCI normally is. This makes for simpler guest code, and it is -assumed that the host already has to be deeply aware of the guest -endian so such an “endian-aware” device is not a significant -issue. - - Descriptor Table - -The descriptor table refers to the buffers the guest is using for -the device. The addresses are physical addresses, and the buffers -can be chained via the next field. Each descriptor describes a -buffer which is read-only or write-only, but a chain of -descriptors can contain both read-only and write-only buffers. - -No descriptor chain may be more than 2^32 bytes long in total.struct vring_desc { - - /* Address (guest-physical). */ - - u64 addr; - - /* Length. */ - - u32 len; - -/* This marks a buffer as continuing via the next field. */ - -#define VRING_DESC_F_NEXT 1 - -/* This marks a buffer as write-only (otherwise read-only). */ - -#define VRING_DESC_F_WRITE 2 - -/* This means the buffer contains a list of buffer descriptors. -*/ - -#define VRING_DESC_F_INDIRECT 4 - - /* The flags as indicated above. */ - - u16 flags; - - /* Next field if flags & NEXT */ - - u16 next; - -}; - -The number of descriptors in the table is specified by the Queue -Size field for this virtqueue. - - <sub:Indirect-Descriptors>Indirect Descriptors - -Some devices benefit by concurrently dispatching a large number -of large requests. The VIRTIO_RING_F_INDIRECT_DESC feature can be -used to allow this (see [cha:Reserved-Feature-Bits]). To increase -ring capacity it is possible to store a table of indirect -descriptors anywhere in memory, and insert a descriptor in main -virtqueue (with flags&INDIRECT on) that refers to memory buffer -containing this indirect descriptor table; fields addr and len -refer to the indirect table address and length in bytes, -respectively. The indirect table layout structure looks like this -(len is the length of the descriptor that refers to this table, -which is a variable, so this code won't compile): - -struct indirect_descriptor_table { - - /* The actual descriptors (16 bytes each) */ - - struct vring_desc desc[len / 16]; - -}; - -The first indirect descriptor is located at start of the indirect -descriptor table (index 0), additional indirect descriptors are -chained by next field. An indirect descriptor without next field -(with flags&NEXT off) signals the end of the indirect descriptor -table, and transfers control back to the main virtqueue. An -indirect descriptor can not refer to another indirect descriptor -table (flags&INDIRECT must be off). A single indirect descriptor -table can include both read-only and write-only descriptors; -write-only flag (flags&WRITE) in the descriptor that refers to it -is ignored. - - Available Ring - -The available ring refers to what descriptors we are offering the -device: it refers to the head of a descriptor chain. The “flags” -field is currently 0 or 1: 1 indicating that we do not need an -interrupt when the device consumes a descriptor from the -available ring. Alternatively, the guest can ask the device to -delay interrupts until an entry with an index specified by the “ -used_event” field is written in the used ring (equivalently, -until the idx field in the used ring will reach the value -used_event + 1). The method employed by the device is controlled -by the VIRTIO_RING_F_EVENT_IDX feature bit (see [cha:Reserved-Feature-Bits] -). This interrupt suppression is merely an optimization; it may -not suppress interrupts entirely. - -The “idx” field indicates where we would put the next descriptor -entry (modulo the ring size). This starts at 0, and increases. - -struct vring_avail { - -#define VRING_AVAIL_F_NO_INTERRUPT 1 - - u16 flags; - - u16 idx; - - u16 ring[qsz]; /* qsz is the Queue Size field read from device -*/ - - u16 used_event; - -}; - - Used Ring - -The used ring is where the device returns buffers once it is done -with them. The flags field can be used by the device to hint that -no notification is necessary when the guest adds to the available -ring. Alternatively, the “avail_event” field can be used by the -device to hint that no notification is necessary until an entry -with an index specified by the “avail_event” is written in the -available ring (equivalently, until the idx field in the -available ring will reach the value avail_event + 1). The method -employed by the device is controlled by the guest through the -VIRTIO_RING_F_EVENT_IDX feature bit (see [cha:Reserved-Feature-Bits] -). [footnote: -These fields are kept here because this is the only part of the -virtqueue written by the device -]. - -Each entry in the ring is a pair: the head entry of the -descriptor chain describing the buffer (this matches an entry -placed in the available ring by the guest earlier), and the total -of bytes written into the buffer. The latter is extremely useful -for guests using untrusted buffers: if you do not know exactly -how much has been written by the device, you usually have to zero -the buffer to ensure no data leakage occurs. - -/* u32 is used here for ids for padding reasons. */ - -struct vring_used_elem { - - /* Index of start of used descriptor chain. */ - - u32 id; - - /* Total length of the descriptor chain which was used -(written to) */ - - u32 len; - -}; - - - -struct vring_used { - -#define VRING_USED_F_NO_NOTIFY 1 - - u16 flags; - - u16 idx; - - struct vring_used_elem ring[qsz]; - - u16 avail_event; - -}; - - Helpers for Managing Virtqueues - -The Linux Kernel Source code contains the definitions above and -helper routines in a more usable form, in -include/linux/virtio_ring.h. This was explicitly licensed by IBM -and Red Hat under the (3-clause) BSD license so that it can be -freely used by all other projects, and is reproduced (with slight -variation to remove Linux assumptions) in Appendix A. - - Device Operation<sec:Device-Operation> - -There are two parts to device operation: supplying new buffers to -the device, and processing used buffers from the device. As an -example, the virtio network device has two virtqueues: the -transmit virtqueue and the receive virtqueue. The driver adds -outgoing (read-only) packets to the transmit virtqueue, and then -frees them after they are used. Similarly, incoming (write-only) -buffers are added to the receive virtqueue, and processed after -they are used. - - Supplying Buffers to The Device - -Actual transfer of buffers from the guest OS to the device -operates as follows: - - Place the buffer(s) into free descriptor(s). - - If there are no free descriptors, the guest may choose to - notify the device even if notifications are suppressed (to - reduce latency).[footnote: -The Linux drivers do this only for read-only buffers: for -write-only buffers, it is assumed that the driver is merely -trying to keep the receive buffer ring full, and no notification -of this expected condition is necessary. -] - - Place the id of the buffer in the next ring entry of the - available ring. - - The steps (1) and (2) may be performed repeatedly if batching - is possible. - - A memory barrier should be executed to ensure the device sees - the updated descriptor table and available ring before the next - step. - - The available “idx” field should be increased by the number of - entries added to the available ring. - - A memory barrier should be executed to ensure that we update - the idx field before checking for notification suppression. - - If notifications are not suppressed, the device should be - notified of the new buffers. - -Note that the above code does not take precautions against the -available ring buffer wrapping around: this is not possible since -the ring buffer is the same size as the descriptor table, so step -(1) will prevent such a condition. - -In addition, the maximum queue size is 32768 (it must be a power -of 2 which fits in 16 bits), so the 16-bit “idx” value can always -distinguish between a full and empty buffer. - -Here is a description of each stage in more detail. - - Placing Buffers Into The Descriptor Table - -A buffer consists of zero or more read-only physically-contiguous -elements followed by zero or more physically-contiguous -write-only elements (it must have at least one element). This -algorithm maps it into the descriptor table: - - for each buffer element, b: - - Get the next free descriptor table entry, d - - Set d.addr to the physical address of the start of b - - Set d.len to the length of b. - - If b is write-only, set d.flags to VRING_DESC_F_WRITE, - otherwise 0. - - If there is a buffer element after this: - - Set d.next to the index of the next free descriptor element. - - Set the VRING_DESC_F_NEXT bit in d.flags. - -In practice, the d.next fields are usually used to chain free -descriptors, and a separate count kept to check there are enough -free descriptors before beginning the mappings. - - Updating The Available Ring - -The head of the buffer we mapped is the first d in the algorithm -above. A naive implementation would do the following: - -avail->ring[avail->idx % qsz] = head; - -However, in general we can add many descriptors before we update -the “idx” field (at which point they become visible to the -device), so we keep a counter of how many we've added: - -avail->ring[(avail->idx + added++) % qsz] = head; - - Updating The Index Field - -Once the idx field of the virtqueue is updated, the device will -be able to access the descriptor entries we've created and the -memory they refer to. This is why a memory barrier is generally -used before the idx update, to ensure it sees the most up-to-date -copy. - -The idx field always increments, and we let it wrap naturally at -65536: - -avail->idx += added; - - <sub:Notifying-The-Device>Notifying The Device - -Device notification occurs by writing the 16-bit virtqueue index -of this virtqueue to the Queue Notify field of the virtio header -in the first I/O region of the PCI device. This can be expensive, -however, so the device can suppress such notifications if it -doesn't need them. We have to be careful to expose the new idx -value before checking the suppression flag: it's OK to notify -gratuitously, but not to omit a required notification. So again, -we use a memory barrier here before reading the flags or the -avail_event field. - -If the VIRTIO_F_RING_EVENT_IDX feature is not negotiated, and if -the VRING_USED_F_NOTIFY flag is not set, we go ahead and write to -the PCI configuration space. - -If the VIRTIO_F_RING_EVENT_IDX feature is negotiated, we read the -avail_event field in the available ring structure. If the -available index crossed_the avail_event field value since the -last notification, we go ahead and write to the PCI configuration -space. The avail_event field wraps naturally at 65536 as well: - -(u16)(new_idx - avail_event - 1) < (u16)(new_idx - old_idx) - - <sub:Receiving-Used-Buffers>Receiving Used Buffers From The - Device - -Once the device has used a buffer (read from or written to it, or -parts of both, depending on the nature of the virtqueue and the -device), it sends an interrupt, following an algorithm very -similar to the algorithm used for the driver to send the device a -buffer: - - Write the head descriptor number to the next field in the used - ring. - - Update the used ring idx. - - Determine whether an interrupt is necessary: - - If the VIRTIO_F_RING_EVENT_IDX feature is not negotiated: check - if f the VRING_AVAIL_F_NO_INTERRUPT flag is not set in avail- - >flags - - If the VIRTIO_F_RING_EVENT_IDX feature is negotiated: check - whether the used index crossed the used_event field value - since the last update. The used_event field wraps naturally - at 65536 as well:(u16)(new_idx - used_event - 1) < (u16)(new_idx - old_idx) - - If an interrupt is necessary: - - If MSI-X capability is disabled: - - Set the lower bit of the ISR Status field for the device. - - Send the appropriate PCI interrupt for the device. - - If MSI-X capability is enabled: - - Request the appropriate MSI-X interrupt message for the - device, Queue Vector field sets the MSI-X Table entry - number. - - If Queue Vector field value is NO_VECTOR, no interrupt - message is requested for this event. - -The guest interrupt handler should: - - If MSI-X capability is disabled: read the ISR Status field, - which will reset it to zero. If the lower bit is zero, the - interrupt was not for this device. Otherwise, the guest driver - should look through the used rings of each virtqueue for the - device, to see if any progress has been made by the device - which requires servicing. - - If MSI-X capability is enabled: look through the used rings of - each virtqueue mapped to the specific MSI-X vector for the - device, to see if any progress has been made by the device - which requires servicing. - -For each ring, guest should then disable interrupts by writing -VRING_AVAIL_F_NO_INTERRUPT flag in avail structure, if required. -It can then process used ring entries finally enabling interrupts -by clearing the VRING_AVAIL_F_NO_INTERRUPT flag or updating the -EVENT_IDX field in the available structure, Guest should then -execute a memory barrier, and then recheck the ring empty -condition. This is necessary to handle the case where, after the -last check and before enabling interrupts, an interrupt has been -suppressed by the device: - -vring_disable_interrupts(vq); - -for (;;) { - - if (vq->last_seen_used != vring->used.idx) { - - vring_enable_interrupts(vq); - - mb(); - - if (vq->last_seen_used != vring->used.idx) - - break; - - } - - struct vring_used_elem *e = -vring.used->ring[vq->last_seen_used%vsz]; - - process_buffer(e); - - vq->last_seen_used++; - -} - - Dealing With Configuration Changes<sub:Dealing-With-Configuration> - -Some virtio PCI devices can change the device configuration -state, as reflected in the virtio header in the PCI configuration -space. In this case: - - If MSI-X capability is disabled: an interrupt is delivered and - the second highest bit is set in the ISR Status field to - indicate that the driver should re-examine the configuration - space.Note that a single interrupt can indicate both that one - or more virtqueue has been used and that the configuration - space has changed: even if the config bit is set, virtqueues - must be scanned. - - If MSI-X capability is enabled: an interrupt message is - requested. The Configuration Vector field sets the MSI-X Table - entry number to use. If Configuration Vector field value is - NO_VECTOR, no interrupt message is requested for this event. - -Creating New Device Types - -Various considerations are necessary when creating a new device -type: - - How Many Virtqueues? - -It is possible that a very simple device will operate entirely -through its configuration space, but most will need at least one -virtqueue in which it will place requests. A device with both -input and output (eg. console and network devices described here) -need two queues: one which the driver fills with buffers to -receive input, and one which the driver places buffers to -transmit output. - - What Configuration Space Layout? - -Configuration space is generally used for rarely-changing or -initialization-time parameters. But it is a limited resource, so -it might be better to use a virtqueue to update configuration -information (the network device does this for filtering, -otherwise the table in the config space could potentially be very -large). - -Note that this space is generally the guest's native endian, -rather than PCI's little-endian. - - What Device Number? - -Currently device numbers are assigned quite freely: a simple -request mail to the author of this document or the Linux -virtualization mailing list[footnote: - -https://lists.linux-foundation.org/mailman/listinfo/virtualization -] will be sufficient to secure a unique one. - -Meanwhile for experimental drivers, use 65535 and work backwards. - - How many MSI-X vectors? - -Using the optional MSI-X capability devices can speed up -interrupt processing by removing the need to read ISR Status -register by guest driver (which might be an expensive operation), -reducing interrupt sharing between devices and queues within the -device, and handling interrupts from multiple CPUs. However, some -systems impose a limit (which might be as low as 256) on the -total number of MSI-X vectors that can be allocated to all -devices. Devices and/or device drivers should take this into -account, limiting the number of vectors used unless the device is -expected to cause a high volume of interrupts. Devices can -control the number of vectors used by limiting the MSI-X Table -Size or not presenting MSI-X capability in PCI configuration -space. Drivers can control this by mapping events to as small -number of vectors as possible, or disabling MSI-X capability -altogether. - - Message Framing - -The descriptors used for a buffer should not effect the semantics -of the message, except for the total length of the buffer. For -example, a network buffer consists of a 10 byte header followed -by the network packet. Whether this is presented in the ring -descriptor chain as (say) a 10 byte buffer and a 1514 byte -buffer, or a single 1524 byte buffer, or even three buffers, -should have no effect. - -In particular, no implementation should use the descriptor -boundaries to determine the size of any header in a request.[footnote: -The current qemu device implementations mistakenly insist that -the first descriptor cover the header in these cases exactly, so -a cautious driver should arrange it so. -] - - Device Improvements - -Any change to configuration space, or new virtqueues, or -behavioural changes, should be indicated by negotiation of a new -feature bit. This establishes clarity[footnote: -Even if it does mean documenting design or implementation -mistakes! -] and avoids future expansion problems. - -Clusters of functionality which are always implemented together -can use a single bit, but if one feature makes sense without the -others they should not be gratuitously grouped together to -conserve feature bits. We can always extend the spec when the -first person needs more than 24 feature bits for their device. - -[LaTeX Command: printnomenclature] - -Appendix A: virtio_ring.h - -#ifndef VIRTIO_RING_H - -#define VIRTIO_RING_H - -/* An interface for efficient virtio implementation. - - * - - * This header is BSD licensed so anyone can use the definitions - - * to implement compatible drivers/servers. - - * - - * Copyright 2007, 2009, IBM Corporation - - * Copyright 2011, Red Hat, Inc - - * All rights reserved. - - * - - * Redistribution and use in source and binary forms, with or -without - - * modification, are permitted provided that the following -conditions - - * are met: - - * 1. Redistributions of source code must retain the above -copyright - - * notice, this list of conditions and the following -disclaimer. - - * 2. Redistributions in binary form must reproduce the above -copyright - - * notice, this list of conditions and the following -disclaimer in the - - * documentation and/or other materials provided with the -distribution. - - * 3. Neither the name of IBM nor the names of its contributors - - * may be used to endorse or promote products derived from -this software - - * without specific prior written permission. - - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND -CONTRIBUTORS ``AS IS'' AND - - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE - - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE - - * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE -LIABLE - - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL - - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS - - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) - - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT - - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING -IN ANY WAY - - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF - - * SUCH DAMAGE. - - */ - - - -/* This marks a buffer as continuing via the next field. */ - -#define VRING_DESC_F_NEXT 1 - -/* This marks a buffer as write-only (otherwise read-only). */ - -#define VRING_DESC_F_WRITE 2 - - - -/* The Host uses this in used->flags to advise the Guest: don't -kick me - - * when you add a buffer. It's unreliable, so it's simply an - - * optimization. Guest will still kick if it's out of buffers. -*/ - -#define VRING_USED_F_NO_NOTIFY 1 - -/* The Guest uses this in avail->flags to advise the Host: don't - - * interrupt me when you consume a buffer. It's unreliable, so -it's - - * simply an optimization. */ - -#define VRING_AVAIL_F_NO_INTERRUPT 1 - - - -/* Virtio ring descriptors: 16 bytes. - - * These can chain together via "next". */ - -struct vring_desc { - - /* Address (guest-physical). */ - - uint64_t addr; - - /* Length. */ - - uint32_t len; - - /* The flags as indicated above. */ - - uint16_t flags; - - /* We chain unused descriptors via this, too */ - - uint16_t next; - -}; - - - -struct vring_avail { - - uint16_t flags; - - uint16_t idx; - - uint16_t ring[]; - - uint16_t used_event; - -}; - - - -/* u32 is used here for ids for padding reasons. */ - -struct vring_used_elem { - - /* Index of start of used descriptor chain. */ - - uint32_t id; - - /* Total length of the descriptor chain which was written -to. */ - - uint32_t len; - -}; - - - -struct vring_used { - - uint16_t flags; - - uint16_t idx; - - struct vring_used_elem ring[]; - - uint16_t avail_event; - -}; - - - -struct vring { - - unsigned int num; - - - - struct vring_desc *desc; - - struct vring_avail *avail; - - struct vring_used *used; - -}; - - - -/* The standard layout for the ring is a continuous chunk of -memory which - - * looks like this. We assume num is a power of 2. - - * - - * struct vring { - - * // The actual descriptors (16 bytes each) - - * struct vring_desc desc[num]; - - * - - * // A ring of available descriptor heads with free-running -index. - - * __u16 avail_flags; - - * __u16 avail_idx; - - * __u16 available[num]; - - * - - * // Padding to the next align boundary. - - * char pad[]; - - * - - * // A ring of used descriptor heads with free-running -index. - - * __u16 used_flags; - - * __u16 EVENT_IDX; - - * struct vring_used_elem used[num]; - - * }; - - * Note: for virtio PCI, align is 4096. - - */ - -static inline void vring_init(struct vring *vr, unsigned int num, -void *p, - - unsigned long align) - -{ - - vr->num = num; - - vr->desc = p; - - vr->avail = p + num*sizeof(struct vring_desc); - - vr->used = (void *)(((unsigned long)&vr->avail->ring[num] - - + align-1) - - & ~(align - 1)); - -} - - - -static inline unsigned vring_size(unsigned int num, unsigned long -align) - -{ - - return ((sizeof(struct vring_desc)*num + -sizeof(uint16_t)*(2+num) - - + align - 1) & ~(align - 1)) - - + sizeof(uint16_t)*3 + sizeof(struct -vring_used_elem)*num; - -} - - - -static inline int vring_need_event(uint16_t event_idx, uint16_t -new_idx, uint16_t old_idx) - -{ - - return (uint16_t)(new_idx - event_idx - 1) < -(uint16_t)(new_idx - old_idx); - -} - -#endif /* VIRTIO_RING_H */ - -<cha:Reserved-Feature-Bits>Appendix B: Reserved Feature Bits - -Currently there are five device-independent feature bits defined: - - VIRTIO_F_NOTIFY_ON_EMPTY (24) Negotiating this feature - indicates that the driver wants an interrupt if the device runs - out of available descriptors on a virtqueue, even though - interrupts are suppressed using the VRING_AVAIL_F_NO_INTERRUPT - flag or the used_event field. An example of this is the - networking driver: it doesn't need to know every time a packet - is transmitted, but it does need to free the transmitted - packets a finite time after they are transmitted. It can avoid - using a timer if the device interrupts it when all the packets - are transmitted. - - VIRTIO_F_RING_INDIRECT_DESC (28) Negotiating this feature - indicates that the driver can use descriptors with the - VRING_DESC_F_INDIRECT flag set, as described in [sub:Indirect-Descriptors] - . - - VIRTIO_F_RING_EVENT_IDX(29) This feature enables the used_event - and the avail_event fields. If set, it indicates that the - device should ignore the flags field in the available ring - structure. Instead, the used_event field in this structure is - used by guest to suppress device interrupts. Further, the - driver should ignore the flags field in the used ring - structure. Instead, the avail_event field in this structure is - used by the device to suppress notifications. If unset, the - driver should ignore the used_event field; the device should - ignore the avail_event field; the flags field is used - -Appendix C: Network Device - -The virtio network device is a virtual ethernet card, and is the -most complex of the devices supported so far by virtio. It has -enhanced rapidly and demonstrates clearly how support for new -features should be added to an existing device. Empty buffers are -placed in one virtqueue for receiving packets, and outgoing -packets are enqueued into another for transmission in that order. -A third command queue is used to control advanced filtering -features. - - Configuration - - Subsystem Device ID 1 - - Virtqueues 0:receiveq. 1:transmitq. 2:controlq[footnote: -Only if VIRTIO_NET_F_CTRL_VQ set -] - - Feature bits - - VIRTIO_NET_F_CSUM (0) Device handles packets with partial - checksum - - VIRTIO_NET_F_GUEST_CSUM (1) Guest handles packets with partial - checksum - - VIRTIO_NET_F_MAC (5) Device has given MAC address. - - VIRTIO_NET_F_GSO (6) (Deprecated) device handles packets with - any GSO type.[footnote: -It was supposed to indicate segmentation offload support, but -upon further investigation it became clear that multiple bits -were required. -] - - VIRTIO_NET_F_GUEST_TSO4 (7) Guest can receive TSOv4. - - VIRTIO_NET_F_GUEST_TSO6 (8) Guest can receive TSOv6. - - VIRTIO_NET_F_GUEST_ECN (9) Guest can receive TSO with ECN. - - VIRTIO_NET_F_GUEST_UFO (10) Guest can receive UFO. - - VIRTIO_NET_F_HOST_TSO4 (11) Device can receive TSOv4. - - VIRTIO_NET_F_HOST_TSO6 (12) Device can receive TSOv6. - - VIRTIO_NET_F_HOST_ECN (13) Device can receive TSO with ECN. - - VIRTIO_NET_F_HOST_UFO (14) Device can receive UFO. - - VIRTIO_NET_F_MRG_RXBUF (15) Guest can merge receive buffers. - - VIRTIO_NET_F_STATUS (16) Configuration status field is - available. - - VIRTIO_NET_F_CTRL_VQ (17) Control channel is available. - - VIRTIO_NET_F_CTRL_RX (18) Control channel RX mode support. - - VIRTIO_NET_F_CTRL_VLAN (19) Control channel VLAN filtering. - - VIRTIO_NET_F_GUEST_ANNOUNCE(21) Guest can send gratuitous - packets. - - Device configuration layout Two configuration fields are - currently defined. The mac address field always exists (though - is only valid if VIRTIO_NET_F_MAC is set), and the status field - only exists if VIRTIO_NET_F_STATUS is set. Two read-only bits - are currently defined for the status field: - VIRTIO_NET_S_LINK_UP and VIRTIO_NET_S_ANNOUNCE. #define VIRTIO_NET_S_LINK_UP 1 - -#define VIRTIO_NET_S_ANNOUNCE 2 - - - -struct virtio_net_config { - - u8 mac[6]; - - u16 status; - -}; - - Device Initialization - - The initialization routine should identify the receive and - transmission virtqueues. - - If the VIRTIO_NET_F_MAC feature bit is set, the configuration - space “mac” entry indicates the “physical” address of the the - network card, otherwise a private MAC address should be - assigned. All guests are expected to negotiate this feature if - it is set. - - If the VIRTIO_NET_F_CTRL_VQ feature bit is negotiated, identify - the control virtqueue. - - If the VIRTIO_NET_F_STATUS feature bit is negotiated, the link - status can be read from the bottom bit of the “status” config - field. Otherwise, the link should be assumed active. - - The receive virtqueue should be filled with receive buffers. - This is described in detail below in “Setting Up Receive - Buffers”. - - A driver can indicate that it will generate checksumless - packets by negotating the VIRTIO_NET_F_CSUM feature. This “ - checksum offload” is a common feature on modern network cards. - - If that feature is negotiated[footnote: -ie. VIRTIO_NET_F_HOST_TSO* and VIRTIO_NET_F_HOST_UFO are -dependent on VIRTIO_NET_F_CSUM; a dvice which offers the offload -features must offer the checksum feature, and a driver which -accepts the offload features must accept the checksum feature. -Similar logic applies to the VIRTIO_NET_F_GUEST_TSO4 features -depending on VIRTIO_NET_F_GUEST_CSUM. -], a driver can use TCP or UDP segmentation offload by - negotiating the VIRTIO_NET_F_HOST_TSO4 (IPv4 TCP), - VIRTIO_NET_F_HOST_TSO6 (IPv6 TCP) and VIRTIO_NET_F_HOST_UFO - (UDP fragmentation) features. It should not send TCP packets - requiring segmentation offload which have the Explicit - Congestion Notification bit set, unless the - VIRTIO_NET_F_HOST_ECN feature is negotiated.[footnote: -This is a common restriction in real, older network cards. -] - - The converse features are also available: a driver can save the - virtual device some work by negotiating these features.[footnote: -For example, a network packet transported between two guests on -the same system may not require checksumming at all, nor -segmentation, if both guests are amenable. -] The VIRTIO_NET_F_GUEST_CSUM feature indicates that partially - checksummed packets can be received, and if it can do that then - the VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, - VIRTIO_NET_F_GUEST_UFO and VIRTIO_NET_F_GUEST_ECN are the input - equivalents of the features described above. See “Receiving - Packets” below. - - Device Operation - -Packets are transmitted by placing them in the transmitq, and -buffers for incoming packets are placed in the receiveq. In each -case, the packet itself is preceded by a header: - -struct virtio_net_hdr { - -#define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 - - u8 flags; - -#define VIRTIO_NET_HDR_GSO_NONE 0 - -#define VIRTIO_NET_HDR_GSO_TCPV4 1 - -#define VIRTIO_NET_HDR_GSO_UDP 3 - -#define VIRTIO_NET_HDR_GSO_TCPV6 4 - -#define VIRTIO_NET_HDR_GSO_ECN 0x80 - - u8 gso_type; - - u16 hdr_len; - - u16 gso_size; - - u16 csum_start; - - u16 csum_offset; - -/* Only if VIRTIO_NET_F_MRG_RXBUF: */ - - u16 num_buffers - -}; - -The controlq is used to control device features such as -filtering. - - Packet Transmission - -Transmitting a single packet is simple, but varies depending on -the different features the driver negotiated. - - If the driver negotiated VIRTIO_NET_F_CSUM, and the packet has - not been fully checksummed, then the virtio_net_hdr's fields - are set as follows. Otherwise, the packet must be fully - checksummed, and flags is zero. - - flags has the VIRTIO_NET_HDR_F_NEEDS_CSUM set, - - <ite:csum_start-is-set>csum_start is set to the offset within - the packet to begin checksumming, and - - csum_offset indicates how many bytes after the csum_start the - new (16 bit ones' complement) checksum should be placed.[footnote: -For example, consider a partially checksummed TCP (IPv4) packet. -It will have a 14 byte ethernet header and 20 byte IP header -followed by the TCP header (with the TCP checksum field 16 bytes -into that header). csum_start will be 14+20 = 34 (the TCP -checksum includes the header), and csum_offset will be 16. The -value in the TCP checksum field should be initialized to the sum -of the TCP pseudo header, so that replacing it by the ones' -complement checksum of the TCP header and body will give the -correct result. -] - - <enu:If-the-driver>If the driver negotiated - VIRTIO_NET_F_HOST_TSO4, TSO6 or UFO, and the packet requires - TCP segmentation or UDP fragmentation, then the “gso_type” - field is set to VIRTIO_NET_HDR_GSO_TCPV4, TCPV6 or UDP. - (Otherwise, it is set to VIRTIO_NET_HDR_GSO_NONE). In this - case, packets larger than 1514 bytes can be transmitted: the - metadata indicates how to replicate the packet header to cut it - into smaller packets. The other gso fields are set: - - hdr_len is a hint to the device as to how much of the header - needs to be kept to copy into each packet, usually set to the - length of the headers, including the transport header.[footnote: -Due to various bugs in implementations, this field is not useful -as a guarantee of the transport header size. -] - - gso_size is the maximum size of each packet beyond that header - (ie. MSS). - - If the driver negotiated the VIRTIO_NET_F_HOST_ECN feature, the - VIRTIO_NET_HDR_GSO_ECN bit may be set in “gso_type” as well, - indicating that the TCP packet has the ECN bit set.[footnote: -This case is not handled by some older hardware, so is called out -specifically in the protocol. -] - - If the driver negotiated the VIRTIO_NET_F_MRG_RXBUF feature, - the num_buffers field is set to zero. - - The header and packet are added as one output buffer to the - transmitq, and the device is notified of the new entry (see [sub:Notifying-The-Device] - ).[footnote: -Note that the header will be two bytes longer for the -VIRTIO_NET_F_MRG_RXBUF case. -] - - Packet Transmission Interrupt - -Often a driver will suppress transmission interrupts using the -VRING_AVAIL_F_NO_INTERRUPT flag (see [sub:Receiving-Used-Buffers] -) and check for used packets in the transmit path of following -packets. However, it will still receive interrupts if the -VIRTIO_F_NOTIFY_ON_EMPTY feature is negotiated, indicating that -the transmission queue is completely emptied. - -The normal behavior in this interrupt handler is to retrieve and -new descriptors from the used ring and free the corresponding -headers and packets. - - Setting Up Receive Buffers - -It is generally a good idea to keep the receive virtqueue as -fully populated as possible: if it runs out, network performance -will suffer. - -If the VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6 or -VIRTIO_NET_F_GUEST_UFO features are used, the Guest will need to -accept packets of up to 65550 bytes long (the maximum size of a -TCP or UDP packet, plus the 14 byte ethernet header), otherwise -1514 bytes. So unless VIRTIO_NET_F_MRG_RXBUF is negotiated, every -buffer in the receive queue needs to be at least this length [footnote: -Obviously each one can be split across multiple descriptor -elements. -]. - -If VIRTIO_NET_F_MRG_RXBUF is negotiated, each buffer must be at -least the size of the struct virtio_net_hdr. - - Packet Receive Interrupt - -When a packet is copied into a buffer in the receiveq, the -optimal path is to disable further interrupts for the receiveq -(see [sub:Receiving-Used-Buffers]) and process packets until no -more are found, then re-enable them. - -Processing packet involves: - - If the driver negotiated the VIRTIO_NET_F_MRG_RXBUF feature, - then the “num_buffers” field indicates how many descriptors - this packet is spread over (including this one). This allows - receipt of large packets without having to allocate large - buffers. In this case, there will be at least “num_buffers” in - the used ring, and they should be chained together to form a - single packet. The other buffers will not begin with a struct - virtio_net_hdr. - - If the VIRTIO_NET_F_MRG_RXBUF feature was not negotiated, or - the “num_buffers” field is one, then the entire packet will be - contained within this buffer, immediately following the struct - virtio_net_hdr. - - If the VIRTIO_NET_F_GUEST_CSUM feature was negotiated, the - VIRTIO_NET_HDR_F_NEEDS_CSUM bit in the “flags” field may be - set: if so, the checksum on the packet is incomplete and the “ - csum_start” and “csum_offset” fields indicate how to calculate - it (see [ite:csum_start-is-set]). - - If the VIRTIO_NET_F_GUEST_TSO4, TSO6 or UFO options were - negotiated, then the “gso_type” may be something other than - VIRTIO_NET_HDR_GSO_NONE, and the “gso_size” field indicates the - desired MSS (see [enu:If-the-driver]). - - Control Virtqueue - -The driver uses the control virtqueue (if VIRTIO_NET_F_VTRL_VQ is -negotiated) to send commands to manipulate various features of -the device which would not easily map into the configuration -space. - -All commands are of the following form: - -struct virtio_net_ctrl { - - u8 class; - - u8 command; - - u8 command-specific-data[]; - - u8 ack; - -}; - - - -/* ack values */ - -#define VIRTIO_NET_OK 0 - -#define VIRTIO_NET_ERR 1 - -The class, command and command-specific-data are set by the -driver, and the device sets the ack byte. There is little it can -do except issue a diagnostic if the ack byte is not -VIRTIO_NET_OK. - - Packet Receive Filtering - -If the VIRTIO_NET_F_CTRL_RX feature is negotiated, the driver can -send control commands for promiscuous mode, multicast receiving, -and filtering of MAC addresses. - -Note that in general, these commands are best-effort: unwanted -packets may still arrive. - - Setting Promiscuous Mode - -#define VIRTIO_NET_CTRL_RX 0 - - #define VIRTIO_NET_CTRL_RX_PROMISC 0 - - #define VIRTIO_NET_CTRL_RX_ALLMULTI 1 - -The class VIRTIO_NET_CTRL_RX has two commands: -VIRTIO_NET_CTRL_RX_PROMISC turns promiscuous mode on and off, and -VIRTIO_NET_CTRL_RX_ALLMULTI turns all-multicast receive on and -off. The command-specific-data is one byte containing 0 (off) or -1 (on). - - Setting MAC Address Filtering - -struct virtio_net_ctrl_mac { - - u32 entries; - - u8 macs[entries][ETH_ALEN]; - -}; - - - -#define VIRTIO_NET_CTRL_MAC 1 - - #define VIRTIO_NET_CTRL_MAC_TABLE_SET 0 - -The device can filter incoming packets by any number of -destination MAC addresses.[footnote: -Since there are no guarantees, it can use a hash filter -orsilently switch to allmulti or promiscuous mode if it is given -too many addresses. -] This table is set using the class VIRTIO_NET_CTRL_MAC and the -command VIRTIO_NET_CTRL_MAC_TABLE_SET. The command-specific-data -is two variable length tables of 6-byte MAC addresses. The first -table contains unicast addresses, and the second contains -multicast addresses. - - VLAN Filtering - -If the driver negotiates the VIRTION_NET_F_CTRL_VLAN feature, it -can control a VLAN filter table in the device. - -#define VIRTIO_NET_CTRL_VLAN 2 - - #define VIRTIO_NET_CTRL_VLAN_ADD 0 - - #define VIRTIO_NET_CTRL_VLAN_DEL 1 - -Both the VIRTIO_NET_CTRL_VLAN_ADD and VIRTIO_NET_CTRL_VLAN_DEL -command take a 16-bit VLAN id as the command-specific-data. - - Gratuitous Packet Sending - -If the driver negotiates the VIRTIO_NET_F_GUEST_ANNOUNCE (depends -on VIRTIO_NET_F_CTRL_VQ), it can ask the guest to send gratuitous -packets; this is usually done after the guest has been physically -migrated, and needs to announce its presence on the new network -links. (As hypervisor does not have the knowledge of guest -network configuration (eg. tagged vlan) it is simplest to prod -the guest in this way). - -#define VIRTIO_NET_CTRL_ANNOUNCE 3 - - #define VIRTIO_NET_CTRL_ANNOUNCE_ACK 0 - -The Guest needs to check VIRTIO_NET_S_ANNOUNCE bit in status -field when it notices the changes of device configuration. The -command VIRTIO_NET_CTRL_ANNOUNCE_ACK is used to indicate that -driver has recevied the notification and device would clear the -VIRTIO_NET_S_ANNOUNCE bit in the status filed after it received -this command. - -Processing this notification involves: - - Sending the gratuitous packets or marking there are pending - gratuitous packets to be sent and letting deferred routine to - send them. - - Sending VIRTIO_NET_CTRL_ANNOUNCE_ACK command through control - vq. - - . - -Appendix D: Block Device - -The virtio block device is a simple virtual block device (ie. -disk). Read and write requests (and other exotic requests) are -placed in the queue, and serviced (probably out of order) by the -device except where noted. - - Configuration - - Subsystem Device ID 2 - - Virtqueues 0:requestq. - - Feature bits - - VIRTIO_BLK_F_BARRIER (0) Host supports request barriers. - - VIRTIO_BLK_F_SIZE_MAX (1) Maximum size of any single segment is - in “size_max”. - - VIRTIO_BLK_F_SEG_MAX (2) Maximum number of segments in a - request is in “seg_max”. - - VIRTIO_BLK_F_GEOMETRY (4) Disk-style geometry specified in “ - geometry”. - - VIRTIO_BLK_F_RO (5) Device is read-only. - - VIRTIO_BLK_F_BLK_SIZE (6) Block size of disk is in “blk_size”. - - VIRTIO_BLK_F_SCSI (7) Device supports scsi packet commands. - - VIRTIO_BLK_F_FLUSH (9) Cache flush command support. - - Device configuration layout The capacity of the device - (expressed in 512-byte sectors) is always present. The - availability of the others all depend on various feature bits - as indicated above. struct virtio_blk_config { - - u64 capacity; - - u32 size_max; - - u32 seg_max; - - struct virtio_blk_geometry { - - u16 cylinders; - - u8 heads; - - u8 sectors; - - } geometry; - - u32 blk_size; - - - -}; - - Device Initialization - - The device size should be read from the “capacity” - configuration field. No requests should be submitted which goes - beyond this limit. - - If the VIRTIO_BLK_F_BLK_SIZE feature is negotiated, the - blk_size field can be read to determine the optimal sector size - for the driver to use. This does not effect the units used in - the protocol (always 512 bytes), but awareness of the correct - value can effect performance. - - If the VIRTIO_BLK_F_RO feature is set by the device, any write - requests will fail. - - Device Operation - -The driver queues requests to the virtqueue, and they are used by -the device (not necessarily in order). Each request is of form: - -struct virtio_blk_req { - - - - u32 type; - - u32 ioprio; - - u64 sector; - - char data[][512]; - - u8 status; - -}; - -If the device has VIRTIO_BLK_F_SCSI feature, it can also support -scsi packet command requests, each of these requests is of form:struct virtio_scsi_pc_req { - - u32 type; - - u32 ioprio; - - u64 sector; - - char cmd[]; - - char data[][512]; - -#define SCSI_SENSE_BUFFERSIZE 96 - - u8 sense[SCSI_SENSE_BUFFERSIZE]; - - u32 errors; - - u32 data_len; - - u32 sense_len; - - u32 residual; - - u8 status; - -}; - -The type of the request is either a read (VIRTIO_BLK_T_IN), a -write (VIRTIO_BLK_T_OUT), a scsi packet command -(VIRTIO_BLK_T_SCSI_CMD or VIRTIO_BLK_T_SCSI_CMD_OUT[footnote: -the SCSI_CMD and SCSI_CMD_OUT types are equivalent, the device -does not distinguish between them -]) or a flush (VIRTIO_BLK_T_FLUSH or VIRTIO_BLK_T_FLUSH_OUT[footnote: -the FLUSH and FLUSH_OUT types are equivalent, the device does not -distinguish between them -]). If the device has VIRTIO_BLK_F_BARRIER feature the high bit -(VIRTIO_BLK_T_BARRIER) indicates that this request acts as a -barrier and that all preceding requests must be complete before -this one, and all following requests must not be started until -this is complete. Note that a barrier does not flush caches in -the underlying backend device in host, and thus does not serve as -data consistency guarantee. Driver must use FLUSH request to -flush the host cache. - -#define VIRTIO_BLK_T_IN 0 - -#define VIRTIO_BLK_T_OUT 1 - -#define VIRTIO_BLK_T_SCSI_CMD 2 - -#define VIRTIO_BLK_T_SCSI_CMD_OUT 3 - -#define VIRTIO_BLK_T_FLUSH 4 - -#define VIRTIO_BLK_T_FLUSH_OUT 5 - -#define VIRTIO_BLK_T_BARRIER 0x80000000 - -The ioprio field is a hint about the relative priorities of -requests to the device: higher numbers indicate more important -requests. - -The sector number indicates the offset (multiplied by 512) where -the read or write is to occur. This field is unused and set to 0 -for scsi packet commands and for flush commands. - -The cmd field is only present for scsi packet command requests, -and indicates the command to perform. This field must reside in a -single, separate read-only buffer; command length can be derived -from the length of this buffer. - -Note that these first three (four for scsi packet commands) -fields are always read-only: the data field is either read-only -or write-only, depending on the request. The size of the read or -write can be derived from the total size of the request buffers. - -The sense field is only present for scsi packet command requests, -and indicates the buffer for scsi sense data. - -The data_len field is only present for scsi packet command -requests, this field is deprecated, and should be ignored by the -driver. Historically, devices copied data length there. - -The sense_len field is only present for scsi packet command -requests and indicates the number of bytes actually written to -the sense buffer. - -The residual field is only present for scsi packet command -requests and indicates the residual size, calculated as data -length - number of bytes actually transferred. - -The final status byte is written by the device: either -VIRTIO_BLK_S_OK for success, VIRTIO_BLK_S_IOERR for host or guest -error or VIRTIO_BLK_S_UNSUPP for a request unsupported by host:#define VIRTIO_BLK_S_OK 0 - -#define VIRTIO_BLK_S_IOERR 1 - -#define VIRTIO_BLK_S_UNSUPP 2 - -Historically, devices assumed that the fields type, ioprio and -sector reside in a single, separate read-only buffer; the fields -errors, data_len, sense_len and residual reside in a single, -separate write-only buffer; the sense field in a separate -write-only buffer of size 96 bytes, by itself; the fields errors, -data_len, sense_len and residual in a single write-only buffer; -and the status field is a separate read-only buffer of size 1 -byte, by itself. - -Appendix E: Console Device - -The virtio console device is a simple device for data input and -output. A device may have one or more ports. Each port has a pair -of input and output virtqueues. Moreover, a device has a pair of -control IO virtqueues. The control virtqueues are used to -communicate information between the device and the driver about -ports being opened and closed on either side of the connection, -indication from the host about whether a particular port is a -console port, adding new ports, port hot-plug/unplug, etc., and -indication from the guest about whether a port or a device was -successfully added, port open/close, etc.. For data IO, one or -more empty buffers are placed in the receive queue for incoming -data and outgoing characters are placed in the transmit queue. - - Configuration - - Subsystem Device ID 3 - - Virtqueues 0:receiveq(port0). 1:transmitq(port0), 2:control - receiveq[footnote: -Ports 2 onwards only if VIRTIO_CONSOLE_F_MULTIPORT is set -], 3:control transmitq, 4:receiveq(port1), 5:transmitq(port1), - ... - - Feature bits - - VIRTIO_CONSOLE_F_SIZE (0) Configuration cols and rows fields - are valid. - - VIRTIO_CONSOLE_F_MULTIPORT(1) Device has support for multiple - ports; configuration fields nr_ports and max_nr_ports are - valid and control virtqueues will be used. - - Device configuration layout The size of the console is supplied - in the configuration space if the VIRTIO_CONSOLE_F_SIZE feature - is set. Furthermore, if the VIRTIO_CONSOLE_F_MULTIPORT feature - is set, the maximum number of ports supported by the device can - be fetched.struct virtio_console_config { - - u16 cols; - - u16 rows; - - - - u32 max_nr_ports; - -}; - - Device Initialization - - If the VIRTIO_CONSOLE_F_SIZE feature is negotiated, the driver - can read the console dimensions from the configuration fields. - - If the VIRTIO_CONSOLE_F_MULTIPORT feature is negotiated, the - driver can spawn multiple ports, not all of which may be - attached to a console. Some could be generic ports. In this - case, the control virtqueues are enabled and according to the - max_nr_ports configuration-space value, the appropriate number - of virtqueues are created. A control message indicating the - driver is ready is sent to the host. The host can then send - control messages for adding new ports to the device. After - creating and initializing each port, a - VIRTIO_CONSOLE_PORT_READY control message is sent to the host - for that port so the host can let us know of any additional - configuration options set for that port. - - The receiveq for each port is populated with one or more - receive buffers. - - Device Operation - - For output, a buffer containing the characters is placed in the - port's transmitq.[footnote: -Because this is high importance and low bandwidth, the current -Linux implementation polls for the buffer to be used, rather than -waiting for an interrupt, simplifying the implementation -significantly. However, for generic serial ports with the -O_NONBLOCK flag set, the polling limitation is relaxed and the -consumed buffers are freed upon the next write or poll call or -when a port is closed or hot-unplugged. -] - - When a buffer is used in the receiveq (signalled by an - interrupt), the contents is the input to the port associated - with the virtqueue for which the notification was received. - - If the driver negotiated the VIRTIO_CONSOLE_F_SIZE feature, a - configuration change interrupt may occur. The updated size can - be read from the configuration fields. - - If the driver negotiated the VIRTIO_CONSOLE_F_MULTIPORT - feature, active ports are announced by the host using the - VIRTIO_CONSOLE_PORT_ADD control message. The same message is - used for port hot-plug as well. - - If the host specified a port `name', a sysfs attribute is - created with the name filled in, so that udev rules can be - written that can create a symlink from the port's name to the - char device for port discovery by applications in the guest. - - Changes to ports' state are effected by control messages. - Appropriate action is taken on the port indicated in the - control message. The layout of the structure of the control - buffer and the events associated are:struct virtio_console_control { - - uint32_t id; /* Port number */ - - uint16_t event; /* The kind of control event */ - - uint16_t value; /* Extra information for the event */ - -}; - - - -/* Some events for the internal messages (control packets) */ - - - -#define VIRTIO_CONSOLE_DEVICE_READY 0 - -#define VIRTIO_CONSOLE_PORT_ADD 1 - -#define VIRTIO_CONSOLE_PORT_REMOVE 2 - -#define VIRTIO_CONSOLE_PORT_READY 3 - -#define VIRTIO_CONSOLE_CONSOLE_PORT 4 - -#define VIRTIO_CONSOLE_RESIZE 5 - -#define VIRTIO_CONSOLE_PORT_OPEN 6 - -#define VIRTIO_CONSOLE_PORT_NAME 7 - -Appendix F: Entropy Device - -The virtio entropy device supplies high-quality randomness for -guest use. - - Configuration - - Subsystem Device ID 4 - - Virtqueues 0:requestq. - - Feature bits None currently defined - - Device configuration layout None currently defined. - - Device Initialization - - The virtqueue is initialized - - Device Operation - -When the driver requires random bytes, it places the descriptor -of one or more buffers in the queue. It will be completely filled -by random data by the device. - -Appendix G: Memory Balloon Device - -The virtio memory balloon device is a primitive device for -managing guest memory: the device asks for a certain amount of -memory, and the guest supplies it (or withdraws it, if the device -has more than it asks for). This allows the guest to adapt to -changes in allowance of underlying physical memory. If the -feature is negotiated, the device can also be used to communicate -guest memory statistics to the host. - - Configuration - - Subsystem Device ID 5 - - Virtqueues 0:inflateq. 1:deflateq. 2:statsq.[footnote: -Only if VIRTIO_BALLON_F_STATS_VQ set -] - - Feature bits - - VIRTIO_BALLOON_F_MUST_TELL_HOST (0) Host must be told before - pages from the balloon are used. - - VIRTIO_BALLOON_F_STATS_VQ (1) A virtqueue for reporting guest - memory statistics is present. - - Device configuration layout Both fields of this configuration - are always available. Note that they are little endian, despite - convention that device fields are guest endian:struct virtio_balloon_config { - - u32 num_pages; - - u32 actual; - -}; - - Device Initialization - - The inflate and deflate virtqueues are identified. - - If the VIRTIO_BALLOON_F_STATS_VQ feature bit is negotiated: - - Identify the stats virtqueue. - - Add one empty buffer to the stats virtqueue and notify the - host. - -Device operation begins immediately. - - Device Operation - - Memory Ballooning The device is driven by the receipt of a - configuration change interrupt. - - The “num_pages” configuration field is examined. If this is - greater than the “actual” number of pages, memory must be given - to the balloon. If it is less than the “actual” number of - pages, memory may be taken back from the balloon for general - use. - - To supply memory to the balloon (aka. inflate): - - The driver constructs an array of addresses of unused memory - pages. These addresses are divided by 4096[footnote: -This is historical, and independent of the guest page size -] and the descriptor describing the resulting 32-bit array is - added to the inflateq. - - To remove memory from the balloon (aka. deflate): - - The driver constructs an array of addresses of memory pages it - has previously given to the balloon, as described above. This - descriptor is added to the deflateq. - - If the VIRTIO_BALLOON_F_MUST_TELL_HOST feature is set, the - guest may not use these requested pages until that descriptor - in the deflateq has been used by the device. - - Otherwise, the guest may begin to re-use pages previously given - to the balloon before the device has acknowledged their - withdrawl. [footnote: -In this case, deflation advice is merely a courtesy -] - - In either case, once the device has completed the inflation or - deflation, the “actual” field of the configuration should be - updated to reflect the new number of pages in the balloon.[footnote: -As updates to configuration space are not atomic, this field -isn't particularly reliable, but can be used to diagnose buggy -guests. -] - - Memory Statistics - -The stats virtqueue is atypical because communication is driven -by the device (not the driver). The channel becomes active at -driver initialization time when the driver adds an empty buffer -and notifies the device. A request for memory statistics proceeds -as follows: - - The device pushes the buffer onto the used ring and sends an - interrupt. - - The driver pops the used buffer and discards it. - - The driver collects memory statistics and writes them into a - new buffer. - - The driver adds the buffer to the virtqueue and notifies the - device. - - The device pops the buffer (retaining it to initiate a - subsequent request) and consumes the statistics. - - Memory Statistics Format Each statistic consists of a 16 bit - tag and a 64 bit value. Both quantities are represented in the - native endian of the guest. All statistics are optional and the - driver may choose which ones to supply. To guarantee backwards - compatibility, unsupported statistics should be omitted. - - struct virtio_balloon_stat { - -#define VIRTIO_BALLOON_S_SWAP_IN 0 - -#define VIRTIO_BALLOON_S_SWAP_OUT 1 - -#define VIRTIO_BALLOON_S_MAJFLT 2 - -#define VIRTIO_BALLOON_S_MINFLT 3 - -#define VIRTIO_BALLOON_S_MEMFREE 4 - -#define VIRTIO_BALLOON_S_MEMTOT 5 - - u16 tag; - - u64 val; - -} __attribute__((packed)); - - Tags - - VIRTIO_BALLOON_S_SWAP_IN The amount of memory that has been - swapped in (in bytes). - - VIRTIO_BALLOON_S_SWAP_OUT The amount of memory that has been - swapped out to disk (in bytes). - - VIRTIO_BALLOON_S_MAJFLT The number of major page faults that - have occurred. - - VIRTIO_BALLOON_S_MINFLT The number of minor page faults that - have occurred. - - VIRTIO_BALLOON_S_MEMFREE The amount of memory not being used - for any purpose (in bytes). - - VIRTIO_BALLOON_S_MEMTOT The total amount of memory available - (in bytes). - -Appendix H: Rpmsg: Remote Processor Messaging - -Virtio rpmsg devices represent remote processors on the system -which run in asymmetric multi-processing (AMP) configuration, and -which are usually used to offload cpu-intensive tasks from the -main application processor (a typical SoC methodology). - -Virtio is being used to communicate with those remote processors; -empty buffers are placed in one virtqueue for receiving messages, -and non-empty buffers, containing outbound messages, are enqueued -in a second virtqueue for transmission. - -Numerous communication channels can be multiplexed over those two -virtqueues, so different entities, running on the application and -remote processor, can directly communicate in a point-to-point -fashion. - - Configuration - - Subsystem Device ID 7 - - Virtqueues 0:receiveq. 1:transmitq. - - Feature bits - - VIRTIO_RPMSG_F_NS (0) Device sends (and capable of receiving) - name service messages announcing the creation (or - destruction) of a channel:/** - - * struct rpmsg_ns_msg - dynamic name service announcement -message - - * @name: name of remote service that is published - - * @addr: address of remote service that is published - - * @flags: indicates whether service is created or destroyed - - * - - * This message is sent across to publish a new service (or -announce - - * about its removal). When we receives these messages, an -appropriate - - * rpmsg channel (i.e device) is created/destroyed. - - */ - -struct rpmsg_ns_msgoon_config { - - char name[RPMSG_NAME_SIZE]; - - u32 addr; - - u32 flags; - -} __packed; - - - -/** - - * enum rpmsg_ns_flags - dynamic name service announcement flags - - * - - * @RPMSG_NS_CREATE: a new remote service was just created - - * @RPMSG_NS_DESTROY: a remote service was just destroyed - - */ - -enum rpmsg_ns_flags { - - RPMSG_NS_CREATE = 0, - - RPMSG_NS_DESTROY = 1, - -}; - - Device configuration layout - -At his point none currently defined. - - Device Initialization - - The initialization routine should identify the receive and - transmission virtqueues. - - The receive virtqueue should be filled with receive buffers. - - Device Operation - -Messages are transmitted by placing them in the transmitq, and -buffers for inbound messages are placed in the receiveq. In any -case, messages are always preceded by the following header: /** - - * struct rpmsg_hdr - common header for all rpmsg messages - - * @src: source address - - * @dst: destination address - - * @reserved: reserved for future use - - * @len: length of payload (in bytes) - - * @flags: message flags - - * @data: @len bytes of message payload data - - * - - * Every message sent(/received) on the rpmsg bus begins with -this header. - - */ - -struct rpmsg_hdr { - - u32 src; - - u32 dst; - - u32 reserved; - - u16 len; - - u16 flags; - - u8 data[0]; - -} __packed; - -Appendix I: SCSI Host Device - -The virtio SCSI host device groups together one or more virtual -logical units (such as disks), and allows communicating to them -using the SCSI protocol. An instance of the device represents a -SCSI host to which many targets and LUNs are attached. - -The virtio SCSI device services two kinds of requests: - - command requests for a logical unit; - - task management functions related to a logical unit, target or - command. - -The device is also able to send out notifications about added and -removed logical units. Together, these capabilities provide a -SCSI transport protocol that uses virtqueues as the transfer -medium. In the transport protocol, the virtio driver acts as the -initiator, while the virtio SCSI host provides one or more -targets that receive and process the requests. - - Configuration - - Subsystem Device ID 8 - - Virtqueues 0:controlq; 1:eventq; 2..n:request queues. - - Feature bits - - VIRTIO_SCSI_F_INOUT (0) A single request can include both - read-only and write-only data buffers. - - VIRTIO_SCSI_F_HOTPLUG (1) The host should enable - hot-plug/hot-unplug of new LUNs and targets on the SCSI bus. - - Device configuration layout All fields of this configuration - are always available. sense_size and cdb_size are writable by - the guest.struct virtio_scsi_config { - - u32 num_queues; - - u32 seg_max; - - u32 max_sectors; - - u32 cmd_per_lun; - - u32 event_info_size; - - u32 sense_size; - - u32 cdb_size; - - u16 max_channel; - - u16 max_target; - - u32 max_lun; - -}; - - num_queues is the total number of request virtqueues exposed by - the device. The driver is free to use only one request queue, - or it can use more to achieve better performance. - - seg_max is the maximum number of segments that can be in a - command. A bidirectional command can include seg_max input - segments and seg_max output segments. - - max_sectors is a hint to the guest about the maximum transfer - size it should use. - - cmd_per_lun is a hint to the guest about the maximum number of - linked commands it should send to one LUN. The actual value - to be used is the minimum of cmd_per_lun and the virtqueue - size. - - event_info_size is the maximum size that the device will fill - for buffers that the driver places in the eventq. The driver - should always put buffers at least of this size. It is - written by the device depending on the set of negotated - features. - - sense_size is the maximum size of the sense data that the - device will write. The default value is written by the device - and will always be 96, but the driver can modify it. It is - restored to the default when the device is reset. - - cdb_size is the maximum size of the CDB that the driver will - write. The default value is written by the device and will - always be 32, but the driver can likewise modify it. It is - restored to the default when the device is reset. - - max_channel, max_target and max_lun can be used by the driver - as hints to constrain scanning the logical units on the - host.h - - Device Initialization - -The initialization routine should first of all discover the -device's virtqueues. - -If the driver uses the eventq, it should then place at least a -buffer in the eventq. - -The driver can immediately issue requests (for example, INQUIRY -or REPORT LUNS) or task management functions (for example, I_T -RESET). - - Device Operation: request queues - -The driver queues requests to an arbitrary request queue, and -they are used by the device on that same queue. It is the -responsibility of the driver to ensure strict request ordering -for commands placed on different queues, because they will be -consumed with no order constraints. - -Requests have the following format: - -struct virtio_scsi_req_cmd { - - // Read-only - - u8 lun[8]; - - u64 id; - - u8 task_attr; - - u8 prio; - - u8 crn; - - char cdb[cdb_size]; - - char dataout[]; - - // Write-only part - - u32 sense_len; - - u32 residual; - - u16 status_qualifier; - - u8 status; - - u8 response; - - u8 sense[sense_size]; - - char datain[]; - -}; - - - -/* command-specific response values */ - -#define VIRTIO_SCSI_S_OK 0 - -#define VIRTIO_SCSI_S_OVERRUN 1 - -#define VIRTIO_SCSI_S_ABORTED 2 - -#define VIRTIO_SCSI_S_BAD_TARGET 3 - -#define VIRTIO_SCSI_S_RESET 4 - -#define VIRTIO_SCSI_S_BUSY 5 - -#define VIRTIO_SCSI_S_TRANSPORT_FAILURE 6 - -#define VIRTIO_SCSI_S_TARGET_FAILURE 7 - -#define VIRTIO_SCSI_S_NEXUS_FAILURE 8 - -#define VIRTIO_SCSI_S_FAILURE 9 - - - -/* task_attr */ - -#define VIRTIO_SCSI_S_SIMPLE 0 - -#define VIRTIO_SCSI_S_ORDERED 1 - -#define VIRTIO_SCSI_S_HEAD 2 - -#define VIRTIO_SCSI_S_ACA 3 - -The lun field addresses a target and logical unit in the -virtio-scsi device's SCSI domain. The only supported format for -the LUN field is: first byte set to 1, second byte set to target, -third and fourth byte representing a single level LUN structure, -followed by four zero bytes. With this representation, a -virtio-scsi device can serve up to 256 targets and 16384 LUNs per -target. - -The id field is the command identifier (“tag”). - -task_attr, prio and crn should be left to zero. task_attr defines -the task attribute as in the table above, but all task attributes -may be mapped to SIMPLE by the device; crn may also be provided -by clients, but is generally expected to be 0. The maximum CRN -value defined by the protocol is 255, since CRN is stored in an -8-bit integer. - -All of these fields are defined in SAM. They are always -read-only, as are the cdb and dataout field. The cdb_size is -taken from the configuration space. - -sense and subsequent fields are always write-only. The sense_len -field indicates the number of bytes actually written to the sense -buffer. The residual field indicates the residual size, -calculated as “data_length - number_of_transferred_bytes”, for -read or write operations. For bidirectional commands, the -number_of_transferred_bytes includes both read and written bytes. -A residual field that is less than the size of datain means that -the dataout field was processed entirely. A residual field that -exceeds the size of datain means that the dataout field was -processed partially and the datain field was not processed at -all. - -The status byte is written by the device to be the status code as -defined in SAM. - -The response byte is written by the device to be one of the -following: - - VIRTIO_SCSI_S_OK when the request was completed and the status - byte is filled with a SCSI status code (not necessarily - "GOOD"). - - VIRTIO_SCSI_S_OVERRUN if the content of the CDB requires - transferring more data than is available in the data buffers. - - VIRTIO_SCSI_S_ABORTED if the request was cancelled due to an - ABORT TASK or ABORT TASK SET task management function. - - VIRTIO_SCSI_S_BAD_TARGET if the request was never processed - because the target indicated by the lun field does not exist. - - VIRTIO_SCSI_S_RESET if the request was cancelled due to a bus - or device reset (including a task management function). - - VIRTIO_SCSI_S_TRANSPORT_FAILURE if the request failed due to a - problem in the connection between the host and the target - (severed link). - - VIRTIO_SCSI_S_TARGET_FAILURE if the target is suffering a - failure and the guest should not retry on other paths. - - VIRTIO_SCSI_S_NEXUS_FAILURE if the nexus is suffering a failure - but retrying on other paths might yield a different result. - - VIRTIO_SCSI_S_BUSY if the request failed but retrying on the - same path should work. - - VIRTIO_SCSI_S_FAILURE for other host or guest error. In - particular, if neither dataout nor datain is empty, and the - VIRTIO_SCSI_F_INOUT feature has not been negotiated, the - request will be immediately returned with a response equal to - VIRTIO_SCSI_S_FAILURE. - - Device Operation: controlq - -The controlq is used for other SCSI transport operations. -Requests have the following format: - -struct virtio_scsi_ctrl { - - u32 type; - - ... - - u8 response; - -}; - - - -/* response values valid for all commands */ - -#define VIRTIO_SCSI_S_OK 0 - -#define VIRTIO_SCSI_S_BAD_TARGET 3 - -#define VIRTIO_SCSI_S_BUSY 5 - -#define VIRTIO_SCSI_S_TRANSPORT_FAILURE 6 - -#define VIRTIO_SCSI_S_TARGET_FAILURE 7 - -#define VIRTIO_SCSI_S_NEXUS_FAILURE 8 - -#define VIRTIO_SCSI_S_FAILURE 9 - -#define VIRTIO_SCSI_S_INCORRECT_LUN 12 - -The type identifies the remaining fields. - -The following commands are defined: - - Task management function -#define VIRTIO_SCSI_T_TMF 0 - - - -#define VIRTIO_SCSI_T_TMF_ABORT_TASK 0 - -#define VIRTIO_SCSI_T_TMF_ABORT_TASK_SET 1 - -#define VIRTIO_SCSI_T_TMF_CLEAR_ACA 2 - -#define VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET 3 - -#define VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET 4 - -#define VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET 5 - -#define VIRTIO_SCSI_T_TMF_QUERY_TASK 6 - -#define VIRTIO_SCSI_T_TMF_QUERY_TASK_SET 7 - - - -struct virtio_scsi_ctrl_tmf - -{ - - // Read-only part - - u32 type; - - u32 subtype; - - u8 lun[8]; - - u64 id; - - // Write-only part - - u8 response; - -} - - - -/* command-specific response values */ - -#define VIRTIO_SCSI_S_FUNCTION_COMPLETE 0 - -#define VIRTIO_SCSI_S_FUNCTION_SUCCEEDED 10 - -#define VIRTIO_SCSI_S_FUNCTION_REJECTED 11 - - The type is VIRTIO_SCSI_T_TMF; the subtype field defines. All - fields except response are filled by the driver. The subtype - field must always be specified and identifies the requested - task management function. - - Other fields may be irrelevant for the requested TMF; if so, - they are ignored but they should still be present. The lun - field is in the same format specified for request queues; the - single level LUN is ignored when the task management function - addresses a whole I_T nexus. When relevant, the value of the id - field is matched against the id values passed on the requestq. - - The outcome of the task management function is written by the - device in the response field. The command-specific response - values map 1-to-1 with those defined in SAM. - - Asynchronous notification query -#define VIRTIO_SCSI_T_AN_QUERY 1 - - - -struct virtio_scsi_ctrl_an { - - // Read-only part - - u32 type; - - u8 lun[8]; - - u32 event_requested; - - // Write-only part - - u32 event_actual; - - u8 response; - -} - - - -#define VIRTIO_SCSI_EVT_ASYNC_OPERATIONAL_CHANGE 2 - -#define VIRTIO_SCSI_EVT_ASYNC_POWER_MGMT 4 - -#define VIRTIO_SCSI_EVT_ASYNC_EXTERNAL_REQUEST 8 - -#define VIRTIO_SCSI_EVT_ASYNC_MEDIA_CHANGE 16 - -#define VIRTIO_SCSI_EVT_ASYNC_MULTI_HOST 32 - -#define VIRTIO_SCSI_EVT_ASYNC_DEVICE_BUSY 64 - - By sending this command, the driver asks the device which - events the given LUN can report, as described in paragraphs 6.6 - and A.6 of the SCSI MMC specification. The driver writes the - events it is interested in into the event_requested; the device - responds by writing the events that it supports into - event_actual. - - The type is VIRTIO_SCSI_T_AN_QUERY. The lun and event_requested - fields are written by the driver. The event_actual and response - fields are written by the device. - - No command-specific values are defined for the response byte. - - Asynchronous notification subscription -#define VIRTIO_SCSI_T_AN_SUBSCRIBE 2 - - - -struct virtio_scsi_ctrl_an { - - // Read-only part - - u32 type; - - u8 lun[8]; - - u32 event_requested; - - // Write-only part - - u32 event_actual; - - u8 response; - -} - - By sending this command, the driver asks the specified LUN to - report events for its physical interface, again as described in - the SCSI MMC specification. The driver writes the events it is - interested in into the event_requested; the device responds by - writing the events that it supports into event_actual. - - Event types are the same as for the asynchronous notification - query message. - - The type is VIRTIO_SCSI_T_AN_SUBSCRIBE. The lun and - event_requested fields are written by the driver. The - event_actual and response fields are written by the device. - - No command-specific values are defined for the response byte. - - Device Operation: eventq - -The eventq is used by the device to report information on logical -units that are attached to it. The driver should always leave a -few buffers ready in the eventq. In general, the device will not -queue events to cope with an empty eventq, and will end up -dropping events if it finds no buffer ready. However, when -reporting events for many LUNs (e.g. when a whole target -disappears), the device can throttle events to avoid dropping -them. For this reason, placing 10-15 buffers on the event queue -should be enough. - -Buffers are placed in the eventq and filled by the device when -interesting events occur. The buffers should be strictly -write-only (device-filled) and the size of the buffers should be -at least the value given in the device's configuration -information. - -Buffers returned by the device on the eventq will be referred to -as "events" in the rest of this section. Events have the -following format: - -#define VIRTIO_SCSI_T_EVENTS_MISSED 0x80000000 - - - -struct virtio_scsi_event { - - // Write-only part - - u32 event; - - ... - -} - -If bit 31 is set in the event field, the device failed to report -an event due to missing buffers. In this case, the driver should -poll the logical units for unit attention conditions, and/or do -whatever form of bus scan is appropriate for the guest operating -system. - -Other data that the device writes to the buffer depends on the -contents of the event field. The following events are defined: - - No event -#define VIRTIO_SCSI_T_NO_EVENT 0 - - This event is fired in the following cases: - - When the device detects in the eventq a buffer that is shorter - than what is indicated in the configuration field, it might - use it immediately and put this dummy value in the event - field. A well-written driver will never observe this - situation. - - When events are dropped, the device may signal this event as - soon as the drivers makes a buffer available, in order to - request action from the driver. In this case, of course, this - event will be reported with the VIRTIO_SCSI_T_EVENTS_MISSED - flag. - - Transport reset -#define VIRTIO_SCSI_T_TRANSPORT_RESET 1 - - - -struct virtio_scsi_event_reset { - - // Write-only part - - u32 event; - - u8 lun[8]; - - u32 reason; - -} - - - -#define VIRTIO_SCSI_EVT_RESET_HARD 0 - -#define VIRTIO_SCSI_EVT_RESET_RESCAN 1 - -#define VIRTIO_SCSI_EVT_RESET_REMOVED 2 - - By sending this event, the device signals that a logical unit - on a target has been reset, including the case of a new device - appearing or disappearing on the bus.The device fills in all - fields. The event field is set to - VIRTIO_SCSI_T_TRANSPORT_RESET. The lun field addresses a - logical unit in the SCSI host. - - The reason value is one of the three #define values appearing - above: - - VIRTIO_SCSI_EVT_RESET_REMOVED (“LUN/target removed”) is used if - the target or logical unit is no longer able to receive - commands. - - VIRTIO_SCSI_EVT_RESET_HARD (“LUN hard reset”) is used if the - logical unit has been reset, but is still present. - - VIRTIO_SCSI_EVT_RESET_RESCAN (“rescan LUN/target”) is used if a - target or logical unit has just appeared on the device. - - The “removed” and “rescan” events, when sent for LUN 0, may - apply to the entire target. After receiving them the driver - should ask the initiator to rescan the target, in order to - detect the case when an entire target has appeared or - disappeared. These two events will never be reported unless the - VIRTIO_SCSI_F_HOTPLUG feature was negotiated between the host - and the guest. - - Events will also be reported via sense codes (this obviously - does not apply to newly appeared buses or targets, since the - application has never discovered them): - - “LUN/target removed” maps to sense key ILLEGAL REQUEST, asc - 0x25, ascq 0x00 (LOGICAL UNIT NOT SUPPORTED) - - “LUN hard reset” maps to sense key UNIT ATTENTION, asc 0x29 - (POWER ON, RESET OR BUS DEVICE RESET OCCURRED) - - “rescan LUN/target” maps to sense key UNIT ATTENTION, asc 0x3f, - ascq 0x0e (REPORTED LUNS DATA HAS CHANGED) - - The preferred way to detect transport reset is always to use - events, because sense codes are only seen by the driver when it - sends a SCSI command to the logical unit or target. However, in - case events are dropped, the initiator will still be able to - synchronize with the actual state of the controller if the - driver asks the initiator to rescan of the SCSI bus. During the - rescan, the initiator will be able to observe the above sense - codes, and it will process them as if it the driver had - received the equivalent event. - - Asynchronous notification -#define VIRTIO_SCSI_T_ASYNC_NOTIFY 2 - - - -struct virtio_scsi_event_an { - - // Write-only part - - u32 event; - - u8 lun[8]; - - u32 reason; - -} - - By sending this event, the device signals that an asynchronous - event was fired from a physical interface. - - All fields are written by the device. The event field is set to - VIRTIO_SCSI_T_ASYNC_NOTIFY. The lun field addresses a logical - unit in the SCSI host. The reason field is a subset of the - events that the driver has subscribed to via the "Asynchronous - notification subscription" command. - - When dropped events are reported, the driver should poll for - asynchronous events manually using SCSI commands. - -Appendix X: virtio-mmio - -Virtual environments without PCI support (a common situation in -embedded devices models) might use simple memory mapped device (“ -virtio-mmio”) instead of the PCI device. - -The memory mapped virtio device behaviour is based on the PCI -device specification. Therefore most of operations like device -initialization, queues configuration and buffer transfers are -nearly identical. Existing differences are described in the -following sections. - - Device Initialization - -Instead of using the PCI IO space for virtio header, the “ -virtio-mmio” device provides a set of memory mapped control -registers, all 32 bits wide, followed by device-specific -configuration space. The following list presents their layout: - - Offset from the device base address | Direction | Name - Description - - 0x000 | R | MagicValue - “virt” string. - - 0x004 | R | Version - Device version number. Currently must be 1. - - 0x008 | R | DeviceID - Virtio Subsystem Device ID (ie. 1 for network card). - - 0x00c | R | VendorID - Virtio Subsystem Vendor ID. - - 0x010 | R | HostFeatures - Flags representing features the device supports. - Reading from this register returns 32 consecutive flag bits, - first bit depending on the last value written to - HostFeaturesSel register. Access to this register returns bits HostFeaturesSel*32 - - to (HostFeaturesSel*32)+31 -, eg. feature bits 0 to 31 if - HostFeaturesSel is set to 0 and features bits 32 to 63 if - HostFeaturesSel is set to 1. Also see [sub:Feature-Bits] - - 0x014 | W | HostFeaturesSel - Device (Host) features word selection. - Writing to this register selects a set of 32 device feature bits - accessible by reading from HostFeatures register. Device driver - must write a value to the HostFeaturesSel register before - reading from the HostFeatures register. - - 0x020 | W | GuestFeatures - Flags representing device features understood and activated by - the driver. - Writing to this register sets 32 consecutive flag bits, first - bit depending on the last value written to GuestFeaturesSel - register. Access to this register sets bits GuestFeaturesSel*32 - - to (GuestFeaturesSel*32)+31 -, eg. feature bits 0 to 31 if - GuestFeaturesSel is set to 0 and features bits 32 to 63 if - GuestFeaturesSel is set to 1. Also see [sub:Feature-Bits] - - 0x024 | W | GuestFeaturesSel - Activated (Guest) features word selection. - Writing to this register selects a set of 32 activated feature - bits accessible by writing to the GuestFeatures register. - Device driver must write a value to the GuestFeaturesSel - register before writing to the GuestFeatures register. - - 0x028 | W | GuestPageSize - Guest page size. - Device driver must write the guest page size in bytes to the - register during initialization, before any queues are used. - This value must be a power of 2 and is used by the Host to - calculate Guest address of the first queue page (see QueuePFN). - - 0x030 | W | QueueSel - Virtual queue index (first queue is 0). - Writing to this register selects the virtual queue that the - following operations on QueueNum, QueueAlign and QueuePFN apply - to. - - 0x034 | R | QueueNumMax - Maximum virtual queue size. - Reading from the register returns the maximum size of the queue - the Host is ready to process or zero (0x0) if the queue is not - available. This applies to the queue selected by writing to - QueueSel and is allowed only when QueuePFN is set to zero - (0x0), so when the queue is not actively used. - - 0x038 | W | QueueNum - Virtual queue size. - Queue size is a number of elements in the queue, therefore size - of the descriptor table and both available and used rings. - Writing to this register notifies the Host what size of the - queue the Guest will use. This applies to the queue selected by - writing to QueueSel. - - 0x03c | W | QueueAlign - Used Ring alignment in the virtual queue. - Writing to this register notifies the Host about alignment - boundary of the Used Ring in bytes. This value must be a power - of 2 and applies to the queue selected by writing to QueueSel. - - 0x040 | RW | QueuePFN - Guest physical page number of the virtual queue. - Writing to this register notifies the host about location of the - virtual queue in the Guest's physical address space. This value - is the index number of a page starting with the queue - Descriptor Table. Value zero (0x0) means physical address zero - (0x00000000) and is illegal. When the Guest stops using the - queue it must write zero (0x0) to this register. - Reading from this register returns the currently used page - number of the queue, therefore a value other than zero (0x0) - means that the queue is in use. - Both read and write accesses apply to the queue selected by - writing to QueueSel. - - 0x050 | W | QueueNotify - Queue notifier. - Writing a queue index to this register notifies the Host that - there are new buffers to process in the queue. - - 0x60 | R | InterruptStatus -Interrupt status. -Reading from this register returns a bit mask of interrupts - asserted by the device. An interrupt is asserted if the - corresponding bit is set, ie. equals one (1). - - Bit 0 | Used Ring Update -This interrupt is asserted when the Host has updated the Used - Ring in at least one of the active virtual queues. - - Bit 1 | Configuration change -This interrupt is asserted when configuration of the device has - changed. - - 0x064 | W | InterruptACK - Interrupt acknowledge. - Writing to this register notifies the Host that the Guest - finished handling interrupts. Set bits in the value clear the - corresponding bits of the InterruptStatus register. - - 0x070 | RW | Status - Device status. - Reading from this register returns the current device status - flags. - Writing non-zero values to this register sets the status flags, - indicating the Guest progress. Writing zero (0x0) to this - register triggers a device reset. - Also see [sub:Device-Initialization-Sequence] - - 0x100+ | RW | Config - Device-specific configuration space starts at an offset 0x100 - and is accessed with byte alignment. Its meaning and size - depends on the device and the driver. - -Virtual queue size is a number of elements in the queue, -therefore size of the descriptor table and both available and -used rings. - -The endianness of the registers follows the native endianness of -the Guest. Writing to registers described as “R” and reading from -registers described as “W” is not permitted and can cause -undefined behavior. - -The device initialization is performed as described in [sub:Device-Initialization-Sequence] - with one exception: the Guest must notify the Host about its -page size, writing the size in bytes to GuestPageSize register -before the initialization is finished. - -The memory mapped virtio devices generate single interrupt only, -therefore no special configuration is required. - - Virtqueue Configuration - -The virtual queue configuration is performed in a similar way to -the one described in [sec:Virtqueue-Configuration] with a few -additional operations: - - Select the queue writing its index (first queue is 0) to the - QueueSel register. - - Check if the queue is not already in use: read QueuePFN - register, returned value should be zero (0x0). - - Read maximum queue size (number of elements) from the - QueueNumMax register. If the returned value is zero (0x0) the - queue is not available. - - Allocate and zero the queue pages in contiguous virtual memory, - aligning the Used Ring to an optimal boundary (usually page - size). Size of the allocated queue may be smaller than or equal - to the maximum size returned by the Host. - - Notify the Host about the queue size by writing the size to - QueueNum register. - - Notify the Host about the used alignment by writing its value - in bytes to QueueAlign register. - - Write the physical number of the first page of the queue to the - QueuePFN register. - -The queue and the device are ready to begin normal operations -now. - - Device Operation - -The memory mapped virtio device behaves in the same way as -described in [sec:Device-Operation], with the following -exceptions: - - The device is notified about new buffers available in a queue - by writing the queue index to register QueueNum instead of the - virtio header in PCI I/O space ([sub:Notifying-The-Device]). - - The memory mapped virtio device is using single, dedicated - interrupt signal, which is raised when at least one of the - interrupts described in the InterruptStatus register - description is asserted. After receiving an interrupt, the - driver must read the InterruptStatus register to check what - caused the interrupt (see the register description). After the - interrupt is handled, the driver must acknowledge it by writing - a bit mask corresponding to the serviced interrupt to the - InterruptACK register. - diff --git a/MAINTAINERS b/MAINTAINERS index ee468fac7dbf..59d708f29249 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3919,7 +3919,6 @@ F: drivers/i2c/i2c-stub.c I2C SUBSYSTEM M: Wolfram Sang <wsa@the-dreams.de> -M: "Ben Dooks (embedded platforms)" <ben-linux@fluff.org> L: linux-i2c@vger.kernel.org W: http://i2c.wiki.kernel.org/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/wsa/linux.git @@ -8743,6 +8742,7 @@ F: drivers/virtio/ F: drivers/net/virtio_net.c F: drivers/block/virtio_blk.c F: include/linux/virtio_*.h +F: include/uapi/linux/virtio_*.h VIRTIO HOST (VHOST) M: "Michael S. Tsirkin" <mst@redhat.com> diff --git a/arch/arm/mach-s3c24xx/mach-rx1950.c b/arch/arm/mach-s3c24xx/mach-rx1950.c index 1f9ba2ae5288..43f3ac5a1c7a 100644 --- a/arch/arm/mach-s3c24xx/mach-rx1950.c +++ b/arch/arm/mach-s3c24xx/mach-rx1950.c @@ -56,7 +56,6 @@ #include <plat/cpu.h> #include <plat/devs.h> #include <plat/pm.h> -#include <plat/regs-iic.h> #include <plat/regs-serial.h> #include "common.h" diff --git a/arch/arm/mach-tegra/tegra.c b/arch/arm/mach-tegra/tegra.c index 84deeab23ee7..9cf1ab17afeb 100644 --- a/arch/arm/mach-tegra/tegra.c +++ b/arch/arm/mach-tegra/tegra.c @@ -31,8 +31,6 @@ #include <linux/pda_power.h> #include <linux/platform_data/tegra_usb.h> #include <linux/io.h> -#include <linux/i2c.h> -#include <linux/i2c-tegra.h> #include <linux/slab.h> #include <linux/sys_soc.h> #include <linux/usb/tegra_usb_phy.h> diff --git a/arch/arm/plat-samsung/devs.c b/arch/arm/plat-samsung/devs.c index f55d3f9b508d..8f456bed4fa0 100644 --- a/arch/arm/plat-samsung/devs.c +++ b/arch/arm/plat-samsung/devs.c @@ -63,7 +63,6 @@ #include <linux/platform_data/usb-s3c2410_udc.h> #include <linux/platform_data/usb-ohci-s3c2410.h> #include <plat/usb-phy.h> -#include <plat/regs-iic.h> #include <plat/regs-serial.h> #include <plat/regs-spi.h> #include <linux/platform_data/spi-s3c64xx.h> diff --git a/arch/arm/plat-samsung/include/plat/regs-iic.h b/arch/arm/plat-samsung/include/plat/regs-iic.h deleted file mode 100644 index 2f7c17de8ac8..000000000000 --- a/arch/arm/plat-samsung/include/plat/regs-iic.h +++ /dev/null @@ -1,56 +0,0 @@ -/* arch/arm/mach-s3c2410/include/mach/regs-iic.h - * - * Copyright (c) 2004 Simtec Electronics <linux@simtec.co.uk> - * http://www.simtec.co.uk/products/SWLINUX/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * S3C2410 I2C Controller -*/ - -#ifndef __ASM_ARCH_REGS_IIC_H -#define __ASM_ARCH_REGS_IIC_H __FILE__ - -/* see s3c2410x user guide, v1.1, section 9 (p447) for more info */ - -#define S3C2410_IICREG(x) (x) - -#define S3C2410_IICCON S3C2410_IICREG(0x00) -#define S3C2410_IICSTAT S3C2410_IICREG(0x04) -#define S3C2410_IICADD S3C2410_IICREG(0x08) -#define S3C2410_IICDS S3C2410_IICREG(0x0C) -#define S3C2440_IICLC S3C2410_IICREG(0x10) - -#define S3C2410_IICCON_ACKEN (1<<7) -#define S3C2410_IICCON_TXDIV_16 (0<<6) -#define S3C2410_IICCON_TXDIV_512 (1<<6) -#define S3C2410_IICCON_IRQEN (1<<5) -#define S3C2410_IICCON_IRQPEND (1<<4) -#define S3C2410_IICCON_SCALE(x) ((x)&15) -#define S3C2410_IICCON_SCALEMASK (0xf) - -#define S3C2410_IICSTAT_MASTER_RX (2<<6) -#define S3C2410_IICSTAT_MASTER_TX (3<<6) -#define S3C2410_IICSTAT_SLAVE_RX (0<<6) -#define S3C2410_IICSTAT_SLAVE_TX (1<<6) -#define S3C2410_IICSTAT_MODEMASK (3<<6) - -#define S3C2410_IICSTAT_START (1<<5) -#define S3C2410_IICSTAT_BUSBUSY (1<<5) -#define S3C2410_IICSTAT_TXRXEN (1<<4) -#define S3C2410_IICSTAT_ARBITR (1<<3) -#define S3C2410_IICSTAT_ASSLAVE (1<<2) -#define S3C2410_IICSTAT_ADDR0 (1<<1) -#define S3C2410_IICSTAT_LASTBIT (1<<0) - -#define S3C2410_IICLC_SDA_DELAY0 (0 << 0) -#define S3C2410_IICLC_SDA_DELAY5 (1 << 0) -#define S3C2410_IICLC_SDA_DELAY10 (2 << 0) -#define S3C2410_IICLC_SDA_DELAY15 (3 << 0) -#define S3C2410_IICLC_SDA_DELAY_MASK (3 << 0) - -#define S3C2410_IICLC_FILTER_ON (1<<2) - -#endif /* __ASM_ARCH_REGS_IIC_H */ diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h index 0d97deba1e35..e2d4a4afa8c3 100644 --- a/arch/x86/include/asm/lguest.h +++ b/arch/x86/include/asm/lguest.h @@ -11,18 +11,11 @@ #define GUEST_PL 1 -/* Every guest maps the core switcher code. */ -#define SHARED_SWITCHER_PAGES \ - DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE) -/* Pages for switcher itself, then two pages per cpu */ -#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) - -/* We map at -4M (-2M for PAE) for ease of mapping (one PTE page). */ -#ifdef CONFIG_X86_PAE -#define SWITCHER_ADDR 0xFFE00000 -#else -#define SWITCHER_ADDR 0xFFC00000 -#endif +/* Page for Switcher text itself, then two pages per cpu */ +#define TOTAL_SWITCHER_PAGES (1 + 2 * nr_cpu_ids) + +/* Where we map the Switcher, in both Host and Guest. */ +extern unsigned long switcher_addr; /* Found in switcher.S */ extern unsigned long default_idt_entries[]; diff --git a/block/blk-integrity.c b/block/blk-integrity.c index dabd221857e1..03cf7179e8ef 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -110,7 +110,7 @@ new_segment: if (!sg) sg = sglist; else { - sg->page_link &= ~0x02; + sg_unmark_end(sg); sg = sg_next(sg); } diff --git a/block/blk-merge.c b/block/blk-merge.c index 936a110de0b9..5f2448253797 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -143,7 +143,7 @@ new_segment: * termination bit to avoid doing a full * sg_init_table() in drivers for each command. */ - (*sg)->page_link &= ~0x02; + sg_unmark_end(*sg); *sg = sg_next(*sg); } diff --git a/drivers/Makefile b/drivers/Makefile index 33360de63650..8e57688ebd95 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -124,7 +124,7 @@ obj-$(CONFIG_PPC_PS3) += ps3/ obj-$(CONFIG_OF) += of/ obj-$(CONFIG_SSB) += ssb/ obj-$(CONFIG_BCMA) += bcma/ -obj-$(CONFIG_VHOST_NET) += vhost/ +obj-$(CONFIG_VHOST_RING) += vhost/ obj-$(CONFIG_VLYNQ) += vlynq/ obj-$(CONFIG_STAGING) += staging/ obj-y += platform/ diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 8ad21a25bc0d..64723953e1c9 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -100,96 +100,103 @@ static inline struct virtblk_req *virtblk_alloc_req(struct virtio_blk *vblk, return vbr; } -static void virtblk_add_buf_wait(struct virtio_blk *vblk, - struct virtblk_req *vbr, - unsigned long out, - unsigned long in) +static int __virtblk_add_req(struct virtqueue *vq, + struct virtblk_req *vbr, + struct scatterlist *data_sg, + bool have_data) { - DEFINE_WAIT(wait); + struct scatterlist hdr, status, cmd, sense, inhdr, *sgs[6]; + unsigned int num_out = 0, num_in = 0; + int type = vbr->out_hdr.type & ~VIRTIO_BLK_T_OUT; - for (;;) { - prepare_to_wait_exclusive(&vblk->queue_wait, &wait, - TASK_UNINTERRUPTIBLE); + sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr)); + sgs[num_out++] = &hdr; - spin_lock_irq(vblk->disk->queue->queue_lock); - if (virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr, - GFP_ATOMIC) < 0) { - spin_unlock_irq(vblk->disk->queue->queue_lock); - io_schedule(); - } else { - virtqueue_kick(vblk->vq); - spin_unlock_irq(vblk->disk->queue->queue_lock); - break; - } + /* + * If this is a packet command we need a couple of additional headers. + * Behind the normal outhdr we put a segment with the scsi command + * block, and before the normal inhdr we put the sense data and the + * inhdr with additional status information. + */ + if (type == VIRTIO_BLK_T_SCSI_CMD) { + sg_init_one(&cmd, vbr->req->cmd, vbr->req->cmd_len); + sgs[num_out++] = &cmd; + } + if (have_data) { + if (vbr->out_hdr.type & VIRTIO_BLK_T_OUT) + sgs[num_out++] = data_sg; + else + sgs[num_out + num_in++] = data_sg; } - finish_wait(&vblk->queue_wait, &wait); + if (type == VIRTIO_BLK_T_SCSI_CMD) { + sg_init_one(&sense, vbr->req->sense, SCSI_SENSE_BUFFERSIZE); + sgs[num_out + num_in++] = &sense; + sg_init_one(&inhdr, &vbr->in_hdr, sizeof(vbr->in_hdr)); + sgs[num_out + num_in++] = &inhdr; + } + + sg_init_one(&status, &vbr->status, sizeof(vbr->status)); + sgs[num_out + num_in++] = &status; + + return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC); } -static inline void virtblk_add_req(struct virtblk_req *vbr, - unsigned int out, unsigned int in) +static void virtblk_add_req(struct virtblk_req *vbr, bool have_data) { struct virtio_blk *vblk = vbr->vblk; + DEFINE_WAIT(wait); + int ret; spin_lock_irq(vblk->disk->queue->queue_lock); - if (unlikely(virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr, - GFP_ATOMIC) < 0)) { + while (unlikely((ret = __virtblk_add_req(vblk->vq, vbr, vbr->sg, + have_data)) < 0)) { + prepare_to_wait_exclusive(&vblk->queue_wait, &wait, + TASK_UNINTERRUPTIBLE); + spin_unlock_irq(vblk->disk->queue->queue_lock); - virtblk_add_buf_wait(vblk, vbr, out, in); - return; + io_schedule(); + spin_lock_irq(vblk->disk->queue->queue_lock); + + finish_wait(&vblk->queue_wait, &wait); } + virtqueue_kick(vblk->vq); spin_unlock_irq(vblk->disk->queue->queue_lock); } -static int virtblk_bio_send_flush(struct virtblk_req *vbr) +static void virtblk_bio_send_flush(struct virtblk_req *vbr) { - unsigned int out = 0, in = 0; - vbr->flags |= VBLK_IS_FLUSH; vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH; vbr->out_hdr.sector = 0; vbr->out_hdr.ioprio = 0; - sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr)); - sg_set_buf(&vbr->sg[out + in++], &vbr->status, sizeof(vbr->status)); - - virtblk_add_req(vbr, out, in); - return 0; + virtblk_add_req(vbr, false); } -static int virtblk_bio_send_data(struct virtblk_req *vbr) +static void virtblk_bio_send_data(struct virtblk_req *vbr) { struct virtio_blk *vblk = vbr->vblk; - unsigned int num, out = 0, in = 0; struct bio *bio = vbr->bio; + bool have_data; vbr->flags &= ~VBLK_IS_FLUSH; vbr->out_hdr.type = 0; vbr->out_hdr.sector = bio->bi_sector; vbr->out_hdr.ioprio = bio_prio(bio); - sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr)); - - num = blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg + out); - - sg_set_buf(&vbr->sg[num + out + in++], &vbr->status, - sizeof(vbr->status)); - - if (num) { - if (bio->bi_rw & REQ_WRITE) { + if (blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg)) { + have_data = true; + if (bio->bi_rw & REQ_WRITE) vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; - out += num; - } else { + else vbr->out_hdr.type |= VIRTIO_BLK_T_IN; - in += num; - } - } + } else + have_data = false; - virtblk_add_req(vbr, out, in); - - return 0; + virtblk_add_req(vbr, have_data); } static void virtblk_bio_send_data_work(struct work_struct *work) @@ -298,7 +305,7 @@ static void virtblk_done(struct virtqueue *vq) static bool do_req(struct request_queue *q, struct virtio_blk *vblk, struct request *req) { - unsigned long num, out = 0, in = 0; + unsigned int num; struct virtblk_req *vbr; vbr = virtblk_alloc_req(vblk, GFP_ATOMIC); @@ -335,40 +342,15 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk, } } - sg_set_buf(&vblk->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr)); - - /* - * If this is a packet command we need a couple of additional headers. - * Behind the normal outhdr we put a segment with the scsi command - * block, and before the normal inhdr we put the sense data and the - * inhdr with additional status information before the normal inhdr. - */ - if (vbr->req->cmd_type == REQ_TYPE_BLOCK_PC) - sg_set_buf(&vblk->sg[out++], vbr->req->cmd, vbr->req->cmd_len); - - num = blk_rq_map_sg(q, vbr->req, vblk->sg + out); - - if (vbr->req->cmd_type == REQ_TYPE_BLOCK_PC) { - sg_set_buf(&vblk->sg[num + out + in++], vbr->req->sense, SCSI_SENSE_BUFFERSIZE); - sg_set_buf(&vblk->sg[num + out + in++], &vbr->in_hdr, - sizeof(vbr->in_hdr)); - } - - sg_set_buf(&vblk->sg[num + out + in++], &vbr->status, - sizeof(vbr->status)); - + num = blk_rq_map_sg(q, vbr->req, vblk->sg); if (num) { - if (rq_data_dir(vbr->req) == WRITE) { + if (rq_data_dir(vbr->req) == WRITE) vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; - out += num; - } else { + else vbr->out_hdr.type |= VIRTIO_BLK_T_IN; - in += num; - } } - if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr, - GFP_ATOMIC) < 0) { + if (__virtblk_add_req(vblk->vq, vbr, vblk->sg, num) < 0) { mempool_free(vbr, vblk->pool); return false; } @@ -539,6 +521,7 @@ static void virtblk_config_changed_work(struct work_struct *work) struct virtio_device *vdev = vblk->vdev; struct request_queue *q = vblk->disk->queue; char cap_str_2[10], cap_str_10[10]; + char *envp[] = { "RESIZE=1", NULL }; u64 capacity, size; mutex_lock(&vblk->config_lock); @@ -568,6 +551,7 @@ static void virtblk_config_changed_work(struct work_struct *work) set_capacity(vblk->disk, capacity); revalidate_disk(vblk->disk); + kobject_uevent_env(&disk_to_dev(vblk->disk)->kobj, KOBJ_CHANGE, envp); done: mutex_unlock(&vblk->config_lock); } diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c index 6bf4d47324eb..ef46a9cfd832 100644 --- a/drivers/char/hw_random/virtio-rng.c +++ b/drivers/char/hw_random/virtio-rng.c @@ -47,7 +47,7 @@ static void register_buffer(u8 *buf, size_t size) sg_init_one(&sg, buf, size); /* There should always be room for one buffer. */ - if (virtqueue_add_buf(vq, &sg, 0, 1, buf, GFP_KERNEL) < 0) + if (virtqueue_add_inbuf(vq, &sg, 1, buf, GFP_KERNEL) < 0) BUG(); virtqueue_kick(vq); diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index ce5f3fc25d6d..1b456fe9b87a 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -78,8 +78,8 @@ struct ports_driver_data { }; static struct ports_driver_data pdrvdata; -DEFINE_SPINLOCK(pdrvdata_lock); -DECLARE_COMPLETION(early_console_added); +static DEFINE_SPINLOCK(pdrvdata_lock); +static DECLARE_COMPLETION(early_console_added); /* This struct holds information that's relevant only for console ports */ struct console { @@ -503,7 +503,7 @@ static int add_inbuf(struct virtqueue *vq, struct port_buffer *buf) sg_init_one(sg, buf->buf, buf->size); - ret = virtqueue_add_buf(vq, sg, 0, 1, buf, GFP_ATOMIC); + ret = virtqueue_add_inbuf(vq, sg, 1, buf, GFP_ATOMIC); virtqueue_kick(vq); if (!ret) ret = vq->num_free; @@ -572,7 +572,7 @@ static ssize_t __send_control_msg(struct ports_device *portdev, u32 port_id, sg_init_one(sg, &cpkt, sizeof(cpkt)); spin_lock(&portdev->c_ovq_lock); - if (virtqueue_add_buf(vq, sg, 1, 0, &cpkt, GFP_ATOMIC) == 0) { + if (virtqueue_add_outbuf(vq, sg, 1, &cpkt, GFP_ATOMIC) == 0) { virtqueue_kick(vq); while (!virtqueue_get_buf(vq, &len)) cpu_relax(); @@ -622,7 +622,7 @@ static ssize_t __send_to_port(struct port *port, struct scatterlist *sg, reclaim_consumed_buffers(port); - err = virtqueue_add_buf(out_vq, sg, nents, 0, data, GFP_ATOMIC); + err = virtqueue_add_outbuf(out_vq, sg, nents, data, GFP_ATOMIC); /* Tell Host to go! */ virtqueue_kick(out_vq); @@ -1040,7 +1040,7 @@ static int port_fops_open(struct inode *inode, struct file *filp) spin_lock_irq(&port->inbuf_lock); if (port->guest_connected) { spin_unlock_irq(&port->inbuf_lock); - ret = -EMFILE; + ret = -EBUSY; goto out; } @@ -1202,7 +1202,7 @@ int __init virtio_cons_early_init(int (*put_chars)(u32, const char *, int)) return hvc_instantiate(0, 0, &hv_ops); } -int init_port_console(struct port *port) +static int init_port_console(struct port *port) { int ret; diff --git a/drivers/gpu/drm/gma500/oaktrail_hdmi_i2c.c b/drivers/gpu/drm/gma500/oaktrail_hdmi_i2c.c index 88627e3ba1e3..1eb86c79523e 100644 --- a/drivers/gpu/drm/gma500/oaktrail_hdmi_i2c.c +++ b/drivers/gpu/drm/gma500/oaktrail_hdmi_i2c.c @@ -319,8 +319,7 @@ void oaktrail_hdmi_i2c_exit(struct pci_dev *dev) struct hdmi_i2c_dev *i2c_dev; hdmi_dev = pci_get_drvdata(dev); - if (i2c_del_adapter(&oaktrail_hdmi_i2c_adapter)) - DRM_DEBUG_DRIVER("Failed to delete hdmi-i2c adapter\n"); + i2c_del_adapter(&oaktrail_hdmi_i2c_adapter); i2c_dev = hdmi_dev->i2c_dev; kfree(i2c_dev); diff --git a/drivers/i2c/busses/i2c-amd756-s4882.c b/drivers/i2c/busses/i2c-amd756-s4882.c index 378fcb5d5783..07f01ac853ff 100644 --- a/drivers/i2c/busses/i2c-amd756-s4882.c +++ b/drivers/i2c/busses/i2c-amd756-s4882.c @@ -169,11 +169,7 @@ static int __init amd756_s4882_init(void) } /* Unregister physical bus */ - error = i2c_del_adapter(&amd756_smbus); - if (error) { - dev_err(&amd756_smbus.dev, "Physical bus removal failed\n"); - goto ERROR0; - } + i2c_del_adapter(&amd756_smbus); printk(KERN_INFO "Enabling SMBus multiplexing for Tyan S4882\n"); /* Define the 5 virtual adapters and algorithms structures */ diff --git a/drivers/i2c/busses/i2c-at91.c b/drivers/i2c/busses/i2c-at91.c index 75195e3f5ddb..6bb839b688be 100644 --- a/drivers/i2c/busses/i2c-at91.c +++ b/drivers/i2c/busses/i2c-at91.c @@ -603,15 +603,18 @@ static const struct of_device_id atmel_twi_dt_ids[] = { } }; MODULE_DEVICE_TABLE(of, atmel_twi_dt_ids); -#else -#define atmel_twi_dt_ids NULL #endif -static bool filter(struct dma_chan *chan, void *slave) +static bool filter(struct dma_chan *chan, void *pdata) { - struct at_dma_slave *sl = slave; + struct at91_twi_pdata *sl_pdata = pdata; + struct at_dma_slave *sl; + + if (!sl_pdata) + return false; - if (sl->dma_dev == chan->device->dev) { + sl = &sl_pdata->dma_slave; + if (sl && (sl->dma_dev == chan->device->dev)) { chan->private = sl; return true; } else { @@ -622,11 +625,10 @@ static bool filter(struct dma_chan *chan, void *slave) static int at91_twi_configure_dma(struct at91_twi_dev *dev, u32 phy_addr) { int ret = 0; - struct at_dma_slave *sdata; + struct at91_twi_pdata *pdata = dev->pdata; struct dma_slave_config slave_config; struct at91_twi_dma *dma = &dev->dma; - - sdata = &dev->pdata->dma_slave; + dma_cap_mask_t mask; memset(&slave_config, 0, sizeof(slave_config)); slave_config.src_addr = (dma_addr_t)phy_addr + AT91_TWI_RHR; @@ -637,25 +639,22 @@ static int at91_twi_configure_dma(struct at91_twi_dev *dev, u32 phy_addr) slave_config.dst_maxburst = 1; slave_config.device_fc = false; - if (sdata && sdata->dma_dev) { - dma_cap_mask_t mask; + dma_cap_zero(mask); + dma_cap_set(DMA_SLAVE, mask); - dma_cap_zero(mask); - dma_cap_set(DMA_SLAVE, mask); - dma->chan_tx = dma_request_channel(mask, filter, sdata); - if (!dma->chan_tx) { - dev_err(dev->dev, "no DMA channel available for tx\n"); - ret = -EBUSY; - goto error; - } - dma->chan_rx = dma_request_channel(mask, filter, sdata); - if (!dma->chan_rx) { - dev_err(dev->dev, "no DMA channel available for rx\n"); - ret = -EBUSY; - goto error; - } - } else { - ret = -EINVAL; + dma->chan_tx = dma_request_slave_channel_compat(mask, filter, pdata, + dev->dev, "tx"); + if (!dma->chan_tx) { + dev_err(dev->dev, "can't get a DMA channel for tx\n"); + ret = -EBUSY; + goto error; + } + + dma->chan_rx = dma_request_slave_channel_compat(mask, filter, pdata, + dev->dev, "rx"); + if (!dma->chan_rx) { + dev_err(dev->dev, "can't get a DMA channel for rx\n"); + ret = -EBUSY; goto error; } @@ -785,12 +784,11 @@ static int at91_twi_probe(struct platform_device *pdev) static int at91_twi_remove(struct platform_device *pdev) { struct at91_twi_dev *dev = platform_get_drvdata(pdev); - int rc; - rc = i2c_del_adapter(&dev->adapter); + i2c_del_adapter(&dev->adapter); clk_disable_unprepare(dev->clk); - return rc; + return 0; } #ifdef CONFIG_PM @@ -828,7 +826,7 @@ static struct platform_driver at91_twi_driver = { .driver = { .name = "at91_i2c", .owner = THIS_MODULE, - .of_match_table = atmel_twi_dt_ids, + .of_match_table = of_match_ptr(atmel_twi_dt_ids), .pm = at91_twi_pm_ops, }, }; diff --git a/drivers/i2c/busses/i2c-cbus-gpio.c b/drivers/i2c/busses/i2c-cbus-gpio.c index 98386d659318..1be13ac11dc5 100644 --- a/drivers/i2c/busses/i2c-cbus-gpio.c +++ b/drivers/i2c/busses/i2c-cbus-gpio.c @@ -206,7 +206,9 @@ static int cbus_i2c_remove(struct platform_device *pdev) { struct i2c_adapter *adapter = platform_get_drvdata(pdev); - return i2c_del_adapter(adapter); + i2c_del_adapter(adapter); + + return 0; } static int cbus_i2c_probe(struct platform_device *pdev) diff --git a/drivers/i2c/busses/i2c-davinci.c b/drivers/i2c/busses/i2c-davinci.c index 7d1e590a7bb6..cf20e06a88e1 100644 --- a/drivers/i2c/busses/i2c-davinci.c +++ b/drivers/i2c/busses/i2c-davinci.c @@ -137,7 +137,7 @@ static inline u16 davinci_i2c_read_reg(struct davinci_i2c_dev *i2c_dev, int reg) } /* Generate a pulse on the i2c clock pin. */ -static void generic_i2c_clock_pulse(unsigned int scl_pin) +static void davinci_i2c_clock_pulse(unsigned int scl_pin) { u16 i; @@ -155,7 +155,7 @@ static void generic_i2c_clock_pulse(unsigned int scl_pin) /* This routine does i2c bus recovery as specified in the * i2c protocol Rev. 03 section 3.16 titled "Bus clear" */ -static void i2c_recover_bus(struct davinci_i2c_dev *dev) +static void davinci_i2c_recover_bus(struct davinci_i2c_dev *dev) { u32 flag = 0; struct davinci_i2c_platform_data *pdata = dev->pdata; @@ -166,7 +166,7 @@ static void i2c_recover_bus(struct davinci_i2c_dev *dev) flag |= DAVINCI_I2C_MDR_NACK; /* write the data into mode register */ davinci_i2c_write_reg(dev, DAVINCI_I2C_MDR_REG, flag); - generic_i2c_clock_pulse(pdata->scl_pin); + davinci_i2c_clock_pulse(pdata->scl_pin); /* Send STOP */ flag = davinci_i2c_read_reg(dev, DAVINCI_I2C_MDR_REG); flag |= DAVINCI_I2C_MDR_STP; @@ -289,7 +289,7 @@ static int i2c_davinci_wait_bus_not_busy(struct davinci_i2c_dev *dev, return -ETIMEDOUT; } else { to_cnt = 0; - i2c_recover_bus(dev); + davinci_i2c_recover_bus(dev); i2c_davinci_init(dev); } } @@ -379,7 +379,7 @@ i2c_davinci_xfer_msg(struct i2c_adapter *adap, struct i2c_msg *msg, int stop) dev->adapter.timeout); if (r == 0) { dev_err(dev->dev, "controller timed out\n"); - i2c_recover_bus(dev); + davinci_i2c_recover_bus(dev); i2c_davinci_init(dev); dev->buf_len = 0; return -ETIMEDOUT; @@ -643,7 +643,7 @@ static int davinci_i2c_probe(struct platform_device *pdev) { struct davinci_i2c_dev *dev; struct i2c_adapter *adap; - struct resource *mem, *irq, *ioarea; + struct resource *mem, *irq; int r; /* NOTE: driver uses the static register mapping */ @@ -659,24 +659,18 @@ static int davinci_i2c_probe(struct platform_device *pdev) return -ENODEV; } - ioarea = request_mem_region(mem->start, resource_size(mem), - pdev->name); - if (!ioarea) { - dev_err(&pdev->dev, "I2C region already claimed\n"); - return -EBUSY; - } - - dev = kzalloc(sizeof(struct davinci_i2c_dev), GFP_KERNEL); + dev = devm_kzalloc(&pdev->dev, sizeof(struct davinci_i2c_dev), + GFP_KERNEL); if (!dev) { - r = -ENOMEM; - goto err_release_region; + dev_err(&pdev->dev, "Memory allocation failed\n"); + return -ENOMEM; } init_completion(&dev->cmd_complete); #ifdef CONFIG_CPU_FREQ init_completion(&dev->xfr_complete); #endif - dev->dev = get_device(&pdev->dev); + dev->dev = &pdev->dev; dev->irq = irq->start; dev->pdata = dev->dev->platform_data; platform_set_drvdata(pdev, dev); @@ -686,10 +680,9 @@ static int davinci_i2c_probe(struct platform_device *pdev) dev->pdata = devm_kzalloc(&pdev->dev, sizeof(struct davinci_i2c_platform_data), GFP_KERNEL); - if (!dev->pdata) { - r = -ENOMEM; - goto err_free_mem; - } + if (!dev->pdata) + return -ENOMEM; + memcpy(dev->pdata, &davinci_i2c_platform_data_default, sizeof(struct davinci_i2c_platform_data)); if (!of_property_read_u32(pdev->dev.of_node, "clock-frequency", @@ -699,22 +692,21 @@ static int davinci_i2c_probe(struct platform_device *pdev) dev->pdata = &davinci_i2c_platform_data_default; } - dev->clk = clk_get(&pdev->dev, NULL); - if (IS_ERR(dev->clk)) { - r = -ENODEV; - goto err_free_mem; - } + dev->clk = devm_clk_get(&pdev->dev, NULL); + if (IS_ERR(dev->clk)) + return -ENODEV; clk_prepare_enable(dev->clk); - dev->base = ioremap(mem->start, resource_size(mem)); - if (!dev->base) { - r = -EBUSY; - goto err_mem_ioremap; + dev->base = devm_ioremap_resource(&pdev->dev, mem); + if (IS_ERR(dev->base)) { + r = PTR_ERR(dev->base); + goto err_unuse_clocks; } i2c_davinci_init(dev); - r = request_irq(dev->irq, i2c_davinci_isr, 0, pdev->name, dev); + r = devm_request_irq(&pdev->dev, dev->irq, i2c_davinci_isr, 0, + pdev->name, dev); if (r) { dev_err(&pdev->dev, "failure requesting irq %i\n", dev->irq); goto err_unuse_clocks; @@ -723,7 +715,7 @@ static int davinci_i2c_probe(struct platform_device *pdev) r = i2c_davinci_cpufreq_register(dev); if (r) { dev_err(&pdev->dev, "failed to register cpufreq\n"); - goto err_free_irq; + goto err_unuse_clocks; } adap = &dev->adapter; @@ -740,50 +732,31 @@ static int davinci_i2c_probe(struct platform_device *pdev) r = i2c_add_numbered_adapter(adap); if (r) { dev_err(&pdev->dev, "failure adding adapter\n"); - goto err_free_irq; + goto err_unuse_clocks; } of_i2c_register_devices(adap); return 0; -err_free_irq: - free_irq(dev->irq, dev); err_unuse_clocks: - iounmap(dev->base); -err_mem_ioremap: clk_disable_unprepare(dev->clk); - clk_put(dev->clk); dev->clk = NULL; -err_free_mem: - put_device(&pdev->dev); - kfree(dev); -err_release_region: - release_mem_region(mem->start, resource_size(mem)); - return r; } static int davinci_i2c_remove(struct platform_device *pdev) { struct davinci_i2c_dev *dev = platform_get_drvdata(pdev); - struct resource *mem; i2c_davinci_cpufreq_deregister(dev); i2c_del_adapter(&dev->adapter); - put_device(&pdev->dev); clk_disable_unprepare(dev->clk); - clk_put(dev->clk); dev->clk = NULL; davinci_i2c_write_reg(dev, DAVINCI_I2C_MDR_REG, 0); - free_irq(dev->irq, dev); - iounmap(dev->base); - kfree(dev); - mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); - release_mem_region(mem->start, resource_size(mem)); return 0; } diff --git a/drivers/i2c/busses/i2c-designware-core.c b/drivers/i2c/busses/i2c-designware-core.c index 94fd81875409..21fbb340ad66 100644 --- a/drivers/i2c/busses/i2c-designware-core.c +++ b/drivers/i2c/busses/i2c-designware-core.c @@ -68,6 +68,7 @@ #define DW_IC_TXFLR 0x74 #define DW_IC_RXFLR 0x78 #define DW_IC_TX_ABRT_SOURCE 0x80 +#define DW_IC_ENABLE_STATUS 0x9c #define DW_IC_COMP_PARAM_1 0xf4 #define DW_IC_COMP_TYPE 0xfc #define DW_IC_COMP_TYPE_VALUE 0x44570140 @@ -248,6 +249,27 @@ static u32 i2c_dw_scl_lcnt(u32 ic_clk, u32 tLOW, u32 tf, int offset) return ((ic_clk * (tLOW + tf) + 5000) / 10000) - 1 + offset; } +static void __i2c_dw_enable(struct dw_i2c_dev *dev, bool enable) +{ + int timeout = 100; + + do { + dw_writel(dev, enable, DW_IC_ENABLE); + if ((dw_readl(dev, DW_IC_ENABLE_STATUS) & 1) == enable) + return; + + /* + * Wait 10 times the signaling period of the highest I2C + * transfer supported by the driver (for 400KHz this is + * 25us) as described in the DesignWare I2C databook. + */ + usleep_range(25, 250); + } while (timeout--); + + dev_warn(dev->dev, "timeout in %sabling adapter\n", + enable ? "en" : "dis"); +} + /** * i2c_dw_init() - initialize the designware i2c master hardware * @dev: device private data @@ -278,7 +300,7 @@ int i2c_dw_init(struct dw_i2c_dev *dev) } /* Disable the adapter */ - dw_writel(dev, 0, DW_IC_ENABLE); + __i2c_dw_enable(dev, false); /* set standard and fast speed deviders for high/low periods */ @@ -333,7 +355,7 @@ static int i2c_dw_wait_bus_not_busy(struct dw_i2c_dev *dev) return -ETIMEDOUT; } timeout--; - mdelay(1); + usleep_range(1000, 1100); } return 0; @@ -345,7 +367,7 @@ static void i2c_dw_xfer_init(struct dw_i2c_dev *dev) u32 ic_con; /* Disable the adapter */ - dw_writel(dev, 0, DW_IC_ENABLE); + __i2c_dw_enable(dev, false); /* set the slave (target) address */ dw_writel(dev, msgs[dev->msg_write_idx].addr, DW_IC_TAR); @@ -359,7 +381,7 @@ static void i2c_dw_xfer_init(struct dw_i2c_dev *dev) dw_writel(dev, ic_con, DW_IC_CON); /* Enable the adapter */ - dw_writel(dev, 1, DW_IC_ENABLE); + __i2c_dw_enable(dev, true); /* Enable interrupts */ dw_writel(dev, DW_IC_INTR_DEFAULT_MASK, DW_IC_INTR_MASK); @@ -565,7 +587,7 @@ i2c_dw_xfer(struct i2c_adapter *adap, struct i2c_msg msgs[], int num) /* no error */ if (likely(!dev->cmd_err)) { /* Disable the adapter */ - dw_writel(dev, 0, DW_IC_ENABLE); + __i2c_dw_enable(dev, false); ret = num; goto done; } @@ -578,7 +600,8 @@ i2c_dw_xfer(struct i2c_adapter *adap, struct i2c_msg msgs[], int num) ret = -EIO; done: - pm_runtime_put(dev->dev); + pm_runtime_mark_last_busy(dev->dev); + pm_runtime_put_autosuspend(dev->dev); mutex_unlock(&dev->lock); return ret; @@ -700,7 +723,7 @@ EXPORT_SYMBOL_GPL(i2c_dw_isr); void i2c_dw_enable(struct dw_i2c_dev *dev) { /* Enable the adapter */ - dw_writel(dev, 1, DW_IC_ENABLE); + __i2c_dw_enable(dev, true); } EXPORT_SYMBOL_GPL(i2c_dw_enable); @@ -713,7 +736,7 @@ EXPORT_SYMBOL_GPL(i2c_dw_is_enabled); void i2c_dw_disable(struct dw_i2c_dev *dev) { /* Disable controller */ - dw_writel(dev, 0, DW_IC_ENABLE); + __i2c_dw_enable(dev, false); /* Disable all interupts */ dw_writel(dev, 0, DW_IC_INTR_MASK); diff --git a/drivers/i2c/busses/i2c-designware-pcidrv.c b/drivers/i2c/busses/i2c-designware-pcidrv.c index 7c5e383c350d..f6ed06c966ee 100644 --- a/drivers/i2c/busses/i2c-designware-pcidrv.c +++ b/drivers/i2c/busses/i2c-designware-pcidrv.c @@ -208,68 +208,45 @@ static u32 i2c_dw_get_clk_rate_khz(struct dw_i2c_dev *dev) } static int i2c_dw_pci_probe(struct pci_dev *pdev, -const struct pci_device_id *id) + const struct pci_device_id *id) { struct dw_i2c_dev *dev; struct i2c_adapter *adap; - unsigned long start, len; - void __iomem *base; int r; struct dw_pci_controller *controller; if (id->driver_data >= ARRAY_SIZE(dw_pci_controllers)) { - printk(KERN_ERR "dw_i2c_pci_probe: invalid driver data %ld\n", + dev_err(&pdev->dev, "%s: invalid driver data %ld\n", __func__, id->driver_data); return -EINVAL; } controller = &dw_pci_controllers[id->driver_data]; - r = pci_enable_device(pdev); + r = pcim_enable_device(pdev); if (r) { dev_err(&pdev->dev, "Failed to enable I2C PCI device (%d)\n", r); - goto exit; + return r; } - /* Determine the address of the I2C area */ - start = pci_resource_start(pdev, 0); - len = pci_resource_len(pdev, 0); - if (!start || len == 0) { - dev_err(&pdev->dev, "base address not set\n"); - r = -ENODEV; - goto exit; - } - - r = pci_request_region(pdev, 0, DRIVER_NAME); + r = pcim_iomap_regions(pdev, 1 << 0, pci_name(pdev)); if (r) { - dev_err(&pdev->dev, "failed to request I2C region " - "0x%lx-0x%lx\n", start, - (unsigned long)pci_resource_end(pdev, 0)); - goto exit; - } - - base = ioremap_nocache(start, len); - if (!base) { dev_err(&pdev->dev, "I/O memory remapping failed\n"); - r = -ENOMEM; - goto err_release_region; + return r; } - - dev = kzalloc(sizeof(struct dw_i2c_dev), GFP_KERNEL); - if (!dev) { - r = -ENOMEM; - goto err_release_region; - } + dev = devm_kzalloc(&pdev->dev, sizeof(struct dw_i2c_dev), GFP_KERNEL); + if (!dev) + return -ENOMEM; init_completion(&dev->cmd_complete); mutex_init(&dev->lock); dev->clk = NULL; dev->controller = controller; dev->get_clk_rate_khz = i2c_dw_get_clk_rate_khz; - dev->base = base; - dev->dev = get_device(&pdev->dev); + dev->base = pcim_iomap_table(pdev)[0]; + dev->dev = &pdev->dev; dev->functionality = I2C_FUNC_I2C | I2C_FUNC_SMBUS_BYTE | @@ -284,7 +261,7 @@ const struct pci_device_id *id) dev->rx_fifo_depth = controller->rx_fifo_depth; r = i2c_dw_init(dev); if (r) - goto err_iounmap; + return r; adap = &dev->adapter; i2c_set_adapdata(adap, dev); @@ -296,10 +273,11 @@ const struct pci_device_id *id) snprintf(adap->name, sizeof(adap->name), "i2c-designware-pci-%d", adap->nr); - r = request_irq(pdev->irq, i2c_dw_isr, IRQF_SHARED, adap->name, dev); + r = devm_request_irq(&pdev->dev, pdev->irq, i2c_dw_isr, IRQF_SHARED, + adap->name, dev); if (r) { dev_err(&pdev->dev, "failure requesting irq %i\n", dev->irq); - goto err_iounmap; + return r; } i2c_dw_disable_int(dev); @@ -307,24 +285,14 @@ const struct pci_device_id *id) r = i2c_add_numbered_adapter(adap); if (r) { dev_err(&pdev->dev, "failure adding adapter\n"); - goto err_free_irq; + return r; } - pm_runtime_put_noidle(&pdev->dev); + pm_runtime_set_autosuspend_delay(&pdev->dev, 1000); + pm_runtime_use_autosuspend(&pdev->dev); pm_runtime_allow(&pdev->dev); return 0; - -err_free_irq: - free_irq(pdev->irq, dev); -err_iounmap: - iounmap(dev->base); - put_device(&pdev->dev); - kfree(dev); -err_release_region: - pci_release_region(pdev, 0); -exit: - return r; } static void i2c_dw_pci_remove(struct pci_dev *pdev) @@ -336,11 +304,6 @@ static void i2c_dw_pci_remove(struct pci_dev *pdev) pm_runtime_get_noresume(&pdev->dev); i2c_del_adapter(&dev->adapter); - put_device(&pdev->dev); - - free_irq(dev->irq, dev); - kfree(dev); - pci_release_region(pdev, 0); } /* work with hotplug and coldplug */ diff --git a/drivers/i2c/busses/i2c-designware-platdrv.c b/drivers/i2c/busses/i2c-designware-platdrv.c index e3085c487ace..8ec91335d95a 100644 --- a/drivers/i2c/busses/i2c-designware-platdrv.c +++ b/drivers/i2c/busses/i2c-designware-platdrv.c @@ -56,20 +56,11 @@ static u32 i2c_dw_get_clk_rate_khz(struct dw_i2c_dev *dev) static int dw_i2c_acpi_configure(struct platform_device *pdev) { struct dw_i2c_dev *dev = platform_get_drvdata(pdev); - struct acpi_device *adev; - int busno, ret; if (!ACPI_HANDLE(&pdev->dev)) return -ENODEV; - ret = acpi_bus_get_device(ACPI_HANDLE(&pdev->dev), &adev); - if (ret) - return -ENODEV; - dev->adapter.nr = -1; - if (adev->pnp.unique_id && !kstrtoint(adev->pnp.unique_id, 0, &busno)) - dev->adapter.nr = busno; - dev->tx_fifo_depth = 32; dev->rx_fifo_depth = 32; return 0; @@ -92,7 +83,7 @@ static int dw_i2c_probe(struct platform_device *pdev) { struct dw_i2c_dev *dev; struct i2c_adapter *adap; - struct resource *mem, *ioarea; + struct resource *mem; int irq, r; /* NOTE: driver uses the static register mapping */ @@ -108,32 +99,25 @@ static int dw_i2c_probe(struct platform_device *pdev) return irq; /* -ENXIO */ } - ioarea = request_mem_region(mem->start, resource_size(mem), - pdev->name); - if (!ioarea) { - dev_err(&pdev->dev, "I2C region already claimed\n"); - return -EBUSY; - } + dev = devm_kzalloc(&pdev->dev, sizeof(struct dw_i2c_dev), GFP_KERNEL); + if (!dev) + return -ENOMEM; - dev = kzalloc(sizeof(struct dw_i2c_dev), GFP_KERNEL); - if (!dev) { - r = -ENOMEM; - goto err_release_region; - } + dev->base = devm_ioremap_resource(&pdev->dev, mem); + if (IS_ERR(dev->base)) + return PTR_ERR(dev->base); init_completion(&dev->cmd_complete); mutex_init(&dev->lock); - dev->dev = get_device(&pdev->dev); + dev->dev = &pdev->dev; dev->irq = irq; platform_set_drvdata(pdev, dev); - dev->clk = clk_get(&pdev->dev, NULL); + dev->clk = devm_clk_get(&pdev->dev, NULL); dev->get_clk_rate_khz = i2c_dw_get_clk_rate_khz; - if (IS_ERR(dev->clk)) { - r = -ENODEV; - goto err_free_mem; - } + if (IS_ERR(dev->clk)) + return PTR_ERR(dev->clk); clk_prepare_enable(dev->clk); dev->functionality = @@ -146,13 +130,6 @@ static int dw_i2c_probe(struct platform_device *pdev) dev->master_cfg = DW_IC_CON_MASTER | DW_IC_CON_SLAVE_DISABLE | DW_IC_CON_RESTART_EN | DW_IC_CON_SPEED_FAST; - dev->base = ioremap(mem->start, resource_size(mem)); - if (dev->base == NULL) { - dev_err(&pdev->dev, "failure mapping io resources\n"); - r = -EBUSY; - goto err_unuse_clocks; - } - /* Try first if we can configure the device from ACPI */ r = dw_i2c_acpi_configure(pdev); if (r) { @@ -164,13 +141,14 @@ static int dw_i2c_probe(struct platform_device *pdev) } r = i2c_dw_init(dev); if (r) - goto err_iounmap; + return r; i2c_dw_disable_int(dev); - r = request_irq(dev->irq, i2c_dw_isr, IRQF_SHARED, pdev->name, dev); + r = devm_request_irq(&pdev->dev, dev->irq, i2c_dw_isr, IRQF_SHARED, + pdev->name, dev); if (r) { dev_err(&pdev->dev, "failure requesting irq %i\n", dev->irq); - goto err_iounmap; + return r; } adap = &dev->adapter; @@ -186,57 +164,32 @@ static int dw_i2c_probe(struct platform_device *pdev) r = i2c_add_numbered_adapter(adap); if (r) { dev_err(&pdev->dev, "failure adding adapter\n"); - goto err_free_irq; + return r; } of_i2c_register_devices(adap); acpi_i2c_register_devices(adap); + pm_runtime_set_autosuspend_delay(&pdev->dev, 1000); + pm_runtime_use_autosuspend(&pdev->dev); pm_runtime_set_active(&pdev->dev); pm_runtime_enable(&pdev->dev); - pm_runtime_put(&pdev->dev); return 0; - -err_free_irq: - free_irq(dev->irq, dev); -err_iounmap: - iounmap(dev->base); -err_unuse_clocks: - clk_disable_unprepare(dev->clk); - clk_put(dev->clk); - dev->clk = NULL; -err_free_mem: - put_device(&pdev->dev); - kfree(dev); -err_release_region: - release_mem_region(mem->start, resource_size(mem)); - - return r; } static int dw_i2c_remove(struct platform_device *pdev) { struct dw_i2c_dev *dev = platform_get_drvdata(pdev); - struct resource *mem; pm_runtime_get_sync(&pdev->dev); i2c_del_adapter(&dev->adapter); - put_device(&pdev->dev); - - clk_disable_unprepare(dev->clk); - clk_put(dev->clk); - dev->clk = NULL; i2c_dw_disable(dev); - free_irq(dev->irq, dev); - kfree(dev); pm_runtime_put(&pdev->dev); pm_runtime_disable(&pdev->dev); - mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); - release_mem_region(mem->start, resource_size(mem)); return 0; } diff --git a/drivers/i2c/busses/i2c-gpio.c b/drivers/i2c/busses/i2c-gpio.c index f3fa4332bbdf..bc6e139c6e7f 100644 --- a/drivers/i2c/busses/i2c-gpio.c +++ b/drivers/i2c/busses/i2c-gpio.c @@ -85,23 +85,29 @@ static int i2c_gpio_getscl(void *data) return gpio_get_value(pdata->scl_pin); } -static int of_i2c_gpio_probe(struct device_node *np, - struct i2c_gpio_platform_data *pdata) +static int of_i2c_gpio_get_pins(struct device_node *np, + unsigned int *sda_pin, unsigned int *scl_pin) { - u32 reg; - if (of_gpio_count(np) < 2) return -ENODEV; - pdata->sda_pin = of_get_gpio(np, 0); - pdata->scl_pin = of_get_gpio(np, 1); + *sda_pin = of_get_gpio(np, 0); + *scl_pin = of_get_gpio(np, 1); - if (!gpio_is_valid(pdata->sda_pin) || !gpio_is_valid(pdata->scl_pin)) { + if (!gpio_is_valid(*sda_pin) || !gpio_is_valid(*scl_pin)) { pr_err("%s: invalid GPIO pins, sda=%d/scl=%d\n", - np->full_name, pdata->sda_pin, pdata->scl_pin); + np->full_name, *sda_pin, *scl_pin); return -ENODEV; } + return 0; +} + +static void of_i2c_gpio_get_props(struct device_node *np, + struct i2c_gpio_platform_data *pdata) +{ + u32 reg; + of_property_read_u32(np, "i2c-gpio,delay-us", &pdata->udelay); if (!of_property_read_u32(np, "i2c-gpio,timeout-ms", ®)) @@ -113,8 +119,6 @@ static int of_i2c_gpio_probe(struct device_node *np, of_property_read_bool(np, "i2c-gpio,scl-open-drain"); pdata->scl_is_output_only = of_property_read_bool(np, "i2c-gpio,scl-output-only"); - - return 0; } static int i2c_gpio_probe(struct platform_device *pdev) @@ -123,31 +127,52 @@ static int i2c_gpio_probe(struct platform_device *pdev) struct i2c_gpio_platform_data *pdata; struct i2c_algo_bit_data *bit_data; struct i2c_adapter *adap; + unsigned int sda_pin, scl_pin; int ret; - priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL); - if (!priv) - return -ENOMEM; - adap = &priv->adap; - bit_data = &priv->bit_data; - pdata = &priv->pdata; - + /* First get the GPIO pins; if it fails, we'll defer the probe. */ if (pdev->dev.of_node) { - ret = of_i2c_gpio_probe(pdev->dev.of_node, pdata); + ret = of_i2c_gpio_get_pins(pdev->dev.of_node, + &sda_pin, &scl_pin); if (ret) return ret; } else { if (!pdev->dev.platform_data) return -ENXIO; - memcpy(pdata, pdev->dev.platform_data, sizeof(*pdata)); + pdata = pdev->dev.platform_data; + sda_pin = pdata->sda_pin; + scl_pin = pdata->scl_pin; } - ret = gpio_request(pdata->sda_pin, "sda"); - if (ret) + ret = gpio_request(sda_pin, "sda"); + if (ret) { + if (ret == -EINVAL) + ret = -EPROBE_DEFER; /* Try again later */ goto err_request_sda; - ret = gpio_request(pdata->scl_pin, "scl"); - if (ret) + } + ret = gpio_request(scl_pin, "scl"); + if (ret) { + if (ret == -EINVAL) + ret = -EPROBE_DEFER; /* Try again later */ goto err_request_scl; + } + + priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL); + if (!priv) { + ret = -ENOMEM; + goto err_add_bus; + } + adap = &priv->adap; + bit_data = &priv->bit_data; + pdata = &priv->pdata; + + if (pdev->dev.of_node) { + pdata->sda_pin = sda_pin; + pdata->scl_pin = scl_pin; + of_i2c_gpio_get_props(pdev->dev.of_node, pdata); + } else { + memcpy(pdata, pdev->dev.platform_data, sizeof(*pdata)); + } if (pdata->sda_is_open_drain) { gpio_direction_output(pdata->sda_pin, 1); @@ -211,9 +236,9 @@ static int i2c_gpio_probe(struct platform_device *pdev) return 0; err_add_bus: - gpio_free(pdata->scl_pin); + gpio_free(scl_pin); err_request_scl: - gpio_free(pdata->sda_pin); + gpio_free(sda_pin); err_request_sda: return ret; } diff --git a/drivers/i2c/busses/i2c-intel-mid.c b/drivers/i2c/busses/i2c-intel-mid.c index 323fa018ffd5..0fb659726ffc 100644 --- a/drivers/i2c/busses/i2c-intel-mid.c +++ b/drivers/i2c/busses/i2c-intel-mid.c @@ -1082,8 +1082,7 @@ static void intel_mid_i2c_remove(struct pci_dev *dev) { struct intel_mid_i2c_private *mrst = pci_get_drvdata(dev); intel_mid_i2c_disable(&mrst->adap); - if (i2c_del_adapter(&mrst->adap)) - dev_err(&dev->dev, "Failed to delete i2c adapter"); + i2c_del_adapter(&mrst->adap); free_irq(dev->irq, mrst); iounmap(mrst->base); diff --git a/drivers/i2c/busses/i2c-ismt.c b/drivers/i2c/busses/i2c-ismt.c index 130f02cc9d94..cd82eb44e4c4 100644 --- a/drivers/i2c/busses/i2c-ismt.c +++ b/drivers/i2c/busses/i2c-ismt.c @@ -183,7 +183,7 @@ struct ismt_priv { /** * ismt_ids - PCI device IDs supported by this driver */ -static const DEFINE_PCI_DEVICE_TABLE(ismt_ids) = { +static DEFINE_PCI_DEVICE_TABLE(ismt_ids) = { { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_S1200_SMT0) }, { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_S1200_SMT1) }, { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_AVOTON_SMT) }, diff --git a/drivers/i2c/busses/i2c-mv64xxx.c b/drivers/i2c/busses/i2c-mv64xxx.c index 8b20ef8524ac..3bbd65d35a5e 100644 --- a/drivers/i2c/busses/i2c-mv64xxx.c +++ b/drivers/i2c/busses/i2c-mv64xxx.c @@ -701,9 +701,8 @@ static int mv64xxx_i2c_remove(struct platform_device *dev) { struct mv64xxx_i2c_data *drv_data = platform_get_drvdata(dev); - int rc; - rc = i2c_del_adapter(&drv_data->adapter); + i2c_del_adapter(&drv_data->adapter); free_irq(drv_data->irq, drv_data); mv64xxx_i2c_unmap_regs(drv_data); #if defined(CONFIG_HAVE_CLK) @@ -715,7 +714,7 @@ mv64xxx_i2c_remove(struct platform_device *dev) #endif kfree(drv_data); - return rc; + return 0; } static const struct of_device_id mv64xxx_i2c_of_match_table[] = { diff --git a/drivers/i2c/busses/i2c-mxs.c b/drivers/i2c/busses/i2c-mxs.c index 120f24646696..c67d89fc6254 100644 --- a/drivers/i2c/busses/i2c-mxs.c +++ b/drivers/i2c/busses/i2c-mxs.c @@ -56,6 +56,7 @@ #define MXS_I2C_CTRL1_SET (0x44) #define MXS_I2C_CTRL1_CLR (0x48) +#define MXS_I2C_CTRL1_CLR_GOT_A_NAK 0x10000000 #define MXS_I2C_CTRL1_BUS_FREE_IRQ 0x80 #define MXS_I2C_CTRL1_DATA_ENGINE_CMPLT_IRQ 0x40 #define MXS_I2C_CTRL1_NO_SLAVE_ACK_IRQ 0x20 @@ -65,6 +66,10 @@ #define MXS_I2C_CTRL1_SLAVE_STOP_IRQ 0x02 #define MXS_I2C_CTRL1_SLAVE_IRQ 0x01 +#define MXS_I2C_STAT (0x50) +#define MXS_I2C_STAT_BUS_BUSY 0x00000800 +#define MXS_I2C_STAT_CLK_GEN_BUSY 0x00000400 + #define MXS_I2C_DATA (0xa0) #define MXS_I2C_DEBUG0 (0xb0) @@ -297,12 +302,10 @@ static int mxs_i2c_pio_wait_dmareq(struct mxs_i2c_dev *i2c) cond_resched(); } - writel(MXS_I2C_DEBUG0_DMAREQ, i2c->regs + MXS_I2C_DEBUG0_CLR); - return 0; } -static int mxs_i2c_pio_wait_cplt(struct mxs_i2c_dev *i2c) +static int mxs_i2c_pio_wait_cplt(struct mxs_i2c_dev *i2c, int last) { unsigned long timeout = jiffies + msecs_to_jiffies(1000); @@ -323,9 +326,50 @@ static int mxs_i2c_pio_wait_cplt(struct mxs_i2c_dev *i2c) writel(MXS_I2C_CTRL1_DATA_ENGINE_CMPLT_IRQ, i2c->regs + MXS_I2C_CTRL1_CLR); + /* + * When ending a transfer with a stop, we have to wait for the bus to + * go idle before we report the transfer as completed. Otherwise the + * start of the next transfer may race with the end of the current one. + */ + while (last && (readl(i2c->regs + MXS_I2C_STAT) & + (MXS_I2C_STAT_BUS_BUSY | MXS_I2C_STAT_CLK_GEN_BUSY))) { + if (time_after(jiffies, timeout)) + return -ETIMEDOUT; + cond_resched(); + } + return 0; } +static int mxs_i2c_pio_check_error_state(struct mxs_i2c_dev *i2c) +{ + u32 state; + + state = readl(i2c->regs + MXS_I2C_CTRL1_CLR) & MXS_I2C_IRQ_MASK; + + if (state & MXS_I2C_CTRL1_NO_SLAVE_ACK_IRQ) + i2c->cmd_err = -ENXIO; + else if (state & (MXS_I2C_CTRL1_EARLY_TERM_IRQ | + MXS_I2C_CTRL1_MASTER_LOSS_IRQ | + MXS_I2C_CTRL1_SLAVE_STOP_IRQ | + MXS_I2C_CTRL1_SLAVE_IRQ)) + i2c->cmd_err = -EIO; + + return i2c->cmd_err; +} + +static void mxs_i2c_pio_trigger_cmd(struct mxs_i2c_dev *i2c, u32 cmd) +{ + u32 reg; + + writel(cmd, i2c->regs + MXS_I2C_CTRL0); + + /* readback makes sure the write is latched into hardware */ + reg = readl(i2c->regs + MXS_I2C_CTRL0); + reg |= MXS_I2C_CTRL0_RUN; + writel(reg, i2c->regs + MXS_I2C_CTRL0); +} + static int mxs_i2c_pio_setup_xfer(struct i2c_adapter *adap, struct i2c_msg *msg, uint32_t flags) { @@ -341,23 +385,26 @@ static int mxs_i2c_pio_setup_xfer(struct i2c_adapter *adap, addr_data |= I2C_SMBUS_READ; /* SELECT command. */ - writel(MXS_I2C_CTRL0_RUN | MXS_CMD_I2C_SELECT, - i2c->regs + MXS_I2C_CTRL0); + mxs_i2c_pio_trigger_cmd(i2c, MXS_CMD_I2C_SELECT); ret = mxs_i2c_pio_wait_dmareq(i2c); if (ret) return ret; writel(addr_data, i2c->regs + MXS_I2C_DATA); + writel(MXS_I2C_DEBUG0_DMAREQ, i2c->regs + MXS_I2C_DEBUG0_CLR); - ret = mxs_i2c_pio_wait_cplt(i2c); + ret = mxs_i2c_pio_wait_cplt(i2c, 0); if (ret) return ret; + if (mxs_i2c_pio_check_error_state(i2c)) + goto cleanup; + /* READ command. */ - writel(MXS_I2C_CTRL0_RUN | MXS_CMD_I2C_READ | flags | - MXS_I2C_CTRL0_XFER_COUNT(msg->len), - i2c->regs + MXS_I2C_CTRL0); + mxs_i2c_pio_trigger_cmd(i2c, + MXS_CMD_I2C_READ | flags | + MXS_I2C_CTRL0_XFER_COUNT(msg->len)); for (i = 0; i < msg->len; i++) { if ((i & 3) == 0) { @@ -365,6 +412,8 @@ static int mxs_i2c_pio_setup_xfer(struct i2c_adapter *adap, if (ret) return ret; data = readl(i2c->regs + MXS_I2C_DATA); + writel(MXS_I2C_DEBUG0_DMAREQ, + i2c->regs + MXS_I2C_DEBUG0_CLR); } msg->buf[i] = data & 0xff; data >>= 8; @@ -373,9 +422,9 @@ static int mxs_i2c_pio_setup_xfer(struct i2c_adapter *adap, addr_data |= I2C_SMBUS_WRITE; /* WRITE command. */ - writel(MXS_I2C_CTRL0_RUN | MXS_CMD_I2C_WRITE | flags | - MXS_I2C_CTRL0_XFER_COUNT(msg->len + 1), - i2c->regs + MXS_I2C_CTRL0); + mxs_i2c_pio_trigger_cmd(i2c, + MXS_CMD_I2C_WRITE | flags | + MXS_I2C_CTRL0_XFER_COUNT(msg->len + 1)); /* * The LSB of data buffer is the first byte blasted across @@ -391,6 +440,8 @@ static int mxs_i2c_pio_setup_xfer(struct i2c_adapter *adap, if (ret) return ret; writel(data, i2c->regs + MXS_I2C_DATA); + writel(MXS_I2C_DEBUG0_DMAREQ, + i2c->regs + MXS_I2C_DEBUG0_CLR); } } @@ -401,13 +452,19 @@ static int mxs_i2c_pio_setup_xfer(struct i2c_adapter *adap, if (ret) return ret; writel(data, i2c->regs + MXS_I2C_DATA); + writel(MXS_I2C_DEBUG0_DMAREQ, + i2c->regs + MXS_I2C_DEBUG0_CLR); } } - ret = mxs_i2c_pio_wait_cplt(i2c); + ret = mxs_i2c_pio_wait_cplt(i2c, flags & MXS_I2C_CTRL0_POST_SEND_STOP); if (ret) return ret; + /* make sure we capture any occurred error into cmd_err */ + mxs_i2c_pio_check_error_state(i2c); + +cleanup: /* Clear any dangling IRQs and re-enable interrupts. */ writel(MXS_I2C_IRQ_MASK, i2c->regs + MXS_I2C_CTRL1_CLR); writel(MXS_I2C_IRQ_MASK << 8, i2c->regs + MXS_I2C_CTRL1_SET); @@ -439,12 +496,12 @@ static int mxs_i2c_xfer_msg(struct i2c_adapter *adap, struct i2c_msg *msg, * using PIO mode while longer transfers use DMA. The 8 byte border is * based on this empirical measurement and a lot of previous frobbing. */ + i2c->cmd_err = 0; if (msg->len < 8) { ret = mxs_i2c_pio_setup_xfer(adap, msg, flags); if (ret) mxs_i2c_reset(i2c); } else { - i2c->cmd_err = 0; INIT_COMPLETION(i2c->cmd_complete); ret = mxs_i2c_dma_setup_xfer(adap, msg, flags); if (ret) @@ -454,13 +511,19 @@ static int mxs_i2c_xfer_msg(struct i2c_adapter *adap, struct i2c_msg *msg, msecs_to_jiffies(1000)); if (ret == 0) goto timeout; + } - if (i2c->cmd_err == -ENXIO) - mxs_i2c_reset(i2c); - - ret = i2c->cmd_err; + if (i2c->cmd_err == -ENXIO) { + /* + * If the transfer fails with a NAK from the slave the + * controller halts until it gets told to return to idle state. + */ + writel(MXS_I2C_CTRL1_CLR_GOT_A_NAK, + i2c->regs + MXS_I2C_CTRL1_SET); } + ret = i2c->cmd_err; + dev_dbg(i2c->dev, "Done with err=%d\n", ret); return ret; @@ -686,11 +749,8 @@ static int mxs_i2c_probe(struct platform_device *pdev) static int mxs_i2c_remove(struct platform_device *pdev) { struct mxs_i2c_dev *i2c = platform_get_drvdata(pdev); - int ret; - ret = i2c_del_adapter(&i2c->adapter); - if (ret) - return -EBUSY; + i2c_del_adapter(&i2c->adapter); if (i2c->dmach) dma_release_channel(i2c->dmach); diff --git a/drivers/i2c/busses/i2c-nforce2-s4985.c b/drivers/i2c/busses/i2c-nforce2-s4985.c index 29015eb9ca46..2ca268d6140b 100644 --- a/drivers/i2c/busses/i2c-nforce2-s4985.c +++ b/drivers/i2c/busses/i2c-nforce2-s4985.c @@ -164,11 +164,7 @@ static int __init nforce2_s4985_init(void) } /* Unregister physical bus */ - error = i2c_del_adapter(nforce2_smbus); - if (error) { - dev_err(&nforce2_smbus->dev, "Physical bus removal failed\n"); - goto ERROR0; - } + i2c_del_adapter(nforce2_smbus); printk(KERN_INFO "Enabling SMBus multiplexing for Tyan S4985\n"); /* Define the 5 virtual adapters and algorithms structures */ diff --git a/drivers/i2c/busses/i2c-octeon.c b/drivers/i2c/busses/i2c-octeon.c index 935585ef4d39..956fe320f313 100644 --- a/drivers/i2c/busses/i2c-octeon.c +++ b/drivers/i2c/busses/i2c-octeon.c @@ -183,7 +183,7 @@ static irqreturn_t octeon_i2c_isr(int irq, void *dev_id) struct octeon_i2c *i2c = dev_id; octeon_i2c_int_disable(i2c); - wake_up_interruptible(&i2c->queue); + wake_up(&i2c->queue); return IRQ_HANDLED; } @@ -206,9 +206,9 @@ static int octeon_i2c_wait(struct octeon_i2c *i2c) octeon_i2c_int_enable(i2c); - result = wait_event_interruptible_timeout(i2c->queue, - octeon_i2c_test_iflg(i2c), - i2c->adap.timeout); + result = wait_event_timeout(i2c->queue, + octeon_i2c_test_iflg(i2c), + i2c->adap.timeout); octeon_i2c_int_disable(i2c); @@ -440,7 +440,7 @@ static struct i2c_adapter octeon_i2c_ops = { .owner = THIS_MODULE, .name = "OCTEON adapter", .algo = &octeon_i2c_algo, - .timeout = 2, + .timeout = HZ / 50, }; /** diff --git a/drivers/i2c/busses/i2c-powermac.c b/drivers/i2c/busses/i2c-powermac.c index da54e673449d..8dc90da1e6e6 100644 --- a/drivers/i2c/busses/i2c-powermac.c +++ b/drivers/i2c/busses/i2c-powermac.c @@ -213,14 +213,8 @@ static const struct i2c_algorithm i2c_powermac_algorithm = { static int i2c_powermac_remove(struct platform_device *dev) { struct i2c_adapter *adapter = platform_get_drvdata(dev); - int rc; - - rc = i2c_del_adapter(adapter); - /* We aren't that prepared to deal with this... */ - if (rc) - printk(KERN_WARNING - "i2c-powermac.c: Failed to remove bus %s !\n", - adapter->name); + + i2c_del_adapter(adapter); memset(adapter, 0, sizeof(*adapter)); return 0; diff --git a/drivers/i2c/busses/i2c-puv3.c b/drivers/i2c/busses/i2c-puv3.c index 8acef657abf3..37a84c87c5fd 100644 --- a/drivers/i2c/busses/i2c-puv3.c +++ b/drivers/i2c/busses/i2c-puv3.c @@ -234,21 +234,15 @@ static int puv3_i2c_remove(struct platform_device *pdev) { struct i2c_adapter *adapter = platform_get_drvdata(pdev); struct resource *mem; - int rc; - rc = i2c_del_adapter(adapter); - if (rc) { - dev_err(&pdev->dev, "Adapter '%s' delete fail\n", - adapter->name); - return rc; - } + i2c_del_adapter(adapter); put_device(&pdev->dev); mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); release_mem_region(mem->start, resource_size(mem)); - return rc; + return 0; } #ifdef CONFIG_PM diff --git a/drivers/i2c/busses/i2c-pxa.c b/drivers/i2c/busses/i2c-pxa.c index 1e88e8d66c55..ea6d45d1dcd6 100644 --- a/drivers/i2c/busses/i2c-pxa.c +++ b/drivers/i2c/busses/i2c-pxa.c @@ -1053,16 +1053,13 @@ static int i2c_pxa_probe_dt(struct platform_device *pdev, struct pxa_i2c *i2c, struct device_node *np = pdev->dev.of_node; const struct of_device_id *of_id = of_match_device(i2c_pxa_dt_ids, &pdev->dev); - int ret; if (!of_id) return 1; - ret = of_alias_get_id(np, "i2c"); - if (ret < 0) { - dev_err(&pdev->dev, "failed to get alias id, errno %d\n", ret); - return ret; - } - pdev->id = ret; + + /* For device tree we always use the dynamic or alias-assigned ID */ + i2c->adap.nr = -1; + if (of_get_property(np, "mrvl,i2c-polling", NULL)) i2c->use_pio = 1; if (of_get_property(np, "mrvl,i2c-fast-mode", NULL)) @@ -1100,6 +1097,9 @@ static int i2c_pxa_probe(struct platform_device *dev) goto emalloc; } + /* Default adapter num to device id; i2c_pxa_probe_dt can override. */ + i2c->adap.nr = dev->id; + ret = i2c_pxa_probe_dt(dev, i2c, &i2c_type); if (ret > 0) ret = i2c_pxa_probe_pdata(dev, i2c, &i2c_type); @@ -1124,9 +1124,7 @@ static int i2c_pxa_probe(struct platform_device *dev) spin_lock_init(&i2c->lock); init_waitqueue_head(&i2c->wait); - i2c->adap.nr = dev->id; - snprintf(i2c->adap.name, sizeof(i2c->adap.name), "pxa_i2c-i2c.%u", - i2c->adap.nr); + strlcpy(i2c->adap.name, "pxa_i2c-i2c", sizeof(i2c->adap.name)); i2c->clk = clk_get(&dev->dev, NULL); if (IS_ERR(i2c->clk)) { @@ -1169,7 +1167,7 @@ static int i2c_pxa_probe(struct platform_device *dev) } else { i2c->adap.algo = &i2c_pxa_algorithm; ret = request_irq(irq, i2c_pxa_handler, IRQF_SHARED, - i2c->adap.name, i2c); + dev_name(&dev->dev), i2c); if (ret) goto ereqirq; } diff --git a/drivers/i2c/busses/i2c-s3c2410.c b/drivers/i2c/busses/i2c-s3c2410.c index f6b880ba1932..6e8ee92ab553 100644 --- a/drivers/i2c/busses/i2c-s3c2410.c +++ b/drivers/i2c/busses/i2c-s3c2410.c @@ -42,9 +42,46 @@ #include <asm/irq.h> -#include <plat/regs-iic.h> #include <linux/platform_data/i2c-s3c2410.h> +/* see s3c2410x user guide, v1.1, section 9 (p447) for more info */ + +#define S3C2410_IICCON 0x00 +#define S3C2410_IICSTAT 0x04 +#define S3C2410_IICADD 0x08 +#define S3C2410_IICDS 0x0C +#define S3C2440_IICLC 0x10 + +#define S3C2410_IICCON_ACKEN (1 << 7) +#define S3C2410_IICCON_TXDIV_16 (0 << 6) +#define S3C2410_IICCON_TXDIV_512 (1 << 6) +#define S3C2410_IICCON_IRQEN (1 << 5) +#define S3C2410_IICCON_IRQPEND (1 << 4) +#define S3C2410_IICCON_SCALE(x) ((x) & 0xf) +#define S3C2410_IICCON_SCALEMASK (0xf) + +#define S3C2410_IICSTAT_MASTER_RX (2 << 6) +#define S3C2410_IICSTAT_MASTER_TX (3 << 6) +#define S3C2410_IICSTAT_SLAVE_RX (0 << 6) +#define S3C2410_IICSTAT_SLAVE_TX (1 << 6) +#define S3C2410_IICSTAT_MODEMASK (3 << 6) + +#define S3C2410_IICSTAT_START (1 << 5) +#define S3C2410_IICSTAT_BUSBUSY (1 << 5) +#define S3C2410_IICSTAT_TXRXEN (1 << 4) +#define S3C2410_IICSTAT_ARBITR (1 << 3) +#define S3C2410_IICSTAT_ASSLAVE (1 << 2) +#define S3C2410_IICSTAT_ADDR0 (1 << 1) +#define S3C2410_IICSTAT_LASTBIT (1 << 0) + +#define S3C2410_IICLC_SDA_DELAY0 (0 << 0) +#define S3C2410_IICLC_SDA_DELAY5 (1 << 0) +#define S3C2410_IICLC_SDA_DELAY10 (2 << 0) +#define S3C2410_IICLC_SDA_DELAY15 (3 << 0) +#define S3C2410_IICLC_SDA_DELAY_MASK (3 << 0) + +#define S3C2410_IICLC_FILTER_ON (1 << 2) + /* Treat S3C2410 as baseline hardware, anything else is supported via quirks */ #define QUIRK_S3C2440 (1 << 0) #define QUIRK_HDMIPHY (1 << 1) @@ -309,6 +346,12 @@ static inline int is_lastmsg(struct s3c24xx_i2c *i2c) static inline int is_msglast(struct s3c24xx_i2c *i2c) { + /* msg->len is always 1 for the first byte of smbus block read. + * Actual length will be read from slave. More bytes will be + * read according to the length then. */ + if (i2c->msg->flags & I2C_M_RECV_LEN && i2c->msg->len == 1) + return 0; + return i2c->msg_ptr == i2c->msg->len-1; } @@ -448,6 +491,9 @@ static int i2c_s3c_irq_nextbyte(struct s3c24xx_i2c *i2c, unsigned long iicstat) byte = readb(i2c->regs + S3C2410_IICDS); i2c->msg->buf[i2c->msg_ptr++] = byte; + /* Add actual length to read for smbus block read */ + if (i2c->msg->flags & I2C_M_RECV_LEN && i2c->msg->len == 1) + i2c->msg->len += byte; prepare_read: if (is_msglast(i2c)) { /* last byte of buffer */ diff --git a/drivers/i2c/busses/i2c-tegra.c b/drivers/i2c/busses/i2c-tegra.c index b714776b6ddd..b60ff90adc39 100644 --- a/drivers/i2c/busses/i2c-tegra.c +++ b/drivers/i2c/busses/i2c-tegra.c @@ -25,7 +25,6 @@ #include <linux/interrupt.h> #include <linux/delay.h> #include <linux/slab.h> -#include <linux/i2c-tegra.h> #include <linux/of_i2c.h> #include <linux/of_device.h> #include <linux/module.h> @@ -172,7 +171,7 @@ struct tegra_i2c_dev { u8 *msg_buf; size_t msg_buf_remaining; int msg_read; - unsigned long bus_clk_rate; + u32 bus_clk_rate; bool is_suspended; }; @@ -694,7 +693,6 @@ static const struct tegra_i2c_hw_feature tegra114_i2c_hw = { .clk_divisor_std_fast_mode = 0x19, }; -#if defined(CONFIG_OF) /* Match table for of_platform binding */ static const struct of_device_id tegra_i2c_of_match[] = { { .compatible = "nvidia,tegra114-i2c", .data = &tegra114_i2c_hw, }, @@ -704,16 +702,13 @@ static const struct of_device_id tegra_i2c_of_match[] = { {}, }; MODULE_DEVICE_TABLE(of, tegra_i2c_of_match); -#endif static int tegra_i2c_probe(struct platform_device *pdev) { struct tegra_i2c_dev *i2c_dev; - struct tegra_i2c_platform_data *pdata = pdev->dev.platform_data; struct resource *res; struct clk *div_clk; struct clk *fast_clk; - const unsigned int *prop; void __iomem *base; int irq; int ret = 0; @@ -754,23 +749,16 @@ static int tegra_i2c_probe(struct platform_device *pdev) i2c_dev->cont_id = pdev->id; i2c_dev->dev = &pdev->dev; - i2c_dev->bus_clk_rate = 100000; /* default clock rate */ - if (pdata) { - i2c_dev->bus_clk_rate = pdata->bus_clk_rate; - - } else if (i2c_dev->dev->of_node) { /* if there is a device tree node ... */ - prop = of_get_property(i2c_dev->dev->of_node, - "clock-frequency", NULL); - if (prop) - i2c_dev->bus_clk_rate = be32_to_cpup(prop); - } + ret = of_property_read_u32(i2c_dev->dev->of_node, "clock-frequency", + &i2c_dev->bus_clk_rate); + if (ret) + i2c_dev->bus_clk_rate = 100000; /* default clock rate */ i2c_dev->hw = &tegra20_i2c_hw; if (pdev->dev.of_node) { const struct of_device_id *match; - match = of_match_device(of_match_ptr(tegra_i2c_of_match), - &pdev->dev); + match = of_match_device(tegra_i2c_of_match, &pdev->dev); i2c_dev->hw = match->data; i2c_dev->is_dvc = of_device_is_compatible(pdev->dev.of_node, "nvidia,tegra20-i2c-dvc"); @@ -876,7 +864,7 @@ static struct platform_driver tegra_i2c_driver = { .driver = { .name = "tegra-i2c", .owner = THIS_MODULE, - .of_match_table = of_match_ptr(tegra_i2c_of_match), + .of_match_table = tegra_i2c_of_match, .pm = TEGRA_I2C_PM, }, }; diff --git a/drivers/i2c/busses/i2c-viperboard.c b/drivers/i2c/busses/i2c-viperboard.c index f45c32c1ace6..c68450cd8d5f 100644 --- a/drivers/i2c/busses/i2c-viperboard.c +++ b/drivers/i2c/busses/i2c-viperboard.c @@ -421,11 +421,10 @@ error: static int vprbrd_i2c_remove(struct platform_device *pdev) { struct vprbrd_i2c *vb_i2c = platform_get_drvdata(pdev); - int ret; - ret = i2c_del_adapter(&vb_i2c->i2c); + i2c_del_adapter(&vb_i2c->i2c); - return ret; + return 0; } static struct platform_driver vprbrd_i2c_driver = { diff --git a/drivers/i2c/busses/i2c-xiic.c b/drivers/i2c/busses/i2c-xiic.c index 332c720fb3fe..3d0f0520c1b4 100644 --- a/drivers/i2c/busses/i2c-xiic.c +++ b/drivers/i2c/busses/i2c-xiic.c @@ -312,10 +312,8 @@ static void xiic_fill_tx_fifo(struct xiic_i2c *i2c) /* last message in transfer -> STOP */ data |= XIIC_TX_DYN_STOP_MASK; dev_dbg(i2c->adap.dev.parent, "%s TX STOP\n", __func__); - - xiic_setreg16(i2c, XIIC_DTR_REG_OFFSET, data); - } else - xiic_setreg8(i2c, XIIC_DTR_REG_OFFSET, data); + } + xiic_setreg16(i2c, XIIC_DTR_REG_OFFSET, data); } } diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c index 991d38daa87d..6b63cc7eb71e 100644 --- a/drivers/i2c/i2c-core.c +++ b/drivers/i2c/i2c-core.c @@ -27,7 +27,9 @@ #include <linux/module.h> #include <linux/kernel.h> +#include <linux/delay.h> #include <linux/errno.h> +#include <linux/gpio.h> #include <linux/slab.h> #include <linux/i2c.h> #include <linux/init.h> @@ -47,7 +49,7 @@ /* core_lock protects i2c_adapter_idr, and guarantees that device detection, deletion of detected devices, and attach_adapter - and detach_adapter calls are serialized */ + calls are serialized */ static DEFINE_MUTEX(core_lock); static DEFINE_IDR(i2c_adapter_idr); @@ -91,7 +93,6 @@ static int i2c_device_match(struct device *dev, struct device_driver *drv) return 0; } -#ifdef CONFIG_HOTPLUG /* uevent helps with hotplug: modprobe -q $(MODALIAS) */ static int i2c_device_uevent(struct device *dev, struct kobj_uevent_env *env) @@ -105,9 +106,129 @@ static int i2c_device_uevent(struct device *dev, struct kobj_uevent_env *env) return 0; } -#else -#define i2c_device_uevent NULL -#endif /* CONFIG_HOTPLUG */ +/* i2c bus recovery routines */ +static int get_scl_gpio_value(struct i2c_adapter *adap) +{ + return gpio_get_value(adap->bus_recovery_info->scl_gpio); +} + +static void set_scl_gpio_value(struct i2c_adapter *adap, int val) +{ + gpio_set_value(adap->bus_recovery_info->scl_gpio, val); +} + +static int get_sda_gpio_value(struct i2c_adapter *adap) +{ + return gpio_get_value(adap->bus_recovery_info->sda_gpio); +} + +static int i2c_get_gpios_for_recovery(struct i2c_adapter *adap) +{ + struct i2c_bus_recovery_info *bri = adap->bus_recovery_info; + struct device *dev = &adap->dev; + int ret = 0; + + ret = gpio_request_one(bri->scl_gpio, GPIOF_OPEN_DRAIN | + GPIOF_OUT_INIT_HIGH, "i2c-scl"); + if (ret) { + dev_warn(dev, "Can't get SCL gpio: %d\n", bri->scl_gpio); + return ret; + } + + if (bri->get_sda) { + if (gpio_request_one(bri->sda_gpio, GPIOF_IN, "i2c-sda")) { + /* work without SDA polling */ + dev_warn(dev, "Can't get SDA gpio: %d. Not using SDA polling\n", + bri->sda_gpio); + bri->get_sda = NULL; + } + } + + return ret; +} + +static void i2c_put_gpios_for_recovery(struct i2c_adapter *adap) +{ + struct i2c_bus_recovery_info *bri = adap->bus_recovery_info; + + if (bri->get_sda) + gpio_free(bri->sda_gpio); + + gpio_free(bri->scl_gpio); +} + +/* + * We are generating clock pulses. ndelay() determines durating of clk pulses. + * We will generate clock with rate 100 KHz and so duration of both clock levels + * is: delay in ns = (10^6 / 100) / 2 + */ +#define RECOVERY_NDELAY 5000 +#define RECOVERY_CLK_CNT 9 + +static int i2c_generic_recovery(struct i2c_adapter *adap) +{ + struct i2c_bus_recovery_info *bri = adap->bus_recovery_info; + int i = 0, val = 1, ret = 0; + + if (bri->prepare_recovery) + bri->prepare_recovery(bri); + + /* + * By this time SCL is high, as we need to give 9 falling-rising edges + */ + while (i++ < RECOVERY_CLK_CNT * 2) { + if (val) { + /* Break if SDA is high */ + if (bri->get_sda && bri->get_sda(adap)) + break; + /* SCL shouldn't be low here */ + if (!bri->get_scl(adap)) { + dev_err(&adap->dev, + "SCL is stuck low, exit recovery\n"); + ret = -EBUSY; + break; + } + } + + val = !val; + bri->set_scl(adap, val); + ndelay(RECOVERY_NDELAY); + } + + if (bri->unprepare_recovery) + bri->unprepare_recovery(bri); + + return ret; +} + +int i2c_generic_scl_recovery(struct i2c_adapter *adap) +{ + adap->bus_recovery_info->set_scl(adap, 1); + return i2c_generic_recovery(adap); +} + +int i2c_generic_gpio_recovery(struct i2c_adapter *adap) +{ + int ret; + + ret = i2c_get_gpios_for_recovery(adap); + if (ret) + return ret; + + ret = i2c_generic_recovery(adap); + i2c_put_gpios_for_recovery(adap); + + return ret; +} + +int i2c_recover_bus(struct i2c_adapter *adap) +{ + if (!adap->bus_recovery_info) + return -EOPNOTSUPP; + + dev_dbg(&adap->dev, "Trying i2c bus recovery\n"); + return adap->bus_recovery_info->recover_bus(adap); +} static int i2c_device_probe(struct device *dev) { @@ -902,6 +1023,39 @@ static int i2c_register_adapter(struct i2c_adapter *adap) "Failed to create compatibility class link\n"); #endif + /* bus recovery specific initialization */ + if (adap->bus_recovery_info) { + struct i2c_bus_recovery_info *bri = adap->bus_recovery_info; + + if (!bri->recover_bus) { + dev_err(&adap->dev, "No recover_bus() found, not using recovery\n"); + adap->bus_recovery_info = NULL; + goto exit_recovery; + } + + /* Generic GPIO recovery */ + if (bri->recover_bus == i2c_generic_gpio_recovery) { + if (!gpio_is_valid(bri->scl_gpio)) { + dev_err(&adap->dev, "Invalid SCL gpio, not using recovery\n"); + adap->bus_recovery_info = NULL; + goto exit_recovery; + } + + if (gpio_is_valid(bri->sda_gpio)) + bri->get_sda = get_sda_gpio_value; + else + bri->get_sda = NULL; + + bri->get_scl = get_scl_gpio_value; + bri->set_scl = set_scl_gpio_value; + } else if (!bri->set_scl || !bri->get_scl) { + /* Generic SCL recovery */ + dev_err(&adap->dev, "No {get|set}_gpio() found, not using recovery\n"); + adap->bus_recovery_info = NULL; + } + } + +exit_recovery: /* create pre-declared device nodes */ if (adap->nr < __i2c_first_dynamic_bus_num) i2c_scan_static_board_info(adap); @@ -921,13 +1075,35 @@ out_list: } /** + * __i2c_add_numbered_adapter - i2c_add_numbered_adapter where nr is never -1 + * @adap: the adapter to register (with adap->nr initialized) + * Context: can sleep + * + * See i2c_add_numbered_adapter() for details. + */ +static int __i2c_add_numbered_adapter(struct i2c_adapter *adap) +{ + int id; + + mutex_lock(&core_lock); + id = idr_alloc(&i2c_adapter_idr, adap, adap->nr, adap->nr + 1, + GFP_KERNEL); + mutex_unlock(&core_lock); + if (id < 0) + return id == -ENOSPC ? -EBUSY : id; + + return i2c_register_adapter(adap); +} + +/** * i2c_add_adapter - declare i2c adapter, use dynamic bus number * @adapter: the adapter to add * Context: can sleep * * This routine is used to declare an I2C adapter when its bus number - * doesn't matter. Examples: for I2C adapters dynamically added by - * USB links or PCI plugin cards. + * doesn't matter or when its bus number is specified by an dt alias. + * Examples of bases when the bus number doesn't matter: I2C adapters + * dynamically added by USB links or PCI plugin cards. * * When this returns zero, a new bus number was allocated and stored * in adap->nr, and the specified adapter became available for clients. @@ -935,8 +1111,17 @@ out_list: */ int i2c_add_adapter(struct i2c_adapter *adapter) { + struct device *dev = &adapter->dev; int id; + if (dev->of_node) { + id = of_alias_get_id(dev->of_node, "i2c"); + if (id >= 0) { + adapter->nr = id; + return __i2c_add_numbered_adapter(adapter); + } + } + mutex_lock(&core_lock); id = idr_alloc(&i2c_adapter_idr, adapter, __i2c_first_dynamic_bus_num, 0, GFP_KERNEL); @@ -975,26 +1160,17 @@ EXPORT_SYMBOL(i2c_add_adapter); */ int i2c_add_numbered_adapter(struct i2c_adapter *adap) { - int id; - if (adap->nr == -1) /* -1 means dynamically assign bus id */ return i2c_add_adapter(adap); - mutex_lock(&core_lock); - id = idr_alloc(&i2c_adapter_idr, adap, adap->nr, adap->nr + 1, - GFP_KERNEL); - mutex_unlock(&core_lock); - if (id < 0) - return id == -ENOSPC ? -EBUSY : id; - return i2c_register_adapter(adap); + return __i2c_add_numbered_adapter(adap); } EXPORT_SYMBOL_GPL(i2c_add_numbered_adapter); -static int i2c_do_del_adapter(struct i2c_driver *driver, +static void i2c_do_del_adapter(struct i2c_driver *driver, struct i2c_adapter *adapter) { struct i2c_client *client, *_n; - int res; /* Remove the devices we created ourselves as the result of hardware * probing (using a driver's detect method) */ @@ -1006,16 +1182,6 @@ static int i2c_do_del_adapter(struct i2c_driver *driver, i2c_unregister_device(client); } } - - if (!driver->detach_adapter) - return 0; - dev_warn(&adapter->dev, "%s: detach_adapter method is deprecated\n", - driver->driver.name); - res = driver->detach_adapter(adapter); - if (res) - dev_err(&adapter->dev, "detach_adapter failed (%d) " - "for driver [%s]\n", res, driver->driver.name); - return res; } static int __unregister_client(struct device *dev, void *dummy) @@ -1036,7 +1202,8 @@ static int __unregister_dummy(struct device *dev, void *dummy) static int __process_removed_adapter(struct device_driver *d, void *data) { - return i2c_do_del_adapter(to_i2c_driver(d), data); + i2c_do_del_adapter(to_i2c_driver(d), data); + return 0; } /** @@ -1047,9 +1214,8 @@ static int __process_removed_adapter(struct device_driver *d, void *data) * This unregisters an I2C adapter which was previously registered * by @i2c_add_adapter or @i2c_add_numbered_adapter. */ -int i2c_del_adapter(struct i2c_adapter *adap) +void i2c_del_adapter(struct i2c_adapter *adap) { - int res = 0; struct i2c_adapter *found; struct i2c_client *client, *next; @@ -1060,16 +1226,14 @@ int i2c_del_adapter(struct i2c_adapter *adap) if (found != adap) { pr_debug("i2c-core: attempting to delete unregistered " "adapter [%s]\n", adap->name); - return -EINVAL; + return; } /* Tell drivers about this removal */ mutex_lock(&core_lock); - res = bus_for_each_drv(&i2c_bus_type, NULL, adap, + bus_for_each_drv(&i2c_bus_type, NULL, adap, __process_removed_adapter); mutex_unlock(&core_lock); - if (res) - return res; /* Remove devices instantiated from sysfs */ mutex_lock_nested(&adap->userspace_clients_lock, @@ -1088,8 +1252,8 @@ int i2c_del_adapter(struct i2c_adapter *adap) * we can't remove the dummy devices during the first pass: they * could have been instantiated by real devices wishing to clean * them up properly, so we give them a chance to do that first. */ - res = device_for_each_child(&adap->dev, NULL, __unregister_client); - res = device_for_each_child(&adap->dev, NULL, __unregister_dummy); + device_for_each_child(&adap->dev, NULL, __unregister_client); + device_for_each_child(&adap->dev, NULL, __unregister_dummy); #ifdef CONFIG_I2C_COMPAT class_compat_remove_link(i2c_adapter_compat_class, &adap->dev, @@ -1114,8 +1278,6 @@ int i2c_del_adapter(struct i2c_adapter *adap) /* Clear the device structure in case this adapter is ever going to be added again */ memset(&adap->dev, 0, sizeof(adap->dev)); - - return 0; } EXPORT_SYMBOL(i2c_del_adapter); @@ -1185,9 +1347,9 @@ EXPORT_SYMBOL(i2c_register_driver); static int __process_removed_driver(struct device *dev, void *data) { - if (dev->type != &i2c_adapter_type) - return 0; - return i2c_do_del_adapter(data, to_i2c_adapter(dev)); + if (dev->type == &i2c_adapter_type) + i2c_do_del_adapter(data, to_i2c_adapter(dev)); + return 0; } /** diff --git a/drivers/i2c/i2c-mux.c b/drivers/i2c/i2c-mux.c index d94e0ce78277..7409ebb33c47 100644 --- a/drivers/i2c/i2c-mux.c +++ b/drivers/i2c/i2c-mux.c @@ -191,17 +191,12 @@ struct i2c_adapter *i2c_add_mux_adapter(struct i2c_adapter *parent, } EXPORT_SYMBOL_GPL(i2c_add_mux_adapter); -int i2c_del_mux_adapter(struct i2c_adapter *adap) +void i2c_del_mux_adapter(struct i2c_adapter *adap) { struct i2c_mux_priv *priv = adap->algo_data; - int ret; - ret = i2c_del_adapter(adap); - if (ret < 0) - return ret; + i2c_del_adapter(adap); kfree(priv); - - return 0; } EXPORT_SYMBOL_GPL(i2c_del_mux_adapter); diff --git a/drivers/i2c/muxes/Kconfig b/drivers/i2c/muxes/Kconfig index 0be5b83c08fa..5faf244d2476 100644 --- a/drivers/i2c/muxes/Kconfig +++ b/drivers/i2c/muxes/Kconfig @@ -5,6 +5,18 @@ menu "Multiplexer I2C Chip support" depends on I2C_MUX +config I2C_ARB_GPIO_CHALLENGE + tristate "GPIO-based I2C arbitration" + depends on GENERIC_GPIO && OF + help + If you say yes to this option, support will be included for an + I2C multimaster arbitration scheme using GPIOs and a challenge & + response mechanism where masters have to claim the bus by asserting + a GPIO. + + This driver can also be built as a module. If so, the module + will be called i2c-arb-gpio-challenge. + config I2C_MUX_GPIO tristate "GPIO-based I2C multiplexer" depends on GENERIC_GPIO diff --git a/drivers/i2c/muxes/Makefile b/drivers/i2c/muxes/Makefile index 76da8692afff..465778b5d5dc 100644 --- a/drivers/i2c/muxes/Makefile +++ b/drivers/i2c/muxes/Makefile @@ -1,6 +1,8 @@ # # Makefile for multiplexer I2C chip drivers. +obj-$(CONFIG_I2C_ARB_GPIO_CHALLENGE) += i2c-arb-gpio-challenge.o + obj-$(CONFIG_I2C_MUX_GPIO) += i2c-mux-gpio.o obj-$(CONFIG_I2C_MUX_PCA9541) += i2c-mux-pca9541.o obj-$(CONFIG_I2C_MUX_PCA954x) += i2c-mux-pca954x.o diff --git a/drivers/i2c/muxes/i2c-arb-gpio-challenge.c b/drivers/i2c/muxes/i2c-arb-gpio-challenge.c new file mode 100644 index 000000000000..210b6f7b9028 --- /dev/null +++ b/drivers/i2c/muxes/i2c-arb-gpio-challenge.c @@ -0,0 +1,251 @@ +/* + * GPIO-based I2C Arbitration Using a Challenge & Response Mechanism + * + * Copyright (C) 2012 Google, Inc + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include <linux/delay.h> +#include <linux/gpio.h> +#include <linux/kernel.h> +#include <linux/i2c.h> +#include <linux/i2c-mux.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/of_i2c.h> +#include <linux/of_gpio.h> +#include <linux/platform_device.h> +#include <linux/slab.h> + + +/** + * struct i2c_arbitrator_data - Driver data for I2C arbitrator + * + * @parent: Parent adapter + * @child: Child bus + * @our_gpio: GPIO we'll use to claim. + * @our_gpio_release: 0 if active high; 1 if active low; AKA if the GPIO == + * this then consider it released. + * @their_gpio: GPIO that the other side will use to claim. + * @their_gpio_release: 0 if active high; 1 if active low; AKA if the GPIO == + * this then consider it released. + * @slew_delay_us: microseconds to wait for a GPIO to go high. + * @wait_retry_us: we'll attempt another claim after this many microseconds. + * @wait_free_us: we'll give up after this many microseconds. + */ + +struct i2c_arbitrator_data { + struct i2c_adapter *parent; + struct i2c_adapter *child; + int our_gpio; + int our_gpio_release; + int their_gpio; + int their_gpio_release; + unsigned int slew_delay_us; + unsigned int wait_retry_us; + unsigned int wait_free_us; +}; + + +/** + * i2c_arbitrator_select - claim the I2C bus + * + * Use the GPIO-based signalling protocol; return -EBUSY if we fail. + */ +static int i2c_arbitrator_select(struct i2c_adapter *adap, void *data, u32 chan) +{ + const struct i2c_arbitrator_data *arb = data; + unsigned long stop_retry, stop_time; + + /* Start a round of trying to claim the bus */ + stop_time = jiffies + usecs_to_jiffies(arb->wait_free_us) + 1; + do { + /* Indicate that we want to claim the bus */ + gpio_set_value(arb->our_gpio, !arb->our_gpio_release); + udelay(arb->slew_delay_us); + + /* Wait for the other master to release it */ + stop_retry = jiffies + usecs_to_jiffies(arb->wait_retry_us) + 1; + while (time_before(jiffies, stop_retry)) { + int gpio_val = !!gpio_get_value(arb->their_gpio); + + if (gpio_val == arb->their_gpio_release) { + /* We got it, so return */ + return 0; + } + + usleep_range(50, 200); + } + + /* It didn't release, so give up, wait, and try again */ + gpio_set_value(arb->our_gpio, arb->our_gpio_release); + + usleep_range(arb->wait_retry_us, arb->wait_retry_us * 2); + } while (time_before(jiffies, stop_time)); + + /* Give up, release our claim */ + gpio_set_value(arb->our_gpio, arb->our_gpio_release); + udelay(arb->slew_delay_us); + dev_err(&adap->dev, "Could not claim bus, timeout\n"); + return -EBUSY; +} + +/** + * i2c_arbitrator_deselect - release the I2C bus + * + * Release the I2C bus using the GPIO-based signalling protocol. + */ +static int i2c_arbitrator_deselect(struct i2c_adapter *adap, void *data, + u32 chan) +{ + const struct i2c_arbitrator_data *arb = data; + + /* Release the bus and wait for the other master to notice */ + gpio_set_value(arb->our_gpio, arb->our_gpio_release); + udelay(arb->slew_delay_us); + + return 0; +} + +static int i2c_arbitrator_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct device_node *np = dev->of_node; + struct device_node *parent_np; + struct i2c_arbitrator_data *arb; + enum of_gpio_flags gpio_flags; + unsigned long out_init; + int ret; + + /* We only support probing from device tree; no platform_data */ + if (!np) { + dev_err(dev, "Cannot find device tree node\n"); + return -ENODEV; + } + if (dev->platform_data) { + dev_err(dev, "Platform data is not supported\n"); + return -EINVAL; + } + + arb = devm_kzalloc(dev, sizeof(*arb), GFP_KERNEL); + if (!arb) { + dev_err(dev, "Cannot allocate i2c_arbitrator_data\n"); + return -ENOMEM; + } + platform_set_drvdata(pdev, arb); + + /* Request GPIOs */ + ret = of_get_named_gpio_flags(np, "our-claim-gpio", 0, &gpio_flags); + if (!gpio_is_valid(ret)) { + if (ret != -EPROBE_DEFER) + dev_err(dev, "Error getting our-claim-gpio\n"); + return ret; + } + arb->our_gpio = ret; + arb->our_gpio_release = !!(gpio_flags & OF_GPIO_ACTIVE_LOW); + out_init = (gpio_flags & OF_GPIO_ACTIVE_LOW) ? + GPIOF_OUT_INIT_HIGH : GPIOF_OUT_INIT_LOW; + ret = devm_gpio_request_one(dev, arb->our_gpio, out_init, + "our-claim-gpio"); + if (ret) { + if (ret != -EPROBE_DEFER) + dev_err(dev, "Error requesting our-claim-gpio\n"); + return ret; + } + + ret = of_get_named_gpio_flags(np, "their-claim-gpios", 0, &gpio_flags); + if (!gpio_is_valid(ret)) { + if (ret != -EPROBE_DEFER) + dev_err(dev, "Error getting their-claim-gpio\n"); + return ret; + } + arb->their_gpio = ret; + arb->their_gpio_release = !!(gpio_flags & OF_GPIO_ACTIVE_LOW); + ret = devm_gpio_request_one(dev, arb->their_gpio, GPIOF_IN, + "their-claim-gpio"); + if (ret) { + if (ret != -EPROBE_DEFER) + dev_err(dev, "Error requesting their-claim-gpio\n"); + return ret; + } + + /* At the moment we only support a single two master (us + 1 other) */ + if (gpio_is_valid(of_get_named_gpio(np, "their-claim-gpios", 1))) { + dev_err(dev, "Only one other master is supported\n"); + return -EINVAL; + } + + /* Arbitration parameters */ + if (of_property_read_u32(np, "slew-delay-us", &arb->slew_delay_us)) + arb->slew_delay_us = 10; + if (of_property_read_u32(np, "wait-retry-us", &arb->wait_retry_us)) + arb->wait_retry_us = 3000; + if (of_property_read_u32(np, "wait-free-us", &arb->wait_free_us)) + arb->wait_free_us = 50000; + + /* Find our parent */ + parent_np = of_parse_phandle(np, "i2c-parent", 0); + if (!parent_np) { + dev_err(dev, "Cannot parse i2c-parent\n"); + return -EINVAL; + } + arb->parent = of_find_i2c_adapter_by_node(parent_np); + if (!arb->parent) { + dev_err(dev, "Cannot find parent bus\n"); + return -EINVAL; + } + + /* Actually add the mux adapter */ + arb->child = i2c_add_mux_adapter(arb->parent, dev, arb, 0, 0, 0, + i2c_arbitrator_select, + i2c_arbitrator_deselect); + if (!arb->child) { + dev_err(dev, "Failed to add adapter\n"); + ret = -ENODEV; + i2c_put_adapter(arb->parent); + } + + return ret; +} + +static int i2c_arbitrator_remove(struct platform_device *pdev) +{ + struct i2c_arbitrator_data *arb = platform_get_drvdata(pdev); + + i2c_del_mux_adapter(arb->child); + i2c_put_adapter(arb->parent); + + return 0; +} + +static const struct of_device_id i2c_arbitrator_of_match[] = { + { .compatible = "i2c-arb-gpio-challenge", }, + {}, +}; +MODULE_DEVICE_TABLE(of, i2c_arbitrator_of_match); + +static struct platform_driver i2c_arbitrator_driver = { + .probe = i2c_arbitrator_probe, + .remove = i2c_arbitrator_remove, + .driver = { + .owner = THIS_MODULE, + .name = "i2c-arb-gpio-challenge", + .of_match_table = of_match_ptr(i2c_arbitrator_of_match), + }, +}; + +module_platform_driver(i2c_arbitrator_driver); + +MODULE_DESCRIPTION("GPIO-based I2C Arbitration"); +MODULE_AUTHOR("Doug Anderson <dianders@chromium.org>"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("platform:i2c-arb-gpio-challenge"); diff --git a/drivers/i2c/muxes/i2c-mux-gpio.c b/drivers/i2c/muxes/i2c-mux-gpio.c index abc2e55aa243..5a0ce0081dce 100644 --- a/drivers/i2c/muxes/i2c-mux-gpio.c +++ b/drivers/i2c/muxes/i2c-mux-gpio.c @@ -201,10 +201,21 @@ static int i2c_mux_gpio_probe(struct platform_device *pdev) for (i = 0; i < mux->data.n_gpios; i++) { ret = gpio_request(gpio_base + mux->data.gpios[i], "i2c-mux-gpio"); - if (ret) + if (ret) { + dev_err(&pdev->dev, "Failed to request GPIO %d\n", + mux->data.gpios[i]); goto err_request_gpio; - gpio_direction_output(gpio_base + mux->data.gpios[i], - initial_state & (1 << i)); + } + + ret = gpio_direction_output(gpio_base + mux->data.gpios[i], + initial_state & (1 << i)); + if (ret) { + dev_err(&pdev->dev, + "Failed to set direction of GPIO %d to output\n", + mux->data.gpios[i]); + i++; /* gpio_request above succeeded, so must free */ + goto err_request_gpio; + } } for (i = 0; i < mux->data.n_values; i++) { diff --git a/drivers/i2c/muxes/i2c-mux-pca954x.c b/drivers/i2c/muxes/i2c-mux-pca954x.c index 8e4387235b69..a531d801dbe4 100644 --- a/drivers/i2c/muxes/i2c-mux-pca954x.c +++ b/drivers/i2c/muxes/i2c-mux-pca954x.c @@ -262,13 +262,11 @@ static int pca954x_remove(struct i2c_client *client) { struct pca954x *data = i2c_get_clientdata(client); const struct chip_desc *chip = &chips[data->type]; - int i, err; + int i; for (i = 0; i < chip->nchans; ++i) if (data->virt_adaps[i]) { - err = i2c_del_mux_adapter(data->virt_adaps[i]); - if (err) - return err; + i2c_del_mux_adapter(data->virt_adaps[i]); data->virt_adaps[i] = NULL; } diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig index 89875ea19ade..ee035ec4526b 100644 --- a/drivers/lguest/Kconfig +++ b/drivers/lguest/Kconfig @@ -5,10 +5,9 @@ config LGUEST ---help--- This is a very simple module which allows you to run multiple instances of the same Linux kernel, using the - "lguest" command found in the Documentation/virtual/lguest - directory. + "lguest" command found in the tools/lguest directory. Note that "lguest" is pronounced to rhyme with "fell quest", - not "rustyvisor". See Documentation/virtual/lguest/lguest.txt. + not "rustyvisor". See tools/lguest/lguest.txt. If unsure, say N. If curious, say M. If masochistic, say Y. diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index a5ebc0083d87..0bf1e4edf04d 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -20,9 +20,9 @@ #include <asm/asm-offsets.h> #include "lg.h" - +unsigned long switcher_addr; +struct page **lg_switcher_pages; static struct vm_struct *switcher_vma; -static struct page **switcher_page; /* This One Big lock protects all inter-guest data structures. */ DEFINE_MUTEX(lguest_lock); @@ -52,13 +52,21 @@ static __init int map_switcher(void) * easy. */ + /* We assume Switcher text fits into a single page. */ + if (end_switcher_text - start_switcher_text > PAGE_SIZE) { + printk(KERN_ERR "lguest: switcher text too large (%zu)\n", + end_switcher_text - start_switcher_text); + return -EINVAL; + } + /* * We allocate an array of struct page pointers. map_vm_area() wants * this, rather than just an array of pages. */ - switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES, - GFP_KERNEL); - if (!switcher_page) { + lg_switcher_pages = kmalloc(sizeof(lg_switcher_pages[0]) + * TOTAL_SWITCHER_PAGES, + GFP_KERNEL); + if (!lg_switcher_pages) { err = -ENOMEM; goto out; } @@ -68,32 +76,29 @@ static __init int map_switcher(void) * so we make sure they're zeroed. */ for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { - switcher_page[i] = alloc_page(GFP_KERNEL|__GFP_ZERO); - if (!switcher_page[i]) { + lg_switcher_pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO); + if (!lg_switcher_pages[i]) { err = -ENOMEM; goto free_some_pages; } } /* - * First we check that the Switcher won't overlap the fixmap area at - * the top of memory. It's currently nowhere near, but it could have - * very strange effects if it ever happened. + * We place the Switcher underneath the fixmap area, which is the + * highest virtual address we can get. This is important, since we + * tell the Guest it can't access this memory, so we want its ceiling + * as high as possible. */ - if (SWITCHER_ADDR + (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE > FIXADDR_START){ - err = -ENOMEM; - printk("lguest: mapping switcher would thwack fixmap\n"); - goto free_pages; - } + switcher_addr = FIXADDR_START - (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE; /* - * Now we reserve the "virtual memory area" we want: 0xFFC00000 - * (SWITCHER_ADDR). We might not get it in theory, but in practice - * it's worked so far. The end address needs +1 because __get_vm_area - * allocates an extra guard page, so we need space for that. + * Now we reserve the "virtual memory area" we want. We might + * not get it in theory, but in practice it's worked so far. + * The end address needs +1 because __get_vm_area allocates an + * extra guard page, so we need space for that. */ switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, - VM_ALLOC, SWITCHER_ADDR, SWITCHER_ADDR + VM_ALLOC, switcher_addr, switcher_addr + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE); if (!switcher_vma) { err = -ENOMEM; @@ -103,12 +108,12 @@ static __init int map_switcher(void) /* * This code actually sets up the pages we've allocated to appear at - * SWITCHER_ADDR. map_vm_area() takes the vma we allocated above, the + * switcher_addr. map_vm_area() takes the vma we allocated above, the * kind of pages we're mapping (kernel pages), and a pointer to our * array of struct pages. It increments that pointer, but we don't * care. */ - pagep = switcher_page; + pagep = lg_switcher_pages; err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep); if (err) { printk("lguest: map_vm_area failed: %i\n", err); @@ -133,8 +138,8 @@ free_pages: i = TOTAL_SWITCHER_PAGES; free_some_pages: for (--i; i >= 0; i--) - __free_pages(switcher_page[i], 0); - kfree(switcher_page); + __free_pages(lg_switcher_pages[i], 0); + kfree(lg_switcher_pages); out: return err; } @@ -149,8 +154,8 @@ static void unmap_switcher(void) vunmap(switcher_vma->addr); /* Now we just need to free the pages we copied the switcher into */ for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) - __free_pages(switcher_page[i], 0); - kfree(switcher_page); + __free_pages(lg_switcher_pages[i], 0); + kfree(lg_switcher_pages); } /*H:032 @@ -323,15 +328,10 @@ static int __init init(void) if (err) goto out; - /* Now we set up the pagetable implementation for the Guests. */ - err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES); - if (err) - goto unmap; - /* We might need to reserve an interrupt vector. */ err = init_interrupts(); if (err) - goto free_pgtables; + goto unmap; /* /dev/lguest needs to be registered. */ err = lguest_device_init(); @@ -346,8 +346,6 @@ static int __init init(void) free_interrupts: free_interrupts(); -free_pgtables: - free_pagetables(); unmap: unmap_switcher(); out: @@ -359,7 +357,6 @@ static void __exit fini(void) { lguest_device_remove(); free_interrupts(); - free_pagetables(); unmap_switcher(); lguest_arch_host_fini(); diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 295df06e6590..2eef40be4c04 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -14,11 +14,10 @@ #include <asm/lguest.h> -void free_pagetables(void); -int init_pagetables(struct page **switcher_page, unsigned int pages); - struct pgdir { unsigned long gpgdir; + bool switcher_mapped; + int last_host_cpu; pgd_t *pgdir; }; @@ -124,6 +123,7 @@ bool lguest_address_ok(const struct lguest *lg, unsigned long addr, unsigned long len); void __lgread(struct lg_cpu *, void *, unsigned long, unsigned); void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned); +extern struct page **lg_switcher_pages; /*H:035 * Using memory-copy operations like that is usually inconvient, so we diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index ff4a0bc9904d..4263f4cc8c55 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -250,13 +250,13 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) */ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) { - /* We have a limited number the number of CPUs in the lguest struct. */ + /* We have a limited number of CPUs in the lguest struct. */ if (id >= ARRAY_SIZE(cpu->lg->cpus)) return -EINVAL; /* Set up this CPU's id, and pointer back to the lguest struct. */ cpu->id = id; - cpu->lg = container_of((cpu - id), struct lguest, cpus[0]); + cpu->lg = container_of(cpu, struct lguest, cpus[id]); cpu->lg->nr_cpus++; /* Each CPU has a timer it can set. */ @@ -270,7 +270,7 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) if (!cpu->regs_page) return -ENOMEM; - /* We actually put the registers at the bottom of the page. */ + /* We actually put the registers at the end of the page. */ cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs); /* diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 864baabaee25..699187ab3800 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -7,7 +7,7 @@ * converted Guest pages when running the Guest. :*/ -/* Copyright (C) Rusty Russell IBM Corporation 2006. +/* Copyright (C) Rusty Russell IBM Corporation 2013. * GPL v2 and any later version */ #include <linux/mm.h> #include <linux/gfp.h> @@ -62,22 +62,11 @@ * will need the last pmd entry of the last pmd page. */ #ifdef CONFIG_X86_PAE -#define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1) -#define RESERVE_MEM 2U #define CHECK_GPGD_MASK _PAGE_PRESENT #else -#define RESERVE_MEM 4U #define CHECK_GPGD_MASK _PAGE_TABLE #endif -/* - * We actually need a separate PTE page for each CPU. Remember that after the - * Switcher code itself comes two pages for each CPU, and we don't want this - * CPU's guest to see the pages of any other CPU. - */ -static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); -#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) - /*H:320 * The page table code is curly enough to need helper functions to keep it * clear and clean. The kernel itself provides many of them; one advantage @@ -95,13 +84,6 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) { unsigned int index = pgd_index(vaddr); -#ifndef CONFIG_X86_PAE - /* We kill any Guest trying to touch the Switcher addresses. */ - if (index >= SWITCHER_PGD_INDEX) { - kill_guest(cpu, "attempt to access switcher pages"); - index = 0; - } -#endif /* Return a pointer index'th pgd entry for the i'th page table. */ return &cpu->lg->pgdirs[i].pgdir[index]; } @@ -117,13 +99,6 @@ static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) unsigned int index = pmd_index(vaddr); pmd_t *page; - /* We kill any Guest trying to touch the Switcher addresses. */ - if (pgd_index(vaddr) == SWITCHER_PGD_INDEX && - index >= SWITCHER_PMD_INDEX) { - kill_guest(cpu, "attempt to access switcher pages"); - index = 0; - } - /* You should never call this if the PGD entry wasn't valid */ BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); page = __va(pgd_pfn(spgd) << PAGE_SHIFT); @@ -275,122 +250,177 @@ static void release_pte(pte_t pte) } /*:*/ -static void check_gpte(struct lg_cpu *cpu, pte_t gpte) +static bool check_gpte(struct lg_cpu *cpu, pte_t gpte) { if ((pte_flags(gpte) & _PAGE_PSE) || - pte_pfn(gpte) >= cpu->lg->pfn_limit) + pte_pfn(gpte) >= cpu->lg->pfn_limit) { kill_guest(cpu, "bad page table entry"); + return false; + } + return true; } -static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) +static bool check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) { if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) || - (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) + (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) { kill_guest(cpu, "bad page directory entry"); + return false; + } + return true; } #ifdef CONFIG_X86_PAE -static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) +static bool check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) { if ((pmd_flags(gpmd) & ~_PAGE_TABLE) || - (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) + (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) { kill_guest(cpu, "bad page middle directory entry"); + return false; + } + return true; } #endif -/*H:330 - * (i) Looking up a page table entry when the Guest faults. - * - * We saw this call in run_guest(): when we see a page fault in the Guest, we - * come here. That's because we only set up the shadow page tables lazily as - * they're needed, so we get page faults all the time and quietly fix them up - * and return to the Guest without it knowing. +/*H:331 + * This is the core routine to walk the shadow page tables and find the page + * table entry for a specific address. * - * If we fixed up the fault (ie. we mapped the address), this routine returns - * true. Otherwise, it was a real fault and we need to tell the Guest. + * If allocate is set, then we allocate any missing levels, setting the flags + * on the new page directory and mid-level directories using the arguments + * (which are copied from the Guest's page table entries). */ -bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) +static pte_t *find_spte(struct lg_cpu *cpu, unsigned long vaddr, bool allocate, + int pgd_flags, int pmd_flags) { - pgd_t gpgd; pgd_t *spgd; - unsigned long gpte_ptr; - pte_t gpte; - pte_t *spte; - /* Mid level for PAE. */ #ifdef CONFIG_X86_PAE pmd_t *spmd; - pmd_t gpmd; #endif - /* First step: get the top-level Guest page table entry. */ - if (unlikely(cpu->linear_pages)) { - /* Faking up a linear mapping. */ - gpgd = __pgd(CHECK_GPGD_MASK); - } else { - gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); - /* Toplevel not present? We can't map it in. */ - if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) - return false; - } - - /* Now look at the matching shadow entry. */ + /* Get top level entry. */ spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { /* No shadow entry: allocate a new shadow PTE page. */ - unsigned long ptepage = get_zeroed_page(GFP_KERNEL); + unsigned long ptepage; + + /* If they didn't want us to allocate anything, stop. */ + if (!allocate) + return NULL; + + ptepage = get_zeroed_page(GFP_KERNEL); /* * This is not really the Guest's fault, but killing it is * simple for this corner case. */ if (!ptepage) { kill_guest(cpu, "out of memory allocating pte page"); - return false; + return NULL; } - /* We check that the Guest pgd is OK. */ - check_gpgd(cpu, gpgd); /* * And we copy the flags to the shadow PGD entry. The page * number in the shadow PGD is the page we just allocated. */ - set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd))); + set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags)); } + /* + * Intel's Physical Address Extension actually uses three levels of + * page tables, so we need to look in the mid-level. + */ #ifdef CONFIG_X86_PAE - if (unlikely(cpu->linear_pages)) { - /* Faking up a linear mapping. */ - gpmd = __pmd(_PAGE_TABLE); - } else { - gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); - /* Middle level not present? We can't map it in. */ - if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) - return false; - } - - /* Now look at the matching shadow entry. */ + /* Now look at the mid-level shadow entry. */ spmd = spmd_addr(cpu, *spgd, vaddr); if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) { /* No shadow entry: allocate a new shadow PTE page. */ - unsigned long ptepage = get_zeroed_page(GFP_KERNEL); + unsigned long ptepage; + + /* If they didn't want us to allocate anything, stop. */ + if (!allocate) + return NULL; + + ptepage = get_zeroed_page(GFP_KERNEL); /* * This is not really the Guest's fault, but killing it is * simple for this corner case. */ if (!ptepage) { - kill_guest(cpu, "out of memory allocating pte page"); - return false; + kill_guest(cpu, "out of memory allocating pmd page"); + return NULL; } - /* We check that the Guest pmd is OK. */ - check_gpmd(cpu, gpmd); - /* * And we copy the flags to the shadow PMD entry. The page * number in the shadow PMD is the page we just allocated. */ - set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd))); + set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags)); + } +#endif + + /* Get the pointer to the shadow PTE entry we're going to set. */ + return spte_addr(cpu, *spgd, vaddr); +} + +/*H:330 + * (i) Looking up a page table entry when the Guest faults. + * + * We saw this call in run_guest(): when we see a page fault in the Guest, we + * come here. That's because we only set up the shadow page tables lazily as + * they're needed, so we get page faults all the time and quietly fix them up + * and return to the Guest without it knowing. + * + * If we fixed up the fault (ie. we mapped the address), this routine returns + * true. Otherwise, it was a real fault and we need to tell the Guest. + */ +bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) +{ + unsigned long gpte_ptr; + pte_t gpte; + pte_t *spte; + pmd_t gpmd; + pgd_t gpgd; + + /* We never demand page the Switcher, so trying is a mistake. */ + if (vaddr >= switcher_addr) + return false; + + /* First step: get the top-level Guest page table entry. */ + if (unlikely(cpu->linear_pages)) { + /* Faking up a linear mapping. */ + gpgd = __pgd(CHECK_GPGD_MASK); + } else { + gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); + /* Toplevel not present? We can't map it in. */ + if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) + return false; + + /* + * This kills the Guest if it has weird flags or tries to + * refer to a "physical" address outside the bounds. + */ + if (!check_gpgd(cpu, gpgd)) + return false; + } + + /* This "mid-level" entry is only used for non-linear, PAE mode. */ + gpmd = __pmd(_PAGE_TABLE); + +#ifdef CONFIG_X86_PAE + if (likely(!cpu->linear_pages)) { + gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); + /* Middle level not present? We can't map it in. */ + if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) + return false; + + /* + * This kills the Guest if it has weird flags or tries to + * refer to a "physical" address outside the bounds. + */ + if (!check_gpmd(cpu, gpmd)) + return false; } /* @@ -433,7 +463,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) * Check that the Guest PTE flags are OK, and the page number is below * the pfn_limit (ie. not mapping the Launcher binary). */ - check_gpte(cpu, gpte); + if (!check_gpte(cpu, gpte)) + return false; /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ gpte = pte_mkyoung(gpte); @@ -441,7 +472,9 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) gpte = pte_mkdirty(gpte); /* Get the pointer to the shadow PTE entry we're going to set. */ - spte = spte_addr(cpu, *spgd, vaddr); + spte = find_spte(cpu, vaddr, true, pgd_flags(gpgd), pmd_flags(gpmd)); + if (!spte) + return false; /* * If there was a valid shadow PTE entry here before, we release it. @@ -493,29 +526,23 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) */ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) { - pgd_t *spgd; + pte_t *spte; unsigned long flags; -#ifdef CONFIG_X86_PAE - pmd_t *spmd; -#endif - /* Look at the current top level entry: is it present? */ - spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); - if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) + /* You can't put your stack in the Switcher! */ + if (vaddr >= switcher_addr) return false; -#ifdef CONFIG_X86_PAE - spmd = spmd_addr(cpu, *spgd, vaddr); - if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) + /* If there's no shadow PTE, it's not writable. */ + spte = find_spte(cpu, vaddr, false, 0, 0); + if (!spte) return false; -#endif /* * Check the flags on the pte entry itself: it must be present and * writable. */ - flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr))); - + flags = pte_flags(*spte); return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); } @@ -678,9 +705,6 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, int *blank_pgdir) { unsigned int next; -#ifdef CONFIG_X86_PAE - pmd_t *pmd_table; -#endif /* * We pick one entry at random to throw out. Choosing the Least @@ -695,29 +719,11 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, if (!cpu->lg->pgdirs[next].pgdir) next = cpu->cpu_pgd; else { -#ifdef CONFIG_X86_PAE /* - * In PAE mode, allocate a pmd page and populate the - * last pgd entry. + * This is a blank page, so there are no kernel + * mappings: caller must map the stack! */ - pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL); - if (!pmd_table) { - free_page((long)cpu->lg->pgdirs[next].pgdir); - set_pgd(cpu->lg->pgdirs[next].pgdir, __pgd(0)); - next = cpu->cpu_pgd; - } else { - set_pgd(cpu->lg->pgdirs[next].pgdir + - SWITCHER_PGD_INDEX, - __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - /* - * This is a blank page, so there are no kernel - * mappings: caller must map the stack! - */ - *blank_pgdir = 1; - } -#else *blank_pgdir = 1; -#endif } } /* Record which Guest toplevel this shadows. */ @@ -725,9 +731,50 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, /* Release all the non-kernel mappings. */ flush_user_mappings(cpu->lg, next); + /* This hasn't run on any CPU at all. */ + cpu->lg->pgdirs[next].last_host_cpu = -1; + return next; } +/*H:501 + * We do need the Switcher code mapped at all times, so we allocate that + * part of the Guest page table here. We map the Switcher code immediately, + * but defer mapping of the guest register page and IDT/LDT etc page until + * just before we run the guest in map_switcher_in_guest(). + * + * We *could* do this setup in map_switcher_in_guest(), but at that point + * we've interrupts disabled, and allocating pages like that is fraught: we + * can't sleep if we need to free up some memory. + */ +static bool allocate_switcher_mapping(struct lg_cpu *cpu) +{ + int i; + + for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { + pte_t *pte = find_spte(cpu, switcher_addr + i * PAGE_SIZE, true, + CHECK_GPGD_MASK, _PAGE_TABLE); + if (!pte) + return false; + + /* + * Map the switcher page if not already there. It might + * already be there because we call allocate_switcher_mapping() + * in guest_set_pgd() just in case it did discard our Switcher + * mapping, but it probably didn't. + */ + if (i == 0 && !(pte_flags(*pte) & _PAGE_PRESENT)) { + /* Get a reference to the Switcher page. */ + get_page(lg_switcher_pages[0]); + /* Create a read-only, exectuable, kernel-style PTE */ + set_pte(pte, + mk_pte(lg_switcher_pages[0], PAGE_KERNEL_RX)); + } + } + cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped = true; + return true; +} + /*H:470 * Finally, a routine which throws away everything: all PGD entries in all * the shadow page tables, including the Guest's kernel mappings. This is used @@ -738,28 +785,16 @@ static void release_all_pagetables(struct lguest *lg) unsigned int i, j; /* Every shadow pagetable this Guest has */ - for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) - if (lg->pgdirs[i].pgdir) { -#ifdef CONFIG_X86_PAE - pgd_t *spgd; - pmd_t *pmdpage; - unsigned int k; - - /* Get the last pmd page. */ - spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX; - pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); - - /* - * And release the pmd entries of that pmd page, - * except for the switcher pmd. - */ - for (k = 0; k < SWITCHER_PMD_INDEX; k++) - release_pmd(&pmdpage[k]); -#endif - /* Every PGD entry except the Switcher at the top */ - for (j = 0; j < SWITCHER_PGD_INDEX; j++) - release_pgd(lg->pgdirs[i].pgdir + j); - } + for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) { + if (!lg->pgdirs[i].pgdir) + continue; + + /* Every PGD entry. */ + for (j = 0; j < PTRS_PER_PGD; j++) + release_pgd(lg->pgdirs[i].pgdir + j); + lg->pgdirs[i].switcher_mapped = false; + lg->pgdirs[i].last_host_cpu = -1; + } } /* @@ -773,6 +808,9 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu) release_all_pagetables(cpu->lg); /* We need the Guest kernel stack mapped again. */ pin_stack_pages(cpu); + /* And we need Switcher allocated. */ + if (!allocate_switcher_mapping(cpu)) + kill_guest(cpu, "Cannot populate switcher mapping"); } /*H:430 @@ -808,9 +846,17 @@ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) newpgdir = new_pgdir(cpu, pgtable, &repin); /* Change the current pgd index to the new one. */ cpu->cpu_pgd = newpgdir; - /* If it was completely blank, we map in the Guest kernel stack */ + /* + * If it was completely blank, we map in the Guest kernel stack and + * the Switcher. + */ if (repin) pin_stack_pages(cpu); + + if (!cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped) { + if (!allocate_switcher_mapping(cpu)) + kill_guest(cpu, "Cannot populate switcher mapping"); + } } /*:*/ @@ -865,7 +911,8 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, * micro-benchmark. */ if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { - check_gpte(cpu, gpte); + if (!check_gpte(cpu, gpte)) + return; set_pte(spte, gpte_to_spte(cpu, gpte, pte_flags(gpte) & _PAGE_DIRTY)); @@ -897,6 +944,12 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, unsigned long vaddr, pte_t gpte) { + /* We don't let you remap the Switcher; we need it to get back! */ + if (vaddr >= switcher_addr) { + kill_guest(cpu, "attempt to set pte into Switcher pages"); + return; + } + /* * Kernel mappings must be changed on all top levels. Slow, but doesn't * happen often. @@ -933,14 +986,23 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) { int pgdir; - if (idx >= SWITCHER_PGD_INDEX) + if (idx > PTRS_PER_PGD) { + kill_guest(&lg->cpus[0], "Attempt to set pgd %u/%u", + idx, PTRS_PER_PGD); return; + } /* If they're talking about a page table we have a shadow for... */ pgdir = find_pgdir(lg, gpgdir); - if (pgdir < ARRAY_SIZE(lg->pgdirs)) + if (pgdir < ARRAY_SIZE(lg->pgdirs)) { /* ... throw it away. */ release_pgd(lg->pgdirs[pgdir].pgdir + idx); + /* That might have been the Switcher mapping, remap it. */ + if (!allocate_switcher_mapping(&lg->cpus[0])) { + kill_guest(&lg->cpus[0], + "Cannot populate switcher mapping"); + } + } } #ifdef CONFIG_X86_PAE @@ -958,6 +1020,9 @@ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) * we will populate on future faults. The Guest doesn't have any actual * pagetables yet, so we set linear_pages to tell demand_page() to fake it * for the moment. + * + * We do need the Switcher to be mapped at all times, so we allocate that + * part of the Guest page table here. */ int init_guest_pagetable(struct lguest *lg) { @@ -971,21 +1036,34 @@ int init_guest_pagetable(struct lguest *lg) /* We start with a linear mapping until the initialize. */ cpu->linear_pages = true; + + /* Allocate the page tables for the Switcher. */ + if (!allocate_switcher_mapping(cpu)) { + release_all_pagetables(lg); + return -ENOMEM; + } + return 0; } /*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ void page_table_guest_data_init(struct lg_cpu *cpu) { + /* + * We tell the Guest that it can't use the virtual addresses + * used by the Switcher. This trick is equivalent to 4GB - + * switcher_addr. + */ + u32 top = ~switcher_addr + 1; + /* We get the kernel address: above this is all kernel memory. */ if (get_user(cpu->lg->kernel_address, - &cpu->lg->lguest_data->kernel_address) + &cpu->lg->lguest_data->kernel_address) /* - * We tell the Guest that it can't use the top 2 or 4 MB - * of virtual addresses used by the Switcher. + * We tell the Guest that it can't use the top virtual + * addresses (used by the Switcher). */ - || put_user(RESERVE_MEM * 1024 * 1024, - &cpu->lg->lguest_data->reserve_mem)) { + || put_user(top, &cpu->lg->lguest_data->reserve_mem)) { kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); return; } @@ -995,12 +1073,7 @@ void page_table_guest_data_init(struct lg_cpu *cpu) * "pgd_index(lg->kernel_address)". This assumes it won't hit the * Switcher mappings, so check that now. */ -#ifdef CONFIG_X86_PAE - if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX && - pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX) -#else - if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX) -#endif + if (cpu->lg->kernel_address >= switcher_addr) kill_guest(cpu, "bad kernel address %#lx", cpu->lg->kernel_address); } @@ -1017,102 +1090,96 @@ void free_guest_pagetable(struct lguest *lg) free_page((long)lg->pgdirs[i].pgdir); } -/*H:480 - * (vi) Mapping the Switcher when the Guest is about to run. - * - * The Switcher and the two pages for this CPU need to be visible in the - * Guest (and not the pages for other CPUs). We have the appropriate PTE pages - * for each CPU already set up, we just need to hook them in now we know which - * Guest is about to run on this CPU. +/*H:481 + * This clears the Switcher mappings for cpu #i. */ -void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) +static void remove_switcher_percpu_map(struct lg_cpu *cpu, unsigned int i) { - pte_t *switcher_pte_page = __this_cpu_read(switcher_pte_pages); - pte_t regs_pte; + unsigned long base = switcher_addr + PAGE_SIZE + i * PAGE_SIZE*2; + pte_t *pte; -#ifdef CONFIG_X86_PAE - pmd_t switcher_pmd; - pmd_t *pmd_table; - - switcher_pmd = pfn_pmd(__pa(switcher_pte_page) >> PAGE_SHIFT, - PAGE_KERNEL_EXEC); - - /* Figure out where the pmd page is, by reading the PGD, and converting - * it to a virtual address. */ - pmd_table = __va(pgd_pfn(cpu->lg-> - pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) - << PAGE_SHIFT); - /* Now write it into the shadow page table. */ - set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); -#else - pgd_t switcher_pgd; + /* Clear the mappings for both pages. */ + pte = find_spte(cpu, base, false, 0, 0); + release_pte(*pte); + set_pte(pte, __pte(0)); - /* - * Make the last PGD entry for this Guest point to the Switcher's PTE - * page for this CPU (with appropriate flags). - */ - switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC); - - cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; - -#endif - /* - * We also change the Switcher PTE page. When we're running the Guest, - * we want the Guest's "regs" page to appear where the first Switcher - * page for this CPU is. This is an optimization: when the Switcher - * saves the Guest registers, it saves them into the first page of this - * CPU's "struct lguest_pages": if we make sure the Guest's register - * page is already mapped there, we don't have to copy them out - * again. - */ - regs_pte = pfn_pte(__pa(cpu->regs_page) >> PAGE_SHIFT, PAGE_KERNEL); - set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], regs_pte); + pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0); + release_pte(*pte); + set_pte(pte, __pte(0)); } -/*:*/ -static void free_switcher_pte_pages(void) -{ - unsigned int i; - - for_each_possible_cpu(i) - free_page((long)switcher_pte_page(i)); -} - -/*H:520 - * Setting up the Switcher PTE page for given CPU is fairly easy, given - * the CPU number and the "struct page"s for the Switcher code itself. +/*H:480 + * (vi) Mapping the Switcher when the Guest is about to run. + * + * The Switcher and the two pages for this CPU need to be visible in the Guest + * (and not the pages for other CPUs). * - * Currently the Switcher is less than a page long, so "pages" is always 1. + * The pages for the pagetables have all been allocated before: we just need + * to make sure the actual PTEs are up-to-date for the CPU we're about to run + * on. */ -static __init void populate_switcher_pte_page(unsigned int cpu, - struct page *switcher_page[], - unsigned int pages) +void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) { - unsigned int i; - pte_t *pte = switcher_pte_page(cpu); + unsigned long base; + struct page *percpu_switcher_page, *regs_page; + pte_t *pte; + struct pgdir *pgdir = &cpu->lg->pgdirs[cpu->cpu_pgd]; + + /* Switcher page should always be mapped by now! */ + BUG_ON(!pgdir->switcher_mapped); + + /* + * Remember that we have two pages for each Host CPU, so we can run a + * Guest on each CPU without them interfering. We need to make sure + * those pages are mapped correctly in the Guest, but since we usually + * run on the same CPU, we cache that, and only update the mappings + * when we move. + */ + if (pgdir->last_host_cpu == raw_smp_processor_id()) + return; - /* The first entries are easy: they map the Switcher code. */ - for (i = 0; i < pages; i++) { - set_pte(&pte[i], mk_pte(switcher_page[i], - __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); + /* -1 means unknown so we remove everything. */ + if (pgdir->last_host_cpu == -1) { + unsigned int i; + for_each_possible_cpu(i) + remove_switcher_percpu_map(cpu, i); + } else { + /* We know exactly what CPU mapping to remove. */ + remove_switcher_percpu_map(cpu, pgdir->last_host_cpu); } - /* The only other thing we map is this CPU's pair of pages. */ - i = pages + cpu*2; - - /* First page (Guest registers) is writable from the Guest */ - set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]), - __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW))); + /* + * When we're running the Guest, we want the Guest's "regs" page to + * appear where the first Switcher page for this CPU is. This is an + * optimization: when the Switcher saves the Guest registers, it saves + * them into the first page of this CPU's "struct lguest_pages": if we + * make sure the Guest's register page is already mapped there, we + * don't have to copy them out again. + */ + /* Find the shadow PTE for this regs page. */ + base = switcher_addr + PAGE_SIZE + + raw_smp_processor_id() * sizeof(struct lguest_pages); + pte = find_spte(cpu, base, false, 0, 0); + regs_page = pfn_to_page(__pa(cpu->regs_page) >> PAGE_SHIFT); + get_page(regs_page); + set_pte(pte, mk_pte(regs_page, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL))); /* - * The second page contains the "struct lguest_ro_state", and is - * read-only. + * We map the second page of the struct lguest_pages read-only in + * the Guest: the IDT, GDT and other things it's not supposed to + * change. */ - set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]), - __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); + pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0); + percpu_switcher_page + = lg_switcher_pages[1 + raw_smp_processor_id()*2 + 1]; + get_page(percpu_switcher_page); + set_pte(pte, mk_pte(percpu_switcher_page, + __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL))); + + pgdir->last_host_cpu = raw_smp_processor_id(); } -/* +/*H:490 * We've made it through the page table code. Perhaps our tired brains are * still processing the details, or perhaps we're simply glad it's over. * @@ -1124,29 +1191,3 @@ static __init void populate_switcher_pte_page(unsigned int cpu, * * There is just one file remaining in the Host. */ - -/*H:510 - * At boot or module load time, init_pagetables() allocates and populates - * the Switcher PTE page for each CPU. - */ -__init int init_pagetables(struct page **switcher_page, unsigned int pages) -{ - unsigned int i; - - for_each_possible_cpu(i) { - switcher_pte_page(i) = (pte_t *)get_zeroed_page(GFP_KERNEL); - if (!switcher_pte_page(i)) { - free_switcher_pte_pages(); - return -ENOMEM; - } - populate_switcher_pte_page(i, switcher_page, pages); - } - return 0; -} -/*:*/ - -/* Cleaning up simply involves freeing the PTE page for each CPU. */ -void free_pagetables(void) -{ - free_switcher_pte_pages(); -} diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 4af12e1844d5..f0a3347b6441 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -59,14 +59,13 @@ static struct { /* Offset from where switcher.S was compiled to where we've copied it */ static unsigned long switcher_offset(void) { - return SWITCHER_ADDR - (unsigned long)start_switcher_text; + return switcher_addr - (unsigned long)start_switcher_text; } -/* This cpu's struct lguest_pages. */ +/* This cpu's struct lguest_pages (after the Switcher text page) */ static struct lguest_pages *lguest_pages(unsigned int cpu) { - return &(((struct lguest_pages *) - (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]); + return &(((struct lguest_pages *)(switcher_addr + PAGE_SIZE))[cpu]); } static DEFINE_PER_CPU(struct lg_cpu *, lg_last_cpu); diff --git a/drivers/media/pci/bt8xx/bttv-i2c.c b/drivers/media/pci/bt8xx/bttv-i2c.c index b7c52dc8999c..d43911deb617 100644 --- a/drivers/media/pci/bt8xx/bttv-i2c.c +++ b/drivers/media/pci/bt8xx/bttv-i2c.c @@ -397,8 +397,8 @@ int init_bttv_i2c(struct bttv *btv) int fini_bttv_i2c(struct bttv *btv) { - if (0 != btv->i2c_rc) - return 0; + if (btv->i2c_rc == 0) + i2c_del_adapter(&btv->c.i2c_adap); - return i2c_del_adapter(&btv->c.i2c_adap); + return 0; } diff --git a/drivers/media/pci/mantis/mantis_i2c.c b/drivers/media/pci/mantis/mantis_i2c.c index 937fb9d50213..895ddba3c0fb 100644 --- a/drivers/media/pci/mantis/mantis_i2c.c +++ b/drivers/media/pci/mantis/mantis_i2c.c @@ -261,6 +261,8 @@ int mantis_i2c_exit(struct mantis_pci *mantis) mmwrite((intmask & ~MANTIS_INT_I2CDONE), MANTIS_INT_MASK); dprintk(MANTIS_DEBUG, 1, "Removing I2C adapter"); - return i2c_del_adapter(&mantis->adapter); + i2c_del_adapter(&mantis->adapter); + + return 0; } EXPORT_SYMBOL_GPL(mantis_i2c_exit); diff --git a/drivers/net/caif/Kconfig b/drivers/net/caif/Kconfig index a966128c2a7a..7ffc756131a2 100644 --- a/drivers/net/caif/Kconfig +++ b/drivers/net/caif/Kconfig @@ -40,3 +40,17 @@ config CAIF_HSI The caif low level driver for CAIF over HSI. Be aware that if you enable this then you also need to enable a low-level HSI driver. + +config CAIF_VIRTIO + tristate "CAIF virtio transport driver" + depends on CAIF + select VHOST_RING + select VIRTIO + select GENERIC_ALLOCATOR + default n + ---help--- + The caif driver for CAIF over Virtio. + +if CAIF_VIRTIO +source "drivers/vhost/Kconfig" +endif diff --git a/drivers/net/caif/Makefile b/drivers/net/caif/Makefile index 15a9d2fc753d..9bbd45391f6c 100644 --- a/drivers/net/caif/Makefile +++ b/drivers/net/caif/Makefile @@ -9,3 +9,6 @@ obj-$(CONFIG_CAIF_SPI_SLAVE) += cfspi_slave.o # HSI interface obj-$(CONFIG_CAIF_HSI) += caif_hsi.o + +# Virtio interface +obj-$(CONFIG_CAIF_VIRTIO) += caif_virtio.o diff --git a/drivers/net/caif/caif_virtio.c b/drivers/net/caif/caif_virtio.c new file mode 100644 index 000000000000..b9ed1288ce2d --- /dev/null +++ b/drivers/net/caif/caif_virtio.c @@ -0,0 +1,790 @@ +/* + * Copyright (C) ST-Ericsson AB 2013 + * Authors: Vicram Arv + * Dmitry Tarnyagin <dmitry.tarnyagin@lockless.no> + * Sjur Brendeland + * License terms: GNU General Public License (GPL) version 2 + */ +#include <linux/module.h> +#include <linux/if_arp.h> +#include <linux/virtio.h> +#include <linux/vringh.h> +#include <linux/debugfs.h> +#include <linux/spinlock.h> +#include <linux/genalloc.h> +#include <linux/interrupt.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <linux/virtio_ids.h> +#include <linux/virtio_caif.h> +#include <linux/virtio_ring.h> +#include <linux/dma-mapping.h> +#include <net/caif/caif_dev.h> +#include <linux/virtio_config.h> + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Vicram Arv"); +MODULE_AUTHOR("Sjur Brendeland"); +MODULE_DESCRIPTION("Virtio CAIF Driver"); + +/* NAPI schedule quota */ +#define CFV_DEFAULT_QUOTA 32 + +/* Defaults used if virtio config space is unavailable */ +#define CFV_DEF_MTU_SIZE 4096 +#define CFV_DEF_HEADROOM 32 +#define CFV_DEF_TAILROOM 32 + +/* Required IP header alignment */ +#define IP_HDR_ALIGN 4 + +/* struct cfv_napi_contxt - NAPI context info + * @riov: IOV holding data read from the ring. Note that riov may + * still hold data when cfv_rx_poll() returns. + * @head: Last descriptor ID we received from vringh_getdesc_kern. + * We use this to put descriptor back on the used ring. USHRT_MAX is + * used to indicate invalid head-id. + */ +struct cfv_napi_context { + struct vringh_kiov riov; + unsigned short head; +}; + +/* struct cfv_stats - statistics for debugfs + * @rx_napi_complete: Number of NAPI completions (RX) + * @rx_napi_resched: Number of calls where the full quota was used (RX) + * @rx_nomem: Number of SKB alloc failures (RX) + * @rx_kicks: Number of RX kicks + * @tx_full_ring: Number times TX ring was full + * @tx_no_mem: Number of times TX went out of memory + * @tx_flow_on: Number of flow on (TX) + * @tx_kicks: Number of TX kicks + */ +struct cfv_stats { + u32 rx_napi_complete; + u32 rx_napi_resched; + u32 rx_nomem; + u32 rx_kicks; + u32 tx_full_ring; + u32 tx_no_mem; + u32 tx_flow_on; + u32 tx_kicks; +}; + +/* struct cfv_info - Caif Virtio control structure + * @cfdev: caif common header + * @vdev: Associated virtio device + * @vr_rx: rx/downlink host vring + * @vq_tx: tx/uplink virtqueue + * @ndev: CAIF link layer device + * @watermark_tx: indicates number of free descriptors we need + * to reopen the tx-queues after overload. + * @tx_lock: protects vq_tx from concurrent use + * @tx_release_tasklet: Tasklet for freeing consumed TX buffers + * @napi: Napi context used in cfv_rx_poll() + * @ctx: Context data used in cfv_rx_poll() + * @tx_hr: transmit headroom + * @rx_hr: receive headroom + * @tx_tr: transmit tail room + * @rx_tr: receive tail room + * @mtu: transmit max size + * @mru: receive max size + * @allocsz: size of dma memory reserved for TX buffers + * @alloc_addr: virtual address to dma memory for TX buffers + * @alloc_dma: dma address to dma memory for TX buffers + * @genpool: Gen Pool used for allocating TX buffers + * @reserved_mem: Pointer to memory reserve allocated from genpool + * @reserved_size: Size of memory reserve allocated from genpool + * @stats: Statistics exposed in sysfs + * @debugfs: Debugfs dentry for statistic counters + */ +struct cfv_info { + struct caif_dev_common cfdev; + struct virtio_device *vdev; + struct vringh *vr_rx; + struct virtqueue *vq_tx; + struct net_device *ndev; + unsigned int watermark_tx; + /* Protect access to vq_tx */ + spinlock_t tx_lock; + struct tasklet_struct tx_release_tasklet; + struct napi_struct napi; + struct cfv_napi_context ctx; + u16 tx_hr; + u16 rx_hr; + u16 tx_tr; + u16 rx_tr; + u32 mtu; + u32 mru; + size_t allocsz; + void *alloc_addr; + dma_addr_t alloc_dma; + struct gen_pool *genpool; + unsigned long reserved_mem; + size_t reserved_size; + struct cfv_stats stats; + struct dentry *debugfs; +}; + +/* struct buf_info - maintains transmit buffer data handle + * @size: size of transmit buffer + * @dma_handle: handle to allocated dma device memory area + * @vaddr: virtual address mapping to allocated memory area + */ +struct buf_info { + size_t size; + u8 *vaddr; +}; + +/* Called from virtio device, in IRQ context */ +static void cfv_release_cb(struct virtqueue *vq_tx) +{ + struct cfv_info *cfv = vq_tx->vdev->priv; + + ++cfv->stats.tx_kicks; + tasklet_schedule(&cfv->tx_release_tasklet); +} + +static void free_buf_info(struct cfv_info *cfv, struct buf_info *buf_info) +{ + if (!buf_info) + return; + gen_pool_free(cfv->genpool, (unsigned long) buf_info->vaddr, + buf_info->size); + kfree(buf_info); +} + +/* This is invoked whenever the remote processor completed processing + * a TX msg we just sent, and the buffer is put back to the used ring. + */ +static void cfv_release_used_buf(struct virtqueue *vq_tx) +{ + struct cfv_info *cfv = vq_tx->vdev->priv; + unsigned long flags; + + BUG_ON(vq_tx != cfv->vq_tx); + + for (;;) { + unsigned int len; + struct buf_info *buf_info; + + /* Get used buffer from used ring to recycle used descriptors */ + spin_lock_irqsave(&cfv->tx_lock, flags); + buf_info = virtqueue_get_buf(vq_tx, &len); + spin_unlock_irqrestore(&cfv->tx_lock, flags); + + /* Stop looping if there are no more buffers to free */ + if (!buf_info) + break; + + free_buf_info(cfv, buf_info); + + /* watermark_tx indicates if we previously stopped the tx + * queues. If we have enough free stots in the virtio ring, + * re-establish memory reserved and open up tx queues. + */ + if (cfv->vq_tx->num_free <= cfv->watermark_tx) + continue; + + /* Re-establish memory reserve */ + if (cfv->reserved_mem == 0 && cfv->genpool) + cfv->reserved_mem = + gen_pool_alloc(cfv->genpool, + cfv->reserved_size); + + /* Open up the tx queues */ + if (cfv->reserved_mem) { + cfv->watermark_tx = + virtqueue_get_vring_size(cfv->vq_tx); + netif_tx_wake_all_queues(cfv->ndev); + /* Buffers are recycled in cfv_netdev_tx, so + * disable notifications when queues are opened. + */ + virtqueue_disable_cb(cfv->vq_tx); + ++cfv->stats.tx_flow_on; + } else { + /* if no memory reserve, wait for more free slots */ + WARN_ON(cfv->watermark_tx > + virtqueue_get_vring_size(cfv->vq_tx)); + cfv->watermark_tx += + virtqueue_get_vring_size(cfv->vq_tx) / 4; + } + } +} + +/* Allocate a SKB and copy packet data to it */ +static struct sk_buff *cfv_alloc_and_copy_skb(int *err, + struct cfv_info *cfv, + u8 *frm, u32 frm_len) +{ + struct sk_buff *skb; + u32 cfpkt_len, pad_len; + + *err = 0; + /* Verify that packet size with down-link header and mtu size */ + if (frm_len > cfv->mru || frm_len <= cfv->rx_hr + cfv->rx_tr) { + netdev_err(cfv->ndev, + "Invalid frmlen:%u mtu:%u hr:%d tr:%d\n", + frm_len, cfv->mru, cfv->rx_hr, + cfv->rx_tr); + *err = -EPROTO; + return NULL; + } + + cfpkt_len = frm_len - (cfv->rx_hr + cfv->rx_tr); + pad_len = (unsigned long)(frm + cfv->rx_hr) & (IP_HDR_ALIGN - 1); + + skb = netdev_alloc_skb(cfv->ndev, frm_len + pad_len); + if (!skb) { + *err = -ENOMEM; + return NULL; + } + + skb_reserve(skb, cfv->rx_hr + pad_len); + + memcpy(skb_put(skb, cfpkt_len), frm + cfv->rx_hr, cfpkt_len); + return skb; +} + +/* Get packets from the host vring */ +static int cfv_rx_poll(struct napi_struct *napi, int quota) +{ + struct cfv_info *cfv = container_of(napi, struct cfv_info, napi); + int rxcnt = 0; + int err = 0; + void *buf; + struct sk_buff *skb; + struct vringh_kiov *riov = &cfv->ctx.riov; + unsigned int skb_len; + +again: + do { + skb = NULL; + + /* Put the previous iovec back on the used ring and + * fetch a new iovec if we have processed all elements. + */ + if (riov->i == riov->used) { + if (cfv->ctx.head != USHRT_MAX) { + vringh_complete_kern(cfv->vr_rx, + cfv->ctx.head, + 0); + cfv->ctx.head = USHRT_MAX; + } + + err = vringh_getdesc_kern( + cfv->vr_rx, + riov, + NULL, + &cfv->ctx.head, + GFP_ATOMIC); + + if (err <= 0) + goto exit; + } + + buf = phys_to_virt((unsigned long) riov->iov[riov->i].iov_base); + /* TODO: Add check on valid buffer address */ + + skb = cfv_alloc_and_copy_skb(&err, cfv, buf, + riov->iov[riov->i].iov_len); + if (unlikely(err)) + goto exit; + + /* Push received packet up the stack. */ + skb_len = skb->len; + skb->protocol = htons(ETH_P_CAIF); + skb_reset_mac_header(skb); + skb->dev = cfv->ndev; + err = netif_receive_skb(skb); + if (unlikely(err)) { + ++cfv->ndev->stats.rx_dropped; + } else { + ++cfv->ndev->stats.rx_packets; + cfv->ndev->stats.rx_bytes += skb_len; + } + + ++riov->i; + ++rxcnt; + } while (rxcnt < quota); + + ++cfv->stats.rx_napi_resched; + goto out; + +exit: + switch (err) { + case 0: + ++cfv->stats.rx_napi_complete; + + /* Really out of patckets? (stolen from virtio_net)*/ + napi_complete(napi); + if (unlikely(!vringh_notify_enable_kern(cfv->vr_rx)) && + napi_schedule_prep(napi)) { + vringh_notify_disable_kern(cfv->vr_rx); + __napi_schedule(napi); + goto again; + } + break; + + case -ENOMEM: + ++cfv->stats.rx_nomem; + dev_kfree_skb(skb); + /* Stop NAPI poll on OOM, we hope to be polled later */ + napi_complete(napi); + vringh_notify_enable_kern(cfv->vr_rx); + break; + + default: + /* We're doomed, any modem fault is fatal */ + netdev_warn(cfv->ndev, "Bad ring, disable device\n"); + cfv->ndev->stats.rx_dropped = riov->used - riov->i; + napi_complete(napi); + vringh_notify_disable_kern(cfv->vr_rx); + netif_carrier_off(cfv->ndev); + break; + } +out: + if (rxcnt && vringh_need_notify_kern(cfv->vr_rx) > 0) + vringh_notify(cfv->vr_rx); + return rxcnt; +} + +static void cfv_recv(struct virtio_device *vdev, struct vringh *vr_rx) +{ + struct cfv_info *cfv = vdev->priv; + + ++cfv->stats.rx_kicks; + vringh_notify_disable_kern(cfv->vr_rx); + napi_schedule(&cfv->napi); +} + +static void cfv_destroy_genpool(struct cfv_info *cfv) +{ + if (cfv->alloc_addr) + dma_free_coherent(cfv->vdev->dev.parent->parent, + cfv->allocsz, cfv->alloc_addr, + cfv->alloc_dma); + + if (!cfv->genpool) + return; + gen_pool_free(cfv->genpool, cfv->reserved_mem, + cfv->reserved_size); + gen_pool_destroy(cfv->genpool); + cfv->genpool = NULL; +} + +static int cfv_create_genpool(struct cfv_info *cfv) +{ + int err; + + /* dma_alloc can only allocate whole pages, and we need a more + * fine graned allocation so we use genpool. We ask for space needed + * by IP and a full ring. If the dma allcoation fails we retry with a + * smaller allocation size. + */ + err = -ENOMEM; + cfv->allocsz = (virtqueue_get_vring_size(cfv->vq_tx) * + (ETH_DATA_LEN + cfv->tx_hr + cfv->tx_tr) * 11)/10; + if (cfv->allocsz <= (num_possible_cpus() + 1) * cfv->ndev->mtu) + return -EINVAL; + + for (;;) { + if (cfv->allocsz <= num_possible_cpus() * cfv->ndev->mtu) { + netdev_info(cfv->ndev, "Not enough device memory\n"); + return -ENOMEM; + } + + cfv->alloc_addr = dma_alloc_coherent( + cfv->vdev->dev.parent->parent, + cfv->allocsz, &cfv->alloc_dma, + GFP_ATOMIC); + if (cfv->alloc_addr) + break; + + cfv->allocsz = (cfv->allocsz * 3) >> 2; + } + + netdev_dbg(cfv->ndev, "Allocated %zd bytes from dma-memory\n", + cfv->allocsz); + + /* Allocate on 128 bytes boundaries (1 << 7)*/ + cfv->genpool = gen_pool_create(7, -1); + if (!cfv->genpool) + goto err; + + err = gen_pool_add_virt(cfv->genpool, (unsigned long)cfv->alloc_addr, + (phys_addr_t)virt_to_phys(cfv->alloc_addr), + cfv->allocsz, -1); + if (err) + goto err; + + /* Reserve some memory for low memory situations. If we hit the roof + * in the memory pool, we stop TX flow and release the reserve. + */ + cfv->reserved_size = num_possible_cpus() * cfv->ndev->mtu; + cfv->reserved_mem = gen_pool_alloc(cfv->genpool, + cfv->reserved_size); + if (!cfv->reserved_mem) { + err = -ENOMEM; + goto err; + } + + cfv->watermark_tx = virtqueue_get_vring_size(cfv->vq_tx); + return 0; +err: + cfv_destroy_genpool(cfv); + return err; +} + +/* Enable the CAIF interface and allocate the memory-pool */ +static int cfv_netdev_open(struct net_device *netdev) +{ + struct cfv_info *cfv = netdev_priv(netdev); + + if (cfv_create_genpool(cfv)) + return -ENOMEM; + + netif_carrier_on(netdev); + napi_enable(&cfv->napi); + + /* Schedule NAPI to read any pending packets */ + napi_schedule(&cfv->napi); + return 0; +} + +/* Disable the CAIF interface and free the memory-pool */ +static int cfv_netdev_close(struct net_device *netdev) +{ + struct cfv_info *cfv = netdev_priv(netdev); + unsigned long flags; + struct buf_info *buf_info; + + /* Disable interrupts, queues and NAPI polling */ + netif_carrier_off(netdev); + virtqueue_disable_cb(cfv->vq_tx); + vringh_notify_disable_kern(cfv->vr_rx); + napi_disable(&cfv->napi); + + /* Release any TX buffers on both used and avilable rings */ + cfv_release_used_buf(cfv->vq_tx); + spin_lock_irqsave(&cfv->tx_lock, flags); + while ((buf_info = virtqueue_detach_unused_buf(cfv->vq_tx))) + free_buf_info(cfv, buf_info); + spin_unlock_irqrestore(&cfv->tx_lock, flags); + + /* Release all dma allocated memory and destroy the pool */ + cfv_destroy_genpool(cfv); + return 0; +} + +/* Allocate a buffer in dma-memory and copy skb to it */ +static struct buf_info *cfv_alloc_and_copy_to_shm(struct cfv_info *cfv, + struct sk_buff *skb, + struct scatterlist *sg) +{ + struct caif_payload_info *info = (void *)&skb->cb; + struct buf_info *buf_info = NULL; + u8 pad_len, hdr_ofs; + + if (!cfv->genpool) + goto err; + + if (unlikely(cfv->tx_hr + skb->len + cfv->tx_tr > cfv->mtu)) { + netdev_warn(cfv->ndev, "Invalid packet len (%d > %d)\n", + cfv->tx_hr + skb->len + cfv->tx_tr, cfv->mtu); + goto err; + } + + buf_info = kmalloc(sizeof(struct buf_info), GFP_ATOMIC); + if (unlikely(!buf_info)) + goto err; + + /* Make the IP header aligned in tbe buffer */ + hdr_ofs = cfv->tx_hr + info->hdr_len; + pad_len = hdr_ofs & (IP_HDR_ALIGN - 1); + buf_info->size = cfv->tx_hr + skb->len + cfv->tx_tr + pad_len; + + /* allocate dma memory buffer */ + buf_info->vaddr = (void *)gen_pool_alloc(cfv->genpool, buf_info->size); + if (unlikely(!buf_info->vaddr)) + goto err; + + /* copy skbuf contents to send buffer */ + skb_copy_bits(skb, 0, buf_info->vaddr + cfv->tx_hr + pad_len, skb->len); + sg_init_one(sg, buf_info->vaddr + pad_len, + skb->len + cfv->tx_hr + cfv->rx_hr); + + return buf_info; +err: + kfree(buf_info); + return NULL; +} + +/* Put the CAIF packet on the virtio ring and kick the receiver */ +static int cfv_netdev_tx(struct sk_buff *skb, struct net_device *netdev) +{ + struct cfv_info *cfv = netdev_priv(netdev); + struct buf_info *buf_info; + struct scatterlist sg; + unsigned long flags; + bool flow_off = false; + int ret; + + /* garbage collect released buffers */ + cfv_release_used_buf(cfv->vq_tx); + spin_lock_irqsave(&cfv->tx_lock, flags); + + /* Flow-off check takes into account number of cpus to make sure + * virtqueue will not be overfilled in any possible smp conditions. + * + * Flow-on is triggered when sufficient buffers are freed + */ + if (unlikely(cfv->vq_tx->num_free <= num_present_cpus())) { + flow_off = true; + cfv->stats.tx_full_ring++; + } + + /* If we run out of memory, we release the memory reserve and retry + * allocation. + */ + buf_info = cfv_alloc_and_copy_to_shm(cfv, skb, &sg); + if (unlikely(!buf_info)) { + cfv->stats.tx_no_mem++; + flow_off = true; + + if (cfv->reserved_mem && cfv->genpool) { + gen_pool_free(cfv->genpool, cfv->reserved_mem, + cfv->reserved_size); + cfv->reserved_mem = 0; + buf_info = cfv_alloc_and_copy_to_shm(cfv, skb, &sg); + } + } + + if (unlikely(flow_off)) { + /* Turn flow on when a 1/4 of the descriptors are released */ + cfv->watermark_tx = virtqueue_get_vring_size(cfv->vq_tx) / 4; + /* Enable notifications of recycled TX buffers */ + virtqueue_enable_cb(cfv->vq_tx); + netif_tx_stop_all_queues(netdev); + } + + if (unlikely(!buf_info)) { + /* If the memory reserve does it's job, this shouldn't happen */ + netdev_warn(cfv->ndev, "Out of gen_pool memory\n"); + goto err; + } + + ret = virtqueue_add_outbuf(cfv->vq_tx, &sg, 1, buf_info, GFP_ATOMIC); + if (unlikely((ret < 0))) { + /* If flow control works, this shouldn't happen */ + netdev_warn(cfv->ndev, "Failed adding buffer to TX vring:%d\n", + ret); + goto err; + } + + /* update netdev statistics */ + cfv->ndev->stats.tx_packets++; + cfv->ndev->stats.tx_bytes += skb->len; + spin_unlock_irqrestore(&cfv->tx_lock, flags); + + /* tell the remote processor it has a pending message to read */ + virtqueue_kick(cfv->vq_tx); + + dev_kfree_skb(skb); + return NETDEV_TX_OK; +err: + spin_unlock_irqrestore(&cfv->tx_lock, flags); + cfv->ndev->stats.tx_dropped++; + free_buf_info(cfv, buf_info); + dev_kfree_skb(skb); + return NETDEV_TX_OK; +} + +static void cfv_tx_release_tasklet(unsigned long drv) +{ + struct cfv_info *cfv = (struct cfv_info *)drv; + cfv_release_used_buf(cfv->vq_tx); +} + +static const struct net_device_ops cfv_netdev_ops = { + .ndo_open = cfv_netdev_open, + .ndo_stop = cfv_netdev_close, + .ndo_start_xmit = cfv_netdev_tx, +}; + +static void cfv_netdev_setup(struct net_device *netdev) +{ + netdev->netdev_ops = &cfv_netdev_ops; + netdev->type = ARPHRD_CAIF; + netdev->tx_queue_len = 100; + netdev->flags = IFF_POINTOPOINT | IFF_NOARP; + netdev->mtu = CFV_DEF_MTU_SIZE; + netdev->destructor = free_netdev; +} + +/* Create debugfs counters for the device */ +static inline void debugfs_init(struct cfv_info *cfv) +{ + cfv->debugfs = + debugfs_create_dir(netdev_name(cfv->ndev), NULL); + + if (IS_ERR(cfv->debugfs)) + return; + + debugfs_create_u32("rx-napi-complete", S_IRUSR, cfv->debugfs, + &cfv->stats.rx_napi_complete); + debugfs_create_u32("rx-napi-resched", S_IRUSR, cfv->debugfs, + &cfv->stats.rx_napi_resched); + debugfs_create_u32("rx-nomem", S_IRUSR, cfv->debugfs, + &cfv->stats.rx_nomem); + debugfs_create_u32("rx-kicks", S_IRUSR, cfv->debugfs, + &cfv->stats.rx_kicks); + debugfs_create_u32("tx-full-ring", S_IRUSR, cfv->debugfs, + &cfv->stats.tx_full_ring); + debugfs_create_u32("tx-no-mem", S_IRUSR, cfv->debugfs, + &cfv->stats.tx_no_mem); + debugfs_create_u32("tx-kicks", S_IRUSR, cfv->debugfs, + &cfv->stats.tx_kicks); + debugfs_create_u32("tx-flow-on", S_IRUSR, cfv->debugfs, + &cfv->stats.tx_flow_on); +} + +/* Setup CAIF for the a virtio device */ +static int cfv_probe(struct virtio_device *vdev) +{ + vq_callback_t *vq_cbs = cfv_release_cb; + vrh_callback_t *vrh_cbs = cfv_recv; + const char *names = "output"; + const char *cfv_netdev_name = "cfvrt"; + struct net_device *netdev; + struct cfv_info *cfv; + int err = -EINVAL; + + netdev = alloc_netdev(sizeof(struct cfv_info), cfv_netdev_name, + cfv_netdev_setup); + if (!netdev) + return -ENOMEM; + + cfv = netdev_priv(netdev); + cfv->vdev = vdev; + cfv->ndev = netdev; + + spin_lock_init(&cfv->tx_lock); + + /* Get the RX virtio ring. This is a "host side vring". */ + err = -ENODEV; + if (!vdev->vringh_config || !vdev->vringh_config->find_vrhs) + goto err; + + err = vdev->vringh_config->find_vrhs(vdev, 1, &cfv->vr_rx, &vrh_cbs); + if (err) + goto err; + + /* Get the TX virtio ring. This is a "guest side vring". */ + err = vdev->config->find_vqs(vdev, 1, &cfv->vq_tx, &vq_cbs, &names); + if (err) + goto err; + + /* Get the CAIF configuration from virtio config space, if available */ +#define GET_VIRTIO_CONFIG_OPS(_v, _var, _f) \ + ((_v)->config->get(_v, offsetof(struct virtio_caif_transf_config, _f), \ + &_var, \ + FIELD_SIZEOF(struct virtio_caif_transf_config, _f))) + + if (vdev->config->get) { + GET_VIRTIO_CONFIG_OPS(vdev, cfv->tx_hr, headroom); + GET_VIRTIO_CONFIG_OPS(vdev, cfv->rx_hr, headroom); + GET_VIRTIO_CONFIG_OPS(vdev, cfv->tx_tr, tailroom); + GET_VIRTIO_CONFIG_OPS(vdev, cfv->rx_tr, tailroom); + GET_VIRTIO_CONFIG_OPS(vdev, cfv->mtu, mtu); + GET_VIRTIO_CONFIG_OPS(vdev, cfv->mru, mtu); + } else { + cfv->tx_hr = CFV_DEF_HEADROOM; + cfv->rx_hr = CFV_DEF_HEADROOM; + cfv->tx_tr = CFV_DEF_TAILROOM; + cfv->rx_tr = CFV_DEF_TAILROOM; + cfv->mtu = CFV_DEF_MTU_SIZE; + cfv->mru = CFV_DEF_MTU_SIZE; + } + + netdev->needed_headroom = cfv->tx_hr; + netdev->needed_tailroom = cfv->tx_tr; + + /* Disable buffer release interrupts unless we have stopped TX queues */ + virtqueue_disable_cb(cfv->vq_tx); + + netdev->mtu = cfv->mtu - cfv->tx_tr; + vdev->priv = cfv; + + /* Initialize NAPI poll context data */ + vringh_kiov_init(&cfv->ctx.riov, NULL, 0); + cfv->ctx.head = USHRT_MAX; + netif_napi_add(netdev, &cfv->napi, cfv_rx_poll, CFV_DEFAULT_QUOTA); + + tasklet_init(&cfv->tx_release_tasklet, + cfv_tx_release_tasklet, + (unsigned long)cfv); + + /* Carrier is off until netdevice is opened */ + netif_carrier_off(netdev); + + /* register Netdev */ + err = register_netdev(netdev); + if (err) { + dev_err(&vdev->dev, "Unable to register netdev (%d)\n", err); + goto err; + } + + debugfs_init(cfv); + + return 0; +err: + netdev_warn(cfv->ndev, "CAIF Virtio probe failed:%d\n", err); + + if (cfv->vr_rx) + vdev->vringh_config->del_vrhs(cfv->vdev); + if (cfv->vdev) + vdev->config->del_vqs(cfv->vdev); + free_netdev(netdev); + return err; +} + +static void cfv_remove(struct virtio_device *vdev) +{ + struct cfv_info *cfv = vdev->priv; + + rtnl_lock(); + dev_close(cfv->ndev); + rtnl_unlock(); + + tasklet_kill(&cfv->tx_release_tasklet); + debugfs_remove_recursive(cfv->debugfs); + + vringh_kiov_cleanup(&cfv->ctx.riov); + vdev->config->reset(vdev); + vdev->vringh_config->del_vrhs(cfv->vdev); + cfv->vr_rx = NULL; + vdev->config->del_vqs(cfv->vdev); + unregister_netdev(cfv->ndev); +} + +static struct virtio_device_id id_table[] = { + { VIRTIO_ID_CAIF, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; + +static unsigned int features[] = { +}; + +static struct virtio_driver caif_virtio_driver = { + .feature_table = features, + .feature_table_size = ARRAY_SIZE(features), + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, + .id_table = id_table, + .probe = cfv_probe, + .remove = cfv_remove, +}; + +module_virtio_driver(caif_virtio_driver); +MODULE_DEVICE_TABLE(virtio, id_table); diff --git a/drivers/net/ethernet/sfc/falcon.c b/drivers/net/ethernet/sfc/falcon.c index 4486102fa9b3..71998e7995d9 100644 --- a/drivers/net/ethernet/sfc/falcon.c +++ b/drivers/net/ethernet/sfc/falcon.c @@ -1528,7 +1528,7 @@ static int falcon_probe_nic(struct efx_nic *efx) return 0; fail6: - BUG_ON(i2c_del_adapter(&board->i2c_adap)); + i2c_del_adapter(&board->i2c_adap); memset(&board->i2c_adap, 0, sizeof(board->i2c_adap)); fail5: efx_nic_free_buffer(efx, &efx->irq_status); @@ -1666,13 +1666,11 @@ static void falcon_remove_nic(struct efx_nic *efx) { struct falcon_nic_data *nic_data = efx->nic_data; struct falcon_board *board = falcon_board(efx); - int rc; board->type->fini(efx); /* Remove I2C adapter and clear it in preparation for a retry */ - rc = i2c_del_adapter(&board->i2c_adap); - BUG_ON(rc); + i2c_del_adapter(&board->i2c_adap); memset(&board->i2c_adap, 0, sizeof(board->i2c_adap)); efx_nic_free_buffer(efx, &efx->irq_status); diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 50077753a0e5..3c23fdc27bf0 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -39,7 +39,6 @@ module_param(gso, bool, 0444); #define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) #define GOOD_COPY_LEN 128 -#define VIRTNET_SEND_COMMAND_SG_MAX 2 #define VIRTNET_DRIVER_VERSION "1.0.0" struct virtnet_stats { @@ -444,7 +443,7 @@ static int add_recvbuf_small(struct receive_queue *rq, gfp_t gfp) skb_to_sgvec(skb, rq->sg + 1, 0, skb->len); - err = virtqueue_add_buf(rq->vq, rq->sg, 0, 2, skb, gfp); + err = virtqueue_add_inbuf(rq->vq, rq->sg, 2, skb, gfp); if (err < 0) dev_kfree_skb(skb); @@ -489,8 +488,8 @@ static int add_recvbuf_big(struct receive_queue *rq, gfp_t gfp) /* chain first in list head */ first->private = (unsigned long)list; - err = virtqueue_add_buf(rq->vq, rq->sg, 0, MAX_SKB_FRAGS + 2, - first, gfp); + err = virtqueue_add_inbuf(rq->vq, rq->sg, MAX_SKB_FRAGS + 2, + first, gfp); if (err < 0) give_pages(rq, first); @@ -508,7 +507,7 @@ static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp) sg_init_one(rq->sg, page_address(page), PAGE_SIZE); - err = virtqueue_add_buf(rq->vq, rq->sg, 0, 1, page, gfp); + err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, page, gfp); if (err < 0) give_pages(rq, page); @@ -582,7 +581,7 @@ static void refill_work(struct work_struct *work) bool still_empty; int i; - for (i = 0; i < vi->max_queue_pairs; i++) { + for (i = 0; i < vi->curr_queue_pairs; i++) { struct receive_queue *rq = &vi->rq[i]; napi_disable(&rq->napi); @@ -637,7 +636,7 @@ static int virtnet_open(struct net_device *dev) struct virtnet_info *vi = netdev_priv(dev); int i; - for (i = 0; i < vi->max_queue_pairs; i++) { + for (i = 0; i < vi->curr_queue_pairs; i++) { /* Make sure we have some buffers: if oom use wq. */ if (!try_fill_recv(&vi->rq[i], GFP_KERNEL)) schedule_delayed_work(&vi->refill, 0); @@ -711,8 +710,7 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb) sg_set_buf(sq->sg, &hdr->hdr, sizeof hdr->hdr); num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len) + 1; - return virtqueue_add_buf(sq->vq, sq->sg, num_sg, - 0, skb, GFP_ATOMIC); + return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC); } static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) @@ -767,32 +765,35 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) * never fail unless improperly formated. */ static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd, - struct scatterlist *data, int out, int in) + struct scatterlist *out, + struct scatterlist *in) { - struct scatterlist *s, sg[VIRTNET_SEND_COMMAND_SG_MAX + 2]; + struct scatterlist *sgs[4], hdr, stat; struct virtio_net_ctrl_hdr ctrl; virtio_net_ctrl_ack status = ~0; - unsigned int tmp; - int i; + unsigned out_num = 0, in_num = 0, tmp; /* Caller should know better */ - BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ) || - (out + in > VIRTNET_SEND_COMMAND_SG_MAX)); - - out++; /* Add header */ - in++; /* Add return status */ + BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)); ctrl.class = class; ctrl.cmd = cmd; + /* Add header */ + sg_init_one(&hdr, &ctrl, sizeof(ctrl)); + sgs[out_num++] = &hdr; - sg_init_table(sg, out + in); + if (out) + sgs[out_num++] = out; + if (in) + sgs[out_num + in_num++] = in; - sg_set_buf(&sg[0], &ctrl, sizeof(ctrl)); - for_each_sg(data, s, out + in - 2, i) - sg_set_buf(&sg[i + 1], sg_virt(s), s->length); - sg_set_buf(&sg[out + in - 1], &status, sizeof(status)); + /* Add return status. */ + sg_init_one(&stat, &status, sizeof(status)); + sgs[out_num + in_num++] = &stat; - BUG_ON(virtqueue_add_buf(vi->cvq, sg, out, in, vi, GFP_ATOMIC) < 0); + BUG_ON(out_num + in_num > ARRAY_SIZE(sgs)); + BUG_ON(virtqueue_add_sgs(vi->cvq, sgs, out_num, in_num, vi, GFP_ATOMIC) + < 0); virtqueue_kick(vi->cvq); @@ -821,7 +822,7 @@ static int virtnet_set_mac_address(struct net_device *dev, void *p) sg_init_one(&sg, addr->sa_data, dev->addr_len); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC, VIRTIO_NET_CTRL_MAC_ADDR_SET, - &sg, 1, 0)) { + &sg, NULL)) { dev_warn(&vdev->dev, "Failed to set mac address by vq command.\n"); return -EINVAL; @@ -889,8 +890,7 @@ static void virtnet_ack_link_announce(struct virtnet_info *vi) { rtnl_lock(); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE, - VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL, - 0, 0)) + VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL, NULL)) dev_warn(&vi->dev->dev, "Failed to ack link announce.\n"); rtnl_unlock(); } @@ -900,6 +900,7 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs) struct scatterlist sg; struct virtio_net_ctrl_mq s; struct net_device *dev = vi->dev; + int i; if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ)) return 0; @@ -908,12 +909,16 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs) sg_init_one(&sg, &s, sizeof(s)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ, - VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg, 1, 0)){ + VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg, NULL)) { dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n", queue_pairs); return -EINVAL; - } else + } else { + for (i = vi->curr_queue_pairs; i < queue_pairs; i++) + if (!try_fill_recv(&vi->rq[i], GFP_KERNEL)) + schedule_delayed_work(&vi->refill, 0); vi->curr_queue_pairs = queue_pairs; + } return 0; } @@ -955,7 +960,7 @@ static void virtnet_set_rx_mode(struct net_device *dev) if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX, VIRTIO_NET_CTRL_RX_PROMISC, - sg, 1, 0)) + sg, NULL)) dev_warn(&dev->dev, "Failed to %sable promisc mode.\n", promisc ? "en" : "dis"); @@ -963,7 +968,7 @@ static void virtnet_set_rx_mode(struct net_device *dev) if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX, VIRTIO_NET_CTRL_RX_ALLMULTI, - sg, 1, 0)) + sg, NULL)) dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n", allmulti ? "en" : "dis"); @@ -1000,7 +1005,7 @@ static void virtnet_set_rx_mode(struct net_device *dev) if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC, VIRTIO_NET_CTRL_MAC_TABLE_SET, - sg, 2, 0)) + sg, NULL)) dev_warn(&dev->dev, "Failed to set MAC fitler table.\n"); kfree(buf); @@ -1015,7 +1020,7 @@ static int virtnet_vlan_rx_add_vid(struct net_device *dev, sg_init_one(&sg, &vid, sizeof(vid)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN, - VIRTIO_NET_CTRL_VLAN_ADD, &sg, 1, 0)) + VIRTIO_NET_CTRL_VLAN_ADD, &sg, NULL)) dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid); return 0; } @@ -1029,7 +1034,7 @@ static int virtnet_vlan_rx_kill_vid(struct net_device *dev, sg_init_one(&sg, &vid, sizeof(vid)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN, - VIRTIO_NET_CTRL_VLAN_DEL, &sg, 1, 0)) + VIRTIO_NET_CTRL_VLAN_DEL, &sg, NULL)) dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid); return 0; } @@ -1570,7 +1575,7 @@ static int virtnet_probe(struct virtio_device *vdev) } /* Last of all, set up some receive buffers. */ - for (i = 0; i < vi->max_queue_pairs; i++) { + for (i = 0; i < vi->curr_queue_pairs; i++) { try_fill_recv(&vi->rq[i], GFP_KERNEL); /* If we didn't even get one input buffer, we're useless. */ @@ -1694,7 +1699,7 @@ static int virtnet_restore(struct virtio_device *vdev) netif_device_attach(vi->dev); - for (i = 0; i < vi->max_queue_pairs; i++) + for (i = 0; i < vi->curr_queue_pairs; i++) if (!try_fill_recv(&vi->rq[i], GFP_KERNEL)) schedule_delayed_work(&vi->refill, 0); diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c index c3d222ed39a2..5327f35d9b5c 100644 --- a/drivers/pinctrl/core.c +++ b/drivers/pinctrl/core.c @@ -129,7 +129,7 @@ struct pinctrl_dev *get_pinctrl_dev_from_of_node(struct device_node *np) return pctldev; } - mutex_lock(&pinctrldev_list_mutex); + mutex_unlock(&pinctrldev_list_mutex); return NULL; } diff --git a/drivers/pinctrl/spear/pinctrl-plgpio.c b/drivers/pinctrl/spear/pinctrl-plgpio.c index a4908ecd74fb..3e5a887216b8 100644 --- a/drivers/pinctrl/spear/pinctrl-plgpio.c +++ b/drivers/pinctrl/spear/pinctrl-plgpio.c @@ -75,7 +75,7 @@ struct plgpio { int (*o2p)(int offset); /* offset_to_pin */ u32 p2o_regs; struct plgpio_regs regs; -#ifdef CONFIG_PM +#ifdef CONFIG_PM_SLEEP struct plgpio_regs *csave_regs; #endif }; @@ -554,7 +554,7 @@ static int plgpio_probe(struct platform_device *pdev) if (IS_ERR(plgpio->clk)) dev_warn(&pdev->dev, "clk_get() failed, work without it\n"); -#ifdef CONFIG_PM +#ifdef CONFIG_PM_SLEEP plgpio->csave_regs = devm_kzalloc(&pdev->dev, sizeof(*plgpio->csave_regs) * DIV_ROUND_UP(plgpio->chip.ngpio, MAX_GPIO_PER_REG), @@ -641,7 +641,7 @@ unprepare_clk: return ret; } -#ifdef CONFIG_PM +#ifdef CONFIG_PM_SLEEP static int plgpio_suspend(struct device *dev) { struct plgpio *plgpio = dev_get_drvdata(dev); diff --git a/drivers/rpmsg/virtio_rpmsg_bus.c b/drivers/rpmsg/virtio_rpmsg_bus.c index 7861f1119b7d..56fceafec9ec 100644 --- a/drivers/rpmsg/virtio_rpmsg_bus.c +++ b/drivers/rpmsg/virtio_rpmsg_bus.c @@ -757,14 +757,14 @@ int rpmsg_send_offchannel_raw(struct rpmsg_channel *rpdev, u32 src, u32 dst, mutex_lock(&vrp->tx_lock); /* add message to the remote processor's virtqueue */ - err = virtqueue_add_buf(vrp->svq, &sg, 1, 0, msg, GFP_KERNEL); + err = virtqueue_add_outbuf(vrp->svq, &sg, 1, msg, GFP_KERNEL); if (err) { /* * need to reclaim the buffer here, otherwise it's lost * (memory won't leak, but rpmsg won't use it again for TX). * this will wait for a buffer management overhaul. */ - dev_err(dev, "virtqueue_add_buf failed: %d\n", err); + dev_err(dev, "virtqueue_add_outbuf failed: %d\n", err); goto out; } @@ -839,7 +839,7 @@ static void rpmsg_recv_done(struct virtqueue *rvq) sg_init_one(&sg, msg, RPMSG_BUF_SIZE); /* add the buffer back to the remote processor's virtqueue */ - err = virtqueue_add_buf(vrp->rvq, &sg, 0, 1, msg, GFP_KERNEL); + err = virtqueue_add_inbuf(vrp->rvq, &sg, 1, msg, GFP_KERNEL); if (err < 0) { dev_err(dev, "failed to add a virtqueue buffer: %d\n", err); return; @@ -972,7 +972,7 @@ static int rpmsg_probe(struct virtio_device *vdev) sg_init_one(&sg, cpu_addr, RPMSG_BUF_SIZE); - err = virtqueue_add_buf(vrp->rvq, &sg, 0, 1, cpu_addr, + err = virtqueue_add_inbuf(vrp->rvq, &sg, 1, cpu_addr, GFP_KERNEL); WARN_ON(err); /* sanity check; this can't really happen */ } diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c index 3449a1f8c656..2168258fb2c3 100644 --- a/drivers/scsi/virtio_scsi.c +++ b/drivers/scsi/virtio_scsi.c @@ -13,6 +13,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/slab.h> #include <linux/mempool.h> @@ -20,12 +22,14 @@ #include <linux/virtio_ids.h> #include <linux/virtio_config.h> #include <linux/virtio_scsi.h> +#include <linux/cpu.h> #include <scsi/scsi_host.h> #include <scsi/scsi_device.h> #include <scsi/scsi_cmnd.h> #define VIRTIO_SCSI_MEMPOOL_SZ 64 #define VIRTIO_SCSI_EVENT_LEN 8 +#define VIRTIO_SCSI_VQ_BASE 2 /* Command queue element */ struct virtio_scsi_cmd { @@ -57,27 +61,61 @@ struct virtio_scsi_vq { struct virtqueue *vq; }; -/* Per-target queue state */ +/* + * Per-target queue state. + * + * This struct holds the data needed by the queue steering policy. When a + * target is sent multiple requests, we need to drive them to the same queue so + * that FIFO processing order is kept. However, if a target was idle, we can + * choose a queue arbitrarily. In this case the queue is chosen according to + * the current VCPU, so the driver expects the number of request queues to be + * equal to the number of VCPUs. This makes it easy and fast to select the + * queue, and also lets the driver optimize the IRQ affinity for the virtqueues + * (each virtqueue's affinity is set to the CPU that "owns" the queue). + * + * An interesting effect of this policy is that only writes to req_vq need to + * take the tgt_lock. Read can be done outside the lock because: + * + * - writes of req_vq only occur when atomic_inc_return(&tgt->reqs) returns 1. + * In that case, no other CPU is reading req_vq: even if they were in + * virtscsi_queuecommand_multi, they would be spinning on tgt_lock. + * + * - reads of req_vq only occur when the target is not idle (reqs != 0). + * A CPU that enters virtscsi_queuecommand_multi will not modify req_vq. + * + * Similarly, decrements of reqs are never concurrent with writes of req_vq. + * Thus they can happen outside the tgt_lock, provided of course we make reqs + * an atomic_t. + */ struct virtio_scsi_target_state { - /* Protects sg. Lock hierarchy is tgt_lock -> vq_lock. */ + /* This spinlock never held at the same time as vq_lock. */ spinlock_t tgt_lock; - /* For sglist construction when adding commands to the virtqueue. */ - struct scatterlist sg[]; + /* Count of outstanding requests. */ + atomic_t reqs; + + /* Currently active virtqueue for requests sent to this target. */ + struct virtio_scsi_vq *req_vq; }; /* Driver instance state */ struct virtio_scsi { struct virtio_device *vdev; - struct virtio_scsi_vq ctrl_vq; - struct virtio_scsi_vq event_vq; - struct virtio_scsi_vq req_vq; - /* Get some buffers ready for event vq */ struct virtio_scsi_event_node event_list[VIRTIO_SCSI_EVENT_LEN]; - struct virtio_scsi_target_state *tgt[]; + u32 num_queues; + + /* If the affinity hint is set for virtqueues */ + bool affinity_hint_set; + + /* CPU hotplug notifier */ + struct notifier_block nb; + + struct virtio_scsi_vq ctrl_vq; + struct virtio_scsi_vq event_vq; + struct virtio_scsi_vq req_vqs[]; }; static struct kmem_cache *virtscsi_cmd_cache; @@ -107,11 +145,13 @@ static void virtscsi_compute_resid(struct scsi_cmnd *sc, u32 resid) * * Called with vq_lock held. */ -static void virtscsi_complete_cmd(void *buf) +static void virtscsi_complete_cmd(struct virtio_scsi *vscsi, void *buf) { struct virtio_scsi_cmd *cmd = buf; struct scsi_cmnd *sc = cmd->sc; struct virtio_scsi_cmd_resp *resp = &cmd->resp.cmd; + struct virtio_scsi_target_state *tgt = + scsi_target(sc->device)->hostdata; dev_dbg(&sc->device->sdev_gendev, "cmd %p response %u status %#02x sense_len %u\n", @@ -166,32 +206,71 @@ static void virtscsi_complete_cmd(void *buf) mempool_free(cmd, virtscsi_cmd_pool); sc->scsi_done(sc); + + atomic_dec(&tgt->reqs); } -static void virtscsi_vq_done(struct virtqueue *vq, void (*fn)(void *buf)) +static void virtscsi_vq_done(struct virtio_scsi *vscsi, + struct virtio_scsi_vq *virtscsi_vq, + void (*fn)(struct virtio_scsi *vscsi, void *buf)) { void *buf; unsigned int len; + unsigned long flags; + struct virtqueue *vq = virtscsi_vq->vq; + spin_lock_irqsave(&virtscsi_vq->vq_lock, flags); do { virtqueue_disable_cb(vq); while ((buf = virtqueue_get_buf(vq, &len)) != NULL) - fn(buf); + fn(vscsi, buf); } while (!virtqueue_enable_cb(vq)); + spin_unlock_irqrestore(&virtscsi_vq->vq_lock, flags); } static void virtscsi_req_done(struct virtqueue *vq) { struct Scsi_Host *sh = virtio_scsi_host(vq->vdev); struct virtio_scsi *vscsi = shost_priv(sh); - unsigned long flags; + int index = vq->index - VIRTIO_SCSI_VQ_BASE; + struct virtio_scsi_vq *req_vq = &vscsi->req_vqs[index]; - spin_lock_irqsave(&vscsi->req_vq.vq_lock, flags); - virtscsi_vq_done(vq, virtscsi_complete_cmd); - spin_unlock_irqrestore(&vscsi->req_vq.vq_lock, flags); + /* + * Read req_vq before decrementing the reqs field in + * virtscsi_complete_cmd. + * + * With barriers: + * + * CPU #0 virtscsi_queuecommand_multi (CPU #1) + * ------------------------------------------------------------ + * lock vq_lock + * read req_vq + * read reqs (reqs = 1) + * write reqs (reqs = 0) + * increment reqs (reqs = 1) + * write req_vq + * + * Possible reordering without barriers: + * + * CPU #0 virtscsi_queuecommand_multi (CPU #1) + * ------------------------------------------------------------ + * lock vq_lock + * read reqs (reqs = 1) + * write reqs (reqs = 0) + * increment reqs (reqs = 1) + * write req_vq + * read (wrong) req_vq + * + * We do not need a full smp_rmb, because req_vq is required to get + * to tgt->reqs: tgt is &vscsi->tgt[sc->device->id], where sc is stored + * in the virtqueue as the user token. + */ + smp_read_barrier_depends(); + + virtscsi_vq_done(vscsi, req_vq, virtscsi_complete_cmd); }; -static void virtscsi_complete_free(void *buf) +static void virtscsi_complete_free(struct virtio_scsi *vscsi, void *buf) { struct virtio_scsi_cmd *cmd = buf; @@ -205,11 +284,8 @@ static void virtscsi_ctrl_done(struct virtqueue *vq) { struct Scsi_Host *sh = virtio_scsi_host(vq->vdev); struct virtio_scsi *vscsi = shost_priv(sh); - unsigned long flags; - spin_lock_irqsave(&vscsi->ctrl_vq.vq_lock, flags); - virtscsi_vq_done(vq, virtscsi_complete_free); - spin_unlock_irqrestore(&vscsi->ctrl_vq.vq_lock, flags); + virtscsi_vq_done(vscsi, &vscsi->ctrl_vq, virtscsi_complete_free); }; static int virtscsi_kick_event(struct virtio_scsi *vscsi, @@ -223,8 +299,8 @@ static int virtscsi_kick_event(struct virtio_scsi *vscsi, spin_lock_irqsave(&vscsi->event_vq.vq_lock, flags); - err = virtqueue_add_buf(vscsi->event_vq.vq, &sg, 0, 1, event_node, - GFP_ATOMIC); + err = virtqueue_add_inbuf(vscsi->event_vq.vq, &sg, 1, event_node, + GFP_ATOMIC); if (!err) virtqueue_kick(vscsi->event_vq.vq); @@ -254,7 +330,7 @@ static void virtscsi_cancel_event_work(struct virtio_scsi *vscsi) } static void virtscsi_handle_transport_reset(struct virtio_scsi *vscsi, - struct virtio_scsi_event *event) + struct virtio_scsi_event *event) { struct scsi_device *sdev; struct Scsi_Host *shost = virtio_scsi_host(vscsi->vdev); @@ -332,7 +408,7 @@ static void virtscsi_handle_event(struct work_struct *work) virtscsi_kick_event(vscsi, event_node); } -static void virtscsi_complete_event(void *buf) +static void virtscsi_complete_event(struct virtio_scsi *vscsi, void *buf) { struct virtio_scsi_event_node *event_node = buf; @@ -344,82 +420,65 @@ static void virtscsi_event_done(struct virtqueue *vq) { struct Scsi_Host *sh = virtio_scsi_host(vq->vdev); struct virtio_scsi *vscsi = shost_priv(sh); - unsigned long flags; - spin_lock_irqsave(&vscsi->event_vq.vq_lock, flags); - virtscsi_vq_done(vq, virtscsi_complete_event); - spin_unlock_irqrestore(&vscsi->event_vq.vq_lock, flags); + virtscsi_vq_done(vscsi, &vscsi->event_vq, virtscsi_complete_event); }; -static void virtscsi_map_sgl(struct scatterlist *sg, unsigned int *p_idx, - struct scsi_data_buffer *sdb) -{ - struct sg_table *table = &sdb->table; - struct scatterlist *sg_elem; - unsigned int idx = *p_idx; - int i; - - for_each_sg(table->sgl, sg_elem, table->nents, i) - sg[idx++] = *sg_elem; - - *p_idx = idx; -} - /** - * virtscsi_map_cmd - map a scsi_cmd to a virtqueue scatterlist - * @vscsi : virtio_scsi state + * virtscsi_add_cmd - add a virtio_scsi_cmd to a virtqueue + * @vq : the struct virtqueue we're talking about * @cmd : command structure - * @out_num : number of read-only elements - * @in_num : number of write-only elements * @req_size : size of the request buffer * @resp_size : size of the response buffer - * - * Called with tgt_lock held. + * @gfp : flags to use for memory allocations */ -static void virtscsi_map_cmd(struct virtio_scsi_target_state *tgt, - struct virtio_scsi_cmd *cmd, - unsigned *out_num, unsigned *in_num, - size_t req_size, size_t resp_size) +static int virtscsi_add_cmd(struct virtqueue *vq, + struct virtio_scsi_cmd *cmd, + size_t req_size, size_t resp_size, gfp_t gfp) { struct scsi_cmnd *sc = cmd->sc; - struct scatterlist *sg = tgt->sg; - unsigned int idx = 0; + struct scatterlist *sgs[4], req, resp; + struct sg_table *out, *in; + unsigned out_num = 0, in_num = 0; + + out = in = NULL; + + if (sc && sc->sc_data_direction != DMA_NONE) { + if (sc->sc_data_direction != DMA_FROM_DEVICE) + out = &scsi_out(sc)->table; + if (sc->sc_data_direction != DMA_TO_DEVICE) + in = &scsi_in(sc)->table; + } /* Request header. */ - sg_set_buf(&sg[idx++], &cmd->req, req_size); + sg_init_one(&req, &cmd->req, req_size); + sgs[out_num++] = &req; /* Data-out buffer. */ - if (sc && sc->sc_data_direction != DMA_FROM_DEVICE) - virtscsi_map_sgl(sg, &idx, scsi_out(sc)); - - *out_num = idx; + if (out) + sgs[out_num++] = out->sgl; /* Response header. */ - sg_set_buf(&sg[idx++], &cmd->resp, resp_size); + sg_init_one(&resp, &cmd->resp, resp_size); + sgs[out_num + in_num++] = &resp; /* Data-in buffer */ - if (sc && sc->sc_data_direction != DMA_TO_DEVICE) - virtscsi_map_sgl(sg, &idx, scsi_in(sc)); + if (in) + sgs[out_num + in_num++] = in->sgl; - *in_num = idx - *out_num; + return virtqueue_add_sgs(vq, sgs, out_num, in_num, cmd, gfp); } -static int virtscsi_kick_cmd(struct virtio_scsi_target_state *tgt, - struct virtio_scsi_vq *vq, +static int virtscsi_kick_cmd(struct virtio_scsi_vq *vq, struct virtio_scsi_cmd *cmd, size_t req_size, size_t resp_size, gfp_t gfp) { - unsigned int out_num, in_num; unsigned long flags; int err; bool needs_kick = false; - spin_lock_irqsave(&tgt->tgt_lock, flags); - virtscsi_map_cmd(tgt, cmd, &out_num, &in_num, req_size, resp_size); - - spin_lock(&vq->vq_lock); - err = virtqueue_add_buf(vq->vq, tgt->sg, out_num, in_num, cmd, gfp); - spin_unlock(&tgt->tgt_lock); + spin_lock_irqsave(&vq->vq_lock, flags); + err = virtscsi_add_cmd(vq->vq, cmd, req_size, resp_size, gfp); if (!err) needs_kick = virtqueue_kick_prepare(vq->vq); @@ -430,10 +489,10 @@ static int virtscsi_kick_cmd(struct virtio_scsi_target_state *tgt, return err; } -static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc) +static int virtscsi_queuecommand(struct virtio_scsi *vscsi, + struct virtio_scsi_vq *req_vq, + struct scsi_cmnd *sc) { - struct virtio_scsi *vscsi = shost_priv(sh); - struct virtio_scsi_target_state *tgt = vscsi->tgt[sc->device->id]; struct virtio_scsi_cmd *cmd; int ret; @@ -467,7 +526,7 @@ static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc) BUG_ON(sc->cmd_len > VIRTIO_SCSI_CDB_SIZE); memcpy(cmd->req.cmd.cdb, sc->cmnd, sc->cmd_len); - if (virtscsi_kick_cmd(tgt, &vscsi->req_vq, cmd, + if (virtscsi_kick_cmd(req_vq, cmd, sizeof cmd->req.cmd, sizeof cmd->resp.cmd, GFP_ATOMIC) == 0) ret = 0; @@ -478,14 +537,62 @@ out: return ret; } +static int virtscsi_queuecommand_single(struct Scsi_Host *sh, + struct scsi_cmnd *sc) +{ + struct virtio_scsi *vscsi = shost_priv(sh); + struct virtio_scsi_target_state *tgt = + scsi_target(sc->device)->hostdata; + + atomic_inc(&tgt->reqs); + return virtscsi_queuecommand(vscsi, &vscsi->req_vqs[0], sc); +} + +static struct virtio_scsi_vq *virtscsi_pick_vq(struct virtio_scsi *vscsi, + struct virtio_scsi_target_state *tgt) +{ + struct virtio_scsi_vq *vq; + unsigned long flags; + u32 queue_num; + + spin_lock_irqsave(&tgt->tgt_lock, flags); + + /* + * The memory barrier after atomic_inc_return matches + * the smp_read_barrier_depends() in virtscsi_req_done. + */ + if (atomic_inc_return(&tgt->reqs) > 1) + vq = ACCESS_ONCE(tgt->req_vq); + else { + queue_num = smp_processor_id(); + while (unlikely(queue_num >= vscsi->num_queues)) + queue_num -= vscsi->num_queues; + + tgt->req_vq = vq = &vscsi->req_vqs[queue_num]; + } + + spin_unlock_irqrestore(&tgt->tgt_lock, flags); + return vq; +} + +static int virtscsi_queuecommand_multi(struct Scsi_Host *sh, + struct scsi_cmnd *sc) +{ + struct virtio_scsi *vscsi = shost_priv(sh); + struct virtio_scsi_target_state *tgt = + scsi_target(sc->device)->hostdata; + struct virtio_scsi_vq *req_vq = virtscsi_pick_vq(vscsi, tgt); + + return virtscsi_queuecommand(vscsi, req_vq, sc); +} + static int virtscsi_tmf(struct virtio_scsi *vscsi, struct virtio_scsi_cmd *cmd) { DECLARE_COMPLETION_ONSTACK(comp); - struct virtio_scsi_target_state *tgt = vscsi->tgt[cmd->sc->device->id]; int ret = FAILED; cmd->comp = ∁ - if (virtscsi_kick_cmd(tgt, &vscsi->ctrl_vq, cmd, + if (virtscsi_kick_cmd(&vscsi->ctrl_vq, cmd, sizeof cmd->req.tmf, sizeof cmd->resp.tmf, GFP_NOIO) < 0) goto out; @@ -547,18 +654,57 @@ static int virtscsi_abort(struct scsi_cmnd *sc) return virtscsi_tmf(vscsi, cmd); } -static struct scsi_host_template virtscsi_host_template = { +static int virtscsi_target_alloc(struct scsi_target *starget) +{ + struct virtio_scsi_target_state *tgt = + kmalloc(sizeof(*tgt), GFP_KERNEL); + if (!tgt) + return -ENOMEM; + + spin_lock_init(&tgt->tgt_lock); + atomic_set(&tgt->reqs, 0); + tgt->req_vq = NULL; + + starget->hostdata = tgt; + return 0; +} + +static void virtscsi_target_destroy(struct scsi_target *starget) +{ + struct virtio_scsi_target_state *tgt = starget->hostdata; + kfree(tgt); +} + +static struct scsi_host_template virtscsi_host_template_single = { + .module = THIS_MODULE, + .name = "Virtio SCSI HBA", + .proc_name = "virtio_scsi", + .this_id = -1, + .queuecommand = virtscsi_queuecommand_single, + .eh_abort_handler = virtscsi_abort, + .eh_device_reset_handler = virtscsi_device_reset, + + .can_queue = 1024, + .dma_boundary = UINT_MAX, + .use_clustering = ENABLE_CLUSTERING, + .target_alloc = virtscsi_target_alloc, + .target_destroy = virtscsi_target_destroy, +}; + +static struct scsi_host_template virtscsi_host_template_multi = { .module = THIS_MODULE, .name = "Virtio SCSI HBA", .proc_name = "virtio_scsi", - .queuecommand = virtscsi_queuecommand, .this_id = -1, + .queuecommand = virtscsi_queuecommand_multi, .eh_abort_handler = virtscsi_abort, .eh_device_reset_handler = virtscsi_device_reset, .can_queue = 1024, .dma_boundary = UINT_MAX, .use_clustering = ENABLE_CLUSTERING, + .target_alloc = virtscsi_target_alloc, + .target_destroy = virtscsi_target_destroy, }; #define virtscsi_config_get(vdev, fld) \ @@ -578,29 +724,69 @@ static struct scsi_host_template virtscsi_host_template = { &__val, sizeof(__val)); \ }) -static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq, - struct virtqueue *vq) +static void __virtscsi_set_affinity(struct virtio_scsi *vscsi, bool affinity) { - spin_lock_init(&virtscsi_vq->vq_lock); - virtscsi_vq->vq = vq; + int i; + int cpu; + + /* In multiqueue mode, when the number of cpu is equal + * to the number of request queues, we let the qeueues + * to be private to one cpu by setting the affinity hint + * to eliminate the contention. + */ + if ((vscsi->num_queues == 1 || + vscsi->num_queues != num_online_cpus()) && affinity) { + if (vscsi->affinity_hint_set) + affinity = false; + else + return; + } + + if (affinity) { + i = 0; + for_each_online_cpu(cpu) { + virtqueue_set_affinity(vscsi->req_vqs[i].vq, cpu); + i++; + } + + vscsi->affinity_hint_set = true; + } else { + for (i = 0; i < vscsi->num_queues - VIRTIO_SCSI_VQ_BASE; i++) + virtqueue_set_affinity(vscsi->req_vqs[i].vq, -1); + + vscsi->affinity_hint_set = false; + } } -static struct virtio_scsi_target_state *virtscsi_alloc_tgt( - struct virtio_device *vdev, int sg_elems) +static void virtscsi_set_affinity(struct virtio_scsi *vscsi, bool affinity) { - struct virtio_scsi_target_state *tgt; - gfp_t gfp_mask = GFP_KERNEL; - - /* We need extra sg elements at head and tail. */ - tgt = kmalloc(sizeof(*tgt) + sizeof(tgt->sg[0]) * (sg_elems + 2), - gfp_mask); + get_online_cpus(); + __virtscsi_set_affinity(vscsi, affinity); + put_online_cpus(); +} - if (!tgt) - return NULL; +static int virtscsi_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + struct virtio_scsi *vscsi = container_of(nfb, struct virtio_scsi, nb); + switch(action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + case CPU_DEAD: + case CPU_DEAD_FROZEN: + __virtscsi_set_affinity(vscsi, true); + break; + default: + break; + } + return NOTIFY_OK; +} - spin_lock_init(&tgt->tgt_lock); - sg_init_table(tgt->sg, sg_elems + 2); - return tgt; +static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq, + struct virtqueue *vq) +{ + spin_lock_init(&virtscsi_vq->vq_lock); + virtscsi_vq->vq = vq; } static void virtscsi_scan(struct virtio_device *vdev) @@ -614,46 +800,56 @@ static void virtscsi_remove_vqs(struct virtio_device *vdev) { struct Scsi_Host *sh = virtio_scsi_host(vdev); struct virtio_scsi *vscsi = shost_priv(sh); - u32 i, num_targets; + + virtscsi_set_affinity(vscsi, false); /* Stop all the virtqueues. */ vdev->config->reset(vdev); - num_targets = sh->max_id; - for (i = 0; i < num_targets; i++) { - kfree(vscsi->tgt[i]); - vscsi->tgt[i] = NULL; - } - vdev->config->del_vqs(vdev); } static int virtscsi_init(struct virtio_device *vdev, - struct virtio_scsi *vscsi, int num_targets) + struct virtio_scsi *vscsi) { int err; - struct virtqueue *vqs[3]; - u32 i, sg_elems; + u32 i; + u32 num_vqs; + vq_callback_t **callbacks; + const char **names; + struct virtqueue **vqs; + + num_vqs = vscsi->num_queues + VIRTIO_SCSI_VQ_BASE; + vqs = kmalloc(num_vqs * sizeof(struct virtqueue *), GFP_KERNEL); + callbacks = kmalloc(num_vqs * sizeof(vq_callback_t *), GFP_KERNEL); + names = kmalloc(num_vqs * sizeof(char *), GFP_KERNEL); + + if (!callbacks || !vqs || !names) { + err = -ENOMEM; + goto out; + } - vq_callback_t *callbacks[] = { - virtscsi_ctrl_done, - virtscsi_event_done, - virtscsi_req_done - }; - const char *names[] = { - "control", - "event", - "request" - }; + callbacks[0] = virtscsi_ctrl_done; + callbacks[1] = virtscsi_event_done; + names[0] = "control"; + names[1] = "event"; + for (i = VIRTIO_SCSI_VQ_BASE; i < num_vqs; i++) { + callbacks[i] = virtscsi_req_done; + names[i] = "request"; + } /* Discover virtqueues and write information to configuration. */ - err = vdev->config->find_vqs(vdev, 3, vqs, callbacks, names); + err = vdev->config->find_vqs(vdev, num_vqs, vqs, callbacks, names); if (err) - return err; + goto out; virtscsi_init_vq(&vscsi->ctrl_vq, vqs[0]); virtscsi_init_vq(&vscsi->event_vq, vqs[1]); - virtscsi_init_vq(&vscsi->req_vq, vqs[2]); + for (i = VIRTIO_SCSI_VQ_BASE; i < num_vqs; i++) + virtscsi_init_vq(&vscsi->req_vqs[i - VIRTIO_SCSI_VQ_BASE], + vqs[i]); + + virtscsi_set_affinity(vscsi, true); virtscsi_config_set(vdev, cdb_size, VIRTIO_SCSI_CDB_SIZE); virtscsi_config_set(vdev, sense_size, VIRTIO_SCSI_SENSE_SIZE); @@ -661,19 +857,12 @@ static int virtscsi_init(struct virtio_device *vdev, if (virtio_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG)) virtscsi_kick_event_all(vscsi); - /* We need to know how many segments before we allocate. */ - sg_elems = virtscsi_config_get(vdev, seg_max) ?: 1; - - for (i = 0; i < num_targets; i++) { - vscsi->tgt[i] = virtscsi_alloc_tgt(vdev, sg_elems); - if (!vscsi->tgt[i]) { - err = -ENOMEM; - goto out; - } - } err = 0; out: + kfree(names); + kfree(callbacks); + kfree(vqs); if (err) virtscsi_remove_vqs(vdev); return err; @@ -686,13 +875,21 @@ static int virtscsi_probe(struct virtio_device *vdev) int err; u32 sg_elems, num_targets; u32 cmd_per_lun; + u32 num_queues; + struct scsi_host_template *hostt; + + /* We need to know how many queues before we allocate. */ + num_queues = virtscsi_config_get(vdev, num_queues) ? : 1; - /* Allocate memory and link the structs together. */ num_targets = virtscsi_config_get(vdev, max_target) + 1; - shost = scsi_host_alloc(&virtscsi_host_template, - sizeof(*vscsi) - + num_targets * sizeof(struct virtio_scsi_target_state)); + if (num_queues == 1) + hostt = &virtscsi_host_template_single; + else + hostt = &virtscsi_host_template_multi; + + shost = scsi_host_alloc(hostt, + sizeof(*vscsi) + sizeof(vscsi->req_vqs[0]) * num_queues); if (!shost) return -ENOMEM; @@ -700,12 +897,20 @@ static int virtscsi_probe(struct virtio_device *vdev) shost->sg_tablesize = sg_elems; vscsi = shost_priv(shost); vscsi->vdev = vdev; + vscsi->num_queues = num_queues; vdev->priv = shost; - err = virtscsi_init(vdev, vscsi, num_targets); + err = virtscsi_init(vdev, vscsi); if (err) goto virtscsi_init_failed; + vscsi->nb.notifier_call = &virtscsi_cpu_callback; + err = register_hotcpu_notifier(&vscsi->nb); + if (err) { + pr_err("registering cpu notifier failed\n"); + goto scsi_add_host_failed; + } + cmd_per_lun = virtscsi_config_get(vdev, cmd_per_lun) ?: 1; shost->cmd_per_lun = min_t(u32, cmd_per_lun, shost->can_queue); shost->max_sectors = virtscsi_config_get(vdev, max_sectors) ?: 0xFFFF; @@ -743,6 +948,8 @@ static void virtscsi_remove(struct virtio_device *vdev) scsi_remove_host(shost); + unregister_hotcpu_notifier(&vscsi->nb); + virtscsi_remove_vqs(vdev); scsi_host_put(shost); } @@ -759,7 +966,7 @@ static int virtscsi_restore(struct virtio_device *vdev) struct Scsi_Host *sh = virtio_scsi_host(vdev); struct virtio_scsi *vscsi = shost_priv(sh); - return virtscsi_init(vdev, vscsi, sh->max_id); + return virtscsi_init(vdev, vscsi); } #endif @@ -794,8 +1001,7 @@ static int __init init(void) virtscsi_cmd_cache = KMEM_CACHE(virtio_scsi_cmd, 0); if (!virtscsi_cmd_cache) { - printk(KERN_ERR "kmem_cache_create() for " - "virtscsi_cmd_cache failed\n"); + pr_err("kmem_cache_create() for virtscsi_cmd_cache failed\n"); goto error; } @@ -804,8 +1010,7 @@ static int __init init(void) mempool_create_slab_pool(VIRTIO_SCSI_MEMPOOL_SZ, virtscsi_cmd_cache); if (!virtscsi_cmd_pool) { - printk(KERN_ERR "mempool_create() for" - "virtscsi_cmd_pool failed\n"); + pr_err("mempool_create() for virtscsi_cmd_pool failed\n"); goto error; } ret = register_virtio_driver(&virtio_scsi_driver); diff --git a/drivers/staging/media/go7007/go7007-driver.c b/drivers/staging/media/go7007/go7007-driver.c index a5ca99d7c0b2..3640df0aa0c1 100644 --- a/drivers/staging/media/go7007/go7007-driver.c +++ b/drivers/staging/media/go7007/go7007-driver.c @@ -242,11 +242,8 @@ static void go7007_remove(struct v4l2_device *v4l2_dev) if (go->hpi_ops->release) go->hpi_ops->release(go); if (go->i2c_adapter_online) { - if (i2c_del_adapter(&go->i2c_adapter) == 0) - go->i2c_adapter_online = 0; - else - v4l2_err(&go->v4l2_dev, - "error removing I2C adapter!\n"); + i2c_del_adapter(&go->i2c_adapter); + go->i2c_adapter_online = 0; } kfree(go->boot_fw); diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 09d2e3ffd6fc..ac3725440d64 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -201,7 +201,9 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type) return (flags & PCI_MSIX_FLAGS_QSIZE) + 1; } - } + } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) + if (pci_is_pcie(vdev->pdev)) + return 1; return 0; } @@ -317,6 +319,17 @@ static long vfio_pci_ioctl(void *device_data, if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS) return -EINVAL; + switch (info.index) { + case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX: + break; + case VFIO_PCI_ERR_IRQ_INDEX: + if (pci_is_pcie(vdev->pdev)) + break; + /* pass thru to return error */ + default: + return -EINVAL; + } + info.flags = VFIO_IRQ_INFO_EVENTFD; info.count = vfio_pci_get_irq_count(vdev, info.index); @@ -552,11 +565,40 @@ static void vfio_pci_remove(struct pci_dev *pdev) kfree(vdev); } +static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, + pci_channel_state_t state) +{ + struct vfio_pci_device *vdev; + struct vfio_device *device; + + device = vfio_device_get_from_dev(&pdev->dev); + if (device == NULL) + return PCI_ERS_RESULT_DISCONNECT; + + vdev = vfio_device_data(device); + if (vdev == NULL) { + vfio_device_put(device); + return PCI_ERS_RESULT_DISCONNECT; + } + + if (vdev->err_trigger) + eventfd_signal(vdev->err_trigger, 1); + + vfio_device_put(device); + + return PCI_ERS_RESULT_CAN_RECOVER; +} + +static struct pci_error_handlers vfio_err_handlers = { + .error_detected = vfio_pci_aer_err_detected, +}; + static struct pci_driver vfio_pci_driver = { .name = "vfio-pci", .id_table = NULL, /* only dynamic ids */ .probe = vfio_pci_probe, .remove = vfio_pci_remove, + .err_handler = &vfio_err_handlers, }; static void __exit vfio_pci_cleanup(void) diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index aeb00fc2d3be..affa34745be9 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -274,9 +274,10 @@ static int vfio_direct_config_read(struct vfio_pci_device *vdev, int pos, return count; } -static int vfio_direct_config_write(struct vfio_pci_device *vdev, int pos, - int count, struct perm_bits *perm, - int offset, __le32 val) +/* Raw access skips any kind of virtualization */ +static int vfio_raw_config_write(struct vfio_pci_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 val) { int ret; @@ -287,13 +288,36 @@ static int vfio_direct_config_write(struct vfio_pci_device *vdev, int pos, return count; } -/* Default all regions to read-only, no-virtualization */ +static int vfio_raw_config_read(struct vfio_pci_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 *val) +{ + int ret; + + ret = vfio_user_config_read(vdev->pdev, pos, val, count); + if (ret) + return pcibios_err_to_errno(ret); + + return count; +} + +/* Default capability regions to read-only, no-virtualization */ static struct perm_bits cap_perms[PCI_CAP_ID_MAX + 1] = { [0 ... PCI_CAP_ID_MAX] = { .readfn = vfio_direct_config_read } }; static struct perm_bits ecap_perms[PCI_EXT_CAP_ID_MAX + 1] = { [0 ... PCI_EXT_CAP_ID_MAX] = { .readfn = vfio_direct_config_read } }; +/* + * Default unassigned regions to raw read-write access. Some devices + * require this to function as they hide registers between the gaps in + * config space (be2net). Like MMIO and I/O port registers, we have + * to trust the hardware isolation. + */ +static struct perm_bits unassigned_perms = { + .readfn = vfio_raw_config_read, + .writefn = vfio_raw_config_write +}; static void free_perm_bits(struct perm_bits *perm) { @@ -779,16 +803,16 @@ int __init vfio_pci_init_perm_bits(void) /* Capabilities */ ret |= init_pci_cap_pm_perm(&cap_perms[PCI_CAP_ID_PM]); - cap_perms[PCI_CAP_ID_VPD].writefn = vfio_direct_config_write; + cap_perms[PCI_CAP_ID_VPD].writefn = vfio_raw_config_write; ret |= init_pci_cap_pcix_perm(&cap_perms[PCI_CAP_ID_PCIX]); - cap_perms[PCI_CAP_ID_VNDR].writefn = vfio_direct_config_write; + cap_perms[PCI_CAP_ID_VNDR].writefn = vfio_raw_config_write; ret |= init_pci_cap_exp_perm(&cap_perms[PCI_CAP_ID_EXP]); ret |= init_pci_cap_af_perm(&cap_perms[PCI_CAP_ID_AF]); /* Extended capabilities */ ret |= init_pci_ext_cap_err_perm(&ecap_perms[PCI_EXT_CAP_ID_ERR]); ret |= init_pci_ext_cap_pwr_perm(&ecap_perms[PCI_EXT_CAP_ID_PWR]); - ecap_perms[PCI_EXT_CAP_ID_VNDR].writefn = vfio_direct_config_write; + ecap_perms[PCI_EXT_CAP_ID_VNDR].writefn = vfio_raw_config_write; if (ret) vfio_pci_uninit_perm_bits(); @@ -801,9 +825,6 @@ static int vfio_find_cap_start(struct vfio_pci_device *vdev, int pos) u8 cap; int base = (pos >= PCI_CFG_SPACE_SIZE) ? PCI_CFG_SPACE_SIZE : PCI_STD_HEADER_SIZEOF; - base /= 4; - pos /= 4; - cap = vdev->pci_config_map[pos]; if (cap == PCI_CAP_ID_BASIC) @@ -813,7 +834,7 @@ static int vfio_find_cap_start(struct vfio_pci_device *vdev, int pos) while (pos - 1 >= base && vdev->pci_config_map[pos - 1] == cap) pos--; - return pos * 4; + return pos; } static int vfio_msi_config_read(struct vfio_pci_device *vdev, int pos, @@ -1017,13 +1038,9 @@ static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos) return byte; case PCI_CAP_ID_EXP: /* length based on version */ - ret = pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, &word); - if (ret) - return pcibios_err_to_errno(ret); - vdev->extended_caps = true; - if ((word & PCI_EXP_FLAGS_VERS) == 1) + if ((pcie_caps_reg(pdev) & PCI_EXP_FLAGS_VERS) == 1) return PCI_CAP_EXP_ENDPOINT_SIZEOF_V1; else return PCI_CAP_EXP_ENDPOINT_SIZEOF_V2; @@ -1230,8 +1247,8 @@ static int vfio_cap_init(struct vfio_pci_device *vdev) } /* Sanity check, do we overlap other capabilities? */ - for (i = 0; i < len; i += 4) { - if (likely(map[(pos + i) / 4] == PCI_CAP_ID_INVALID)) + for (i = 0; i < len; i++) { + if (likely(map[pos + i] == PCI_CAP_ID_INVALID)) continue; pr_warn("%s: %s pci config conflict @0x%x, was cap 0x%x now cap 0x%x\n", @@ -1239,7 +1256,7 @@ static int vfio_cap_init(struct vfio_pci_device *vdev) pos + i, map[pos + i], cap); } - memset(map + (pos / 4), cap, len / 4); + memset(map + pos, cap, len); ret = vfio_fill_vconfig_bytes(vdev, pos, len); if (ret) return ret; @@ -1314,8 +1331,8 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev) hidden = true; } - for (i = 0; i < len; i += 4) { - if (likely(map[(epos + i) / 4] == PCI_CAP_ID_INVALID)) + for (i = 0; i < len; i++) { + if (likely(map[epos + i] == PCI_CAP_ID_INVALID)) continue; pr_warn("%s: %s pci config conflict @0x%x, was ecap 0x%x now ecap 0x%x\n", @@ -1330,7 +1347,7 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev) */ BUILD_BUG_ON(PCI_EXT_CAP_ID_MAX >= PCI_CAP_ID_INVALID); - memset(map + (epos / 4), ecap, len / 4); + memset(map + epos, ecap, len); ret = vfio_fill_vconfig_bytes(vdev, epos, len); if (ret) return ret; @@ -1377,10 +1394,12 @@ int vfio_config_init(struct vfio_pci_device *vdev) int ret; /* - * Config space, caps and ecaps are all dword aligned, so we can - * use one byte per dword to record the type. + * Config space, caps and ecaps are all dword aligned, so we could + * use one byte per dword to record the type. However, there are + * no requiremenst on the length of a capability, so the gap between + * capabilities needs byte granularity. */ - map = kmalloc(pdev->cfg_size / 4, GFP_KERNEL); + map = kmalloc(pdev->cfg_size, GFP_KERNEL); if (!map) return -ENOMEM; @@ -1393,9 +1412,9 @@ int vfio_config_init(struct vfio_pci_device *vdev) vdev->pci_config_map = map; vdev->vconfig = vconfig; - memset(map, PCI_CAP_ID_BASIC, PCI_STD_HEADER_SIZEOF / 4); - memset(map + (PCI_STD_HEADER_SIZEOF / 4), PCI_CAP_ID_INVALID, - (pdev->cfg_size - PCI_STD_HEADER_SIZEOF) / 4); + memset(map, PCI_CAP_ID_BASIC, PCI_STD_HEADER_SIZEOF); + memset(map + PCI_STD_HEADER_SIZEOF, PCI_CAP_ID_INVALID, + pdev->cfg_size - PCI_STD_HEADER_SIZEOF); ret = vfio_fill_vconfig_bytes(vdev, 0, PCI_STD_HEADER_SIZEOF); if (ret) @@ -1450,6 +1469,22 @@ void vfio_config_free(struct vfio_pci_device *vdev) vdev->msi_perm = NULL; } +/* + * Find the remaining number of bytes in a dword that match the given + * position. Stop at either the end of the capability or the dword boundary. + */ +static size_t vfio_pci_cap_remaining_dword(struct vfio_pci_device *vdev, + loff_t pos) +{ + u8 cap = vdev->pci_config_map[pos]; + size_t i; + + for (i = 1; (pos + i) % 4 && vdev->pci_config_map[pos + i] == cap; i++) + /* nop */; + + return i; +} + static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite) { @@ -1458,55 +1493,48 @@ static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf, __le32 val = 0; int cap_start = 0, offset; u8 cap_id; - ssize_t ret = count; + ssize_t ret; - if (*ppos < 0 || *ppos + count > pdev->cfg_size) + if (*ppos < 0 || *ppos >= pdev->cfg_size || + *ppos + count > pdev->cfg_size) return -EFAULT; /* - * gcc can't seem to figure out we're a static function, only called - * with count of 1/2/4 and hits copy_from_user_overflow without this. + * Chop accesses into aligned chunks containing no more than a + * single capability. Caller increments to the next chunk. */ - if (count > sizeof(val)) - return -EINVAL; - - cap_id = vdev->pci_config_map[*ppos / 4]; - - if (cap_id == PCI_CAP_ID_INVALID) { - if (iswrite) - return ret; /* drop */ - - /* - * Per PCI spec 3.0, section 6.1, reads from reserved and - * unimplemented registers return 0 - */ - if (copy_to_user(buf, &val, count)) - return -EFAULT; - - return ret; - } + count = min(count, vfio_pci_cap_remaining_dword(vdev, *ppos)); + if (count >= 4 && !(*ppos % 4)) + count = 4; + else if (count >= 2 && !(*ppos % 2)) + count = 2; + else + count = 1; - /* - * All capabilities are minimum 4 bytes and aligned on dword - * boundaries. Since we don't support unaligned accesses, we're - * only ever accessing a single capability. - */ - if (*ppos >= PCI_CFG_SPACE_SIZE) { - WARN_ON(cap_id > PCI_EXT_CAP_ID_MAX); + ret = count; - perm = &ecap_perms[cap_id]; - cap_start = vfio_find_cap_start(vdev, *ppos); + cap_id = vdev->pci_config_map[*ppos]; + if (cap_id == PCI_CAP_ID_INVALID) { + perm = &unassigned_perms; + cap_start = *ppos; } else { - WARN_ON(cap_id > PCI_CAP_ID_MAX); + if (*ppos >= PCI_CFG_SPACE_SIZE) { + WARN_ON(cap_id > PCI_EXT_CAP_ID_MAX); - perm = &cap_perms[cap_id]; + perm = &ecap_perms[cap_id]; + cap_start = vfio_find_cap_start(vdev, *ppos); + } else { + WARN_ON(cap_id > PCI_CAP_ID_MAX); - if (cap_id == PCI_CAP_ID_MSI) - perm = vdev->msi_perm; + perm = &cap_perms[cap_id]; - if (cap_id > PCI_CAP_ID_BASIC) - cap_start = vfio_find_cap_start(vdev, *ppos); + if (cap_id == PCI_CAP_ID_MSI) + perm = vdev->msi_perm; + + if (cap_id > PCI_CAP_ID_BASIC) + cap_start = vfio_find_cap_start(vdev, *ppos); + } } WARN_ON(!cap_start && cap_id != PCI_CAP_ID_BASIC); @@ -1546,20 +1574,8 @@ ssize_t vfio_pci_config_rw(struct vfio_pci_device *vdev, char __user *buf, pos &= VFIO_PCI_OFFSET_MASK; - /* - * We want to both keep the access size the caller users as well as - * support reading large chunks of config space in a single call. - * PCI doesn't support unaligned accesses, so we can safely break - * those apart. - */ while (count) { - if (count >= 4 && !(pos % 4)) - ret = vfio_config_do_rw(vdev, buf, 4, &pos, iswrite); - else if (count >= 2 && !(pos % 2)) - ret = vfio_config_do_rw(vdev, buf, 2, &pos, iswrite); - else - ret = vfio_config_do_rw(vdev, buf, 1, &pos, iswrite); - + ret = vfio_config_do_rw(vdev, buf, count, &pos, iswrite); if (ret < 0) return ret; diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index a96509187deb..4bc704e1b7c7 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -287,7 +287,8 @@ void vfio_pci_intx_mask(struct vfio_pci_device *vdev) * a signal is necessary, which can then be handled via a work queue * or directly depending on the caller. */ -int vfio_pci_intx_unmask_handler(struct vfio_pci_device *vdev, void *unused) +static int vfio_pci_intx_unmask_handler(struct vfio_pci_device *vdev, + void *unused) { struct pci_dev *pdev = vdev->pdev; unsigned long flags; @@ -746,6 +747,63 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_device *vdev, return 0; } +static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev, + unsigned index, unsigned start, + unsigned count, uint32_t flags, void *data) +{ + int32_t fd = *(int32_t *)data; + struct pci_dev *pdev = vdev->pdev; + + if ((index != VFIO_PCI_ERR_IRQ_INDEX) || + !(flags & VFIO_IRQ_SET_DATA_TYPE_MASK)) + return -EINVAL; + + /* + * device_lock synchronizes setting and checking of + * err_trigger. The vfio_pci_aer_err_detected() is also + * called with device_lock held. + */ + + /* DATA_NONE/DATA_BOOL enables loopback testing */ + + if (flags & VFIO_IRQ_SET_DATA_NONE) { + device_lock(&pdev->dev); + if (vdev->err_trigger) + eventfd_signal(vdev->err_trigger, 1); + device_unlock(&pdev->dev); + return 0; + } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { + uint8_t trigger = *(uint8_t *)data; + device_lock(&pdev->dev); + if (trigger && vdev->err_trigger) + eventfd_signal(vdev->err_trigger, 1); + device_unlock(&pdev->dev); + return 0; + } + + /* Handle SET_DATA_EVENTFD */ + + if (fd == -1) { + device_lock(&pdev->dev); + if (vdev->err_trigger) + eventfd_ctx_put(vdev->err_trigger); + vdev->err_trigger = NULL; + device_unlock(&pdev->dev); + return 0; + } else if (fd >= 0) { + struct eventfd_ctx *efdctx; + efdctx = eventfd_ctx_fdget(fd); + if (IS_ERR(efdctx)) + return PTR_ERR(efdctx); + device_lock(&pdev->dev); + if (vdev->err_trigger) + eventfd_ctx_put(vdev->err_trigger); + vdev->err_trigger = efdctx; + device_unlock(&pdev->dev); + return 0; + } else + return -EINVAL; +} int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags, unsigned index, unsigned start, unsigned count, void *data) @@ -780,6 +838,13 @@ int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags, break; } break; + case VFIO_PCI_ERR_IRQ_INDEX: + switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { + case VFIO_IRQ_SET_ACTION_TRIGGER: + if (pci_is_pcie(vdev->pdev)) + func = vfio_pci_set_err_trigger; + break; + } } if (!func) diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index d7e55d03f49e..9c6d5d0f3b02 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -56,6 +56,7 @@ struct vfio_pci_device { bool has_vga; struct pci_saved_state *pci_saved_state; atomic_t refcnt; + struct eventfd_ctx *err_trigger; }; #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index fcc12f3e60a3..acb7121a9316 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -24,8 +24,10 @@ #include <linux/list.h> #include <linux/module.h> #include <linux/mutex.h> +#include <linux/rwsem.h> #include <linux/sched.h> #include <linux/slab.h> +#include <linux/stat.h> #include <linux/string.h> #include <linux/uaccess.h> #include <linux/vfio.h> @@ -57,7 +59,7 @@ struct vfio_iommu_driver { struct vfio_container { struct kref kref; struct list_head group_list; - struct mutex group_lock; + struct rw_semaphore group_lock; struct vfio_iommu_driver *iommu_driver; void *iommu_data; }; @@ -392,12 +394,13 @@ static void vfio_device_release(struct kref *kref) } /* Device reference always implies a group reference */ -static void vfio_device_put(struct vfio_device *device) +void vfio_device_put(struct vfio_device *device) { struct vfio_group *group = device->group; kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock); vfio_group_put(group); } +EXPORT_SYMBOL_GPL(vfio_device_put); static void vfio_device_get(struct vfio_device *device) { @@ -627,6 +630,33 @@ int vfio_add_group_dev(struct device *dev, } EXPORT_SYMBOL_GPL(vfio_add_group_dev); +/** + * Get a reference to the vfio_device for a device that is known to + * be bound to a vfio driver. The driver implicitly holds a + * vfio_device reference between vfio_add_group_dev and + * vfio_del_group_dev. We can therefore use drvdata to increment + * that reference from the struct device. This additional + * reference must be released by calling vfio_device_put. + */ +struct vfio_device *vfio_device_get_from_dev(struct device *dev) +{ + struct vfio_device *device = dev_get_drvdata(dev); + + vfio_device_get(device); + + return device; +} +EXPORT_SYMBOL_GPL(vfio_device_get_from_dev); + +/* + * Caller must hold a reference to the vfio_device + */ +void *vfio_device_data(struct vfio_device *device) +{ + return device->device_data; +} +EXPORT_SYMBOL_GPL(vfio_device_data); + /* Given a referenced group, check if it contains the device */ static bool vfio_dev_present(struct vfio_group *group, struct device *dev) { @@ -675,9 +705,13 @@ EXPORT_SYMBOL_GPL(vfio_del_group_dev); static long vfio_ioctl_check_extension(struct vfio_container *container, unsigned long arg) { - struct vfio_iommu_driver *driver = container->iommu_driver; + struct vfio_iommu_driver *driver; long ret = 0; + down_read(&container->group_lock); + + driver = container->iommu_driver; + switch (arg) { /* No base extensions yet */ default: @@ -707,10 +741,12 @@ static long vfio_ioctl_check_extension(struct vfio_container *container, VFIO_CHECK_EXTENSION, arg); } + up_read(&container->group_lock); + return ret; } -/* hold container->group_lock */ +/* hold write lock on container->group_lock */ static int __vfio_container_attach_groups(struct vfio_container *container, struct vfio_iommu_driver *driver, void *data) @@ -741,7 +777,7 @@ static long vfio_ioctl_set_iommu(struct vfio_container *container, struct vfio_iommu_driver *driver; long ret = -ENODEV; - mutex_lock(&container->group_lock); + down_write(&container->group_lock); /* * The container is designed to be an unprivileged interface while @@ -752,7 +788,7 @@ static long vfio_ioctl_set_iommu(struct vfio_container *container, * the container is deprivileged and returns to an unset state. */ if (list_empty(&container->group_list) || container->iommu_driver) { - mutex_unlock(&container->group_lock); + up_write(&container->group_lock); return -EINVAL; } @@ -799,7 +835,7 @@ static long vfio_ioctl_set_iommu(struct vfio_container *container, mutex_unlock(&vfio.iommu_drivers_lock); skip_drivers_unlock: - mutex_unlock(&container->group_lock); + up_write(&container->group_lock); return ret; } @@ -815,9 +851,6 @@ static long vfio_fops_unl_ioctl(struct file *filep, if (!container) return ret; - driver = container->iommu_driver; - data = container->iommu_data; - switch (cmd) { case VFIO_GET_API_VERSION: ret = VFIO_API_VERSION; @@ -829,8 +862,15 @@ static long vfio_fops_unl_ioctl(struct file *filep, ret = vfio_ioctl_set_iommu(container, arg); break; default: + down_read(&container->group_lock); + + driver = container->iommu_driver; + data = container->iommu_data; + if (driver) /* passthrough all unrecognized ioctls */ ret = driver->ops->ioctl(data, cmd, arg); + + up_read(&container->group_lock); } return ret; @@ -854,7 +894,7 @@ static int vfio_fops_open(struct inode *inode, struct file *filep) return -ENOMEM; INIT_LIST_HEAD(&container->group_list); - mutex_init(&container->group_lock); + init_rwsem(&container->group_lock); kref_init(&container->kref); filep->private_data = container; @@ -881,35 +921,55 @@ static ssize_t vfio_fops_read(struct file *filep, char __user *buf, size_t count, loff_t *ppos) { struct vfio_container *container = filep->private_data; - struct vfio_iommu_driver *driver = container->iommu_driver; + struct vfio_iommu_driver *driver; + ssize_t ret = -EINVAL; - if (unlikely(!driver || !driver->ops->read)) - return -EINVAL; + down_read(&container->group_lock); - return driver->ops->read(container->iommu_data, buf, count, ppos); + driver = container->iommu_driver; + if (likely(driver && driver->ops->read)) + ret = driver->ops->read(container->iommu_data, + buf, count, ppos); + + up_read(&container->group_lock); + + return ret; } static ssize_t vfio_fops_write(struct file *filep, const char __user *buf, size_t count, loff_t *ppos) { struct vfio_container *container = filep->private_data; - struct vfio_iommu_driver *driver = container->iommu_driver; + struct vfio_iommu_driver *driver; + ssize_t ret = -EINVAL; - if (unlikely(!driver || !driver->ops->write)) - return -EINVAL; + down_read(&container->group_lock); - return driver->ops->write(container->iommu_data, buf, count, ppos); + driver = container->iommu_driver; + if (likely(driver && driver->ops->write)) + ret = driver->ops->write(container->iommu_data, + buf, count, ppos); + + up_read(&container->group_lock); + + return ret; } static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma) { struct vfio_container *container = filep->private_data; - struct vfio_iommu_driver *driver = container->iommu_driver; + struct vfio_iommu_driver *driver; + int ret = -EINVAL; - if (unlikely(!driver || !driver->ops->mmap)) - return -EINVAL; + down_read(&container->group_lock); - return driver->ops->mmap(container->iommu_data, vma); + driver = container->iommu_driver; + if (likely(driver && driver->ops->mmap)) + ret = driver->ops->mmap(container->iommu_data, vma); + + up_read(&container->group_lock); + + return ret; } static const struct file_operations vfio_fops = { @@ -933,7 +993,7 @@ static void __vfio_group_unset_container(struct vfio_group *group) struct vfio_container *container = group->container; struct vfio_iommu_driver *driver; - mutex_lock(&container->group_lock); + down_write(&container->group_lock); driver = container->iommu_driver; if (driver) @@ -951,7 +1011,7 @@ static void __vfio_group_unset_container(struct vfio_group *group) container->iommu_data = NULL; } - mutex_unlock(&container->group_lock); + up_write(&container->group_lock); vfio_container_put(container); } @@ -1011,7 +1071,7 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd) container = f.file->private_data; WARN_ON(!container); /* fget ensures we don't race vfio_release */ - mutex_lock(&container->group_lock); + down_write(&container->group_lock); driver = container->iommu_driver; if (driver) { @@ -1029,7 +1089,7 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd) atomic_inc(&group->container_users); unlock_out: - mutex_unlock(&container->group_lock); + up_write(&container->group_lock); fdput(f); return ret; } @@ -1300,6 +1360,9 @@ static const struct file_operations vfio_device_fops = { */ static char *vfio_devnode(struct device *dev, umode_t *mode) { + if (MINOR(dev->devt) == 0) + *mode = S_IRUGO | S_IWUGO; + return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev)); } diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig index bf243177ffe1..8b9226da3f54 100644 --- a/drivers/vhost/Kconfig +++ b/drivers/vhost/Kconfig @@ -1,6 +1,7 @@ config VHOST_NET tristate "Host kernel accelerator for virtio net" depends on NET && EVENTFD && (TUN || !TUN) && (MACVTAP || !MACVTAP) + select VHOST_RING ---help--- This kernel module can be loaded in host kernel to accelerate guest networking with virtio_net. Not to be confused with virtio_net @@ -9,6 +10,17 @@ config VHOST_NET To compile this driver as a module, choose M here: the module will be called vhost_net. -if STAGING -source "drivers/vhost/Kconfig.tcm" -endif +config VHOST_SCSI + tristate "VHOST_SCSI TCM fabric driver" + depends on TARGET_CORE && EVENTFD && m + select VHOST_RING + default n + ---help--- + Say M here to enable the vhost_scsi TCM fabric module + for use with virtio-scsi guests + +config VHOST_RING + tristate + ---help--- + This option is selected by any driver which needs to access + the host side of a virtio ring. diff --git a/drivers/vhost/Kconfig.tcm b/drivers/vhost/Kconfig.tcm deleted file mode 100644 index 7e3aa28d999e..000000000000 --- a/drivers/vhost/Kconfig.tcm +++ /dev/null @@ -1,6 +0,0 @@ -config TCM_VHOST - tristate "TCM_VHOST fabric module" - depends on TARGET_CORE && EVENTFD && m - default n - ---help--- - Say M here to enable the TCM_VHOST fabric module for use with virtio-scsi guests diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile index a27b053bc9ab..654e9afb11f5 100644 --- a/drivers/vhost/Makefile +++ b/drivers/vhost/Makefile @@ -1,4 +1,7 @@ obj-$(CONFIG_VHOST_NET) += vhost_net.o vhost_net-y := vhost.o net.o -obj-$(CONFIG_TCM_VHOST) += tcm_vhost.o +obj-$(CONFIG_VHOST_SCSI) += vhost_scsi.o +vhost_scsi-y := scsi.o + +obj-$(CONFIG_VHOST_RING) += vringh.o diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 87c216c1e54e..a3645bd163d8 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -64,9 +64,35 @@ enum { VHOST_NET_VQ_MAX = 2, }; +struct vhost_ubuf_ref { + struct kref kref; + wait_queue_head_t wait; + struct vhost_virtqueue *vq; +}; + +struct vhost_net_virtqueue { + struct vhost_virtqueue vq; + /* hdr is used to store the virtio header. + * Since each iovec has >= 1 byte length, we never need more than + * header length entries to store the header. */ + struct iovec hdr[sizeof(struct virtio_net_hdr_mrg_rxbuf)]; + size_t vhost_hlen; + size_t sock_hlen; + /* vhost zerocopy support fields below: */ + /* last used idx for outstanding DMA zerocopy buffers */ + int upend_idx; + /* first used idx for DMA done zerocopy buffers */ + int done_idx; + /* an array of userspace buffers info */ + struct ubuf_info *ubuf_info; + /* Reference counting for outstanding ubufs. + * Protected by vq mutex. Writers must also take device mutex. */ + struct vhost_ubuf_ref *ubufs; +}; + struct vhost_net { struct vhost_dev dev; - struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX]; + struct vhost_net_virtqueue vqs[VHOST_NET_VQ_MAX]; struct vhost_poll poll[VHOST_NET_VQ_MAX]; /* Number of TX recently submitted. * Protected by tx vq lock. */ @@ -78,6 +104,90 @@ struct vhost_net { bool tx_flush; }; +static unsigned vhost_zcopy_mask __read_mostly; + +void vhost_enable_zcopy(int vq) +{ + vhost_zcopy_mask |= 0x1 << vq; +} + +static void vhost_zerocopy_done_signal(struct kref *kref) +{ + struct vhost_ubuf_ref *ubufs = container_of(kref, struct vhost_ubuf_ref, + kref); + wake_up(&ubufs->wait); +} + +struct vhost_ubuf_ref *vhost_ubuf_alloc(struct vhost_virtqueue *vq, + bool zcopy) +{ + struct vhost_ubuf_ref *ubufs; + /* No zero copy backend? Nothing to count. */ + if (!zcopy) + return NULL; + ubufs = kmalloc(sizeof(*ubufs), GFP_KERNEL); + if (!ubufs) + return ERR_PTR(-ENOMEM); + kref_init(&ubufs->kref); + init_waitqueue_head(&ubufs->wait); + ubufs->vq = vq; + return ubufs; +} + +void vhost_ubuf_put(struct vhost_ubuf_ref *ubufs) +{ + kref_put(&ubufs->kref, vhost_zerocopy_done_signal); +} + +void vhost_ubuf_put_and_wait(struct vhost_ubuf_ref *ubufs) +{ + kref_put(&ubufs->kref, vhost_zerocopy_done_signal); + wait_event(ubufs->wait, !atomic_read(&ubufs->kref.refcount)); + kfree(ubufs); +} + +int vhost_net_set_ubuf_info(struct vhost_net *n) +{ + bool zcopy; + int i; + + for (i = 0; i < n->dev.nvqs; ++i) { + zcopy = vhost_zcopy_mask & (0x1 << i); + if (!zcopy) + continue; + n->vqs[i].ubuf_info = kmalloc(sizeof(*n->vqs[i].ubuf_info) * + UIO_MAXIOV, GFP_KERNEL); + if (!n->vqs[i].ubuf_info) + goto err; + } + return 0; + +err: + while (i--) { + zcopy = vhost_zcopy_mask & (0x1 << i); + if (!zcopy) + continue; + kfree(n->vqs[i].ubuf_info); + } + return -ENOMEM; +} + +void vhost_net_vq_reset(struct vhost_net *n) +{ + int i; + + for (i = 0; i < VHOST_NET_VQ_MAX; i++) { + n->vqs[i].done_idx = 0; + n->vqs[i].upend_idx = 0; + n->vqs[i].ubufs = NULL; + kfree(n->vqs[i].ubuf_info); + n->vqs[i].ubuf_info = NULL; + n->vqs[i].vhost_hlen = 0; + n->vqs[i].sock_hlen = 0; + } + +} + static void vhost_net_tx_packet(struct vhost_net *net) { ++net->tx_packets; @@ -153,10 +263,12 @@ static void copy_iovec_hdr(const struct iovec *from, struct iovec *to, static int vhost_zerocopy_signal_used(struct vhost_net *net, struct vhost_virtqueue *vq) { + struct vhost_net_virtqueue *nvq = + container_of(vq, struct vhost_net_virtqueue, vq); int i; int j = 0; - for (i = vq->done_idx; i != vq->upend_idx; i = (i + 1) % UIO_MAXIOV) { + for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) { if (vq->heads[i].len == VHOST_DMA_FAILED_LEN) vhost_net_tx_err(net); if (VHOST_DMA_IS_DONE(vq->heads[i].len)) { @@ -168,7 +280,7 @@ static int vhost_zerocopy_signal_used(struct vhost_net *net, break; } if (j) - vq->done_idx = i; + nvq->done_idx = i; return j; } @@ -198,7 +310,8 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success) * read-size critical section for our kind of RCU. */ static void handle_tx(struct vhost_net *net) { - struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX]; + struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; + struct vhost_virtqueue *vq = &nvq->vq; unsigned out, in, s; int head; struct msghdr msg = { @@ -224,8 +337,8 @@ static void handle_tx(struct vhost_net *net) mutex_lock(&vq->mutex); vhost_disable_notify(&net->dev, vq); - hdr_size = vq->vhost_hlen; - zcopy = vq->ubufs; + hdr_size = nvq->vhost_hlen; + zcopy = nvq->ubufs; for (;;) { /* Release DMAs done buffers first */ @@ -246,9 +359,10 @@ static void handle_tx(struct vhost_net *net) /* If more outstanding DMAs, queue the work. * Handle upend_idx wrap around */ - num_pends = likely(vq->upend_idx >= vq->done_idx) ? - (vq->upend_idx - vq->done_idx) : - (vq->upend_idx + UIO_MAXIOV - vq->done_idx); + num_pends = likely(nvq->upend_idx >= nvq->done_idx) ? + (nvq->upend_idx - nvq->done_idx) : + (nvq->upend_idx + UIO_MAXIOV - + nvq->done_idx); if (unlikely(num_pends > VHOST_MAX_PEND)) break; if (unlikely(vhost_enable_notify(&net->dev, vq))) { @@ -263,45 +377,45 @@ static void handle_tx(struct vhost_net *net) break; } /* Skip header. TODO: support TSO. */ - s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out); + s = move_iovec_hdr(vq->iov, nvq->hdr, hdr_size, out); msg.msg_iovlen = out; len = iov_length(vq->iov, out); /* Sanity check */ if (!len) { vq_err(vq, "Unexpected header len for TX: " "%zd expected %zd\n", - iov_length(vq->hdr, s), hdr_size); + iov_length(nvq->hdr, s), hdr_size); break; } zcopy_used = zcopy && (len >= VHOST_GOODCOPY_LEN || - vq->upend_idx != vq->done_idx); + nvq->upend_idx != nvq->done_idx); /* use msg_control to pass vhost zerocopy ubuf info to skb */ if (zcopy_used) { - vq->heads[vq->upend_idx].id = head; + vq->heads[nvq->upend_idx].id = head; if (!vhost_net_tx_select_zcopy(net) || len < VHOST_GOODCOPY_LEN) { /* copy don't need to wait for DMA done */ - vq->heads[vq->upend_idx].len = + vq->heads[nvq->upend_idx].len = VHOST_DMA_DONE_LEN; msg.msg_control = NULL; msg.msg_controllen = 0; ubufs = NULL; } else { struct ubuf_info *ubuf; - ubuf = vq->ubuf_info + vq->upend_idx; + ubuf = nvq->ubuf_info + nvq->upend_idx; - vq->heads[vq->upend_idx].len = + vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS; ubuf->callback = vhost_zerocopy_callback; - ubuf->ctx = vq->ubufs; - ubuf->desc = vq->upend_idx; + ubuf->ctx = nvq->ubufs; + ubuf->desc = nvq->upend_idx; msg.msg_control = ubuf; msg.msg_controllen = sizeof(ubuf); - ubufs = vq->ubufs; + ubufs = nvq->ubufs; kref_get(&ubufs->kref); } - vq->upend_idx = (vq->upend_idx + 1) % UIO_MAXIOV; + nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV; } /* TODO: Check specific error and bomb out unless ENOBUFS? */ err = sock->ops->sendmsg(NULL, sock, &msg, len); @@ -309,8 +423,8 @@ static void handle_tx(struct vhost_net *net) if (zcopy_used) { if (ubufs) vhost_ubuf_put(ubufs); - vq->upend_idx = ((unsigned)vq->upend_idx - 1) % - UIO_MAXIOV; + nvq->upend_idx = ((unsigned)nvq->upend_idx - 1) + % UIO_MAXIOV; } vhost_discard_vq_desc(vq, 1); break; @@ -417,7 +531,8 @@ err: * read-size critical section for our kind of RCU. */ static void handle_rx(struct vhost_net *net) { - struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX]; + struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX]; + struct vhost_virtqueue *vq = &nvq->vq; unsigned uninitialized_var(in), log; struct vhost_log *vq_log; struct msghdr msg = { @@ -445,8 +560,8 @@ static void handle_rx(struct vhost_net *net) mutex_lock(&vq->mutex); vhost_disable_notify(&net->dev, vq); - vhost_hlen = vq->vhost_hlen; - sock_hlen = vq->sock_hlen; + vhost_hlen = nvq->vhost_hlen; + sock_hlen = nvq->sock_hlen; vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ? vq->log : NULL; @@ -476,11 +591,11 @@ static void handle_rx(struct vhost_net *net) /* We don't need to be notified again. */ if (unlikely((vhost_hlen))) /* Skip header. TODO: support TSO. */ - move_iovec_hdr(vq->iov, vq->hdr, vhost_hlen, in); + move_iovec_hdr(vq->iov, nvq->hdr, vhost_hlen, in); else /* Copy the header for use in VIRTIO_NET_F_MRG_RXBUF: * needed because recvmsg can modify msg_iov. */ - copy_iovec_hdr(vq->iov, vq->hdr, sock_hlen, in); + copy_iovec_hdr(vq->iov, nvq->hdr, sock_hlen, in); msg.msg_iovlen = in; err = sock->ops->recvmsg(NULL, sock, &msg, sock_len, MSG_DONTWAIT | MSG_TRUNC); @@ -494,7 +609,7 @@ static void handle_rx(struct vhost_net *net) continue; } if (unlikely(vhost_hlen) && - memcpy_toiovecend(vq->hdr, (unsigned char *)&hdr, 0, + memcpy_toiovecend(nvq->hdr, (unsigned char *)&hdr, 0, vhost_hlen)) { vq_err(vq, "Unable to write vnet_hdr at addr %p\n", vq->iov->iov_base); @@ -502,7 +617,7 @@ static void handle_rx(struct vhost_net *net) } /* TODO: Should check and handle checksum. */ if (likely(mergeable) && - memcpy_toiovecend(vq->hdr, (unsigned char *)&headcount, + memcpy_toiovecend(nvq->hdr, (unsigned char *)&headcount, offsetof(typeof(hdr), num_buffers), sizeof hdr.num_buffers)) { vq_err(vq, "Failed num_buffers write"); @@ -559,17 +674,34 @@ static int vhost_net_open(struct inode *inode, struct file *f) { struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL); struct vhost_dev *dev; - int r; + struct vhost_virtqueue **vqs; + int r, i; if (!n) return -ENOMEM; + vqs = kmalloc(VHOST_NET_VQ_MAX * sizeof(*vqs), GFP_KERNEL); + if (!vqs) { + kfree(n); + return -ENOMEM; + } dev = &n->dev; - n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick; - n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick; - r = vhost_dev_init(dev, n->vqs, VHOST_NET_VQ_MAX); + vqs[VHOST_NET_VQ_TX] = &n->vqs[VHOST_NET_VQ_TX].vq; + vqs[VHOST_NET_VQ_RX] = &n->vqs[VHOST_NET_VQ_RX].vq; + n->vqs[VHOST_NET_VQ_TX].vq.handle_kick = handle_tx_kick; + n->vqs[VHOST_NET_VQ_RX].vq.handle_kick = handle_rx_kick; + for (i = 0; i < VHOST_NET_VQ_MAX; i++) { + n->vqs[i].ubufs = NULL; + n->vqs[i].ubuf_info = NULL; + n->vqs[i].upend_idx = 0; + n->vqs[i].done_idx = 0; + n->vqs[i].vhost_hlen = 0; + n->vqs[i].sock_hlen = 0; + } + r = vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX); if (r < 0) { kfree(n); + kfree(vqs); return r; } @@ -584,7 +716,9 @@ static int vhost_net_open(struct inode *inode, struct file *f) static void vhost_net_disable_vq(struct vhost_net *n, struct vhost_virtqueue *vq) { - struct vhost_poll *poll = n->poll + (vq - n->vqs); + struct vhost_net_virtqueue *nvq = + container_of(vq, struct vhost_net_virtqueue, vq); + struct vhost_poll *poll = n->poll + (nvq - n->vqs); if (!vq->private_data) return; vhost_poll_stop(poll); @@ -593,7 +727,9 @@ static void vhost_net_disable_vq(struct vhost_net *n, static int vhost_net_enable_vq(struct vhost_net *n, struct vhost_virtqueue *vq) { - struct vhost_poll *poll = n->poll + (vq - n->vqs); + struct vhost_net_virtqueue *nvq = + container_of(vq, struct vhost_net_virtqueue, vq); + struct vhost_poll *poll = n->poll + (nvq - n->vqs); struct socket *sock; sock = rcu_dereference_protected(vq->private_data, @@ -621,30 +757,30 @@ static struct socket *vhost_net_stop_vq(struct vhost_net *n, static void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock, struct socket **rx_sock) { - *tx_sock = vhost_net_stop_vq(n, n->vqs + VHOST_NET_VQ_TX); - *rx_sock = vhost_net_stop_vq(n, n->vqs + VHOST_NET_VQ_RX); + *tx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_TX].vq); + *rx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_RX].vq); } static void vhost_net_flush_vq(struct vhost_net *n, int index) { vhost_poll_flush(n->poll + index); - vhost_poll_flush(&n->dev.vqs[index].poll); + vhost_poll_flush(&n->vqs[index].vq.poll); } static void vhost_net_flush(struct vhost_net *n) { vhost_net_flush_vq(n, VHOST_NET_VQ_TX); vhost_net_flush_vq(n, VHOST_NET_VQ_RX); - if (n->dev.vqs[VHOST_NET_VQ_TX].ubufs) { - mutex_lock(&n->dev.vqs[VHOST_NET_VQ_TX].mutex); + if (n->vqs[VHOST_NET_VQ_TX].ubufs) { + mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); n->tx_flush = true; - mutex_unlock(&n->dev.vqs[VHOST_NET_VQ_TX].mutex); + mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); /* Wait for all lower device DMAs done. */ - vhost_ubuf_put_and_wait(n->dev.vqs[VHOST_NET_VQ_TX].ubufs); - mutex_lock(&n->dev.vqs[VHOST_NET_VQ_TX].mutex); + vhost_ubuf_put_and_wait(n->vqs[VHOST_NET_VQ_TX].ubufs); + mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); n->tx_flush = false; - kref_init(&n->dev.vqs[VHOST_NET_VQ_TX].ubufs->kref); - mutex_unlock(&n->dev.vqs[VHOST_NET_VQ_TX].mutex); + kref_init(&n->vqs[VHOST_NET_VQ_TX].ubufs->kref); + mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); } } @@ -658,6 +794,7 @@ static int vhost_net_release(struct inode *inode, struct file *f) vhost_net_flush(n); vhost_dev_stop(&n->dev); vhost_dev_cleanup(&n->dev, false); + vhost_net_vq_reset(n); if (tx_sock) fput(tx_sock->file); if (rx_sock) @@ -665,6 +802,7 @@ static int vhost_net_release(struct inode *inode, struct file *f) /* We do an extra flush before freeing memory, * since jobs can re-queue themselves. */ vhost_net_flush(n); + kfree(n->dev.vqs); kfree(n); return 0; } @@ -738,6 +876,7 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) { struct socket *sock, *oldsock; struct vhost_virtqueue *vq; + struct vhost_net_virtqueue *nvq; struct vhost_ubuf_ref *ubufs, *oldubufs = NULL; int r; @@ -750,7 +889,8 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) r = -ENOBUFS; goto err; } - vq = n->vqs + index; + vq = &n->vqs[index].vq; + nvq = &n->vqs[index]; mutex_lock(&vq->mutex); /* Verify that ring has been setup correctly. */ @@ -783,8 +923,8 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) if (r) goto err_used; - oldubufs = vq->ubufs; - vq->ubufs = ubufs; + oldubufs = nvq->ubufs; + nvq->ubufs = ubufs; n->tx_packets = 0; n->tx_zcopy_err = 0; @@ -827,14 +967,21 @@ static long vhost_net_reset_owner(struct vhost_net *n) struct socket *tx_sock = NULL; struct socket *rx_sock = NULL; long err; + struct vhost_memory *memory; mutex_lock(&n->dev.mutex); err = vhost_dev_check_owner(&n->dev); if (err) goto done; + memory = vhost_dev_reset_owner_prepare(); + if (!memory) { + err = -ENOMEM; + goto done; + } vhost_net_stop(n, &tx_sock, &rx_sock); vhost_net_flush(n); - err = vhost_dev_reset_owner(&n->dev); + vhost_dev_reset_owner(&n->dev, memory); + vhost_net_vq_reset(n); done: mutex_unlock(&n->dev.mutex); if (tx_sock) @@ -870,10 +1017,10 @@ static int vhost_net_set_features(struct vhost_net *n, u64 features) n->dev.acked_features = features; smp_wmb(); for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { - mutex_lock(&n->vqs[i].mutex); + mutex_lock(&n->vqs[i].vq.mutex); n->vqs[i].vhost_hlen = vhost_hlen; n->vqs[i].sock_hlen = sock_hlen; - mutex_unlock(&n->vqs[i].mutex); + mutex_unlock(&n->vqs[i].vq.mutex); } vhost_net_flush(n); mutex_unlock(&n->dev.mutex); @@ -910,11 +1057,17 @@ static long vhost_net_ioctl(struct file *f, unsigned int ioctl, return vhost_net_reset_owner(n); default: mutex_lock(&n->dev.mutex); + if (ioctl == VHOST_SET_OWNER) { + r = vhost_net_set_ubuf_info(n); + if (r) + goto out; + } r = vhost_dev_ioctl(&n->dev, ioctl, argp); if (r == -ENOIOCTLCMD) r = vhost_vring_ioctl(&n->dev, ioctl, argp); else vhost_net_flush(n); +out: mutex_unlock(&n->dev.mutex); return r; } diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/scsi.c index 1677238d281f..5179f7aa1b0b 100644 --- a/drivers/vhost/tcm_vhost.c +++ b/drivers/vhost/scsi.c @@ -45,14 +45,116 @@ #include <target/target_core_configfs.h> #include <target/configfs_macros.h> #include <linux/vhost.h> -#include <linux/virtio_net.h> /* TODO vhost.h currently depends on this */ #include <linux/virtio_scsi.h> #include <linux/llist.h> #include <linux/bitmap.h> #include "vhost.c" #include "vhost.h" -#include "tcm_vhost.h" + +#define TCM_VHOST_VERSION "v0.1" +#define TCM_VHOST_NAMELEN 256 +#define TCM_VHOST_MAX_CDB_SIZE 32 + +struct vhost_scsi_inflight { + /* Wait for the flush operation to finish */ + struct completion comp; + /* Refcount for the inflight reqs */ + struct kref kref; +}; + +struct tcm_vhost_cmd { + /* Descriptor from vhost_get_vq_desc() for virt_queue segment */ + int tvc_vq_desc; + /* virtio-scsi initiator task attribute */ + int tvc_task_attr; + /* virtio-scsi initiator data direction */ + enum dma_data_direction tvc_data_direction; + /* Expected data transfer length from virtio-scsi header */ + u32 tvc_exp_data_len; + /* The Tag from include/linux/virtio_scsi.h:struct virtio_scsi_cmd_req */ + u64 tvc_tag; + /* The number of scatterlists associated with this cmd */ + u32 tvc_sgl_count; + /* Saved unpacked SCSI LUN for tcm_vhost_submission_work() */ + u32 tvc_lun; + /* Pointer to the SGL formatted memory from virtio-scsi */ + struct scatterlist *tvc_sgl; + /* Pointer to response */ + struct virtio_scsi_cmd_resp __user *tvc_resp; + /* Pointer to vhost_scsi for our device */ + struct vhost_scsi *tvc_vhost; + /* Pointer to vhost_virtqueue for the cmd */ + struct vhost_virtqueue *tvc_vq; + /* Pointer to vhost nexus memory */ + struct tcm_vhost_nexus *tvc_nexus; + /* The TCM I/O descriptor that is accessed via container_of() */ + struct se_cmd tvc_se_cmd; + /* work item used for cmwq dispatch to tcm_vhost_submission_work() */ + struct work_struct work; + /* Copy of the incoming SCSI command descriptor block (CDB) */ + unsigned char tvc_cdb[TCM_VHOST_MAX_CDB_SIZE]; + /* Sense buffer that will be mapped into outgoing status */ + unsigned char tvc_sense_buf[TRANSPORT_SENSE_BUFFER]; + /* Completed commands list, serviced from vhost worker thread */ + struct llist_node tvc_completion_list; + /* Used to track inflight cmd */ + struct vhost_scsi_inflight *inflight; +}; + +struct tcm_vhost_nexus { + /* Pointer to TCM session for I_T Nexus */ + struct se_session *tvn_se_sess; +}; + +struct tcm_vhost_nacl { + /* Binary World Wide unique Port Name for Vhost Initiator port */ + u64 iport_wwpn; + /* ASCII formatted WWPN for Sas Initiator port */ + char iport_name[TCM_VHOST_NAMELEN]; + /* Returned by tcm_vhost_make_nodeacl() */ + struct se_node_acl se_node_acl; +}; + +struct vhost_scsi; +struct tcm_vhost_tpg { + /* Vhost port target portal group tag for TCM */ + u16 tport_tpgt; + /* Used to track number of TPG Port/Lun Links wrt to explict I_T Nexus shutdown */ + int tv_tpg_port_count; + /* Used for vhost_scsi device reference to tpg_nexus, protected by tv_tpg_mutex */ + int tv_tpg_vhost_count; + /* list for tcm_vhost_list */ + struct list_head tv_tpg_list; + /* Used to protect access for tpg_nexus */ + struct mutex tv_tpg_mutex; + /* Pointer to the TCM VHost I_T Nexus for this TPG endpoint */ + struct tcm_vhost_nexus *tpg_nexus; + /* Pointer back to tcm_vhost_tport */ + struct tcm_vhost_tport *tport; + /* Returned by tcm_vhost_make_tpg() */ + struct se_portal_group se_tpg; + /* Pointer back to vhost_scsi, protected by tv_tpg_mutex */ + struct vhost_scsi *vhost_scsi; +}; + +struct tcm_vhost_tport { + /* SCSI protocol the tport is providing */ + u8 tport_proto_id; + /* Binary World Wide unique Port Name for Vhost Target port */ + u64 tport_wwpn; + /* ASCII formatted WWPN for Vhost Target port */ + char tport_name[TCM_VHOST_NAMELEN]; + /* Returned by tcm_vhost_make_tport() */ + struct se_wwn tport_wwn; +}; + +struct tcm_vhost_evt { + /* event to be sent to guest */ + struct virtio_scsi_event event; + /* event list, serviced from vhost worker thread */ + struct llist_node list; +}; enum { VHOST_SCSI_VQ_CTL = 0, @@ -74,13 +176,28 @@ enum { #define VHOST_SCSI_MAX_VQ 128 #define VHOST_SCSI_MAX_EVENT 128 +struct vhost_scsi_virtqueue { + struct vhost_virtqueue vq; + /* + * Reference counting for inflight reqs, used for flush operation. At + * each time, one reference tracks new commands submitted, while we + * wait for another one to reach 0. + */ + struct vhost_scsi_inflight inflights[2]; + /* + * Indicate current inflight in use, protected by vq->mutex. + * Writers must also take dev mutex and flush under it. + */ + int inflight_idx; +}; + struct vhost_scsi { /* Protected by vhost_scsi->dev.mutex */ struct tcm_vhost_tpg **vs_tpg; char vs_vhost_wwpn[TRANSPORT_IQN_LEN]; struct vhost_dev dev; - struct vhost_virtqueue vqs[VHOST_SCSI_MAX_VQ]; + struct vhost_scsi_virtqueue vqs[VHOST_SCSI_MAX_VQ]; struct vhost_work vs_completion_work; /* cmd completion work item */ struct llist_head vs_completion_list; /* cmd completion queue */ @@ -107,6 +224,59 @@ static int iov_num_pages(struct iovec *iov) ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT; } +void tcm_vhost_done_inflight(struct kref *kref) +{ + struct vhost_scsi_inflight *inflight; + + inflight = container_of(kref, struct vhost_scsi_inflight, kref); + complete(&inflight->comp); +} + +static void tcm_vhost_init_inflight(struct vhost_scsi *vs, + struct vhost_scsi_inflight *old_inflight[]) +{ + struct vhost_scsi_inflight *new_inflight; + struct vhost_virtqueue *vq; + int idx, i; + + for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) { + vq = &vs->vqs[i].vq; + + mutex_lock(&vq->mutex); + + /* store old infight */ + idx = vs->vqs[i].inflight_idx; + if (old_inflight) + old_inflight[i] = &vs->vqs[i].inflights[idx]; + + /* setup new infight */ + vs->vqs[i].inflight_idx = idx ^ 1; + new_inflight = &vs->vqs[i].inflights[idx ^ 1]; + kref_init(&new_inflight->kref); + init_completion(&new_inflight->comp); + + mutex_unlock(&vq->mutex); + } +} + +static struct vhost_scsi_inflight * +tcm_vhost_get_inflight(struct vhost_virtqueue *vq) +{ + struct vhost_scsi_inflight *inflight; + struct vhost_scsi_virtqueue *svq; + + svq = container_of(vq, struct vhost_scsi_virtqueue, vq); + inflight = &svq->inflights[svq->inflight_idx]; + kref_get(&inflight->kref); + + return inflight; +} + +static void tcm_vhost_put_inflight(struct vhost_scsi_inflight *inflight) +{ + kref_put(&inflight->kref, tcm_vhost_done_inflight); +} + static int tcm_vhost_check_true(struct se_portal_group *se_tpg) { return 1; @@ -366,7 +536,7 @@ static void tcm_vhost_free_evt(struct vhost_scsi *vs, struct tcm_vhost_evt *evt) static struct tcm_vhost_evt *tcm_vhost_allocate_evt(struct vhost_scsi *vs, u32 event, u32 reason) { - struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT]; + struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT].vq; struct tcm_vhost_evt *evt; if (vs->vs_events_nr > VHOST_SCSI_MAX_EVENT) { @@ -403,13 +573,15 @@ static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd) kfree(tv_cmd->tvc_sgl); } + tcm_vhost_put_inflight(tv_cmd->inflight); + kfree(tv_cmd); } static void tcm_vhost_do_evt_work(struct vhost_scsi *vs, struct tcm_vhost_evt *evt) { - struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT]; + struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT].vq; struct virtio_scsi_event *event = &evt->event; struct virtio_scsi_event __user *eventp; unsigned out, in; @@ -460,7 +632,7 @@ static void tcm_vhost_evt_work(struct vhost_work *work) { struct vhost_scsi *vs = container_of(work, struct vhost_scsi, vs_event_work); - struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT]; + struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT].vq; struct tcm_vhost_evt *evt; struct llist_node *llnode; @@ -511,8 +683,10 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work) v_rsp.sense_len); ret = copy_to_user(tv_cmd->tvc_resp, &v_rsp, sizeof(v_rsp)); if (likely(ret == 0)) { + struct vhost_scsi_virtqueue *q; vhost_add_used(tv_cmd->tvc_vq, tv_cmd->tvc_vq_desc, 0); - vq = tv_cmd->tvc_vq - vs->vqs; + q = container_of(tv_cmd->tvc_vq, struct vhost_scsi_virtqueue, vq); + vq = q - vs->vqs; __set_bit(vq, signal); } else pr_err("Faulted on virtio_scsi_cmd_resp\n"); @@ -523,10 +697,11 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work) vq = -1; while ((vq = find_next_bit(signal, VHOST_SCSI_MAX_VQ, vq + 1)) < VHOST_SCSI_MAX_VQ) - vhost_signal(&vs->dev, &vs->vqs[vq]); + vhost_signal(&vs->dev, &vs->vqs[vq].vq); } static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd( + struct vhost_virtqueue *vq, struct tcm_vhost_tpg *tv_tpg, struct virtio_scsi_cmd_req *v_req, u32 exp_data_len, @@ -551,6 +726,7 @@ static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd( tv_cmd->tvc_exp_data_len = exp_data_len; tv_cmd->tvc_data_direction = data_direction; tv_cmd->tvc_nexus = tv_nexus; + tv_cmd->inflight = tcm_vhost_get_inflight(vq); return tv_cmd; } @@ -806,7 +982,7 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs, for (i = 0; i < data_num; i++) exp_data_len += vq->iov[data_first + i].iov_len; - tv_cmd = vhost_scsi_allocate_cmd(tv_tpg, &v_req, + tv_cmd = vhost_scsi_allocate_cmd(vq, tv_tpg, &v_req, exp_data_len, data_direction); if (IS_ERR(tv_cmd)) { vq_err(vq, "vhost_scsi_allocate_cmd failed %ld\n", @@ -938,17 +1114,35 @@ static void vhost_scsi_handle_kick(struct vhost_work *work) static void vhost_scsi_flush_vq(struct vhost_scsi *vs, int index) { - vhost_poll_flush(&vs->dev.vqs[index].poll); + vhost_poll_flush(&vs->vqs[index].vq.poll); } +/* Callers must hold dev mutex */ static void vhost_scsi_flush(struct vhost_scsi *vs) { + struct vhost_scsi_inflight *old_inflight[VHOST_SCSI_MAX_VQ]; int i; + /* Init new inflight and remember the old inflight */ + tcm_vhost_init_inflight(vs, old_inflight); + + /* + * The inflight->kref was initialized to 1. We decrement it here to + * indicate the start of the flush operation so that it will reach 0 + * when all the reqs are finished. + */ + for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) + kref_put(&old_inflight[i]->kref, tcm_vhost_done_inflight); + + /* Flush both the vhost poll and vhost work */ for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) vhost_scsi_flush_vq(vs, i); vhost_work_flush(&vs->dev, &vs->vs_completion_work); vhost_work_flush(&vs->dev, &vs->vs_event_work); + + /* Wait for all reqs issued before the flush to be finished */ + for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) + wait_for_completion(&old_inflight[i]->comp); } /* @@ -975,7 +1169,7 @@ static int vhost_scsi_set_endpoint( /* Verify that ring has been setup correctly. */ for (index = 0; index < vs->dev.nvqs; ++index) { /* Verify that ring has been setup correctly. */ - if (!vhost_vq_access_ok(&vs->vqs[index])) { + if (!vhost_vq_access_ok(&vs->vqs[index].vq)) { ret = -EFAULT; goto out; } @@ -1022,7 +1216,7 @@ static int vhost_scsi_set_endpoint( memcpy(vs->vs_vhost_wwpn, t->vhost_wwpn, sizeof(vs->vs_vhost_wwpn)); for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) { - vq = &vs->vqs[i]; + vq = &vs->vqs[i].vq; /* Flushing the vhost_work acts as synchronize_rcu */ mutex_lock(&vq->mutex); rcu_assign_pointer(vq->private_data, vs_tpg); @@ -1063,7 +1257,7 @@ static int vhost_scsi_clear_endpoint( mutex_lock(&vs->dev.mutex); /* Verify that ring has been setup correctly. */ for (index = 0; index < vs->dev.nvqs; ++index) { - if (!vhost_vq_access_ok(&vs->vqs[index])) { + if (!vhost_vq_access_ok(&vs->vqs[index].vq)) { ret = -EFAULT; goto err_dev; } @@ -1103,7 +1297,7 @@ static int vhost_scsi_clear_endpoint( } if (match) { for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) { - vq = &vs->vqs[i]; + vq = &vs->vqs[i].vq; /* Flushing the vhost_work acts as synchronize_rcu */ mutex_lock(&vq->mutex); rcu_assign_pointer(vq->private_data, NULL); @@ -1151,24 +1345,39 @@ static int vhost_scsi_set_features(struct vhost_scsi *vs, u64 features) static int vhost_scsi_open(struct inode *inode, struct file *f) { struct vhost_scsi *s; + struct vhost_virtqueue **vqs; int r, i; s = kzalloc(sizeof(*s), GFP_KERNEL); if (!s) return -ENOMEM; + vqs = kmalloc(VHOST_SCSI_MAX_VQ * sizeof(*vqs), GFP_KERNEL); + if (!vqs) { + kfree(s); + return -ENOMEM; + } + vhost_work_init(&s->vs_completion_work, vhost_scsi_complete_cmd_work); vhost_work_init(&s->vs_event_work, tcm_vhost_evt_work); s->vs_events_nr = 0; s->vs_events_missed = false; - s->vqs[VHOST_SCSI_VQ_CTL].handle_kick = vhost_scsi_ctl_handle_kick; - s->vqs[VHOST_SCSI_VQ_EVT].handle_kick = vhost_scsi_evt_handle_kick; - for (i = VHOST_SCSI_VQ_IO; i < VHOST_SCSI_MAX_VQ; i++) - s->vqs[i].handle_kick = vhost_scsi_handle_kick; - r = vhost_dev_init(&s->dev, s->vqs, VHOST_SCSI_MAX_VQ); + vqs[VHOST_SCSI_VQ_CTL] = &s->vqs[VHOST_SCSI_VQ_CTL].vq; + vqs[VHOST_SCSI_VQ_EVT] = &s->vqs[VHOST_SCSI_VQ_EVT].vq; + s->vqs[VHOST_SCSI_VQ_CTL].vq.handle_kick = vhost_scsi_ctl_handle_kick; + s->vqs[VHOST_SCSI_VQ_EVT].vq.handle_kick = vhost_scsi_evt_handle_kick; + for (i = VHOST_SCSI_VQ_IO; i < VHOST_SCSI_MAX_VQ; i++) { + vqs[i] = &s->vqs[i].vq; + s->vqs[i].vq.handle_kick = vhost_scsi_handle_kick; + } + r = vhost_dev_init(&s->dev, vqs, VHOST_SCSI_MAX_VQ); + + tcm_vhost_init_inflight(s, NULL); + if (r < 0) { + kfree(vqs); kfree(s); return r; } @@ -1190,6 +1399,7 @@ static int vhost_scsi_release(struct inode *inode, struct file *f) vhost_dev_cleanup(&s->dev, false); /* Jobs can re-queue themselves in evt kick handler. Do extra flush. */ vhost_scsi_flush(s); + kfree(s->dev.vqs); kfree(s); return 0; } @@ -1205,7 +1415,7 @@ static long vhost_scsi_ioctl(struct file *f, unsigned int ioctl, u32 events_missed; u64 features; int r, abi_version = VHOST_SCSI_ABI_VERSION; - struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT]; + struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT].vq; switch (ioctl) { case VHOST_SCSI_SET_ENDPOINT: @@ -1333,7 +1543,7 @@ static void tcm_vhost_do_plug(struct tcm_vhost_tpg *tpg, else reason = VIRTIO_SCSI_EVT_RESET_REMOVED; - vq = &vs->vqs[VHOST_SCSI_VQ_EVT]; + vq = &vs->vqs[VHOST_SCSI_VQ_EVT].vq; mutex_lock(&vq->mutex); tcm_vhost_send_evt(vs, tpg, lun, VIRTIO_SCSI_T_TRANSPORT_RESET, reason); @@ -1926,7 +2136,8 @@ static void tcm_vhost_exit(void) destroy_workqueue(tcm_vhost_workqueue); }; -MODULE_DESCRIPTION("TCM_VHOST series fabric driver"); +MODULE_DESCRIPTION("VHOST_SCSI series fabric driver"); +MODULE_ALIAS("tcm_vhost"); MODULE_LICENSE("GPL"); module_init(tcm_vhost_init); module_exit(tcm_vhost_exit); diff --git a/drivers/vhost/tcm_vhost.h b/drivers/vhost/tcm_vhost.h deleted file mode 100644 index 514b9fda230e..000000000000 --- a/drivers/vhost/tcm_vhost.h +++ /dev/null @@ -1,128 +0,0 @@ -#define TCM_VHOST_VERSION "v0.1" -#define TCM_VHOST_NAMELEN 256 -#define TCM_VHOST_MAX_CDB_SIZE 32 - -struct tcm_vhost_cmd { - /* Descriptor from vhost_get_vq_desc() for virt_queue segment */ - int tvc_vq_desc; - /* virtio-scsi initiator task attribute */ - int tvc_task_attr; - /* virtio-scsi initiator data direction */ - enum dma_data_direction tvc_data_direction; - /* Expected data transfer length from virtio-scsi header */ - u32 tvc_exp_data_len; - /* The Tag from include/linux/virtio_scsi.h:struct virtio_scsi_cmd_req */ - u64 tvc_tag; - /* The number of scatterlists associated with this cmd */ - u32 tvc_sgl_count; - /* Saved unpacked SCSI LUN for tcm_vhost_submission_work() */ - u32 tvc_lun; - /* Pointer to the SGL formatted memory from virtio-scsi */ - struct scatterlist *tvc_sgl; - /* Pointer to response */ - struct virtio_scsi_cmd_resp __user *tvc_resp; - /* Pointer to vhost_scsi for our device */ - struct vhost_scsi *tvc_vhost; - /* Pointer to vhost_virtqueue for the cmd */ - struct vhost_virtqueue *tvc_vq; - /* Pointer to vhost nexus memory */ - struct tcm_vhost_nexus *tvc_nexus; - /* The TCM I/O descriptor that is accessed via container_of() */ - struct se_cmd tvc_se_cmd; - /* work item used for cmwq dispatch to tcm_vhost_submission_work() */ - struct work_struct work; - /* Copy of the incoming SCSI command descriptor block (CDB) */ - unsigned char tvc_cdb[TCM_VHOST_MAX_CDB_SIZE]; - /* Sense buffer that will be mapped into outgoing status */ - unsigned char tvc_sense_buf[TRANSPORT_SENSE_BUFFER]; - /* Completed commands list, serviced from vhost worker thread */ - struct llist_node tvc_completion_list; -}; - -struct tcm_vhost_nexus { - /* Pointer to TCM session for I_T Nexus */ - struct se_session *tvn_se_sess; -}; - -struct tcm_vhost_nacl { - /* Binary World Wide unique Port Name for Vhost Initiator port */ - u64 iport_wwpn; - /* ASCII formatted WWPN for Sas Initiator port */ - char iport_name[TCM_VHOST_NAMELEN]; - /* Returned by tcm_vhost_make_nodeacl() */ - struct se_node_acl se_node_acl; -}; - -struct vhost_scsi; -struct tcm_vhost_tpg { - /* Vhost port target portal group tag for TCM */ - u16 tport_tpgt; - /* Used to track number of TPG Port/Lun Links wrt to explict I_T Nexus shutdown */ - int tv_tpg_port_count; - /* Used for vhost_scsi device reference to tpg_nexus, protected by tv_tpg_mutex */ - int tv_tpg_vhost_count; - /* list for tcm_vhost_list */ - struct list_head tv_tpg_list; - /* Used to protect access for tpg_nexus */ - struct mutex tv_tpg_mutex; - /* Pointer to the TCM VHost I_T Nexus for this TPG endpoint */ - struct tcm_vhost_nexus *tpg_nexus; - /* Pointer back to tcm_vhost_tport */ - struct tcm_vhost_tport *tport; - /* Returned by tcm_vhost_make_tpg() */ - struct se_portal_group se_tpg; - /* Pointer back to vhost_scsi, protected by tv_tpg_mutex */ - struct vhost_scsi *vhost_scsi; -}; - -struct tcm_vhost_tport { - /* SCSI protocol the tport is providing */ - u8 tport_proto_id; - /* Binary World Wide unique Port Name for Vhost Target port */ - u64 tport_wwpn; - /* ASCII formatted WWPN for Vhost Target port */ - char tport_name[TCM_VHOST_NAMELEN]; - /* Returned by tcm_vhost_make_tport() */ - struct se_wwn tport_wwn; -}; - -struct tcm_vhost_evt { - /* event to be sent to guest */ - struct virtio_scsi_event event; - /* event list, serviced from vhost worker thread */ - struct llist_node list; -}; - -/* - * As per request from MST, keep TCM_VHOST related ioctl defines out of - * linux/vhost.h (user-space) for now.. - */ - -#include <linux/vhost.h> - -/* - * Used by QEMU userspace to ensure a consistent vhost-scsi ABI. - * - * ABI Rev 0: July 2012 version starting point for v3.6-rc merge candidate + - * RFC-v2 vhost-scsi userspace. Add GET_ABI_VERSION ioctl usage - * ABI Rev 1: January 2013. Ignore vhost_tpgt filed in struct vhost_scsi_target. - * All the targets under vhost_wwpn can be seen and used by guset. - */ - -#define VHOST_SCSI_ABI_VERSION 1 - -struct vhost_scsi_target { - int abi_version; - char vhost_wwpn[TRANSPORT_IQN_LEN]; - unsigned short vhost_tpgt; - unsigned short reserved; -}; - -/* VHOST_SCSI specific defines */ -#define VHOST_SCSI_SET_ENDPOINT _IOW(VHOST_VIRTIO, 0x40, struct vhost_scsi_target) -#define VHOST_SCSI_CLEAR_ENDPOINT _IOW(VHOST_VIRTIO, 0x41, struct vhost_scsi_target) -/* Changing this breaks userspace. */ -#define VHOST_SCSI_GET_ABI_VERSION _IOW(VHOST_VIRTIO, 0x42, int) -/* Set and get the events missed flag */ -#define VHOST_SCSI_SET_EVENTS_MISSED _IOW(VHOST_VIRTIO, 0x43, __u32) -#define VHOST_SCSI_GET_EVENTS_MISSED _IOW(VHOST_VIRTIO, 0x44, __u32) diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c index 91d6f060aade..1ee45bc85f67 100644 --- a/drivers/vhost/test.c +++ b/drivers/vhost/test.c @@ -219,13 +219,20 @@ static long vhost_test_reset_owner(struct vhost_test *n) { void *priv = NULL; long err; + struct vhost_memory *memory; + mutex_lock(&n->dev.mutex); err = vhost_dev_check_owner(&n->dev); if (err) goto done; + memory = vhost_dev_reset_owner_prepare(); + if (!memory) { + err = -ENOMEM; + goto done; + } vhost_test_stop(n, &priv); vhost_test_flush(n); - err = vhost_dev_reset_owner(&n->dev); + vhost_dev_reset_owner(&n->dev, memory); done: mutex_unlock(&n->dev.mutex); return err; @@ -275,7 +282,9 @@ static long vhost_test_ioctl(struct file *f, unsigned int ioctl, return vhost_test_reset_owner(n); default: mutex_lock(&n->dev.mutex); - r = vhost_dev_ioctl(&n->dev, ioctl, arg); + r = vhost_dev_ioctl(&n->dev, ioctl, argp); + if (r == -ENOIOCTLCMD) + r = vhost_vring_ioctl(&n->dev, ioctl, argp); vhost_test_flush(n); mutex_unlock(&n->dev.mutex); return r; diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 4eecdb867d53..749b5ab5bfbb 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -33,8 +33,6 @@ enum { VHOST_MEMORY_F_LOG = 0x1, }; -static unsigned vhost_zcopy_mask __read_mostly; - #define vhost_used_event(vq) ((u16 __user *)&vq->avail->ring[vq->num]) #define vhost_avail_event(vq) ((u16 __user *)&vq->used->ring[vq->num]) @@ -181,8 +179,6 @@ static void vhost_vq_reset(struct vhost_dev *dev, vq->used_flags = 0; vq->log_used = false; vq->log_addr = -1ull; - vq->vhost_hlen = 0; - vq->sock_hlen = 0; vq->private_data = NULL; vq->log_base = NULL; vq->error_ctx = NULL; @@ -191,9 +187,6 @@ static void vhost_vq_reset(struct vhost_dev *dev, vq->call_ctx = NULL; vq->call = NULL; vq->log_ctx = NULL; - vq->upend_idx = 0; - vq->done_idx = 0; - vq->ubufs = NULL; } static int vhost_worker(void *data) @@ -253,43 +246,29 @@ static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq) vq->log = NULL; kfree(vq->heads); vq->heads = NULL; - kfree(vq->ubuf_info); - vq->ubuf_info = NULL; -} - -void vhost_enable_zcopy(int vq) -{ - vhost_zcopy_mask |= 0x1 << vq; } /* Helper to allocate iovec buffers for all vqs. */ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev) { int i; - bool zcopy; for (i = 0; i < dev->nvqs; ++i) { - dev->vqs[i].indirect = kmalloc(sizeof *dev->vqs[i].indirect * + dev->vqs[i]->indirect = kmalloc(sizeof *dev->vqs[i]->indirect * UIO_MAXIOV, GFP_KERNEL); - dev->vqs[i].log = kmalloc(sizeof *dev->vqs[i].log * UIO_MAXIOV, + dev->vqs[i]->log = kmalloc(sizeof *dev->vqs[i]->log * UIO_MAXIOV, GFP_KERNEL); - dev->vqs[i].heads = kmalloc(sizeof *dev->vqs[i].heads * + dev->vqs[i]->heads = kmalloc(sizeof *dev->vqs[i]->heads * UIO_MAXIOV, GFP_KERNEL); - zcopy = vhost_zcopy_mask & (0x1 << i); - if (zcopy) - dev->vqs[i].ubuf_info = - kmalloc(sizeof *dev->vqs[i].ubuf_info * - UIO_MAXIOV, GFP_KERNEL); - if (!dev->vqs[i].indirect || !dev->vqs[i].log || - !dev->vqs[i].heads || - (zcopy && !dev->vqs[i].ubuf_info)) + if (!dev->vqs[i]->indirect || !dev->vqs[i]->log || + !dev->vqs[i]->heads) goto err_nomem; } return 0; err_nomem: for (; i >= 0; --i) - vhost_vq_free_iovecs(&dev->vqs[i]); + vhost_vq_free_iovecs(dev->vqs[i]); return -ENOMEM; } @@ -298,11 +277,11 @@ static void vhost_dev_free_iovecs(struct vhost_dev *dev) int i; for (i = 0; i < dev->nvqs; ++i) - vhost_vq_free_iovecs(&dev->vqs[i]); + vhost_vq_free_iovecs(dev->vqs[i]); } long vhost_dev_init(struct vhost_dev *dev, - struct vhost_virtqueue *vqs, int nvqs) + struct vhost_virtqueue **vqs, int nvqs) { int i; @@ -318,16 +297,15 @@ long vhost_dev_init(struct vhost_dev *dev, dev->worker = NULL; for (i = 0; i < dev->nvqs; ++i) { - dev->vqs[i].log = NULL; - dev->vqs[i].indirect = NULL; - dev->vqs[i].heads = NULL; - dev->vqs[i].ubuf_info = NULL; - dev->vqs[i].dev = dev; - mutex_init(&dev->vqs[i].mutex); - vhost_vq_reset(dev, dev->vqs + i); - if (dev->vqs[i].handle_kick) - vhost_poll_init(&dev->vqs[i].poll, - dev->vqs[i].handle_kick, POLLIN, dev); + dev->vqs[i]->log = NULL; + dev->vqs[i]->indirect = NULL; + dev->vqs[i]->heads = NULL; + dev->vqs[i]->dev = dev; + mutex_init(&dev->vqs[i]->mutex); + vhost_vq_reset(dev, dev->vqs[i]); + if (dev->vqs[i]->handle_kick) + vhost_poll_init(&dev->vqs[i]->poll, + dev->vqs[i]->handle_kick, POLLIN, dev); } return 0; @@ -408,21 +386,19 @@ err_mm: return err; } -/* Caller should have device mutex */ -long vhost_dev_reset_owner(struct vhost_dev *dev) +struct vhost_memory *vhost_dev_reset_owner_prepare(void) { - struct vhost_memory *memory; - - /* Restore memory to default empty mapping. */ - memory = kmalloc(offsetof(struct vhost_memory, regions), GFP_KERNEL); - if (!memory) - return -ENOMEM; + return kmalloc(offsetof(struct vhost_memory, regions), GFP_KERNEL); +} +/* Caller should have device mutex */ +void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_memory *memory) +{ vhost_dev_cleanup(dev, true); + /* Restore memory to default empty mapping. */ memory->nregions = 0; RCU_INIT_POINTER(dev->memory, memory); - return 0; } void vhost_dev_stop(struct vhost_dev *dev) @@ -430,9 +406,9 @@ void vhost_dev_stop(struct vhost_dev *dev) int i; for (i = 0; i < dev->nvqs; ++i) { - if (dev->vqs[i].kick && dev->vqs[i].handle_kick) { - vhost_poll_stop(&dev->vqs[i].poll); - vhost_poll_flush(&dev->vqs[i].poll); + if (dev->vqs[i]->kick && dev->vqs[i]->handle_kick) { + vhost_poll_stop(&dev->vqs[i]->poll); + vhost_poll_flush(&dev->vqs[i]->poll); } } } @@ -443,17 +419,17 @@ void vhost_dev_cleanup(struct vhost_dev *dev, bool locked) int i; for (i = 0; i < dev->nvqs; ++i) { - if (dev->vqs[i].error_ctx) - eventfd_ctx_put(dev->vqs[i].error_ctx); - if (dev->vqs[i].error) - fput(dev->vqs[i].error); - if (dev->vqs[i].kick) - fput(dev->vqs[i].kick); - if (dev->vqs[i].call_ctx) - eventfd_ctx_put(dev->vqs[i].call_ctx); - if (dev->vqs[i].call) - fput(dev->vqs[i].call); - vhost_vq_reset(dev, dev->vqs + i); + if (dev->vqs[i]->error_ctx) + eventfd_ctx_put(dev->vqs[i]->error_ctx); + if (dev->vqs[i]->error) + fput(dev->vqs[i]->error); + if (dev->vqs[i]->kick) + fput(dev->vqs[i]->kick); + if (dev->vqs[i]->call_ctx) + eventfd_ctx_put(dev->vqs[i]->call_ctx); + if (dev->vqs[i]->call) + fput(dev->vqs[i]->call); + vhost_vq_reset(dev, dev->vqs[i]); } vhost_dev_free_iovecs(dev); if (dev->log_ctx) @@ -524,14 +500,14 @@ static int memory_access_ok(struct vhost_dev *d, struct vhost_memory *mem, for (i = 0; i < d->nvqs; ++i) { int ok; - mutex_lock(&d->vqs[i].mutex); + mutex_lock(&d->vqs[i]->mutex); /* If ring is inactive, will check when it's enabled. */ - if (d->vqs[i].private_data) - ok = vq_memory_access_ok(d->vqs[i].log_base, mem, + if (d->vqs[i]->private_data) + ok = vq_memory_access_ok(d->vqs[i]->log_base, mem, log_all); else ok = 1; - mutex_unlock(&d->vqs[i].mutex); + mutex_unlock(&d->vqs[i]->mutex); if (!ok) return 0; } @@ -641,7 +617,7 @@ long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp) if (idx >= d->nvqs) return -ENOBUFS; - vq = d->vqs + idx; + vq = d->vqs[idx]; mutex_lock(&vq->mutex); @@ -852,7 +828,7 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp) for (i = 0; i < d->nvqs; ++i) { struct vhost_virtqueue *vq; void __user *base = (void __user *)(unsigned long)p; - vq = d->vqs + i; + vq = d->vqs[i]; mutex_lock(&vq->mutex); /* If ring is inactive, will check when it's enabled. */ if (vq->private_data && !vq_log_access_ok(d, vq, base)) @@ -879,9 +855,9 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp) } else filep = eventfp; for (i = 0; i < d->nvqs; ++i) { - mutex_lock(&d->vqs[i].mutex); - d->vqs[i].log_ctx = d->log_ctx; - mutex_unlock(&d->vqs[i].mutex); + mutex_lock(&d->vqs[i]->mutex); + d->vqs[i]->log_ctx = d->log_ctx; + mutex_unlock(&d->vqs[i]->mutex); } if (ctx) eventfd_ctx_put(ctx); @@ -1551,38 +1527,3 @@ void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) &vq->used->flags, r); } } - -static void vhost_zerocopy_done_signal(struct kref *kref) -{ - struct vhost_ubuf_ref *ubufs = container_of(kref, struct vhost_ubuf_ref, - kref); - wake_up(&ubufs->wait); -} - -struct vhost_ubuf_ref *vhost_ubuf_alloc(struct vhost_virtqueue *vq, - bool zcopy) -{ - struct vhost_ubuf_ref *ubufs; - /* No zero copy backend? Nothing to count. */ - if (!zcopy) - return NULL; - ubufs = kmalloc(sizeof *ubufs, GFP_KERNEL); - if (!ubufs) - return ERR_PTR(-ENOMEM); - kref_init(&ubufs->kref); - init_waitqueue_head(&ubufs->wait); - ubufs->vq = vq; - return ubufs; -} - -void vhost_ubuf_put(struct vhost_ubuf_ref *ubufs) -{ - kref_put(&ubufs->kref, vhost_zerocopy_done_signal); -} - -void vhost_ubuf_put_and_wait(struct vhost_ubuf_ref *ubufs) -{ - kref_put(&ubufs->kref, vhost_zerocopy_done_signal); - wait_event(ubufs->wait, !atomic_read(&ubufs->kref.refcount)); - kfree(ubufs); -} diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 17261e277c02..b58f4ae82cb8 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -54,18 +54,6 @@ struct vhost_log { struct vhost_virtqueue; -struct vhost_ubuf_ref { - struct kref kref; - wait_queue_head_t wait; - struct vhost_virtqueue *vq; -}; - -struct vhost_ubuf_ref *vhost_ubuf_alloc(struct vhost_virtqueue *, bool zcopy); -void vhost_ubuf_put(struct vhost_ubuf_ref *); -void vhost_ubuf_put_and_wait(struct vhost_ubuf_ref *); - -struct ubuf_info; - /* The virtqueue structure describes a queue attached to a device. */ struct vhost_virtqueue { struct vhost_dev *dev; @@ -114,10 +102,7 @@ struct vhost_virtqueue { /* hdr is used to store the virtio header. * Since each iovec has >= 1 byte length, we never need more than * header length entries to store the header. */ - struct iovec hdr[sizeof(struct virtio_net_hdr_mrg_rxbuf)]; struct iovec *indirect; - size_t vhost_hlen; - size_t sock_hlen; struct vring_used_elem *heads; /* We use a kind of RCU to access private pointer. * All readers access it from worker, which makes it possible to @@ -130,16 +115,6 @@ struct vhost_virtqueue { /* Log write descriptors */ void __user *log_base; struct vhost_log *log; - /* vhost zerocopy support fields below: */ - /* last used idx for outstanding DMA zerocopy buffers */ - int upend_idx; - /* first used idx for DMA done zerocopy buffers */ - int done_idx; - /* an array of userspace buffers info */ - struct ubuf_info *ubuf_info; - /* Reference counting for outstanding ubufs. - * Protected by vq mutex. Writers must also take device mutex. */ - struct vhost_ubuf_ref *ubufs; }; struct vhost_dev { @@ -150,7 +125,7 @@ struct vhost_dev { struct mm_struct *mm; struct mutex mutex; unsigned acked_features; - struct vhost_virtqueue *vqs; + struct vhost_virtqueue **vqs; int nvqs; struct file *log_file; struct eventfd_ctx *log_ctx; @@ -159,9 +134,10 @@ struct vhost_dev { struct task_struct *worker; }; -long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs); +long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs, int nvqs); long vhost_dev_check_owner(struct vhost_dev *); -long vhost_dev_reset_owner(struct vhost_dev *); +struct vhost_memory *vhost_dev_reset_owner_prepare(void); +void vhost_dev_reset_owner(struct vhost_dev *, struct vhost_memory *); void vhost_dev_cleanup(struct vhost_dev *, bool locked); void vhost_dev_stop(struct vhost_dev *); long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, void __user *argp); diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c new file mode 100644 index 000000000000..bff0775e258c --- /dev/null +++ b/drivers/vhost/vringh.c @@ -0,0 +1,1007 @@ +/* + * Helpers for the host side of a virtio ring. + * + * Since these may be in userspace, we use (inline) accessors. + */ +#include <linux/vringh.h> +#include <linux/virtio_ring.h> +#include <linux/kernel.h> +#include <linux/ratelimit.h> +#include <linux/uaccess.h> +#include <linux/slab.h> +#include <linux/export.h> + +static __printf(1,2) __cold void vringh_bad(const char *fmt, ...) +{ + static DEFINE_RATELIMIT_STATE(vringh_rs, + DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + if (__ratelimit(&vringh_rs)) { + va_list ap; + va_start(ap, fmt); + printk(KERN_NOTICE "vringh:"); + vprintk(fmt, ap); + va_end(ap); + } +} + +/* Returns vring->num if empty, -ve on error. */ +static inline int __vringh_get_head(const struct vringh *vrh, + int (*getu16)(u16 *val, const u16 *p), + u16 *last_avail_idx) +{ + u16 avail_idx, i, head; + int err; + + err = getu16(&avail_idx, &vrh->vring.avail->idx); + if (err) { + vringh_bad("Failed to access avail idx at %p", + &vrh->vring.avail->idx); + return err; + } + + if (*last_avail_idx == avail_idx) + return vrh->vring.num; + + /* Only get avail ring entries after they have been exposed by guest. */ + virtio_rmb(vrh->weak_barriers); + + i = *last_avail_idx & (vrh->vring.num - 1); + + err = getu16(&head, &vrh->vring.avail->ring[i]); + if (err) { + vringh_bad("Failed to read head: idx %d address %p", + *last_avail_idx, &vrh->vring.avail->ring[i]); + return err; + } + + if (head >= vrh->vring.num) { + vringh_bad("Guest says index %u > %u is available", + head, vrh->vring.num); + return -EINVAL; + } + + (*last_avail_idx)++; + return head; +} + +/* Copy some bytes to/from the iovec. Returns num copied. */ +static inline ssize_t vringh_iov_xfer(struct vringh_kiov *iov, + void *ptr, size_t len, + int (*xfer)(void *addr, void *ptr, + size_t len)) +{ + int err, done = 0; + + while (len && iov->i < iov->used) { + size_t partlen; + + partlen = min(iov->iov[iov->i].iov_len, len); + err = xfer(iov->iov[iov->i].iov_base, ptr, partlen); + if (err) + return err; + done += partlen; + len -= partlen; + ptr += partlen; + iov->consumed += partlen; + iov->iov[iov->i].iov_len -= partlen; + iov->iov[iov->i].iov_base += partlen; + + if (!iov->iov[iov->i].iov_len) { + /* Fix up old iov element then increment. */ + iov->iov[iov->i].iov_len = iov->consumed; + iov->iov[iov->i].iov_base -= iov->consumed; + + iov->consumed = 0; + iov->i++; + } + } + return done; +} + +/* May reduce *len if range is shorter. */ +static inline bool range_check(struct vringh *vrh, u64 addr, size_t *len, + struct vringh_range *range, + bool (*getrange)(struct vringh *, + u64, struct vringh_range *)) +{ + if (addr < range->start || addr > range->end_incl) { + if (!getrange(vrh, addr, range)) + return false; + } + BUG_ON(addr < range->start || addr > range->end_incl); + + /* To end of memory? */ + if (unlikely(addr + *len == 0)) { + if (range->end_incl == -1ULL) + return true; + goto truncate; + } + + /* Otherwise, don't wrap. */ + if (addr + *len < addr) { + vringh_bad("Wrapping descriptor %zu@0x%llx", + *len, (unsigned long long)addr); + return false; + } + + if (unlikely(addr + *len - 1 > range->end_incl)) + goto truncate; + return true; + +truncate: + *len = range->end_incl + 1 - addr; + return true; +} + +static inline bool no_range_check(struct vringh *vrh, u64 addr, size_t *len, + struct vringh_range *range, + bool (*getrange)(struct vringh *, + u64, struct vringh_range *)) +{ + return true; +} + +/* No reason for this code to be inline. */ +static int move_to_indirect(int *up_next, u16 *i, void *addr, + const struct vring_desc *desc, + struct vring_desc **descs, int *desc_max) +{ + /* Indirect tables can't have indirect. */ + if (*up_next != -1) { + vringh_bad("Multilevel indirect %u->%u", *up_next, *i); + return -EINVAL; + } + + if (unlikely(desc->len % sizeof(struct vring_desc))) { + vringh_bad("Strange indirect len %u", desc->len); + return -EINVAL; + } + + /* We will check this when we follow it! */ + if (desc->flags & VRING_DESC_F_NEXT) + *up_next = desc->next; + else + *up_next = -2; + *descs = addr; + *desc_max = desc->len / sizeof(struct vring_desc); + + /* Now, start at the first indirect. */ + *i = 0; + return 0; +} + +static int resize_iovec(struct vringh_kiov *iov, gfp_t gfp) +{ + struct kvec *new; + unsigned int flag, new_num = (iov->max_num & ~VRINGH_IOV_ALLOCATED) * 2; + + if (new_num < 8) + new_num = 8; + + flag = (iov->max_num & VRINGH_IOV_ALLOCATED); + if (flag) + new = krealloc(iov->iov, new_num * sizeof(struct iovec), gfp); + else { + new = kmalloc(new_num * sizeof(struct iovec), gfp); + if (new) { + memcpy(new, iov->iov, + iov->max_num * sizeof(struct iovec)); + flag = VRINGH_IOV_ALLOCATED; + } + } + if (!new) + return -ENOMEM; + iov->iov = new; + iov->max_num = (new_num | flag); + return 0; +} + +static u16 __cold return_from_indirect(const struct vringh *vrh, int *up_next, + struct vring_desc **descs, int *desc_max) +{ + u16 i = *up_next; + + *up_next = -1; + *descs = vrh->vring.desc; + *desc_max = vrh->vring.num; + return i; +} + +static int slow_copy(struct vringh *vrh, void *dst, const void *src, + bool (*rcheck)(struct vringh *vrh, u64 addr, size_t *len, + struct vringh_range *range, + bool (*getrange)(struct vringh *vrh, + u64, + struct vringh_range *)), + bool (*getrange)(struct vringh *vrh, + u64 addr, + struct vringh_range *r), + struct vringh_range *range, + int (*copy)(void *dst, const void *src, size_t len)) +{ + size_t part, len = sizeof(struct vring_desc); + + do { + u64 addr; + int err; + + part = len; + addr = (u64)(unsigned long)src - range->offset; + + if (!rcheck(vrh, addr, &part, range, getrange)) + return -EINVAL; + + err = copy(dst, src, part); + if (err) + return err; + + dst += part; + src += part; + len -= part; + } while (len); + return 0; +} + +static inline int +__vringh_iov(struct vringh *vrh, u16 i, + struct vringh_kiov *riov, + struct vringh_kiov *wiov, + bool (*rcheck)(struct vringh *vrh, u64 addr, size_t *len, + struct vringh_range *range, + bool (*getrange)(struct vringh *, u64, + struct vringh_range *)), + bool (*getrange)(struct vringh *, u64, struct vringh_range *), + gfp_t gfp, + int (*copy)(void *dst, const void *src, size_t len)) +{ + int err, count = 0, up_next, desc_max; + struct vring_desc desc, *descs; + struct vringh_range range = { -1ULL, 0 }, slowrange; + bool slow = false; + + /* We start traversing vring's descriptor table. */ + descs = vrh->vring.desc; + desc_max = vrh->vring.num; + up_next = -1; + + if (riov) + riov->i = riov->used = 0; + else if (wiov) + wiov->i = wiov->used = 0; + else + /* You must want something! */ + BUG(); + + for (;;) { + void *addr; + struct vringh_kiov *iov; + size_t len; + + if (unlikely(slow)) + err = slow_copy(vrh, &desc, &descs[i], rcheck, getrange, + &slowrange, copy); + else + err = copy(&desc, &descs[i], sizeof(desc)); + if (unlikely(err)) + goto fail; + + if (unlikely(desc.flags & VRING_DESC_F_INDIRECT)) { + /* Make sure it's OK, and get offset. */ + len = desc.len; + if (!rcheck(vrh, desc.addr, &len, &range, getrange)) { + err = -EINVAL; + goto fail; + } + + if (unlikely(len != desc.len)) { + slow = true; + /* We need to save this range to use offset */ + slowrange = range; + } + + addr = (void *)(long)(desc.addr + range.offset); + err = move_to_indirect(&up_next, &i, addr, &desc, + &descs, &desc_max); + if (err) + goto fail; + continue; + } + + if (count++ == vrh->vring.num) { + vringh_bad("Descriptor loop in %p", descs); + err = -ELOOP; + goto fail; + } + + if (desc.flags & VRING_DESC_F_WRITE) + iov = wiov; + else { + iov = riov; + if (unlikely(wiov && wiov->i)) { + vringh_bad("Readable desc %p after writable", + &descs[i]); + err = -EINVAL; + goto fail; + } + } + + if (!iov) { + vringh_bad("Unexpected %s desc", + !wiov ? "writable" : "readable"); + err = -EPROTO; + goto fail; + } + + again: + /* Make sure it's OK, and get offset. */ + len = desc.len; + if (!rcheck(vrh, desc.addr, &len, &range, getrange)) { + err = -EINVAL; + goto fail; + } + addr = (void *)(unsigned long)(desc.addr + range.offset); + + if (unlikely(iov->used == (iov->max_num & ~VRINGH_IOV_ALLOCATED))) { + err = resize_iovec(iov, gfp); + if (err) + goto fail; + } + + iov->iov[iov->used].iov_base = addr; + iov->iov[iov->used].iov_len = len; + iov->used++; + + if (unlikely(len != desc.len)) { + desc.len -= len; + desc.addr += len; + goto again; + } + + if (desc.flags & VRING_DESC_F_NEXT) { + i = desc.next; + } else { + /* Just in case we need to finish traversing above. */ + if (unlikely(up_next > 0)) { + i = return_from_indirect(vrh, &up_next, + &descs, &desc_max); + slow = false; + } else + break; + } + + if (i >= desc_max) { + vringh_bad("Chained index %u > %u", i, desc_max); + err = -EINVAL; + goto fail; + } + } + + return 0; + +fail: + return err; +} + +static inline int __vringh_complete(struct vringh *vrh, + const struct vring_used_elem *used, + unsigned int num_used, + int (*putu16)(u16 *p, u16 val), + int (*putused)(struct vring_used_elem *dst, + const struct vring_used_elem + *src, unsigned num)) +{ + struct vring_used *used_ring; + int err; + u16 used_idx, off; + + used_ring = vrh->vring.used; + used_idx = vrh->last_used_idx + vrh->completed; + + off = used_idx % vrh->vring.num; + + /* Compiler knows num_used == 1 sometimes, hence extra check */ + if (num_used > 1 && unlikely(off + num_used >= vrh->vring.num)) { + u16 part = vrh->vring.num - off; + err = putused(&used_ring->ring[off], used, part); + if (!err) + err = putused(&used_ring->ring[0], used + part, + num_used - part); + } else + err = putused(&used_ring->ring[off], used, num_used); + + if (err) { + vringh_bad("Failed to write %u used entries %u at %p", + num_used, off, &used_ring->ring[off]); + return err; + } + + /* Make sure buffer is written before we update index. */ + virtio_wmb(vrh->weak_barriers); + + err = putu16(&vrh->vring.used->idx, used_idx + num_used); + if (err) { + vringh_bad("Failed to update used index at %p", + &vrh->vring.used->idx); + return err; + } + + vrh->completed += num_used; + return 0; +} + + +static inline int __vringh_need_notify(struct vringh *vrh, + int (*getu16)(u16 *val, const u16 *p)) +{ + bool notify; + u16 used_event; + int err; + + /* Flush out used index update. This is paired with the + * barrier that the Guest executes when enabling + * interrupts. */ + virtio_mb(vrh->weak_barriers); + + /* Old-style, without event indices. */ + if (!vrh->event_indices) { + u16 flags; + err = getu16(&flags, &vrh->vring.avail->flags); + if (err) { + vringh_bad("Failed to get flags at %p", + &vrh->vring.avail->flags); + return err; + } + return (!(flags & VRING_AVAIL_F_NO_INTERRUPT)); + } + + /* Modern: we know when other side wants to know. */ + err = getu16(&used_event, &vring_used_event(&vrh->vring)); + if (err) { + vringh_bad("Failed to get used event idx at %p", + &vring_used_event(&vrh->vring)); + return err; + } + + /* Just in case we added so many that we wrap. */ + if (unlikely(vrh->completed > 0xffff)) + notify = true; + else + notify = vring_need_event(used_event, + vrh->last_used_idx + vrh->completed, + vrh->last_used_idx); + + vrh->last_used_idx += vrh->completed; + vrh->completed = 0; + return notify; +} + +static inline bool __vringh_notify_enable(struct vringh *vrh, + int (*getu16)(u16 *val, const u16 *p), + int (*putu16)(u16 *p, u16 val)) +{ + u16 avail; + + if (!vrh->event_indices) { + /* Old-school; update flags. */ + if (putu16(&vrh->vring.used->flags, 0) != 0) { + vringh_bad("Clearing used flags %p", + &vrh->vring.used->flags); + return true; + } + } else { + if (putu16(&vring_avail_event(&vrh->vring), + vrh->last_avail_idx) != 0) { + vringh_bad("Updating avail event index %p", + &vring_avail_event(&vrh->vring)); + return true; + } + } + + /* They could have slipped one in as we were doing that: make + * sure it's written, then check again. */ + virtio_mb(vrh->weak_barriers); + + if (getu16(&avail, &vrh->vring.avail->idx) != 0) { + vringh_bad("Failed to check avail idx at %p", + &vrh->vring.avail->idx); + return true; + } + + /* This is unlikely, so we just leave notifications enabled + * (if we're using event_indices, we'll only get one + * notification anyway). */ + return avail == vrh->last_avail_idx; +} + +static inline void __vringh_notify_disable(struct vringh *vrh, + int (*putu16)(u16 *p, u16 val)) +{ + if (!vrh->event_indices) { + /* Old-school; update flags. */ + if (putu16(&vrh->vring.used->flags, VRING_USED_F_NO_NOTIFY)) { + vringh_bad("Setting used flags %p", + &vrh->vring.used->flags); + } + } +} + +/* Userspace access helpers: in this case, addresses are really userspace. */ +static inline int getu16_user(u16 *val, const u16 *p) +{ + return get_user(*val, (__force u16 __user *)p); +} + +static inline int putu16_user(u16 *p, u16 val) +{ + return put_user(val, (__force u16 __user *)p); +} + +static inline int copydesc_user(void *dst, const void *src, size_t len) +{ + return copy_from_user(dst, (__force void __user *)src, len) ? + -EFAULT : 0; +} + +static inline int putused_user(struct vring_used_elem *dst, + const struct vring_used_elem *src, + unsigned int num) +{ + return copy_to_user((__force void __user *)dst, src, + sizeof(*dst) * num) ? -EFAULT : 0; +} + +static inline int xfer_from_user(void *src, void *dst, size_t len) +{ + return copy_from_user(dst, (__force void __user *)src, len) ? + -EFAULT : 0; +} + +static inline int xfer_to_user(void *dst, void *src, size_t len) +{ + return copy_to_user((__force void __user *)dst, src, len) ? + -EFAULT : 0; +} + +/** + * vringh_init_user - initialize a vringh for a userspace vring. + * @vrh: the vringh to initialize. + * @features: the feature bits for this ring. + * @num: the number of elements. + * @weak_barriers: true if we only need memory barriers, not I/O. + * @desc: the userpace descriptor pointer. + * @avail: the userpace avail pointer. + * @used: the userpace used pointer. + * + * Returns an error if num is invalid: you should check pointers + * yourself! + */ +int vringh_init_user(struct vringh *vrh, u32 features, + unsigned int num, bool weak_barriers, + struct vring_desc __user *desc, + struct vring_avail __user *avail, + struct vring_used __user *used) +{ + /* Sane power of 2 please! */ + if (!num || num > 0xffff || (num & (num - 1))) { + vringh_bad("Bad ring size %u", num); + return -EINVAL; + } + + vrh->event_indices = (features & (1 << VIRTIO_RING_F_EVENT_IDX)); + vrh->weak_barriers = weak_barriers; + vrh->completed = 0; + vrh->last_avail_idx = 0; + vrh->last_used_idx = 0; + vrh->vring.num = num; + /* vring expects kernel addresses, but only used via accessors. */ + vrh->vring.desc = (__force struct vring_desc *)desc; + vrh->vring.avail = (__force struct vring_avail *)avail; + vrh->vring.used = (__force struct vring_used *)used; + return 0; +} +EXPORT_SYMBOL(vringh_init_user); + +/** + * vringh_getdesc_user - get next available descriptor from userspace ring. + * @vrh: the userspace vring. + * @riov: where to put the readable descriptors (or NULL) + * @wiov: where to put the writable descriptors (or NULL) + * @getrange: function to call to check ranges. + * @head: head index we received, for passing to vringh_complete_user(). + * + * Returns 0 if there was no descriptor, 1 if there was, or -errno. + * + * Note that on error return, you can tell the difference between an + * invalid ring and a single invalid descriptor: in the former case, + * *head will be vrh->vring.num. You may be able to ignore an invalid + * descriptor, but there's not much you can do with an invalid ring. + * + * Note that you may need to clean up riov and wiov, even on error! + */ +int vringh_getdesc_user(struct vringh *vrh, + struct vringh_iov *riov, + struct vringh_iov *wiov, + bool (*getrange)(struct vringh *vrh, + u64 addr, struct vringh_range *r), + u16 *head) +{ + int err; + + *head = vrh->vring.num; + err = __vringh_get_head(vrh, getu16_user, &vrh->last_avail_idx); + if (err < 0) + return err; + + /* Empty... */ + if (err == vrh->vring.num) + return 0; + + /* We need the layouts to be the identical for this to work */ + BUILD_BUG_ON(sizeof(struct vringh_kiov) != sizeof(struct vringh_iov)); + BUILD_BUG_ON(offsetof(struct vringh_kiov, iov) != + offsetof(struct vringh_iov, iov)); + BUILD_BUG_ON(offsetof(struct vringh_kiov, i) != + offsetof(struct vringh_iov, i)); + BUILD_BUG_ON(offsetof(struct vringh_kiov, used) != + offsetof(struct vringh_iov, used)); + BUILD_BUG_ON(offsetof(struct vringh_kiov, max_num) != + offsetof(struct vringh_iov, max_num)); + BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec)); + BUILD_BUG_ON(offsetof(struct iovec, iov_base) != + offsetof(struct kvec, iov_base)); + BUILD_BUG_ON(offsetof(struct iovec, iov_len) != + offsetof(struct kvec, iov_len)); + BUILD_BUG_ON(sizeof(((struct iovec *)NULL)->iov_base) + != sizeof(((struct kvec *)NULL)->iov_base)); + BUILD_BUG_ON(sizeof(((struct iovec *)NULL)->iov_len) + != sizeof(((struct kvec *)NULL)->iov_len)); + + *head = err; + err = __vringh_iov(vrh, *head, (struct vringh_kiov *)riov, + (struct vringh_kiov *)wiov, + range_check, getrange, GFP_KERNEL, copydesc_user); + if (err) + return err; + + return 1; +} +EXPORT_SYMBOL(vringh_getdesc_user); + +/** + * vringh_iov_pull_user - copy bytes from vring_iov. + * @riov: the riov as passed to vringh_getdesc_user() (updated as we consume) + * @dst: the place to copy. + * @len: the maximum length to copy. + * + * Returns the bytes copied <= len or a negative errno. + */ +ssize_t vringh_iov_pull_user(struct vringh_iov *riov, void *dst, size_t len) +{ + return vringh_iov_xfer((struct vringh_kiov *)riov, + dst, len, xfer_from_user); +} +EXPORT_SYMBOL(vringh_iov_pull_user); + +/** + * vringh_iov_push_user - copy bytes into vring_iov. + * @wiov: the wiov as passed to vringh_getdesc_user() (updated as we consume) + * @dst: the place to copy. + * @len: the maximum length to copy. + * + * Returns the bytes copied <= len or a negative errno. + */ +ssize_t vringh_iov_push_user(struct vringh_iov *wiov, + const void *src, size_t len) +{ + return vringh_iov_xfer((struct vringh_kiov *)wiov, + (void *)src, len, xfer_to_user); +} +EXPORT_SYMBOL(vringh_iov_push_user); + +/** + * vringh_abandon_user - we've decided not to handle the descriptor(s). + * @vrh: the vring. + * @num: the number of descriptors to put back (ie. num + * vringh_get_user() to undo). + * + * The next vringh_get_user() will return the old descriptor(s) again. + */ +void vringh_abandon_user(struct vringh *vrh, unsigned int num) +{ + /* We only update vring_avail_event(vr) when we want to be notified, + * so we haven't changed that yet. */ + vrh->last_avail_idx -= num; +} +EXPORT_SYMBOL(vringh_abandon_user); + +/** + * vringh_complete_user - we've finished with descriptor, publish it. + * @vrh: the vring. + * @head: the head as filled in by vringh_getdesc_user. + * @len: the length of data we have written. + * + * You should check vringh_need_notify_user() after one or more calls + * to this function. + */ +int vringh_complete_user(struct vringh *vrh, u16 head, u32 len) +{ + struct vring_used_elem used; + + used.id = head; + used.len = len; + return __vringh_complete(vrh, &used, 1, putu16_user, putused_user); +} +EXPORT_SYMBOL(vringh_complete_user); + +/** + * vringh_complete_multi_user - we've finished with many descriptors. + * @vrh: the vring. + * @used: the head, length pairs. + * @num_used: the number of used elements. + * + * You should check vringh_need_notify_user() after one or more calls + * to this function. + */ +int vringh_complete_multi_user(struct vringh *vrh, + const struct vring_used_elem used[], + unsigned num_used) +{ + return __vringh_complete(vrh, used, num_used, + putu16_user, putused_user); +} +EXPORT_SYMBOL(vringh_complete_multi_user); + +/** + * vringh_notify_enable_user - we want to know if something changes. + * @vrh: the vring. + * + * This always enables notifications, but returns false if there are + * now more buffers available in the vring. + */ +bool vringh_notify_enable_user(struct vringh *vrh) +{ + return __vringh_notify_enable(vrh, getu16_user, putu16_user); +} +EXPORT_SYMBOL(vringh_notify_enable_user); + +/** + * vringh_notify_disable_user - don't tell us if something changes. + * @vrh: the vring. + * + * This is our normal running state: we disable and then only enable when + * we're going to sleep. + */ +void vringh_notify_disable_user(struct vringh *vrh) +{ + __vringh_notify_disable(vrh, putu16_user); +} +EXPORT_SYMBOL(vringh_notify_disable_user); + +/** + * vringh_need_notify_user - must we tell the other side about used buffers? + * @vrh: the vring we've called vringh_complete_user() on. + * + * Returns -errno or 0 if we don't need to tell the other side, 1 if we do. + */ +int vringh_need_notify_user(struct vringh *vrh) +{ + return __vringh_need_notify(vrh, getu16_user); +} +EXPORT_SYMBOL(vringh_need_notify_user); + +/* Kernelspace access helpers. */ +static inline int getu16_kern(u16 *val, const u16 *p) +{ + *val = ACCESS_ONCE(*p); + return 0; +} + +static inline int putu16_kern(u16 *p, u16 val) +{ + ACCESS_ONCE(*p) = val; + return 0; +} + +static inline int copydesc_kern(void *dst, const void *src, size_t len) +{ + memcpy(dst, src, len); + return 0; +} + +static inline int putused_kern(struct vring_used_elem *dst, + const struct vring_used_elem *src, + unsigned int num) +{ + memcpy(dst, src, num * sizeof(*dst)); + return 0; +} + +static inline int xfer_kern(void *src, void *dst, size_t len) +{ + memcpy(dst, src, len); + return 0; +} + +/** + * vringh_init_kern - initialize a vringh for a kernelspace vring. + * @vrh: the vringh to initialize. + * @features: the feature bits for this ring. + * @num: the number of elements. + * @weak_barriers: true if we only need memory barriers, not I/O. + * @desc: the userpace descriptor pointer. + * @avail: the userpace avail pointer. + * @used: the userpace used pointer. + * + * Returns an error if num is invalid. + */ +int vringh_init_kern(struct vringh *vrh, u32 features, + unsigned int num, bool weak_barriers, + struct vring_desc *desc, + struct vring_avail *avail, + struct vring_used *used) +{ + /* Sane power of 2 please! */ + if (!num || num > 0xffff || (num & (num - 1))) { + vringh_bad("Bad ring size %u", num); + return -EINVAL; + } + + vrh->event_indices = (features & (1 << VIRTIO_RING_F_EVENT_IDX)); + vrh->weak_barriers = weak_barriers; + vrh->completed = 0; + vrh->last_avail_idx = 0; + vrh->last_used_idx = 0; + vrh->vring.num = num; + vrh->vring.desc = desc; + vrh->vring.avail = avail; + vrh->vring.used = used; + return 0; +} +EXPORT_SYMBOL(vringh_init_kern); + +/** + * vringh_getdesc_kern - get next available descriptor from kernelspace ring. + * @vrh: the kernelspace vring. + * @riov: where to put the readable descriptors (or NULL) + * @wiov: where to put the writable descriptors (or NULL) + * @head: head index we received, for passing to vringh_complete_kern(). + * @gfp: flags for allocating larger riov/wiov. + * + * Returns 0 if there was no descriptor, 1 if there was, or -errno. + * + * Note that on error return, you can tell the difference between an + * invalid ring and a single invalid descriptor: in the former case, + * *head will be vrh->vring.num. You may be able to ignore an invalid + * descriptor, but there's not much you can do with an invalid ring. + * + * Note that you may need to clean up riov and wiov, even on error! + */ +int vringh_getdesc_kern(struct vringh *vrh, + struct vringh_kiov *riov, + struct vringh_kiov *wiov, + u16 *head, + gfp_t gfp) +{ + int err; + + err = __vringh_get_head(vrh, getu16_kern, &vrh->last_avail_idx); + if (err < 0) + return err; + + /* Empty... */ + if (err == vrh->vring.num) + return 0; + + *head = err; + err = __vringh_iov(vrh, *head, riov, wiov, no_range_check, NULL, + gfp, copydesc_kern); + if (err) + return err; + + return 1; +} +EXPORT_SYMBOL(vringh_getdesc_kern); + +/** + * vringh_iov_pull_kern - copy bytes from vring_iov. + * @riov: the riov as passed to vringh_getdesc_kern() (updated as we consume) + * @dst: the place to copy. + * @len: the maximum length to copy. + * + * Returns the bytes copied <= len or a negative errno. + */ +ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len) +{ + return vringh_iov_xfer(riov, dst, len, xfer_kern); +} +EXPORT_SYMBOL(vringh_iov_pull_kern); + +/** + * vringh_iov_push_kern - copy bytes into vring_iov. + * @wiov: the wiov as passed to vringh_getdesc_kern() (updated as we consume) + * @dst: the place to copy. + * @len: the maximum length to copy. + * + * Returns the bytes copied <= len or a negative errno. + */ +ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov, + const void *src, size_t len) +{ + return vringh_iov_xfer(wiov, (void *)src, len, xfer_kern); +} +EXPORT_SYMBOL(vringh_iov_push_kern); + +/** + * vringh_abandon_kern - we've decided not to handle the descriptor(s). + * @vrh: the vring. + * @num: the number of descriptors to put back (ie. num + * vringh_get_kern() to undo). + * + * The next vringh_get_kern() will return the old descriptor(s) again. + */ +void vringh_abandon_kern(struct vringh *vrh, unsigned int num) +{ + /* We only update vring_avail_event(vr) when we want to be notified, + * so we haven't changed that yet. */ + vrh->last_avail_idx -= num; +} +EXPORT_SYMBOL(vringh_abandon_kern); + +/** + * vringh_complete_kern - we've finished with descriptor, publish it. + * @vrh: the vring. + * @head: the head as filled in by vringh_getdesc_kern. + * @len: the length of data we have written. + * + * You should check vringh_need_notify_kern() after one or more calls + * to this function. + */ +int vringh_complete_kern(struct vringh *vrh, u16 head, u32 len) +{ + struct vring_used_elem used; + + used.id = head; + used.len = len; + + return __vringh_complete(vrh, &used, 1, putu16_kern, putused_kern); +} +EXPORT_SYMBOL(vringh_complete_kern); + +/** + * vringh_notify_enable_kern - we want to know if something changes. + * @vrh: the vring. + * + * This always enables notifications, but returns false if there are + * now more buffers available in the vring. + */ +bool vringh_notify_enable_kern(struct vringh *vrh) +{ + return __vringh_notify_enable(vrh, getu16_kern, putu16_kern); +} +EXPORT_SYMBOL(vringh_notify_enable_kern); + +/** + * vringh_notify_disable_kern - don't tell us if something changes. + * @vrh: the vring. + * + * This is our normal running state: we disable and then only enable when + * we're going to sleep. + */ +void vringh_notify_disable_kern(struct vringh *vrh) +{ + __vringh_notify_disable(vrh, putu16_kern); +} +EXPORT_SYMBOL(vringh_notify_disable_kern); + +/** + * vringh_need_notify_kern - must we tell the other side about used buffers? + * @vrh: the vring we've called vringh_complete_kern() on. + * + * Returns -errno or 0 if we don't need to tell the other side, 1 if we do. + */ +int vringh_need_notify_kern(struct vringh *vrh) +{ + return __vringh_need_notify(vrh, getu16_kern); +} +EXPORT_SYMBOL(vringh_need_notify_kern); diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 8dab163c5ef0..bd3ae324a1a2 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -108,7 +108,7 @@ static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq) sg_init_one(&sg, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns); /* We should always be able to add one buffer to an empty queue. */ - if (virtqueue_add_buf(vq, &sg, 1, 0, vb, GFP_KERNEL) < 0) + if (virtqueue_add_outbuf(vq, &sg, 1, vb, GFP_KERNEL) < 0) BUG(); virtqueue_kick(vq); @@ -256,7 +256,7 @@ static void stats_handle_request(struct virtio_balloon *vb) if (!virtqueue_get_buf(vq, &len)) return; sg_init_one(&sg, vb->stats, sizeof(vb->stats)); - if (virtqueue_add_buf(vq, &sg, 1, 0, vb, GFP_KERNEL) < 0) + if (virtqueue_add_outbuf(vq, &sg, 1, vb, GFP_KERNEL) < 0) BUG(); virtqueue_kick(vq); } @@ -341,7 +341,7 @@ static int init_vqs(struct virtio_balloon *vb) * use it to signal us later. */ sg_init_one(&sg, vb->stats, sizeof vb->stats); - if (virtqueue_add_buf(vb->stats_vq, &sg, 1, 0, vb, GFP_KERNEL) + if (virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb, GFP_KERNEL) < 0) BUG(); virtqueue_kick(vb->stats_vq); diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index ffd7e7da5d3b..5217baf5528c 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -24,27 +24,6 @@ #include <linux/module.h> #include <linux/hrtimer.h> -/* virtio guest is communicating with a virtual "device" that actually runs on - * a host processor. Memory barriers are used to control SMP effects. */ -#ifdef CONFIG_SMP -/* Where possible, use SMP barriers which are more lightweight than mandatory - * barriers, because mandatory barriers control MMIO effects on accesses - * through relaxed memory I/O windows (which virtio-pci does not use). */ -#define virtio_mb(vq) \ - do { if ((vq)->weak_barriers) smp_mb(); else mb(); } while(0) -#define virtio_rmb(vq) \ - do { if ((vq)->weak_barriers) smp_rmb(); else rmb(); } while(0) -#define virtio_wmb(vq) \ - do { if ((vq)->weak_barriers) smp_wmb(); else wmb(); } while(0) -#else -/* We must force memory ordering even if guest is UP since host could be - * running on another CPU, but SMP barriers are defined to barrier() in that - * configuration. So fall back to mandatory barriers instead. */ -#define virtio_mb(vq) mb() -#define virtio_rmb(vq) rmb() -#define virtio_wmb(vq) wmb() -#endif - #ifdef DEBUG /* For development, we want to crash whenever the ring is screwed. */ #define BAD_RING(_vq, fmt, args...) \ @@ -119,16 +98,36 @@ struct vring_virtqueue #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq) +static inline struct scatterlist *sg_next_chained(struct scatterlist *sg, + unsigned int *count) +{ + return sg_next(sg); +} + +static inline struct scatterlist *sg_next_arr(struct scatterlist *sg, + unsigned int *count) +{ + if (--(*count) == 0) + return NULL; + return sg + 1; +} + /* Set up an indirect table of descriptors and add it to the queue. */ -static int vring_add_indirect(struct vring_virtqueue *vq, - struct scatterlist sg[], - unsigned int out, - unsigned int in, - gfp_t gfp) +static inline int vring_add_indirect(struct vring_virtqueue *vq, + struct scatterlist *sgs[], + struct scatterlist *(*next) + (struct scatterlist *, unsigned int *), + unsigned int total_sg, + unsigned int total_out, + unsigned int total_in, + unsigned int out_sgs, + unsigned int in_sgs, + gfp_t gfp) { struct vring_desc *desc; unsigned head; - int i; + struct scatterlist *sg; + int i, n; /* * We require lowmem mappings for the descriptors because @@ -137,25 +136,31 @@ static int vring_add_indirect(struct vring_virtqueue *vq, */ gfp &= ~(__GFP_HIGHMEM | __GFP_HIGH); - desc = kmalloc((out + in) * sizeof(struct vring_desc), gfp); + desc = kmalloc(total_sg * sizeof(struct vring_desc), gfp); if (!desc) return -ENOMEM; - /* Transfer entries from the sg list into the indirect page */ - for (i = 0; i < out; i++) { - desc[i].flags = VRING_DESC_F_NEXT; - desc[i].addr = sg_phys(sg); - desc[i].len = sg->length; - desc[i].next = i+1; - sg++; + /* Transfer entries from the sg lists into the indirect page */ + i = 0; + for (n = 0; n < out_sgs; n++) { + for (sg = sgs[n]; sg; sg = next(sg, &total_out)) { + desc[i].flags = VRING_DESC_F_NEXT; + desc[i].addr = sg_phys(sg); + desc[i].len = sg->length; + desc[i].next = i+1; + i++; + } } - for (; i < (out + in); i++) { - desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE; - desc[i].addr = sg_phys(sg); - desc[i].len = sg->length; - desc[i].next = i+1; - sg++; + for (; n < (out_sgs + in_sgs); n++) { + for (sg = sgs[n]; sg; sg = next(sg, &total_in)) { + desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE; + desc[i].addr = sg_phys(sg); + desc[i].len = sg->length; + desc[i].next = i+1; + i++; + } } + BUG_ON(i != total_sg); /* Last one doesn't continue. */ desc[i-1].flags &= ~VRING_DESC_F_NEXT; @@ -176,29 +181,20 @@ static int vring_add_indirect(struct vring_virtqueue *vq, return head; } -/** - * virtqueue_add_buf - expose buffer to other end - * @vq: the struct virtqueue we're talking about. - * @sg: the description of the buffer(s). - * @out_num: the number of sg readable by other side - * @in_num: the number of sg which are writable (after readable ones) - * @data: the token identifying the buffer. - * @gfp: how to do memory allocations (if necessary). - * - * Caller must ensure we don't call this with other virtqueue operations - * at the same time (except where noted). - * - * Returns zero or a negative error (ie. ENOSPC, ENOMEM). - */ -int virtqueue_add_buf(struct virtqueue *_vq, - struct scatterlist sg[], - unsigned int out, - unsigned int in, - void *data, - gfp_t gfp) +static inline int virtqueue_add(struct virtqueue *_vq, + struct scatterlist *sgs[], + struct scatterlist *(*next) + (struct scatterlist *, unsigned int *), + unsigned int total_out, + unsigned int total_in, + unsigned int out_sgs, + unsigned int in_sgs, + void *data, + gfp_t gfp) { struct vring_virtqueue *vq = to_vvq(_vq); - unsigned int i, avail, uninitialized_var(prev); + struct scatterlist *sg; + unsigned int i, n, avail, uninitialized_var(prev), total_sg; int head; START_USE(vq); @@ -218,46 +214,54 @@ int virtqueue_add_buf(struct virtqueue *_vq, } #endif + total_sg = total_in + total_out; + /* If the host supports indirect descriptor tables, and we have multiple * buffers, then go indirect. FIXME: tune this threshold */ - if (vq->indirect && (out + in) > 1 && vq->vq.num_free) { - head = vring_add_indirect(vq, sg, out, in, gfp); + if (vq->indirect && total_sg > 1 && vq->vq.num_free) { + head = vring_add_indirect(vq, sgs, next, total_sg, total_out, + total_in, + out_sgs, in_sgs, gfp); if (likely(head >= 0)) goto add_head; } - BUG_ON(out + in > vq->vring.num); - BUG_ON(out + in == 0); + BUG_ON(total_sg > vq->vring.num); + BUG_ON(total_sg == 0); - if (vq->vq.num_free < out + in) { + if (vq->vq.num_free < total_sg) { pr_debug("Can't add buf len %i - avail = %i\n", - out + in, vq->vq.num_free); + total_sg, vq->vq.num_free); /* FIXME: for historical reasons, we force a notify here if * there are outgoing parts to the buffer. Presumably the * host should service the ring ASAP. */ - if (out) + if (out_sgs) vq->notify(&vq->vq); END_USE(vq); return -ENOSPC; } /* We're about to use some buffers from the free list. */ - vq->vq.num_free -= out + in; - - head = vq->free_head; - for (i = vq->free_head; out; i = vq->vring.desc[i].next, out--) { - vq->vring.desc[i].flags = VRING_DESC_F_NEXT; - vq->vring.desc[i].addr = sg_phys(sg); - vq->vring.desc[i].len = sg->length; - prev = i; - sg++; + vq->vq.num_free -= total_sg; + + head = i = vq->free_head; + for (n = 0; n < out_sgs; n++) { + for (sg = sgs[n]; sg; sg = next(sg, &total_out)) { + vq->vring.desc[i].flags = VRING_DESC_F_NEXT; + vq->vring.desc[i].addr = sg_phys(sg); + vq->vring.desc[i].len = sg->length; + prev = i; + i = vq->vring.desc[i].next; + } } - for (; in; i = vq->vring.desc[i].next, in--) { - vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE; - vq->vring.desc[i].addr = sg_phys(sg); - vq->vring.desc[i].len = sg->length; - prev = i; - sg++; + for (; n < (out_sgs + in_sgs); n++) { + for (sg = sgs[n]; sg; sg = next(sg, &total_in)) { + vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE; + vq->vring.desc[i].addr = sg_phys(sg); + vq->vring.desc[i].len = sg->length; + prev = i; + i = vq->vring.desc[i].next; + } } /* Last one doesn't continue. */ vq->vring.desc[prev].flags &= ~VRING_DESC_F_NEXT; @@ -276,7 +280,7 @@ add_head: /* Descriptors and available array need to be set before we expose the * new available array entries. */ - virtio_wmb(vq); + virtio_wmb(vq->weak_barriers); vq->vring.avail->idx++; vq->num_added++; @@ -290,9 +294,122 @@ add_head: return 0; } + +/** + * virtqueue_add_buf - expose buffer to other end + * @vq: the struct virtqueue we're talking about. + * @sg: the description of the buffer(s). + * @out_num: the number of sg readable by other side + * @in_num: the number of sg which are writable (after readable ones) + * @data: the token identifying the buffer. + * @gfp: how to do memory allocations (if necessary). + * + * Caller must ensure we don't call this with other virtqueue operations + * at the same time (except where noted). + * + * Returns zero or a negative error (ie. ENOSPC, ENOMEM). + */ +int virtqueue_add_buf(struct virtqueue *_vq, + struct scatterlist sg[], + unsigned int out, + unsigned int in, + void *data, + gfp_t gfp) +{ + struct scatterlist *sgs[2]; + + sgs[0] = sg; + sgs[1] = sg + out; + + return virtqueue_add(_vq, sgs, sg_next_arr, + out, in, out ? 1 : 0, in ? 1 : 0, data, gfp); +} EXPORT_SYMBOL_GPL(virtqueue_add_buf); /** + * virtqueue_add_sgs - expose buffers to other end + * @vq: the struct virtqueue we're talking about. + * @sgs: array of terminated scatterlists. + * @out_num: the number of scatterlists readable by other side + * @in_num: the number of scatterlists which are writable (after readable ones) + * @data: the token identifying the buffer. + * @gfp: how to do memory allocations (if necessary). + * + * Caller must ensure we don't call this with other virtqueue operations + * at the same time (except where noted). + * + * Returns zero or a negative error (ie. ENOSPC, ENOMEM). + */ +int virtqueue_add_sgs(struct virtqueue *_vq, + struct scatterlist *sgs[], + unsigned int out_sgs, + unsigned int in_sgs, + void *data, + gfp_t gfp) +{ + unsigned int i, total_out, total_in; + + /* Count them first. */ + for (i = total_out = total_in = 0; i < out_sgs; i++) { + struct scatterlist *sg; + for (sg = sgs[i]; sg; sg = sg_next(sg)) + total_out++; + } + for (; i < out_sgs + in_sgs; i++) { + struct scatterlist *sg; + for (sg = sgs[i]; sg; sg = sg_next(sg)) + total_in++; + } + return virtqueue_add(_vq, sgs, sg_next_chained, + total_out, total_in, out_sgs, in_sgs, data, gfp); +} +EXPORT_SYMBOL_GPL(virtqueue_add_sgs); + +/** + * virtqueue_add_outbuf - expose output buffers to other end + * @vq: the struct virtqueue we're talking about. + * @sgs: array of scatterlists (need not be terminated!) + * @num: the number of scatterlists readable by other side + * @data: the token identifying the buffer. + * @gfp: how to do memory allocations (if necessary). + * + * Caller must ensure we don't call this with other virtqueue operations + * at the same time (except where noted). + * + * Returns zero or a negative error (ie. ENOSPC, ENOMEM). + */ +int virtqueue_add_outbuf(struct virtqueue *vq, + struct scatterlist sg[], unsigned int num, + void *data, + gfp_t gfp) +{ + return virtqueue_add(vq, &sg, sg_next_arr, num, 0, 1, 0, data, gfp); +} +EXPORT_SYMBOL_GPL(virtqueue_add_outbuf); + +/** + * virtqueue_add_inbuf - expose input buffers to other end + * @vq: the struct virtqueue we're talking about. + * @sgs: array of scatterlists (need not be terminated!) + * @num: the number of scatterlists writable by other side + * @data: the token identifying the buffer. + * @gfp: how to do memory allocations (if necessary). + * + * Caller must ensure we don't call this with other virtqueue operations + * at the same time (except where noted). + * + * Returns zero or a negative error (ie. ENOSPC, ENOMEM). + */ +int virtqueue_add_inbuf(struct virtqueue *vq, + struct scatterlist sg[], unsigned int num, + void *data, + gfp_t gfp) +{ + return virtqueue_add(vq, &sg, sg_next_arr, 0, num, 0, 1, data, gfp); +} +EXPORT_SYMBOL_GPL(virtqueue_add_inbuf); + +/** * virtqueue_kick_prepare - first half of split virtqueue_kick call. * @vq: the struct virtqueue * @@ -312,7 +429,7 @@ bool virtqueue_kick_prepare(struct virtqueue *_vq) START_USE(vq); /* We need to expose available array entries before checking avail * event. */ - virtio_mb(vq); + virtio_mb(vq->weak_barriers); old = vq->vring.avail->idx - vq->num_added; new = vq->vring.avail->idx; @@ -436,7 +553,7 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) } /* Only get used array entries after they have been exposed by host. */ - virtio_rmb(vq); + virtio_rmb(vq->weak_barriers); last_used = (vq->last_used_idx & (vq->vring.num - 1)); i = vq->vring.used->ring[last_used].id; @@ -460,7 +577,7 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) * the read in the next get_buf call. */ if (!(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) { vring_used_event(&vq->vring) = vq->last_used_idx; - virtio_mb(vq); + virtio_mb(vq->weak_barriers); } #ifdef DEBUG @@ -513,7 +630,7 @@ bool virtqueue_enable_cb(struct virtqueue *_vq) * entry. Always do both to keep code simple. */ vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT; vring_used_event(&vq->vring) = vq->last_used_idx; - virtio_mb(vq); + virtio_mb(vq->weak_barriers); if (unlikely(more_used(vq))) { END_USE(vq); return false; @@ -553,7 +670,7 @@ bool virtqueue_enable_cb_delayed(struct virtqueue *_vq) /* TODO: tune this threshold */ bufs = (u16)(vq->vring.avail->idx - vq->last_used_idx) * 3 / 4; vring_used_event(&vq->vring) = vq->last_used_idx + bufs; - virtio_mb(vq); + virtio_mb(vq->weak_barriers); if (unlikely((u16)(vq->vring.used->idx - vq->last_used_idx) > bufs)) { END_USE(vq); return false; diff --git a/include/linux/i2c-mux.h b/include/linux/i2c-mux.h index 40cb05a97b46..b5f9a007a3ab 100644 --- a/include/linux/i2c-mux.h +++ b/include/linux/i2c-mux.h @@ -42,7 +42,7 @@ struct i2c_adapter *i2c_add_mux_adapter(struct i2c_adapter *parent, int (*deselect) (struct i2c_adapter *, void *mux_dev, u32 chan_id)); -int i2c_del_mux_adapter(struct i2c_adapter *adap); +void i2c_del_mux_adapter(struct i2c_adapter *adap); #endif /* __KERNEL__ */ diff --git a/include/linux/i2c-tegra.h b/include/linux/i2c-tegra.h deleted file mode 100644 index 9c85da49857a..000000000000 --- a/include/linux/i2c-tegra.h +++ /dev/null @@ -1,25 +0,0 @@ -/* - * drivers/i2c/busses/i2c-tegra.c - * - * Copyright (C) 2010 Google, Inc. - * Author: Colin Cross <ccross@android.com> - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - */ - -#ifndef _LINUX_I2C_TEGRA_H -#define _LINUX_I2C_TEGRA_H - -struct tegra_i2c_platform_data { - unsigned long bus_clk_rate; -}; - -#endif /* _LINUX_I2C_TEGRA_H */ diff --git a/include/linux/i2c.h b/include/linux/i2c.h index d0c4db7b4872..e988fa935b3c 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -125,7 +125,6 @@ extern s32 i2c_smbus_write_i2c_block_data(const struct i2c_client *client, * struct i2c_driver - represent an I2C device driver * @class: What kind of i2c device we instantiate (for detect) * @attach_adapter: Callback for bus addition (deprecated) - * @detach_adapter: Callback for bus removal (deprecated) * @probe: Callback for device binding * @remove: Callback for device unbinding * @shutdown: Callback for device shutdown @@ -162,12 +161,10 @@ extern s32 i2c_smbus_write_i2c_block_data(const struct i2c_client *client, struct i2c_driver { unsigned int class; - /* Notifies the driver that a new bus has appeared or is about to be - * removed. You should avoid using this, it will be removed in a - * near future. + /* Notifies the driver that a new bus has appeared. You should avoid + * using this, it will be removed in a near future. */ int (*attach_adapter)(struct i2c_adapter *) __deprecated; - int (*detach_adapter)(struct i2c_adapter *) __deprecated; /* Standard driver model interfaces */ int (*probe)(struct i2c_client *, const struct i2c_device_id *); @@ -370,6 +367,45 @@ struct i2c_algorithm { u32 (*functionality) (struct i2c_adapter *); }; +/** + * struct i2c_bus_recovery_info - I2C bus recovery information + * @recover_bus: Recover routine. Either pass driver's recover_bus() routine, or + * i2c_generic_scl_recovery() or i2c_generic_gpio_recovery(). + * @get_scl: This gets current value of SCL line. Mandatory for generic SCL + * recovery. Used internally for generic GPIO recovery. + * @set_scl: This sets/clears SCL line. Mandatory for generic SCL recovery. Used + * internally for generic GPIO recovery. + * @get_sda: This gets current value of SDA line. Optional for generic SCL + * recovery. Used internally, if sda_gpio is a valid GPIO, for generic GPIO + * recovery. + * @prepare_recovery: This will be called before starting recovery. Platform may + * configure padmux here for SDA/SCL line or something else they want. + * @unprepare_recovery: This will be called after completing recovery. Platform + * may configure padmux here for SDA/SCL line or something else they want. + * @scl_gpio: gpio number of the SCL line. Only required for GPIO recovery. + * @sda_gpio: gpio number of the SDA line. Only required for GPIO recovery. + */ +struct i2c_bus_recovery_info { + int (*recover_bus)(struct i2c_adapter *); + + int (*get_scl)(struct i2c_adapter *); + void (*set_scl)(struct i2c_adapter *, int val); + int (*get_sda)(struct i2c_adapter *); + + void (*prepare_recovery)(struct i2c_bus_recovery_info *bri); + void (*unprepare_recovery)(struct i2c_bus_recovery_info *bri); + + /* gpio recovery */ + int scl_gpio; + int sda_gpio; +}; + +int i2c_recover_bus(struct i2c_adapter *adap); + +/* Generic recovery routines */ +int i2c_generic_gpio_recovery(struct i2c_adapter *adap); +int i2c_generic_scl_recovery(struct i2c_adapter *adap); + /* * i2c_adapter is the structure used to identify a physical i2c bus along * with the access algorithms necessary to access it. @@ -393,6 +429,8 @@ struct i2c_adapter { struct mutex userspace_clients_lock; struct list_head userspace_clients; + + struct i2c_bus_recovery_info *bus_recovery_info; }; #define to_i2c_adapter(d) container_of(d, struct i2c_adapter, dev) @@ -450,7 +488,7 @@ void i2c_unlock_adapter(struct i2c_adapter *); */ #if defined(CONFIG_I2C) || defined(CONFIG_I2C_MODULE) extern int i2c_add_adapter(struct i2c_adapter *); -extern int i2c_del_adapter(struct i2c_adapter *); +extern void i2c_del_adapter(struct i2c_adapter *); extern int i2c_add_numbered_adapter(struct i2c_adapter *); extern int i2c_register_driver(struct module *, struct i2c_driver *); diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 2d8bdaef9611..bfc47e0de81c 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -172,6 +172,22 @@ static inline void sg_mark_end(struct scatterlist *sg) } /** + * sg_unmark_end - Undo setting the end of the scatterlist + * @sg: SG entryScatterlist + * + * Description: + * Removes the termination marker from the given entry of the scatterlist. + * + **/ +static inline void sg_unmark_end(struct scatterlist *sg) +{ +#ifdef CONFIG_DEBUG_SG + BUG_ON(sg->sg_magic != SG_MAGIC); +#endif + sg->page_link &= ~0x02; +} + +/** * sg_phys - Return physical address of an sg entry * @sg: SG entry * diff --git a/include/linux/vfio.h b/include/linux/vfio.h index ab9e86224c54..ac8d488e4372 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -45,6 +45,9 @@ extern int vfio_add_group_dev(struct device *dev, void *device_data); extern void *vfio_del_group_dev(struct device *dev); +extern struct vfio_device *vfio_device_get_from_dev(struct device *dev); +extern void vfio_device_put(struct vfio_device *device); +extern void *vfio_device_data(struct vfio_device *device); /** * struct vfio_iommu_driver_ops - VFIO IOMMU driver callbacks diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 2d7a5e045908..9ff8645b7e0b 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -8,6 +8,7 @@ #include <linux/device.h> #include <linux/mod_devicetable.h> #include <linux/gfp.h> +#include <linux/vringh.h> /** * virtqueue - a queue to register buffers for sending or receiving. @@ -40,6 +41,23 @@ int virtqueue_add_buf(struct virtqueue *vq, void *data, gfp_t gfp); +int virtqueue_add_outbuf(struct virtqueue *vq, + struct scatterlist sg[], unsigned int num, + void *data, + gfp_t gfp); + +int virtqueue_add_inbuf(struct virtqueue *vq, + struct scatterlist sg[], unsigned int num, + void *data, + gfp_t gfp); + +int virtqueue_add_sgs(struct virtqueue *vq, + struct scatterlist *sgs[], + unsigned int out_sgs, + unsigned int in_sgs, + void *data, + gfp_t gfp); + void virtqueue_kick(struct virtqueue *vq); bool virtqueue_kick_prepare(struct virtqueue *vq); @@ -64,6 +82,7 @@ unsigned int virtqueue_get_vring_size(struct virtqueue *vq); * @dev: underlying device. * @id: the device type identification (used to match it with a driver). * @config: the configuration ops for this device. + * @vringh_config: configuration ops for host vrings. * @vqs: the list of virtqueues for this device. * @features: the features supported by both driver and device. * @priv: private pointer for the driver's use. @@ -73,6 +92,7 @@ struct virtio_device { struct device dev; struct virtio_device_id id; const struct virtio_config_ops *config; + const struct vringh_config_ops *vringh_config; struct list_head vqs; /* Note that this is a Linux set_bit-style bitmap. */ unsigned long features[1]; diff --git a/include/linux/virtio_caif.h b/include/linux/virtio_caif.h new file mode 100644 index 000000000000..5d2d3124ca3d --- /dev/null +++ b/include/linux/virtio_caif.h @@ -0,0 +1,24 @@ +/* + * Copyright (C) ST-Ericsson AB 2012 + * Author: Sjur Brændeland <sjur.brandeland@stericsson.com> + * + * This header is BSD licensed so + * anyone can use the definitions to implement compatible remote processors + */ + +#ifndef VIRTIO_CAIF_H +#define VIRTIO_CAIF_H + +#include <linux/types.h> +struct virtio_caif_transf_config { + u16 headroom; + u16 tailroom; + u32 mtu; + u8 reserved[4]; +}; + +struct virtio_caif_config { + struct virtio_caif_transf_config uplink, downlink; + u8 reserved[8]; +}; +#endif diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h index 63c6ea199519..ca3ad41c2c82 100644 --- a/include/linux/virtio_ring.h +++ b/include/linux/virtio_ring.h @@ -4,6 +4,63 @@ #include <linux/irqreturn.h> #include <uapi/linux/virtio_ring.h> +/* + * Barriers in virtio are tricky. Non-SMP virtio guests can't assume + * they're not on an SMP host system, so they need to assume real + * barriers. Non-SMP virtio hosts could skip the barriers, but does + * anyone care? + * + * For virtio_pci on SMP, we don't need to order with respect to MMIO + * accesses through relaxed memory I/O windows, so smp_mb() et al are + * sufficient. + * + * For using virtio to talk to real devices (eg. other heterogeneous + * CPUs) we do need real barriers. In theory, we could be using both + * kinds of virtio, so it's a runtime decision, and the branch is + * actually quite cheap. + */ + +#ifdef CONFIG_SMP +static inline void virtio_mb(bool weak_barriers) +{ + if (weak_barriers) + smp_mb(); + else + mb(); +} + +static inline void virtio_rmb(bool weak_barriers) +{ + if (weak_barriers) + smp_rmb(); + else + rmb(); +} + +static inline void virtio_wmb(bool weak_barriers) +{ + if (weak_barriers) + smp_wmb(); + else + wmb(); +} +#else +static inline void virtio_mb(bool weak_barriers) +{ + mb(); +} + +static inline void virtio_rmb(bool weak_barriers) +{ + rmb(); +} + +static inline void virtio_wmb(bool weak_barriers) +{ + wmb(); +} +#endif + struct virtio_device; struct virtqueue; diff --git a/include/linux/vringh.h b/include/linux/vringh.h new file mode 100644 index 000000000000..749cde28728b --- /dev/null +++ b/include/linux/vringh.h @@ -0,0 +1,225 @@ +/* + * Linux host-side vring helpers; for when the kernel needs to access + * someone else's vring. + * + * Copyright IBM Corporation, 2013. + * Parts taken from drivers/vhost/vhost.c Copyright 2009 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Written by: Rusty Russell <rusty@rustcorp.com.au> + */ +#ifndef _LINUX_VRINGH_H +#define _LINUX_VRINGH_H +#include <uapi/linux/virtio_ring.h> +#include <linux/uio.h> +#include <linux/slab.h> +#include <asm/barrier.h> + +/* virtio_ring with information needed for host access. */ +struct vringh { + /* Guest publishes used event idx (note: we always do). */ + bool event_indices; + + /* Can we get away with weak barriers? */ + bool weak_barriers; + + /* Last available index we saw (ie. where we're up to). */ + u16 last_avail_idx; + + /* Last index we used. */ + u16 last_used_idx; + + /* How many descriptors we've completed since last need_notify(). */ + u32 completed; + + /* The vring (note: it may contain user pointers!) */ + struct vring vring; + + /* The function to call to notify the guest about added buffers */ + void (*notify)(struct vringh *); +}; + +/** + * struct vringh_config_ops - ops for creating a host vring from a virtio driver + * @find_vrhs: find the host vrings and instantiate them + * vdev: the virtio_device + * nhvrs: the number of host vrings to find + * hvrs: on success, includes new host vrings + * callbacks: array of driver callbacks, for each host vring + * include a NULL entry for vqs that do not need a callback + * Returns 0 on success or error status + * @del_vrhs: free the host vrings found by find_vrhs(). + */ +struct virtio_device; +typedef void vrh_callback_t(struct virtio_device *, struct vringh *); +struct vringh_config_ops { + int (*find_vrhs)(struct virtio_device *vdev, unsigned nhvrs, + struct vringh *vrhs[], vrh_callback_t *callbacks[]); + void (*del_vrhs)(struct virtio_device *vdev); +}; + +/* The memory the vring can access, and what offset to apply. */ +struct vringh_range { + u64 start, end_incl; + u64 offset; +}; + +/** + * struct vringh_iov - iovec mangler. + * + * Mangles iovec in place, and restores it. + * Remaining data is iov + i, of used - i elements. + */ +struct vringh_iov { + struct iovec *iov; + size_t consumed; /* Within iov[i] */ + unsigned i, used, max_num; +}; + +/** + * struct vringh_iov - kvec mangler. + * + * Mangles kvec in place, and restores it. + * Remaining data is iov + i, of used - i elements. + */ +struct vringh_kiov { + struct kvec *iov; + size_t consumed; /* Within iov[i] */ + unsigned i, used, max_num; +}; + +/* Flag on max_num to indicate we're kmalloced. */ +#define VRINGH_IOV_ALLOCATED 0x8000000 + +/* Helpers for userspace vrings. */ +int vringh_init_user(struct vringh *vrh, u32 features, + unsigned int num, bool weak_barriers, + struct vring_desc __user *desc, + struct vring_avail __user *avail, + struct vring_used __user *used); + +static inline void vringh_iov_init(struct vringh_iov *iov, + struct iovec *iovec, unsigned num) +{ + iov->used = iov->i = 0; + iov->consumed = 0; + iov->max_num = num; + iov->iov = iovec; +} + +static inline void vringh_iov_reset(struct vringh_iov *iov) +{ + iov->iov[iov->i].iov_len += iov->consumed; + iov->iov[iov->i].iov_base -= iov->consumed; + iov->consumed = 0; + iov->i = 0; +} + +static inline void vringh_iov_cleanup(struct vringh_iov *iov) +{ + if (iov->max_num & VRINGH_IOV_ALLOCATED) + kfree(iov->iov); + iov->max_num = iov->used = iov->i = iov->consumed = 0; + iov->iov = NULL; +} + +/* Convert a descriptor into iovecs. */ +int vringh_getdesc_user(struct vringh *vrh, + struct vringh_iov *riov, + struct vringh_iov *wiov, + bool (*getrange)(struct vringh *vrh, + u64 addr, struct vringh_range *r), + u16 *head); + +/* Copy bytes from readable vsg, consuming it (and incrementing wiov->i). */ +ssize_t vringh_iov_pull_user(struct vringh_iov *riov, void *dst, size_t len); + +/* Copy bytes into writable vsg, consuming it (and incrementing wiov->i). */ +ssize_t vringh_iov_push_user(struct vringh_iov *wiov, + const void *src, size_t len); + +/* Mark a descriptor as used. */ +int vringh_complete_user(struct vringh *vrh, u16 head, u32 len); +int vringh_complete_multi_user(struct vringh *vrh, + const struct vring_used_elem used[], + unsigned num_used); + +/* Pretend we've never seen descriptor (for easy error handling). */ +void vringh_abandon_user(struct vringh *vrh, unsigned int num); + +/* Do we need to fire the eventfd to notify the other side? */ +int vringh_need_notify_user(struct vringh *vrh); + +bool vringh_notify_enable_user(struct vringh *vrh); +void vringh_notify_disable_user(struct vringh *vrh); + +/* Helpers for kernelspace vrings. */ +int vringh_init_kern(struct vringh *vrh, u32 features, + unsigned int num, bool weak_barriers, + struct vring_desc *desc, + struct vring_avail *avail, + struct vring_used *used); + +static inline void vringh_kiov_init(struct vringh_kiov *kiov, + struct kvec *kvec, unsigned num) +{ + kiov->used = kiov->i = 0; + kiov->consumed = 0; + kiov->max_num = num; + kiov->iov = kvec; +} + +static inline void vringh_kiov_reset(struct vringh_kiov *kiov) +{ + kiov->iov[kiov->i].iov_len += kiov->consumed; + kiov->iov[kiov->i].iov_base -= kiov->consumed; + kiov->consumed = 0; + kiov->i = 0; +} + +static inline void vringh_kiov_cleanup(struct vringh_kiov *kiov) +{ + if (kiov->max_num & VRINGH_IOV_ALLOCATED) + kfree(kiov->iov); + kiov->max_num = kiov->used = kiov->i = kiov->consumed = 0; + kiov->iov = NULL; +} + +int vringh_getdesc_kern(struct vringh *vrh, + struct vringh_kiov *riov, + struct vringh_kiov *wiov, + u16 *head, + gfp_t gfp); + +ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len); +ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov, + const void *src, size_t len); +void vringh_abandon_kern(struct vringh *vrh, unsigned int num); +int vringh_complete_kern(struct vringh *vrh, u16 head, u32 len); + +bool vringh_notify_enable_kern(struct vringh *vrh); +void vringh_notify_disable_kern(struct vringh *vrh); + +int vringh_need_notify_kern(struct vringh *vrh); + +/* Notify the guest about buffers added to the used ring */ +static inline void vringh_notify(struct vringh *vrh) +{ + if (vrh->notify) + vrh->notify(vrh); +} + +#endif /* _LINUX_VRINGH_H */ diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 4f41f309911e..284ff2436829 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -319,6 +319,7 @@ enum { VFIO_PCI_INTX_IRQ_INDEX, VFIO_PCI_MSI_IRQ_INDEX, VFIO_PCI_MSIX_IRQ_INDEX, + VFIO_PCI_ERR_IRQ_INDEX, VFIO_PCI_NUM_IRQS }; diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index e847f1e30756..bb6a5b4cb3c5 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -127,4 +127,32 @@ struct vhost_memory { /* vhost-net should add virtio_net_hdr for RX, and strip for TX packets. */ #define VHOST_NET_F_VIRTIO_NET_HDR 27 +/* VHOST_SCSI specific definitions */ + +/* + * Used by QEMU userspace to ensure a consistent vhost-scsi ABI. + * + * ABI Rev 0: July 2012 version starting point for v3.6-rc merge candidate + + * RFC-v2 vhost-scsi userspace. Add GET_ABI_VERSION ioctl usage + * ABI Rev 1: January 2013. Ignore vhost_tpgt filed in struct vhost_scsi_target. + * All the targets under vhost_wwpn can be seen and used by guset. + */ + +#define VHOST_SCSI_ABI_VERSION 1 + +struct vhost_scsi_target { + int abi_version; + char vhost_wwpn[224]; /* TRANSPORT_IQN_LEN */ + unsigned short vhost_tpgt; + unsigned short reserved; +}; + +#define VHOST_SCSI_SET_ENDPOINT _IOW(VHOST_VIRTIO, 0x40, struct vhost_scsi_target) +#define VHOST_SCSI_CLEAR_ENDPOINT _IOW(VHOST_VIRTIO, 0x41, struct vhost_scsi_target) +/* Changing this breaks userspace. */ +#define VHOST_SCSI_GET_ABI_VERSION _IOW(VHOST_VIRTIO, 0x42, int) +/* Set and get the events missed flag */ +#define VHOST_SCSI_SET_EVENTS_MISSED _IOW(VHOST_VIRTIO, 0x43, __u32) +#define VHOST_SCSI_GET_EVENTS_MISSED _IOW(VHOST_VIRTIO, 0x44, __u32) + #endif diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h index 652dc8bea921..5e26f61b5df5 100644 --- a/include/uapi/linux/virtio_balloon.h +++ b/include/uapi/linux/virtio_balloon.h @@ -52,8 +52,8 @@ struct virtio_balloon_config #define VIRTIO_BALLOON_S_NR 6 struct virtio_balloon_stat { - u16 tag; - u64 val; + __u16 tag; + __u64 val; } __attribute__((packed)); #endif /* _LINUX_VIRTIO_BALLOON_H */ diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h index a7630d04029f..284fc3a05f7b 100644 --- a/include/uapi/linux/virtio_ids.h +++ b/include/uapi/linux/virtio_ids.h @@ -38,5 +38,6 @@ #define VIRTIO_ID_SCSI 8 /* virtio scsi */ #define VIRTIO_ID_9P 9 /* 9p virtio console */ #define VIRTIO_ID_RPROC_SERIAL 11 /* virtio remoteproc serial link */ +#define VIRTIO_ID_CAIF 12 /* Virtio caif */ #endif /* _LINUX_VIRTIO_IDS_H */ diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index de2e950a0a7a..e1c26b101830 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -194,11 +194,14 @@ static int pack_sg_list(struct scatterlist *sg, int start, if (s > count) s = count; BUG_ON(index > limit); + /* Make sure we don't terminate early. */ + sg_unmark_end(&sg[index]); sg_set_buf(&sg[index++], data, s); count -= s; data += s; } - + if (index-start) + sg_mark_end(&sg[index - 1]); return index-start; } @@ -236,12 +239,17 @@ pack_sg_list_p(struct scatterlist *sg, int start, int limit, s = rest_of_page(data); if (s > count) s = count; + /* Make sure we don't terminate early. */ + sg_unmark_end(&sg[index]); sg_set_page(&sg[index++], pdata[i++], s, data_off); data_off = 0; data += s; count -= s; nr_pages--; } + + if (index-start) + sg_mark_end(&sg[index - 1]); return index - start; } @@ -256,9 +264,10 @@ static int p9_virtio_request(struct p9_client *client, struct p9_req_t *req) { int err; - int in, out; + int in, out, out_sgs, in_sgs; unsigned long flags; struct virtio_chan *chan = client->trans; + struct scatterlist *sgs[2]; p9_debug(P9_DEBUG_TRANS, "9p debug: virtio request\n"); @@ -266,14 +275,19 @@ p9_virtio_request(struct p9_client *client, struct p9_req_t *req) req_retry: spin_lock_irqsave(&chan->lock, flags); + out_sgs = in_sgs = 0; /* Handle out VirtIO ring buffers */ out = pack_sg_list(chan->sg, 0, VIRTQUEUE_NUM, req->tc->sdata, req->tc->size); + if (out) + sgs[out_sgs++] = chan->sg; in = pack_sg_list(chan->sg, out, VIRTQUEUE_NUM, req->rc->sdata, req->rc->capacity); + if (in) + sgs[out_sgs + in_sgs++] = chan->sg + out; - err = virtqueue_add_buf(chan->vq, chan->sg, out, in, req->tc, + err = virtqueue_add_sgs(chan->vq, sgs, out_sgs, in_sgs, req->tc, GFP_ATOMIC); if (err < 0) { if (err == -ENOSPC) { @@ -289,7 +303,7 @@ req_retry: } else { spin_unlock_irqrestore(&chan->lock, flags); p9_debug(P9_DEBUG_TRANS, - "virtio rpc add_buf returned failure\n"); + "virtio rpc add_sgs returned failure\n"); return -EIO; } } @@ -351,11 +365,12 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req, char *uidata, char *uodata, int inlen, int outlen, int in_hdr_len, int kern_buf) { - int in, out, err; + int in, out, err, out_sgs, in_sgs; unsigned long flags; int in_nr_pages = 0, out_nr_pages = 0; struct page **in_pages = NULL, **out_pages = NULL; struct virtio_chan *chan = client->trans; + struct scatterlist *sgs[4]; p9_debug(P9_DEBUG_TRANS, "virtio request\n"); @@ -396,13 +411,22 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req, req->status = REQ_STATUS_SENT; req_retry_pinned: spin_lock_irqsave(&chan->lock, flags); + + out_sgs = in_sgs = 0; + /* out data */ out = pack_sg_list(chan->sg, 0, VIRTQUEUE_NUM, req->tc->sdata, req->tc->size); - if (out_pages) + if (out) + sgs[out_sgs++] = chan->sg; + + if (out_pages) { + sgs[out_sgs++] = chan->sg + out; out += pack_sg_list_p(chan->sg, out, VIRTQUEUE_NUM, out_pages, out_nr_pages, uodata, outlen); + } + /* * Take care of in data * For example TREAD have 11. @@ -412,11 +436,17 @@ req_retry_pinned: */ in = pack_sg_list(chan->sg, out, VIRTQUEUE_NUM, req->rc->sdata, in_hdr_len); - if (in_pages) + if (in) + sgs[out_sgs + in_sgs++] = chan->sg + out; + + if (in_pages) { + sgs[out_sgs + in_sgs++] = chan->sg + out + in; in += pack_sg_list_p(chan->sg, out + in, VIRTQUEUE_NUM, in_pages, in_nr_pages, uidata, inlen); + } - err = virtqueue_add_buf(chan->vq, chan->sg, out, in, req->tc, + BUG_ON(out_sgs + in_sgs > ARRAY_SIZE(sgs)); + err = virtqueue_add_sgs(chan->vq, sgs, out_sgs, in_sgs, req->tc, GFP_ATOMIC); if (err < 0) { if (err == -ENOSPC) { @@ -432,7 +462,7 @@ req_retry_pinned: } else { spin_unlock_irqrestore(&chan->lock, flags); p9_debug(P9_DEBUG_TRANS, - "virtio rpc add_buf returned failure\n"); + "virtio rpc add_sgs returned failure\n"); err = -EIO; goto err_out; } diff --git a/tools/lguest/lguest.txt b/tools/lguest/lguest.txt index 7203ace65e83..06e1f4649511 100644 --- a/tools/lguest/lguest.txt +++ b/tools/lguest/lguest.txt @@ -70,7 +70,7 @@ Running Lguest: - Run an lguest as root: - Documentation/virtual/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 \ + tools/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 \ --block=rootfile root=/dev/vda Explanation: diff --git a/tools/virtio/Makefile b/tools/virtio/Makefile index d1d442ed106a..3187c62d9814 100644 --- a/tools/virtio/Makefile +++ b/tools/virtio/Makefile @@ -1,12 +1,14 @@ all: test mod -test: virtio_test +test: virtio_test vringh_test virtio_test: virtio_ring.o virtio_test.o -CFLAGS += -g -O2 -Wall -I. -I ../../usr/include/ -Wno-pointer-sign -fno-strict-overflow -MMD -vpath %.c ../../drivers/virtio +vringh_test: vringh_test.o vringh.o virtio_ring.o + +CFLAGS += -g -O2 -Wall -I. -I ../../usr/include/ -Wno-pointer-sign -fno-strict-overflow -fno-strict-aliasing -fno-common -MMD -U_FORTIFY_SOURCE +vpath %.c ../../drivers/virtio ../../drivers/vhost mod: ${MAKE} -C `pwd`/../.. M=`pwd`/vhost_test .PHONY: all test mod clean clean: - ${RM} *.o vhost_test/*.o vhost_test/.*.cmd \ + ${RM} *.o vringh_test virtio_test vhost_test/*.o vhost_test/.*.cmd \ vhost_test/Module.symvers vhost_test/modules.order *.d -include *.d diff --git a/tools/virtio/asm/barrier.h b/tools/virtio/asm/barrier.h new file mode 100644 index 000000000000..aff61e13306c --- /dev/null +++ b/tools/virtio/asm/barrier.h @@ -0,0 +1,14 @@ +#if defined(__i386__) || defined(__x86_64__) +#define barrier() asm volatile("" ::: "memory") +#define mb() __sync_synchronize() + +#define smp_mb() mb() +# define smp_rmb() barrier() +# define smp_wmb() barrier() +/* Weak barriers should be used. If not - it's a bug */ +# define rmb() abort() +# define wmb() abort() +#else +#error Please fill in barrier macros +#endif + diff --git a/tools/virtio/linux/bug.h b/tools/virtio/linux/bug.h new file mode 100644 index 000000000000..fb94f0787c47 --- /dev/null +++ b/tools/virtio/linux/bug.h @@ -0,0 +1,10 @@ +#ifndef BUG_H +#define BUG_H + +#define BUG_ON(__BUG_ON_cond) assert(!(__BUG_ON_cond)) + +#define BUILD_BUG_ON(x) + +#define BUG() abort() + +#endif /* BUG_H */ diff --git a/tools/virtio/linux/err.h b/tools/virtio/linux/err.h new file mode 100644 index 000000000000..e32eff8b2a14 --- /dev/null +++ b/tools/virtio/linux/err.h @@ -0,0 +1,26 @@ +#ifndef ERR_H +#define ERR_H +#define MAX_ERRNO 4095 + +#define IS_ERR_VALUE(x) unlikely((x) >= (unsigned long)-MAX_ERRNO) + +static inline void * __must_check ERR_PTR(long error) +{ + return (void *) error; +} + +static inline long __must_check PTR_ERR(const void *ptr) +{ + return (long) ptr; +} + +static inline long __must_check IS_ERR(const void *ptr) +{ + return IS_ERR_VALUE((unsigned long)ptr); +} + +static inline long __must_check IS_ERR_OR_NULL(const void *ptr) +{ + return !ptr || IS_ERR_VALUE((unsigned long)ptr); +} +#endif /* ERR_H */ diff --git a/tools/virtio/linux/export.h b/tools/virtio/linux/export.h new file mode 100644 index 000000000000..7311d326894a --- /dev/null +++ b/tools/virtio/linux/export.h @@ -0,0 +1,5 @@ +#define EXPORT_SYMBOL(sym) +#define EXPORT_SYMBOL_GPL(sym) +#define EXPORT_SYMBOL_GPL_FUTURE(sym) +#define EXPORT_UNUSED_SYMBOL(sym) +#define EXPORT_UNUSED_SYMBOL_GPL(sym) diff --git a/tools/virtio/linux/irqreturn.h b/tools/virtio/linux/irqreturn.h new file mode 100644 index 000000000000..a3c4e7be7089 --- /dev/null +++ b/tools/virtio/linux/irqreturn.h @@ -0,0 +1 @@ +#include "../../../include/linux/irqreturn.h" diff --git a/tools/virtio/linux/kernel.h b/tools/virtio/linux/kernel.h new file mode 100644 index 000000000000..fba705963968 --- /dev/null +++ b/tools/virtio/linux/kernel.h @@ -0,0 +1,112 @@ +#ifndef KERNEL_H +#define KERNEL_H +#include <stdbool.h> +#include <stdlib.h> +#include <stddef.h> +#include <stdio.h> +#include <string.h> +#include <assert.h> +#include <stdarg.h> + +#include <linux/types.h> +#include <linux/printk.h> +#include <linux/bug.h> +#include <errno.h> +#include <unistd.h> +#include <asm/barrier.h> + +#define CONFIG_SMP + +#define PAGE_SIZE getpagesize() +#define PAGE_MASK (~(PAGE_SIZE-1)) + +typedef unsigned long long dma_addr_t; +typedef size_t __kernel_size_t; + +struct page { + unsigned long long dummy; +}; + +/* Physical == Virtual */ +#define virt_to_phys(p) ((unsigned long)p) +#define phys_to_virt(a) ((void *)(unsigned long)(a)) +/* Page address: Virtual / 4K */ +#define page_to_phys(p) ((dma_addr_t)(unsigned long)(p)) +#define virt_to_page(p) ((struct page *)((unsigned long)p & PAGE_MASK)) + +#define offset_in_page(p) (((unsigned long)p) % PAGE_SIZE) + +#define __printf(a,b) __attribute__((format(printf,a,b))) + +typedef enum { + GFP_KERNEL, + GFP_ATOMIC, + __GFP_HIGHMEM, + __GFP_HIGH +} gfp_t; + +#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0])) + +extern void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end; +static inline void *kmalloc(size_t s, gfp_t gfp) +{ + if (__kmalloc_fake) + return __kmalloc_fake; + return malloc(s); +} + +static inline void kfree(void *p) +{ + if (p >= __kfree_ignore_start && p < __kfree_ignore_end) + return; + free(p); +} + +static inline void *krealloc(void *p, size_t s, gfp_t gfp) +{ + return realloc(p, s); +} + + +static inline unsigned long __get_free_page(gfp_t gfp) +{ + void *p; + + posix_memalign(&p, PAGE_SIZE, PAGE_SIZE); + return (unsigned long)p; +} + +static inline void free_page(unsigned long addr) +{ + free((void *)addr); +} + +#define container_of(ptr, type, member) ({ \ + const typeof( ((type *)0)->member ) *__mptr = (ptr); \ + (type *)( (char *)__mptr - offsetof(type,member) );}) + +#define uninitialized_var(x) x = x + +# ifndef likely +# define likely(x) (__builtin_expect(!!(x), 1)) +# endif +# ifndef unlikely +# define unlikely(x) (__builtin_expect(!!(x), 0)) +# endif + +#define pr_err(format, ...) fprintf (stderr, format, ## __VA_ARGS__) +#ifdef DEBUG +#define pr_debug(format, ...) fprintf (stderr, format, ## __VA_ARGS__) +#else +#define pr_debug(format, ...) do {} while (0) +#endif +#define dev_err(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__) +#define dev_warn(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__) + +#define min(x, y) ({ \ + typeof(x) _min1 = (x); \ + typeof(y) _min2 = (y); \ + (void) (&_min1 == &_min2); \ + _min1 < _min2 ? _min1 : _min2; }) + +#endif /* KERNEL_H */ diff --git a/tools/virtio/linux/module.h b/tools/virtio/linux/module.h index e69de29bb2d1..3039a7e972b6 100644 --- a/tools/virtio/linux/module.h +++ b/tools/virtio/linux/module.h @@ -0,0 +1 @@ +#include <linux/export.h> diff --git a/tools/virtio/linux/printk.h b/tools/virtio/linux/printk.h new file mode 100644 index 000000000000..9f2423bd89c2 --- /dev/null +++ b/tools/virtio/linux/printk.h @@ -0,0 +1,4 @@ +#include "../../../include/linux/kern_levels.h" + +#define printk printf +#define vprintk vprintf diff --git a/tools/virtio/linux/ratelimit.h b/tools/virtio/linux/ratelimit.h new file mode 100644 index 000000000000..dcce1725f90d --- /dev/null +++ b/tools/virtio/linux/ratelimit.h @@ -0,0 +1,4 @@ +#define DEFINE_RATELIMIT_STATE(name, interval_init, burst_init) int name = 0 + +#define __ratelimit(x) (*(x)) + diff --git a/tools/virtio/linux/scatterlist.h b/tools/virtio/linux/scatterlist.h new file mode 100644 index 000000000000..68c9e2adc996 --- /dev/null +++ b/tools/virtio/linux/scatterlist.h @@ -0,0 +1,189 @@ +#ifndef SCATTERLIST_H +#define SCATTERLIST_H +#include <linux/kernel.h> + +struct scatterlist { + unsigned long page_link; + unsigned int offset; + unsigned int length; + dma_addr_t dma_address; +}; + +/* Scatterlist helpers, stolen from linux/scatterlist.h */ +#define sg_is_chain(sg) ((sg)->page_link & 0x01) +#define sg_is_last(sg) ((sg)->page_link & 0x02) +#define sg_chain_ptr(sg) \ + ((struct scatterlist *) ((sg)->page_link & ~0x03)) + +/** + * sg_assign_page - Assign a given page to an SG entry + * @sg: SG entry + * @page: The page + * + * Description: + * Assign page to sg entry. Also see sg_set_page(), the most commonly used + * variant. + * + **/ +static inline void sg_assign_page(struct scatterlist *sg, struct page *page) +{ + unsigned long page_link = sg->page_link & 0x3; + + /* + * In order for the low bit stealing approach to work, pages + * must be aligned at a 32-bit boundary as a minimum. + */ + BUG_ON((unsigned long) page & 0x03); +#ifdef CONFIG_DEBUG_SG + BUG_ON(sg->sg_magic != SG_MAGIC); + BUG_ON(sg_is_chain(sg)); +#endif + sg->page_link = page_link | (unsigned long) page; +} + +/** + * sg_set_page - Set sg entry to point at given page + * @sg: SG entry + * @page: The page + * @len: Length of data + * @offset: Offset into page + * + * Description: + * Use this function to set an sg entry pointing at a page, never assign + * the page directly. We encode sg table information in the lower bits + * of the page pointer. See sg_page() for looking up the page belonging + * to an sg entry. + * + **/ +static inline void sg_set_page(struct scatterlist *sg, struct page *page, + unsigned int len, unsigned int offset) +{ + sg_assign_page(sg, page); + sg->offset = offset; + sg->length = len; +} + +static inline struct page *sg_page(struct scatterlist *sg) +{ +#ifdef CONFIG_DEBUG_SG + BUG_ON(sg->sg_magic != SG_MAGIC); + BUG_ON(sg_is_chain(sg)); +#endif + return (struct page *)((sg)->page_link & ~0x3); +} + +/* + * Loop over each sg element, following the pointer to a new list if necessary + */ +#define for_each_sg(sglist, sg, nr, __i) \ + for (__i = 0, sg = (sglist); __i < (nr); __i++, sg = sg_next(sg)) + +/** + * sg_chain - Chain two sglists together + * @prv: First scatterlist + * @prv_nents: Number of entries in prv + * @sgl: Second scatterlist + * + * Description: + * Links @prv@ and @sgl@ together, to form a longer scatterlist. + * + **/ +static inline void sg_chain(struct scatterlist *prv, unsigned int prv_nents, + struct scatterlist *sgl) +{ + /* + * offset and length are unused for chain entry. Clear them. + */ + prv[prv_nents - 1].offset = 0; + prv[prv_nents - 1].length = 0; + + /* + * Set lowest bit to indicate a link pointer, and make sure to clear + * the termination bit if it happens to be set. + */ + prv[prv_nents - 1].page_link = ((unsigned long) sgl | 0x01) & ~0x02; +} + +/** + * sg_mark_end - Mark the end of the scatterlist + * @sg: SG entryScatterlist + * + * Description: + * Marks the passed in sg entry as the termination point for the sg + * table. A call to sg_next() on this entry will return NULL. + * + **/ +static inline void sg_mark_end(struct scatterlist *sg) +{ +#ifdef CONFIG_DEBUG_SG + BUG_ON(sg->sg_magic != SG_MAGIC); +#endif + /* + * Set termination bit, clear potential chain bit + */ + sg->page_link |= 0x02; + sg->page_link &= ~0x01; +} + +/** + * sg_unmark_end - Undo setting the end of the scatterlist + * @sg: SG entryScatterlist + * + * Description: + * Removes the termination marker from the given entry of the scatterlist. + * + **/ +static inline void sg_unmark_end(struct scatterlist *sg) +{ +#ifdef CONFIG_DEBUG_SG + BUG_ON(sg->sg_magic != SG_MAGIC); +#endif + sg->page_link &= ~0x02; +} + +static inline struct scatterlist *sg_next(struct scatterlist *sg) +{ +#ifdef CONFIG_DEBUG_SG + BUG_ON(sg->sg_magic != SG_MAGIC); +#endif + if (sg_is_last(sg)) + return NULL; + + sg++; + if (unlikely(sg_is_chain(sg))) + sg = sg_chain_ptr(sg); + + return sg; +} + +static inline void sg_init_table(struct scatterlist *sgl, unsigned int nents) +{ + memset(sgl, 0, sizeof(*sgl) * nents); +#ifdef CONFIG_DEBUG_SG + { + unsigned int i; + for (i = 0; i < nents; i++) + sgl[i].sg_magic = SG_MAGIC; + } +#endif + sg_mark_end(&sgl[nents - 1]); +} + +static inline dma_addr_t sg_phys(struct scatterlist *sg) +{ + return page_to_phys(sg_page(sg)) + sg->offset; +} + +static inline void sg_set_buf(struct scatterlist *sg, const void *buf, + unsigned int buflen) +{ + sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf)); +} + +static inline void sg_init_one(struct scatterlist *sg, + const void *buf, unsigned int buflen) +{ + sg_init_table(sg, 1); + sg_set_buf(sg, buf, buflen); +} +#endif /* SCATTERLIST_H */ diff --git a/tools/virtio/linux/types.h b/tools/virtio/linux/types.h new file mode 100644 index 000000000000..f8ebb9a2b3d6 --- /dev/null +++ b/tools/virtio/linux/types.h @@ -0,0 +1,28 @@ +#ifndef TYPES_H +#define TYPES_H +#include <stdint.h> + +#define __force +#define __user +#define __must_check +#define __cold + +typedef uint64_t u64; +typedef int64_t s64; +typedef uint32_t u32; +typedef int32_t s32; +typedef uint16_t u16; +typedef int16_t s16; +typedef uint8_t u8; +typedef int8_t s8; + +typedef uint64_t __u64; +typedef int64_t __s64; +typedef uint32_t __u32; +typedef int32_t __s32; +typedef uint16_t __u16; +typedef int16_t __s16; +typedef uint8_t __u8; +typedef int8_t __s8; + +#endif /* TYPES_H */ diff --git a/tools/virtio/linux/uaccess.h b/tools/virtio/linux/uaccess.h new file mode 100644 index 000000000000..0a578fe18653 --- /dev/null +++ b/tools/virtio/linux/uaccess.h @@ -0,0 +1,50 @@ +#ifndef UACCESS_H +#define UACCESS_H +extern void *__user_addr_min, *__user_addr_max; + +#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) + +static inline void __chk_user_ptr(const volatile void *p, size_t size) +{ + assert(p >= __user_addr_min && p + size <= __user_addr_max); +} + +#define put_user(x, ptr) \ +({ \ + typeof(ptr) __pu_ptr = (ptr); \ + __chk_user_ptr(__pu_ptr, sizeof(*__pu_ptr)); \ + ACCESS_ONCE(*(__pu_ptr)) = x; \ + 0; \ +}) + +#define get_user(x, ptr) \ +({ \ + typeof(ptr) __pu_ptr = (ptr); \ + __chk_user_ptr(__pu_ptr, sizeof(*__pu_ptr)); \ + x = ACCESS_ONCE(*(__pu_ptr)); \ + 0; \ +}) + +static void volatile_memcpy(volatile char *to, const volatile char *from, + unsigned long n) +{ + while (n--) + *(to++) = *(from++); +} + +static inline int copy_from_user(void *to, const void __user volatile *from, + unsigned long n) +{ + __chk_user_ptr(from, n); + volatile_memcpy(to, from, n); + return 0; +} + +static inline int copy_to_user(void __user volatile *to, const void *from, + unsigned long n) +{ + __chk_user_ptr(to, n); + volatile_memcpy(to, from, n); + return 0; +} +#endif /* UACCESS_H */ diff --git a/tools/virtio/linux/uio.h b/tools/virtio/linux/uio.h new file mode 100644 index 000000000000..cd20f0ba3081 --- /dev/null +++ b/tools/virtio/linux/uio.h @@ -0,0 +1,3 @@ +#include <linux/kernel.h> + +#include "../../../include/linux/uio.h" diff --git a/tools/virtio/linux/virtio.h b/tools/virtio/linux/virtio.h index 81847dd08bd0..cd801838156f 100644 --- a/tools/virtio/linux/virtio.h +++ b/tools/virtio/linux/virtio.h @@ -1,127 +1,7 @@ #ifndef LINUX_VIRTIO_H #define LINUX_VIRTIO_H - -#include <stdbool.h> -#include <stdlib.h> -#include <stddef.h> -#include <stdio.h> -#include <string.h> -#include <assert.h> - -#include <linux/types.h> -#include <errno.h> - -typedef unsigned long long dma_addr_t; - -struct scatterlist { - unsigned long page_link; - unsigned int offset; - unsigned int length; - dma_addr_t dma_address; -}; - -struct page { - unsigned long long dummy; -}; - -#define BUG_ON(__BUG_ON_cond) assert(!(__BUG_ON_cond)) - -/* Physical == Virtual */ -#define virt_to_phys(p) ((unsigned long)p) -#define phys_to_virt(a) ((void *)(unsigned long)(a)) -/* Page address: Virtual / 4K */ -#define virt_to_page(p) ((struct page*)((virt_to_phys(p) / 4096) * \ - sizeof(struct page))) -#define offset_in_page(p) (((unsigned long)p) % 4096) -#define sg_phys(sg) ((sg->page_link & ~0x3) / sizeof(struct page) * 4096 + \ - sg->offset) -static inline void sg_mark_end(struct scatterlist *sg) -{ - /* - * Set termination bit, clear potential chain bit - */ - sg->page_link |= 0x02; - sg->page_link &= ~0x01; -} -static inline void sg_init_table(struct scatterlist *sgl, unsigned int nents) -{ - memset(sgl, 0, sizeof(*sgl) * nents); - sg_mark_end(&sgl[nents - 1]); -} -static inline void sg_assign_page(struct scatterlist *sg, struct page *page) -{ - unsigned long page_link = sg->page_link & 0x3; - - /* - * In order for the low bit stealing approach to work, pages - * must be aligned at a 32-bit boundary as a minimum. - */ - BUG_ON((unsigned long) page & 0x03); - sg->page_link = page_link | (unsigned long) page; -} - -static inline void sg_set_page(struct scatterlist *sg, struct page *page, - unsigned int len, unsigned int offset) -{ - sg_assign_page(sg, page); - sg->offset = offset; - sg->length = len; -} - -static inline void sg_set_buf(struct scatterlist *sg, const void *buf, - unsigned int buflen) -{ - sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf)); -} - -static inline void sg_init_one(struct scatterlist *sg, const void *buf, unsigned int buflen) -{ - sg_init_table(sg, 1); - sg_set_buf(sg, buf, buflen); -} - -typedef __u16 u16; - -typedef enum { - GFP_KERNEL, - GFP_ATOMIC, -} gfp_t; -typedef enum { - IRQ_NONE, - IRQ_HANDLED -} irqreturn_t; - -static inline void *kmalloc(size_t s, gfp_t gfp) -{ - return malloc(s); -} - -static inline void kfree(void *p) -{ - free(p); -} - -#define container_of(ptr, type, member) ({ \ - const typeof( ((type *)0)->member ) *__mptr = (ptr); \ - (type *)( (char *)__mptr - offsetof(type,member) );}) - -#define uninitialized_var(x) x = x - -# ifndef likely -# define likely(x) (__builtin_expect(!!(x), 1)) -# endif -# ifndef unlikely -# define unlikely(x) (__builtin_expect(!!(x), 0)) -# endif - -#define pr_err(format, ...) fprintf (stderr, format, ## __VA_ARGS__) -#ifdef DEBUG -#define pr_debug(format, ...) fprintf (stderr, format, ## __VA_ARGS__) -#else -#define pr_debug(format, ...) do {} while (0) -#endif -#define dev_err(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__) -#define dev_warn(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__) +#include <linux/scatterlist.h> +#include <linux/kernel.h> /* TODO: empty stubs for now. Broken but enough for virtio_ring.c */ #define list_add_tail(a, b) do {} while (0) @@ -131,6 +11,7 @@ static inline void kfree(void *p) #define BITS_PER_BYTE 8 #define BITS_PER_LONG (sizeof(long) * BITS_PER_BYTE) #define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) + /* TODO: Not atomic as it should be: * we don't use this for anything important. */ static inline void clear_bit(int nr, volatile unsigned long *addr) @@ -145,10 +26,6 @@ static inline int test_bit(int nr, const volatile unsigned long *addr) { return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1))); } - -/* The only feature we care to support */ -#define virtio_has_feature(dev, feature) \ - test_bit((feature), (dev)->features) /* end of stubs */ struct virtio_device { @@ -163,39 +40,32 @@ struct virtqueue { void (*callback)(struct virtqueue *vq); const char *name; struct virtio_device *vdev; + unsigned int index; + unsigned int num_free; void *priv; }; -#define EXPORT_SYMBOL_GPL(__EXPORT_SYMBOL_GPL_name) \ - void __EXPORT_SYMBOL_GPL##__EXPORT_SYMBOL_GPL_name() { \ -} #define MODULE_LICENSE(__MODULE_LICENSE_value) \ const char *__MODULE_LICENSE_name = __MODULE_LICENSE_value -#define CONFIG_SMP - -#if defined(__i386__) || defined(__x86_64__) -#define barrier() asm volatile("" ::: "memory") -#define mb() __sync_synchronize() - -#define smp_mb() mb() -# define smp_rmb() barrier() -# define smp_wmb() barrier() -/* Weak barriers should be used. If not - it's a bug */ -# define rmb() abort() -# define wmb() abort() -#else -#error Please fill in barrier macros -#endif - /* Interfaces exported by virtio_ring. */ -int virtqueue_add_buf(struct virtqueue *vq, - struct scatterlist sg[], - unsigned int out_num, - unsigned int in_num, +int virtqueue_add_sgs(struct virtqueue *vq, + struct scatterlist *sgs[], + unsigned int out_sgs, + unsigned int in_sgs, void *data, gfp_t gfp); +int virtqueue_add_outbuf(struct virtqueue *vq, + struct scatterlist sg[], unsigned int num, + void *data, + gfp_t gfp); + +int virtqueue_add_inbuf(struct virtqueue *vq, + struct scatterlist sg[], unsigned int num, + void *data, + gfp_t gfp); + void virtqueue_kick(struct virtqueue *vq); void *virtqueue_get_buf(struct virtqueue *vq, unsigned int *len); @@ -206,7 +76,8 @@ bool virtqueue_enable_cb(struct virtqueue *vq); bool virtqueue_enable_cb_delayed(struct virtqueue *vq); void *virtqueue_detach_unused_buf(struct virtqueue *vq); -struct virtqueue *vring_new_virtqueue(unsigned int num, +struct virtqueue *vring_new_virtqueue(unsigned int index, + unsigned int num, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, diff --git a/tools/virtio/linux/virtio_config.h b/tools/virtio/linux/virtio_config.h new file mode 100644 index 000000000000..5049967f99f7 --- /dev/null +++ b/tools/virtio/linux/virtio_config.h @@ -0,0 +1,6 @@ +#define VIRTIO_TRANSPORT_F_START 28 +#define VIRTIO_TRANSPORT_F_END 32 + +#define virtio_has_feature(dev, feature) \ + test_bit((feature), (dev)->features) + diff --git a/tools/virtio/linux/virtio_ring.h b/tools/virtio/linux/virtio_ring.h new file mode 100644 index 000000000000..8949c4e2772c --- /dev/null +++ b/tools/virtio/linux/virtio_ring.h @@ -0,0 +1 @@ +#include "../../../include/linux/virtio_ring.h" diff --git a/tools/virtio/linux/vringh.h b/tools/virtio/linux/vringh.h new file mode 100644 index 000000000000..9348957be56e --- /dev/null +++ b/tools/virtio/linux/vringh.h @@ -0,0 +1 @@ +#include "../../../include/linux/vringh.h" diff --git a/tools/virtio/uapi/linux/uio.h b/tools/virtio/uapi/linux/uio.h new file mode 100644 index 000000000000..7230e9002207 --- /dev/null +++ b/tools/virtio/uapi/linux/uio.h @@ -0,0 +1 @@ +#include <sys/uio.h> diff --git a/tools/virtio/uapi/linux/virtio_config.h b/tools/virtio/uapi/linux/virtio_config.h new file mode 100644 index 000000000000..4c86675f0159 --- /dev/null +++ b/tools/virtio/uapi/linux/virtio_config.h @@ -0,0 +1 @@ +#include "../../../../include/uapi/linux/virtio_config.h" diff --git a/tools/virtio/uapi/linux/virtio_ring.h b/tools/virtio/uapi/linux/virtio_ring.h new file mode 100644 index 000000000000..4d99c78234d3 --- /dev/null +++ b/tools/virtio/uapi/linux/virtio_ring.h @@ -0,0 +1,4 @@ +#ifndef VIRTIO_RING_H +#define VIRTIO_RING_H +#include "../../../../include/uapi/linux/virtio_ring.h" +#endif /* VIRTIO_RING_H */ diff --git a/tools/virtio/virtio_test.c b/tools/virtio/virtio_test.c index fcc9aa25fd08..da7a19558281 100644 --- a/tools/virtio/virtio_test.c +++ b/tools/virtio/virtio_test.c @@ -10,11 +10,15 @@ #include <sys/stat.h> #include <sys/types.h> #include <fcntl.h> +#include <stdbool.h> #include <linux/vhost.h> #include <linux/virtio.h> #include <linux/virtio_ring.h> #include "../../drivers/vhost/test.h" +/* Unused */ +void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end; + struct vq_info { int kick; int call; @@ -92,7 +96,8 @@ static void vq_info_add(struct vdev_info *dev, int num) assert(r >= 0); memset(info->ring, 0, vring_size(num, 4096)); vring_init(&info->vring, num, info->ring, 4096); - info->vq = vring_new_virtqueue(info->vring.num, 4096, &dev->vdev, + info->vq = vring_new_virtqueue(info->idx, + info->vring.num, 4096, &dev->vdev, true, info->ring, vq_notify, vq_callback, "test"); assert(info->vq); @@ -161,9 +166,9 @@ static void run_test(struct vdev_info *dev, struct vq_info *vq, do { if (started < bufs) { sg_init_one(&sl, dev->buf, dev->buf_size); - r = virtqueue_add_buf(vq->vq, &sl, 1, 0, - dev->buf + started, - GFP_ATOMIC); + r = virtqueue_add_outbuf(vq->vq, &sl, 1, + dev->buf + started, + GFP_ATOMIC); if (likely(r == 0)) { ++started; virtqueue_kick(vq->vq); diff --git a/tools/virtio/vringh_test.c b/tools/virtio/vringh_test.c new file mode 100644 index 000000000000..d053ea40c001 --- /dev/null +++ b/tools/virtio/vringh_test.c @@ -0,0 +1,741 @@ +/* Simple test of virtio code, entirely in userpsace. */ +#define _GNU_SOURCE +#include <sched.h> +#include <err.h> +#include <linux/kernel.h> +#include <linux/err.h> +#include <linux/virtio.h> +#include <linux/vringh.h> +#include <linux/virtio_ring.h> +#include <linux/uaccess.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <sys/wait.h> +#include <fcntl.h> + +#define USER_MEM (1024*1024) +void *__user_addr_min, *__user_addr_max; +void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end; +static u64 user_addr_offset; + +#define RINGSIZE 256 +#define ALIGN 4096 + +static void never_notify_host(struct virtqueue *vq) +{ + abort(); +} + +static void never_callback_guest(struct virtqueue *vq) +{ + abort(); +} + +static bool getrange_iov(struct vringh *vrh, u64 addr, struct vringh_range *r) +{ + if (addr < (u64)(unsigned long)__user_addr_min - user_addr_offset) + return false; + if (addr >= (u64)(unsigned long)__user_addr_max - user_addr_offset) + return false; + + r->start = (u64)(unsigned long)__user_addr_min - user_addr_offset; + r->end_incl = (u64)(unsigned long)__user_addr_max - 1 - user_addr_offset; + r->offset = user_addr_offset; + return true; +} + +/* We return single byte ranges. */ +static bool getrange_slow(struct vringh *vrh, u64 addr, struct vringh_range *r) +{ + if (addr < (u64)(unsigned long)__user_addr_min - user_addr_offset) + return false; + if (addr >= (u64)(unsigned long)__user_addr_max - user_addr_offset) + return false; + + r->start = addr; + r->end_incl = r->start; + r->offset = user_addr_offset; + return true; +} + +struct guest_virtio_device { + struct virtio_device vdev; + int to_host_fd; + unsigned long notifies; +}; + +static void parallel_notify_host(struct virtqueue *vq) +{ + struct guest_virtio_device *gvdev; + + gvdev = container_of(vq->vdev, struct guest_virtio_device, vdev); + write(gvdev->to_host_fd, "", 1); + gvdev->notifies++; +} + +static void no_notify_host(struct virtqueue *vq) +{ +} + +#define NUM_XFERS (10000000) + +/* We aim for two "distant" cpus. */ +static void find_cpus(unsigned int *first, unsigned int *last) +{ + unsigned int i; + + *first = -1U; + *last = 0; + for (i = 0; i < 4096; i++) { + cpu_set_t set; + CPU_ZERO(&set); + CPU_SET(i, &set); + if (sched_setaffinity(getpid(), sizeof(set), &set) == 0) { + if (i < *first) + *first = i; + if (i > *last) + *last = i; + } + } +} + +/* Opencoded version for fast mode */ +static inline int vringh_get_head(struct vringh *vrh, u16 *head) +{ + u16 avail_idx, i; + int err; + + err = get_user(avail_idx, &vrh->vring.avail->idx); + if (err) + return err; + + if (vrh->last_avail_idx == avail_idx) + return 0; + + /* Only get avail ring entries after they have been exposed by guest. */ + virtio_rmb(vrh->weak_barriers); + + i = vrh->last_avail_idx & (vrh->vring.num - 1); + + err = get_user(*head, &vrh->vring.avail->ring[i]); + if (err) + return err; + + vrh->last_avail_idx++; + return 1; +} + +static int parallel_test(unsigned long features, + bool (*getrange)(struct vringh *vrh, + u64 addr, struct vringh_range *r), + bool fast_vringh) +{ + void *host_map, *guest_map; + int fd, mapsize, to_guest[2], to_host[2]; + unsigned long xfers = 0, notifies = 0, receives = 0; + unsigned int first_cpu, last_cpu; + cpu_set_t cpu_set; + char buf[128]; + + /* Create real file to mmap. */ + fd = open("/tmp/vringh_test-file", O_RDWR|O_CREAT|O_TRUNC, 0600); + if (fd < 0) + err(1, "Opening /tmp/vringh_test-file"); + + /* Extra room at the end for some data, and indirects */ + mapsize = vring_size(RINGSIZE, ALIGN) + + RINGSIZE * 2 * sizeof(int) + + RINGSIZE * 6 * sizeof(struct vring_desc); + mapsize = (mapsize + getpagesize() - 1) & ~(getpagesize() - 1); + ftruncate(fd, mapsize); + + /* Parent and child use separate addresses, to check our mapping logic! */ + host_map = mmap(NULL, mapsize, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + guest_map = mmap(NULL, mapsize, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + + pipe(to_guest); + pipe(to_host); + + CPU_ZERO(&cpu_set); + find_cpus(&first_cpu, &last_cpu); + printf("Using CPUS %u and %u\n", first_cpu, last_cpu); + fflush(stdout); + + if (fork() != 0) { + struct vringh vrh; + int status, err, rlen = 0; + char rbuf[5]; + + /* We are the host: never access guest addresses! */ + munmap(guest_map, mapsize); + + __user_addr_min = host_map; + __user_addr_max = __user_addr_min + mapsize; + user_addr_offset = host_map - guest_map; + assert(user_addr_offset); + + close(to_guest[0]); + close(to_host[1]); + + vring_init(&vrh.vring, RINGSIZE, host_map, ALIGN); + vringh_init_user(&vrh, features, RINGSIZE, true, + vrh.vring.desc, vrh.vring.avail, vrh.vring.used); + CPU_SET(first_cpu, &cpu_set); + if (sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set)) + errx(1, "Could not set affinity to cpu %u", first_cpu); + + while (xfers < NUM_XFERS) { + struct iovec host_riov[2], host_wiov[2]; + struct vringh_iov riov, wiov; + u16 head, written; + + if (fast_vringh) { + for (;;) { + err = vringh_get_head(&vrh, &head); + if (err != 0) + break; + err = vringh_need_notify_user(&vrh); + if (err < 0) + errx(1, "vringh_need_notify_user: %i", + err); + if (err) { + write(to_guest[1], "", 1); + notifies++; + } + } + if (err != 1) + errx(1, "vringh_get_head"); + written = 0; + goto complete; + } else { + vringh_iov_init(&riov, + host_riov, + ARRAY_SIZE(host_riov)); + vringh_iov_init(&wiov, + host_wiov, + ARRAY_SIZE(host_wiov)); + + err = vringh_getdesc_user(&vrh, &riov, &wiov, + getrange, &head); + } + if (err == 0) { + err = vringh_need_notify_user(&vrh); + if (err < 0) + errx(1, "vringh_need_notify_user: %i", + err); + if (err) { + write(to_guest[1], "", 1); + notifies++; + } + + if (!vringh_notify_enable_user(&vrh)) + continue; + + /* Swallow all notifies at once. */ + if (read(to_host[0], buf, sizeof(buf)) < 1) + break; + + vringh_notify_disable_user(&vrh); + receives++; + continue; + } + if (err != 1) + errx(1, "vringh_getdesc_user: %i", err); + + /* We simply copy bytes. */ + if (riov.used) { + rlen = vringh_iov_pull_user(&riov, rbuf, + sizeof(rbuf)); + if (rlen != 4) + errx(1, "vringh_iov_pull_user: %i", + rlen); + assert(riov.i == riov.used); + written = 0; + } else { + err = vringh_iov_push_user(&wiov, rbuf, rlen); + if (err != rlen) + errx(1, "vringh_iov_push_user: %i", + err); + assert(wiov.i == wiov.used); + written = err; + } + complete: + xfers++; + + err = vringh_complete_user(&vrh, head, written); + if (err != 0) + errx(1, "vringh_complete_user: %i", err); + } + + err = vringh_need_notify_user(&vrh); + if (err < 0) + errx(1, "vringh_need_notify_user: %i", err); + if (err) { + write(to_guest[1], "", 1); + notifies++; + } + wait(&status); + if (!WIFEXITED(status)) + errx(1, "Child died with signal %i?", WTERMSIG(status)); + if (WEXITSTATUS(status) != 0) + errx(1, "Child exited %i?", WEXITSTATUS(status)); + printf("Host: notified %lu, pinged %lu\n", notifies, receives); + return 0; + } else { + struct guest_virtio_device gvdev; + struct virtqueue *vq; + unsigned int *data; + struct vring_desc *indirects; + unsigned int finished = 0; + + /* We pass sg[]s pointing into here, but we need RINGSIZE+1 */ + data = guest_map + vring_size(RINGSIZE, ALIGN); + indirects = (void *)data + (RINGSIZE + 1) * 2 * sizeof(int); + + /* We are the guest. */ + munmap(host_map, mapsize); + + close(to_guest[1]); + close(to_host[0]); + + gvdev.vdev.features[0] = features; + gvdev.to_host_fd = to_host[1]; + gvdev.notifies = 0; + + CPU_SET(first_cpu, &cpu_set); + if (sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set)) + err(1, "Could not set affinity to cpu %u", first_cpu); + + vq = vring_new_virtqueue(0, RINGSIZE, ALIGN, &gvdev.vdev, true, + guest_map, fast_vringh ? no_notify_host + : parallel_notify_host, + never_callback_guest, "guest vq"); + + /* Don't kfree indirects. */ + __kfree_ignore_start = indirects; + __kfree_ignore_end = indirects + RINGSIZE * 6; + + while (xfers < NUM_XFERS) { + struct scatterlist sg[4]; + unsigned int num_sg, len; + int *dbuf, err; + bool output = !(xfers % 2); + + /* Consume bufs. */ + while ((dbuf = virtqueue_get_buf(vq, &len)) != NULL) { + if (len == 4) + assert(*dbuf == finished - 1); + else if (!fast_vringh) + assert(*dbuf == finished); + finished++; + } + + /* Produce a buffer. */ + dbuf = data + (xfers % (RINGSIZE + 1)); + + if (output) + *dbuf = xfers; + else + *dbuf = -1; + + switch ((xfers / sizeof(*dbuf)) % 4) { + case 0: + /* Nasty three-element sg list. */ + sg_init_table(sg, num_sg = 3); + sg_set_buf(&sg[0], (void *)dbuf, 1); + sg_set_buf(&sg[1], (void *)dbuf + 1, 2); + sg_set_buf(&sg[2], (void *)dbuf + 3, 1); + break; + case 1: + sg_init_table(sg, num_sg = 2); + sg_set_buf(&sg[0], (void *)dbuf, 1); + sg_set_buf(&sg[1], (void *)dbuf + 1, 3); + break; + case 2: + sg_init_table(sg, num_sg = 1); + sg_set_buf(&sg[0], (void *)dbuf, 4); + break; + case 3: + sg_init_table(sg, num_sg = 4); + sg_set_buf(&sg[0], (void *)dbuf, 1); + sg_set_buf(&sg[1], (void *)dbuf + 1, 1); + sg_set_buf(&sg[2], (void *)dbuf + 2, 1); + sg_set_buf(&sg[3], (void *)dbuf + 3, 1); + break; + } + + /* May allocate an indirect, so force it to allocate + * user addr */ + __kmalloc_fake = indirects + (xfers % RINGSIZE) * 4; + if (output) + err = virtqueue_add_outbuf(vq, sg, num_sg, dbuf, + GFP_KERNEL); + else + err = virtqueue_add_inbuf(vq, sg, num_sg, + dbuf, GFP_KERNEL); + + if (err == -ENOSPC) { + if (!virtqueue_enable_cb_delayed(vq)) + continue; + /* Swallow all notifies at once. */ + if (read(to_guest[0], buf, sizeof(buf)) < 1) + break; + + receives++; + virtqueue_disable_cb(vq); + continue; + } + + if (err) + errx(1, "virtqueue_add_in/outbuf: %i", err); + + xfers++; + virtqueue_kick(vq); + } + + /* Any extra? */ + while (finished != xfers) { + int *dbuf; + unsigned int len; + + /* Consume bufs. */ + dbuf = virtqueue_get_buf(vq, &len); + if (dbuf) { + if (len == 4) + assert(*dbuf == finished - 1); + else + assert(len == 0); + finished++; + continue; + } + + if (!virtqueue_enable_cb_delayed(vq)) + continue; + if (read(to_guest[0], buf, sizeof(buf)) < 1) + break; + + receives++; + virtqueue_disable_cb(vq); + } + + printf("Guest: notified %lu, pinged %lu\n", + gvdev.notifies, receives); + vring_del_virtqueue(vq); + return 0; + } +} + +int main(int argc, char *argv[]) +{ + struct virtio_device vdev; + struct virtqueue *vq; + struct vringh vrh; + struct scatterlist guest_sg[RINGSIZE], *sgs[2]; + struct iovec host_riov[2], host_wiov[2]; + struct vringh_iov riov, wiov; + struct vring_used_elem used[RINGSIZE]; + char buf[28]; + u16 head; + int err; + unsigned i; + void *ret; + bool (*getrange)(struct vringh *vrh, u64 addr, struct vringh_range *r); + bool fast_vringh = false, parallel = false; + + getrange = getrange_iov; + vdev.features[0] = 0; + + while (argv[1]) { + if (strcmp(argv[1], "--indirect") == 0) + vdev.features[0] |= (1 << VIRTIO_RING_F_INDIRECT_DESC); + else if (strcmp(argv[1], "--eventidx") == 0) + vdev.features[0] |= (1 << VIRTIO_RING_F_EVENT_IDX); + else if (strcmp(argv[1], "--slow-range") == 0) + getrange = getrange_slow; + else if (strcmp(argv[1], "--fast-vringh") == 0) + fast_vringh = true; + else if (strcmp(argv[1], "--parallel") == 0) + parallel = true; + else + errx(1, "Unknown arg %s", argv[1]); + argv++; + } + + if (parallel) + return parallel_test(vdev.features[0], getrange, fast_vringh); + + if (posix_memalign(&__user_addr_min, PAGE_SIZE, USER_MEM) != 0) + abort(); + __user_addr_max = __user_addr_min + USER_MEM; + memset(__user_addr_min, 0, vring_size(RINGSIZE, ALIGN)); + + /* Set up guest side. */ + vq = vring_new_virtqueue(0, RINGSIZE, ALIGN, &vdev, true, + __user_addr_min, + never_notify_host, never_callback_guest, + "guest vq"); + + /* Set up host side. */ + vring_init(&vrh.vring, RINGSIZE, __user_addr_min, ALIGN); + vringh_init_user(&vrh, vdev.features[0], RINGSIZE, true, + vrh.vring.desc, vrh.vring.avail, vrh.vring.used); + + /* No descriptor to get yet... */ + err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head); + if (err != 0) + errx(1, "vringh_getdesc_user: %i", err); + + /* Guest puts in a descriptor. */ + memcpy(__user_addr_max - 1, "a", 1); + sg_init_table(guest_sg, 1); + sg_set_buf(&guest_sg[0], __user_addr_max - 1, 1); + sg_init_table(guest_sg+1, 1); + sg_set_buf(&guest_sg[1], __user_addr_max - 3, 2); + sgs[0] = &guest_sg[0]; + sgs[1] = &guest_sg[1]; + + /* May allocate an indirect, so force it to allocate user addr */ + __kmalloc_fake = __user_addr_min + vring_size(RINGSIZE, ALIGN); + err = virtqueue_add_sgs(vq, sgs, 1, 1, &err, GFP_KERNEL); + if (err) + errx(1, "virtqueue_add_sgs: %i", err); + __kmalloc_fake = NULL; + + /* Host retreives it. */ + vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov)); + vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov)); + + err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head); + if (err != 1) + errx(1, "vringh_getdesc_user: %i", err); + + assert(riov.used == 1); + assert(riov.iov[0].iov_base == __user_addr_max - 1); + assert(riov.iov[0].iov_len == 1); + if (getrange != getrange_slow) { + assert(wiov.used == 1); + assert(wiov.iov[0].iov_base == __user_addr_max - 3); + assert(wiov.iov[0].iov_len == 2); + } else { + assert(wiov.used == 2); + assert(wiov.iov[0].iov_base == __user_addr_max - 3); + assert(wiov.iov[0].iov_len == 1); + assert(wiov.iov[1].iov_base == __user_addr_max - 2); + assert(wiov.iov[1].iov_len == 1); + } + + err = vringh_iov_pull_user(&riov, buf, 5); + if (err != 1) + errx(1, "vringh_iov_pull_user: %i", err); + assert(buf[0] == 'a'); + assert(riov.i == 1); + assert(vringh_iov_pull_user(&riov, buf, 5) == 0); + + memcpy(buf, "bcdef", 5); + err = vringh_iov_push_user(&wiov, buf, 5); + if (err != 2) + errx(1, "vringh_iov_push_user: %i", err); + assert(memcmp(__user_addr_max - 3, "bc", 2) == 0); + assert(wiov.i == wiov.used); + assert(vringh_iov_push_user(&wiov, buf, 5) == 0); + + /* Host is done. */ + err = vringh_complete_user(&vrh, head, err); + if (err != 0) + errx(1, "vringh_complete_user: %i", err); + + /* Guest should see used token now. */ + __kfree_ignore_start = __user_addr_min + vring_size(RINGSIZE, ALIGN); + __kfree_ignore_end = __kfree_ignore_start + 1; + ret = virtqueue_get_buf(vq, &i); + if (ret != &err) + errx(1, "virtqueue_get_buf: %p", ret); + assert(i == 2); + + /* Guest puts in a huge descriptor. */ + sg_init_table(guest_sg, RINGSIZE); + for (i = 0; i < RINGSIZE; i++) { + sg_set_buf(&guest_sg[i], + __user_addr_max - USER_MEM/4, USER_MEM/4); + } + + /* Fill contents with recognisable garbage. */ + for (i = 0; i < USER_MEM/4; i++) + ((char *)__user_addr_max - USER_MEM/4)[i] = i; + + /* This will allocate an indirect, so force it to allocate user addr */ + __kmalloc_fake = __user_addr_min + vring_size(RINGSIZE, ALIGN); + err = virtqueue_add_outbuf(vq, guest_sg, RINGSIZE, &err, GFP_KERNEL); + if (err) + errx(1, "virtqueue_add_outbuf (large): %i", err); + __kmalloc_fake = NULL; + + /* Host picks it up (allocates new iov). */ + vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov)); + vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov)); + + err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head); + if (err != 1) + errx(1, "vringh_getdesc_user: %i", err); + + assert(riov.max_num & VRINGH_IOV_ALLOCATED); + assert(riov.iov != host_riov); + if (getrange != getrange_slow) + assert(riov.used == RINGSIZE); + else + assert(riov.used == RINGSIZE * USER_MEM/4); + + assert(!(wiov.max_num & VRINGH_IOV_ALLOCATED)); + assert(wiov.used == 0); + + /* Pull data back out (in odd chunks), should be as expected. */ + for (i = 0; i < RINGSIZE * USER_MEM/4; i += 3) { + err = vringh_iov_pull_user(&riov, buf, 3); + if (err != 3 && i + err != RINGSIZE * USER_MEM/4) + errx(1, "vringh_iov_pull_user large: %i", err); + assert(buf[0] == (char)i); + assert(err < 2 || buf[1] == (char)(i + 1)); + assert(err < 3 || buf[2] == (char)(i + 2)); + } + assert(riov.i == riov.used); + vringh_iov_cleanup(&riov); + vringh_iov_cleanup(&wiov); + + /* Complete using multi interface, just because we can. */ + used[0].id = head; + used[0].len = 0; + err = vringh_complete_multi_user(&vrh, used, 1); + if (err) + errx(1, "vringh_complete_multi_user(1): %i", err); + + /* Free up those descriptors. */ + ret = virtqueue_get_buf(vq, &i); + if (ret != &err) + errx(1, "virtqueue_get_buf: %p", ret); + + /* Add lots of descriptors. */ + sg_init_table(guest_sg, 1); + sg_set_buf(&guest_sg[0], __user_addr_max - 1, 1); + for (i = 0; i < RINGSIZE; i++) { + err = virtqueue_add_outbuf(vq, guest_sg, 1, &err, GFP_KERNEL); + if (err) + errx(1, "virtqueue_add_outbuf (multiple): %i", err); + } + + /* Now get many, and consume them all at once. */ + vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov)); + vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov)); + + for (i = 0; i < RINGSIZE; i++) { + err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head); + if (err != 1) + errx(1, "vringh_getdesc_user: %i", err); + used[i].id = head; + used[i].len = 0; + } + /* Make sure it wraps around ring, to test! */ + assert(vrh.vring.used->idx % RINGSIZE != 0); + err = vringh_complete_multi_user(&vrh, used, RINGSIZE); + if (err) + errx(1, "vringh_complete_multi_user: %i", err); + + /* Free those buffers. */ + for (i = 0; i < RINGSIZE; i++) { + unsigned len; + assert(virtqueue_get_buf(vq, &len) != NULL); + } + + /* Test weird (but legal!) indirect. */ + if (vdev.features[0] & (1 << VIRTIO_RING_F_INDIRECT_DESC)) { + char *data = __user_addr_max - USER_MEM/4; + struct vring_desc *d = __user_addr_max - USER_MEM/2; + struct vring vring; + + /* Force creation of direct, which we modify. */ + vdev.features[0] &= ~(1 << VIRTIO_RING_F_INDIRECT_DESC); + vq = vring_new_virtqueue(0, RINGSIZE, ALIGN, &vdev, true, + __user_addr_min, + never_notify_host, + never_callback_guest, + "guest vq"); + + sg_init_table(guest_sg, 4); + sg_set_buf(&guest_sg[0], d, sizeof(*d)*2); + sg_set_buf(&guest_sg[1], d + 2, sizeof(*d)*1); + sg_set_buf(&guest_sg[2], data + 6, 4); + sg_set_buf(&guest_sg[3], d + 3, sizeof(*d)*3); + + err = virtqueue_add_outbuf(vq, guest_sg, 4, &err, GFP_KERNEL); + if (err) + errx(1, "virtqueue_add_outbuf (indirect): %i", err); + + vring_init(&vring, RINGSIZE, __user_addr_min, ALIGN); + + /* They're used in order, but double-check... */ + assert(vring.desc[0].addr == (unsigned long)d); + assert(vring.desc[1].addr == (unsigned long)(d+2)); + assert(vring.desc[2].addr == (unsigned long)data + 6); + assert(vring.desc[3].addr == (unsigned long)(d+3)); + vring.desc[0].flags |= VRING_DESC_F_INDIRECT; + vring.desc[1].flags |= VRING_DESC_F_INDIRECT; + vring.desc[3].flags |= VRING_DESC_F_INDIRECT; + + /* First indirect */ + d[0].addr = (unsigned long)data; + d[0].len = 1; + d[0].flags = VRING_DESC_F_NEXT; + d[0].next = 1; + d[1].addr = (unsigned long)data + 1; + d[1].len = 2; + d[1].flags = 0; + + /* Second indirect */ + d[2].addr = (unsigned long)data + 3; + d[2].len = 3; + d[2].flags = 0; + + /* Third indirect */ + d[3].addr = (unsigned long)data + 10; + d[3].len = 5; + d[3].flags = VRING_DESC_F_NEXT; + d[3].next = 1; + d[4].addr = (unsigned long)data + 15; + d[4].len = 6; + d[4].flags = VRING_DESC_F_NEXT; + d[4].next = 2; + d[5].addr = (unsigned long)data + 21; + d[5].len = 7; + d[5].flags = 0; + + /* Host picks it up (allocates new iov). */ + vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov)); + vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov)); + + err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head); + if (err != 1) + errx(1, "vringh_getdesc_user: %i", err); + + if (head != 0) + errx(1, "vringh_getdesc_user: head %i not 0", head); + + assert(riov.max_num & VRINGH_IOV_ALLOCATED); + if (getrange != getrange_slow) + assert(riov.used == 7); + else + assert(riov.used == 28); + err = vringh_iov_pull_user(&riov, buf, 29); + assert(err == 28); + + /* Data should be linear. */ + for (i = 0; i < err; i++) + assert(buf[i] == i); + vringh_iov_cleanup(&riov); + } + + /* Don't leak memory... */ + vring_del_virtqueue(vq); + free(__user_addr_min); + + return 0; +} |