diff options
| author | Paolo Abeni <pabeni@redhat.com> | 2026-05-19 13:07:52 +0300 |
|---|---|---|
| committer | Paolo Abeni <pabeni@redhat.com> | 2026-05-19 13:07:52 +0300 |
| commit | f2dfcc4b4bc28ba8ad45bce43ad76fa9575e27f5 (patch) | |
| tree | 6952169e227d89e7937f6145a0a63f8089256045 | |
| parent | 7af2a94f4dcf53a45f3be5870ebeb195402866d1 (diff) | |
| parent | 4e88fb3234c864b67338ca8d48ca515cf9992ab6 (diff) | |
| download | linux-f2dfcc4b4bc28ba8ad45bce43ad76fa9575e27f5.tar.xz | |
Merge branch 'eea-add-basic-driver-framework-for-alibaba-elastic-ethernet-adaptor'
Xuan Zhuo says:
====================
eea: Add basic driver framework for Alibaba Elastic Ethernet Adaptor
Add a driver framework for EEA that will be available in the future.
This driver is currently quite minimal, implementing only fundamental
core functionalities. Key features include: I/O queue management via
adminq, basic PCI-layer operations, and essential RX/TX data
communication capabilities. It also supports the creation,
initialization, and management of network devices (netdev). Furthermore,
the ring structures for both I/O queues and adminq have been abstracted
into a simple, unified, and reusable library implementation,
facilitating future extension and maintenance.
====================
Link: https://patch.msgid.link/20260514095138.80680-1-xuanzhuo@linux.alibaba.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
| -rw-r--r-- | MAINTAINERS | 8 | ||||
| -rw-r--r-- | drivers/net/ethernet/Kconfig | 1 | ||||
| -rw-r--r-- | drivers/net/ethernet/Makefile | 1 | ||||
| -rw-r--r-- | drivers/net/ethernet/alibaba/Kconfig | 28 | ||||
| -rw-r--r-- | drivers/net/ethernet/alibaba/Makefile | 5 | ||||
| -rw-r--r-- | drivers/net/ethernet/alibaba/eea/Makefile | 9 | ||||
| -rw-r--r-- | drivers/net/ethernet/alibaba/eea/eea_adminq.c | 542 | ||||
| -rw-r--r-- | drivers/net/ethernet/alibaba/eea/eea_adminq.h | 83 | ||||
| -rw-r--r-- | drivers/net/ethernet/alibaba/eea/eea_desc.h | 138 | ||||
| -rw-r--r-- | drivers/net/ethernet/alibaba/eea/eea_ethtool.c | 273 | ||||
| -rw-r--r-- | drivers/net/ethernet/alibaba/eea/eea_ethtool.h | 48 | ||||
| -rw-r--r-- | drivers/net/ethernet/alibaba/eea/eea_net.c | 887 | ||||
| -rw-r--r-- | drivers/net/ethernet/alibaba/eea/eea_net.h | 198 | ||||
| -rw-r--r-- | drivers/net/ethernet/alibaba/eea/eea_pci.c | 744 | ||||
| -rw-r--r-- | drivers/net/ethernet/alibaba/eea/eea_pci.h | 73 | ||||
| -rw-r--r-- | drivers/net/ethernet/alibaba/eea/eea_ring.c | 249 | ||||
| -rw-r--r-- | drivers/net/ethernet/alibaba/eea/eea_ring.h | 99 | ||||
| -rw-r--r-- | drivers/net/ethernet/alibaba/eea/eea_rx.c | 814 | ||||
| -rw-r--r-- | drivers/net/ethernet/alibaba/eea/eea_tx.c | 500 |
19 files changed, 4700 insertions, 0 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index edd161f2c62d..5db1a2923dd2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -808,6 +808,14 @@ S: Maintained F: Documentation/i2c/busses/i2c-ali1563.rst F: drivers/i2c/busses/i2c-ali1563.c +ALIBABA ELASTIC ETHERNET ADAPTER DRIVER +M: Xuan Zhuo <xuanzhuo@linux.alibaba.com> +M: Wen Gu <guwen@linux.alibaba.com> +R: Philo Lu <lulie@linux.alibaba.com> +L: netdev@vger.kernel.org +S: Maintained +F: drivers/net/ethernet/alibaba/eea + ALIBABA ELASTIC RDMA DRIVER M: Cheng Xu <chengyou@linux.alibaba.com> M: Kai Shen <kaishen@linux.alibaba.com> diff --git a/drivers/net/ethernet/Kconfig b/drivers/net/ethernet/Kconfig index b8f70e2a1763..78c79ad7bba5 100644 --- a/drivers/net/ethernet/Kconfig +++ b/drivers/net/ethernet/Kconfig @@ -22,6 +22,7 @@ source "drivers/net/ethernet/aeroflex/Kconfig" source "drivers/net/ethernet/agere/Kconfig" source "drivers/net/ethernet/airoha/Kconfig" source "drivers/net/ethernet/alacritech/Kconfig" +source "drivers/net/ethernet/alibaba/Kconfig" source "drivers/net/ethernet/allwinner/Kconfig" source "drivers/net/ethernet/altera/Kconfig" source "drivers/net/ethernet/amazon/Kconfig" diff --git a/drivers/net/ethernet/Makefile b/drivers/net/ethernet/Makefile index 57344fec6ce0..bba55d9af387 100644 --- a/drivers/net/ethernet/Makefile +++ b/drivers/net/ethernet/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_NET_VENDOR_ADI) += adi/ obj-$(CONFIG_NET_VENDOR_AGERE) += agere/ obj-$(CONFIG_NET_VENDOR_AIROHA) += airoha/ obj-$(CONFIG_NET_VENDOR_ALACRITECH) += alacritech/ +obj-$(CONFIG_NET_VENDOR_ALIBABA) += alibaba/ obj-$(CONFIG_NET_VENDOR_ALLWINNER) += allwinner/ obj-$(CONFIG_ALTERA_TSE) += altera/ obj-$(CONFIG_NET_VENDOR_AMAZON) += amazon/ diff --git a/drivers/net/ethernet/alibaba/Kconfig b/drivers/net/ethernet/alibaba/Kconfig new file mode 100644 index 000000000000..b8fd3c92fa4c --- /dev/null +++ b/drivers/net/ethernet/alibaba/Kconfig @@ -0,0 +1,28 @@ +# +# Alibaba network device configuration +# + +config NET_VENDOR_ALIBABA + bool "Alibaba Devices" + default y + help + If you have a network (Ethernet) device belonging to this class, say Y. + + Note that the answer to this question doesn't directly affect the + kernel: saying N will just cause the configurator to skip all + the questions about Alibaba devices. If you say Y, you will be asked + for your specific device in the following questions. + +if NET_VENDOR_ALIBABA + +config ALIBABA_EEA + tristate "Alibaba Elastic Ethernet Adapter support" + depends on PCI_MSI + depends on 64BIT + select PAGE_POOL + help + This driver supports Alibaba Elastic Ethernet Adapter. + + To compile this driver as a module, choose M here. + +endif #NET_VENDOR_ALIBABA diff --git a/drivers/net/ethernet/alibaba/Makefile b/drivers/net/ethernet/alibaba/Makefile new file mode 100644 index 000000000000..63a527e79ba7 --- /dev/null +++ b/drivers/net/ethernet/alibaba/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for the Alibaba network device drivers. +# + +obj-$(CONFIG_ALIBABA_EEA) += eea/ diff --git a/drivers/net/ethernet/alibaba/eea/Makefile b/drivers/net/ethernet/alibaba/eea/Makefile new file mode 100644 index 000000000000..c38db22cca34 --- /dev/null +++ b/drivers/net/ethernet/alibaba/eea/Makefile @@ -0,0 +1,9 @@ + +obj-$(CONFIG_ALIBABA_EEA) += eea.o +eea-y := eea_ring.o \ + eea_net.o \ + eea_pci.o \ + eea_adminq.o \ + eea_ethtool.o \ + eea_tx.o \ + eea_rx.o diff --git a/drivers/net/ethernet/alibaba/eea/eea_adminq.c b/drivers/net/ethernet/alibaba/eea/eea_adminq.c new file mode 100644 index 000000000000..dfad1bdbc44d --- /dev/null +++ b/drivers/net/ethernet/alibaba/eea/eea_adminq.c @@ -0,0 +1,542 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Driver for Alibaba Elastic Ethernet Adapter. + * + * Copyright (C) 2025 Alibaba Inc. + */ + +#include <linux/etherdevice.h> +#include <linux/iopoll.h> +#include <linux/utsname.h> +#include <linux/version.h> + +#include "eea_adminq.h" +#include "eea_net.h" +#include "eea_pci.h" +#include "eea_ring.h" + +#define EEA_AQ_CMD_CFG_QUERY ((0 << 8) | 0) + +#define EEA_AQ_CMD_QUEUE_CREATE ((1 << 8) | 0) +#define EEA_AQ_CMD_QUEUE_DESTROY_ALL ((1 << 8) | 1) + +#define EEA_AQ_CMD_HOST_INFO ((2 << 8) | 0) + +#define EEA_AQ_CMD_DEV_STATUS ((3 << 8) | 0) + +#define EEA_RING_DESC_F_AQ_PHASE (BIT(15) | BIT(7)) + +#define EEA_QUEUE_FLAGS_HW_SPLIT_HDR BIT(0) +#define EEA_QUEUE_FLAGS_SQCQ BIT(1) +#define EEA_QUEUE_FLAGS_HWTS BIT(2) + +struct eea_aq_create { + __le32 flags; + /* queue index. + * rx: 0 == qidx % 2 + * tx: 1 == qidx % 2 + */ + __le16 qidx; + /* the depth of the queue */ + __le16 depth; + /* 0: without SPLIT HDR + * 1: 128B + * 2: 256B + * 3: 512B + */ + u8 hdr_buf_size; + u8 sq_desc_size; + u8 cq_desc_size; + u8 reserve0; + /* The vector for the irq. rx,tx share the same vector */ + __le16 msix_vector; + __le16 reserve; + /* sq ring cfg. */ + __le32 sq_addr_low; + __le32 sq_addr_high; + /* cq ring cfg. Just valid when flags include EEA_QUEUE_FLAGS_SQCQ. */ + __le32 cq_addr_low; + __le32 cq_addr_high; +}; + +struct eea_aq_queue_drv_status { + __le16 qidx; + + __le16 sq_head; + __le16 cq_head; + __le16 reserved; +}; + +#define EEA_OS_DISTRO 0 +#define EEA_DRV_TYPE 0 +#define EEA_OS_LINUX 1 +#define EEA_SPEC_VER_MAJOR 1 +#define EEA_SPEC_VER_MINOR 0 + +struct eea_aq_host_info_cfg { + __le16 os_type; + __le16 os_dist; + __le16 drv_type; + + __le16 kern_ver_major; + __le16 kern_ver_minor; + __le16 kern_ver_sub_minor; + + __le16 drv_ver_major; + __le16 drv_ver_minor; + __le16 drv_ver_sub_minor; + + __le16 spec_ver_major; + __le16 spec_ver_minor; + __le16 pci_bdf; + __le32 pci_domain; + + u8 os_ver_str[64]; + u8 isa_str[64]; +}; + +#define EEA_HINFO_MAX_REP_LEN 1024 +#define EEA_HINFO_REP_BAD 2 + +struct eea_aq_host_info_rep { + u8 op_code; + u8 has_reply; + u8 reply_str[EEA_HINFO_MAX_REP_LEN]; +}; + +static struct eea_ring *qid_to_ering(struct eea_net *enet, u32 qid) +{ + struct eea_ring *ering; + + if (qid % 2 == 0) + ering = enet->rx[qid / 2]->ering; + else + ering = enet->tx[qid / 2].ering; + + return ering; +} + +#define EEA_AQ_TIMEOUT_US (60 * 1000 * 1000) + +static void eea_device_broken(struct eea_net *enet) +{ + if (enet->adminq.broken) + return; + + eea_device_reset(enet->edev); + enet->adminq.broken = true; +} + +static int eea_adminq_submit(struct eea_net *enet, u16 cmd, + dma_addr_t req_addr, dma_addr_t res_addr, + u32 req_size, u32 res_size, u32 *reply_len) +{ + struct eea_aq_cdesc *cdesc; + struct eea_aq_desc *desc; + int ret; + + if (enet->adminq.broken) + return -EIO; + + desc = eea_ering_aq_alloc_desc(enet->adminq.ring); + + desc->classid = cmd >> 8; + desc->command = cmd & 0xff; + + desc->data_addr = cpu_to_le64(req_addr); + desc->data_len = cpu_to_le32(req_size); + + desc->reply_addr = cpu_to_le64(res_addr); + desc->reply_len = cpu_to_le32(res_size); + + /* for update flags */ + dma_wmb(); + + desc->flags = cpu_to_le16(enet->adminq.phase); + + eea_ering_sq_commit_desc(enet->adminq.ring); + + eea_ering_kick(enet->adminq.ring); + + ++enet->adminq.num; + + if ((enet->adminq.num % enet->adminq.ring->num) == 0) + enet->adminq.phase ^= EEA_RING_DESC_F_AQ_PHASE; + + ret = read_poll_timeout(eea_ering_cq_get_desc, cdesc, cdesc, 10, + EEA_AQ_TIMEOUT_US, false, enet->adminq.ring); + if (ret) { + netdev_err(enet->netdev, + "adminq exec timeout. cmd: %d reset device.\n", + cmd); + /* The device must be reset before unmapping buffers to avoid + * potential DMA writes after the memory is freed. + */ + eea_device_broken(enet); + return ret; + } + + /* Returns 0 on success, or a negative error code on failure. */ + ret = le32_to_cpu(cdesc->status); + + eea_ering_cq_ack_desc(enet->adminq.ring, 1); + + if (ret) + netdev_err(enet->netdev, + "adminq exec failed. cmd: %d ret %d\n", cmd, ret); + else + *reply_len = le32_to_cpu(cdesc->reply_len); + + return ret; +} + +static int eea_adminq_exec(struct eea_net *enet, u16 cmd, + void *req, u32 req_size, + void *res, u32 res_size, + u32 *reply) +{ + dma_addr_t req_addr = 0, res_addr = 0; + struct device *dma; + u32 reply_len = 0; + int ret; + + if (reply) + *reply = 0; + + dma = enet->edev->dma_dev; + + if (req) { + req_addr = dma_map_single(dma, req, req_size, DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(dma, req_addr))) + return -ENOMEM; + } + + if (res) { + res_addr = dma_map_single(dma, res, res_size, DMA_FROM_DEVICE); + if (unlikely(dma_mapping_error(dma, res_addr))) { + ret = -ENOMEM; + goto err_unmap_req; + } + } + + mutex_lock(&enet->adminq.lock); + ret = eea_adminq_submit(enet, cmd, req_addr, res_addr, + req_size, res_size, &reply_len); + mutex_unlock(&enet->adminq.lock); + if (res) { + dma_unmap_single(dma, res_addr, res_size, DMA_FROM_DEVICE); + + if (ret) + memset(res, 0, res_size); + else if (res_size > reply_len) + memset(res + reply_len, 0, res_size - reply_len); + + if (reply) + *reply = reply_len; + } + +err_unmap_req: + if (req) + dma_unmap_single(dma, req_addr, req_size, DMA_TO_DEVICE); + + return ret; +} + +void eea_destroy_adminq(struct eea_net *enet) +{ + struct eea_aq *aq; + + aq = &enet->adminq; + + if (aq->ring) { + eea_ering_free(aq->ring); + aq->ring = NULL; + aq->phase = 0; + } + + kfree(aq->q_req_buf); + kfree(aq->q_res_buf); + + aq->q_req_buf = NULL; + aq->q_res_buf = NULL; +} + +int eea_create_adminq(struct eea_net *enet, u32 qid) +{ + u32 db_size, q_size, num; + struct eea_ring *ering; + struct eea_aq *aq; + int err = -ENOMEM; + + num = enet->edev->rx_num + enet->edev->tx_num; + aq = &enet->adminq; + + ering = eea_ering_alloc(qid, 64, enet->edev, sizeof(struct eea_aq_desc), + sizeof(struct eea_aq_cdesc), "adminq"); + if (!ering) + return -ENOMEM; + + aq->ring = ering; + + err = eea_pci_active_aq(ering, qid / 2 + 1); + if (err) + goto err; + + aq->phase = BIT(7); + aq->num = 0; + + q_size = sizeof(*aq->q_req_buf) * num; + db_size = sizeof(*aq->q_res_buf) * num; + + aq->q_req_size = q_size; + aq->q_res_size = db_size; + + err = -ENOMEM; + + aq->q_req_buf = kzalloc(q_size, GFP_KERNEL); + if (!aq->q_req_buf) + goto err; + + aq->q_res_buf = kzalloc(db_size, GFP_KERNEL); + if (!aq->q_res_buf) + goto err; + + /* Before we set up the AQ, the device remains in an inactive state, so + * there will be no DMA operations. If the 'set up AQ' process fails, we + * can safely free the DMA-related memory. + */ + err = eea_pci_set_aq_up(enet->edev); + if (err) + goto err; + + aq->broken = false; + + mutex_init(&aq->lock); + + return 0; + +err: + eea_destroy_adminq(enet); + return err; +} + +int eea_adminq_query_cfg(struct eea_net *enet, struct eea_aq_cfg *cfg) +{ + return eea_adminq_exec(enet, EEA_AQ_CMD_CFG_QUERY, NULL, 0, cfg, + sizeof(*cfg), NULL); +} + +static void qcfg_fill(struct eea_aq_create *qcfg, struct eea_ring *ering, + u32 flags) +{ + qcfg->flags = cpu_to_le32(flags); + qcfg->qidx = cpu_to_le16(ering->index); + qcfg->depth = cpu_to_le16(ering->num); + + qcfg->hdr_buf_size = flags & EEA_QUEUE_FLAGS_HW_SPLIT_HDR ? 1 : 0; + qcfg->sq_desc_size = ering->sq.desc_size; + qcfg->cq_desc_size = ering->cq.desc_size; + qcfg->msix_vector = cpu_to_le16(ering->msix_vec); + + qcfg->sq_addr_low = cpu_to_le32(lower_32_bits(ering->sq.dma_addr)); + qcfg->sq_addr_high = cpu_to_le32(upper_32_bits(ering->sq.dma_addr)); + + qcfg->cq_addr_low = cpu_to_le32(lower_32_bits(ering->cq.dma_addr)); + qcfg->cq_addr_high = cpu_to_le32(upper_32_bits(ering->cq.dma_addr)); +} + +int eea_adminq_create_q(struct eea_net *enet, u32 num, u32 flags) +{ + int i, db_size, q_size, err = -ENOMEM; + struct eea_net_cfg *cfg; + struct eea_ring *ering; + struct eea_aq *aq; + u32 reply_len; + + cfg = &enet->cfg; + aq = &enet->adminq; + + if (cfg->split_hdr) + flags |= EEA_QUEUE_FLAGS_HW_SPLIT_HDR; + + flags |= EEA_QUEUE_FLAGS_SQCQ; + flags |= EEA_QUEUE_FLAGS_HWTS; + + q_size = sizeof(*aq->q_req_buf) * num; + db_size = sizeof(*aq->q_res_buf) * num; + + for (i = 0; i < num; i++) { + ering = qid_to_ering(enet, i); + qcfg_fill(aq->q_req_buf + i, ering, flags); + } + + err = eea_adminq_exec(enet, EEA_AQ_CMD_QUEUE_CREATE, + aq->q_req_buf, q_size, + aq->q_res_buf, db_size, + &reply_len); + if (err) + return err; + + if (reply_len != db_size) { + eea_adminq_destroy_all_q(enet); + netdev_err(enet->netdev, "invalid reply len %u\n", reply_len); + return -EINVAL; + } + + for (i = 0; i < num; i++) { + ering = qid_to_ering(enet, i); + ering->db = eea_pci_db_addr(ering->edev, + le32_to_cpu(aq->q_res_buf[i])); + if (!ering->db) { + netdev_err(enet->netdev, "invalid db off %u\n", + le32_to_cpu(aq->q_res_buf[i])); + goto err; + } + } + + return err; + +err: + eea_adminq_destroy_all_q(enet); + for (i = 0; i < num; i++) { + ering = qid_to_ering(enet, i); + ering->db = NULL; + } + + return -EIO; +} + +int eea_adminq_destroy_all_q(struct eea_net *enet) +{ + int err; + + err = eea_adminq_exec(enet, EEA_AQ_CMD_QUEUE_DESTROY_ALL, NULL, 0, + NULL, 0, NULL); + if (err) { + /* The device must be reset before unmapping buffers to avoid + * potential DMA writes after the memory is freed. + */ + mutex_lock(&enet->adminq.lock); + eea_device_broken(enet); + mutex_unlock(&enet->adminq.lock); + + netdev_err(enet->netdev, "QUEUE_DESTROY fail: reset device.\n"); + } + + return err; +} + +/* The caller must ensure that both the 'rx' and 'tx' arrays are valid. */ +int eea_adminq_dev_status(struct eea_net *enet, + struct eea_aq_dev_status *dstatus) +{ + struct eea_aq_queue_drv_status *drv_status; + struct __eea_aq_dev_status *dev_status; + int err, i, io_num, size, q_num; + struct eea_ring *ering; + void *rep, *req; + + q_num = enet->cfg.rx_ring_num + enet->cfg.tx_ring_num + 1; + io_num = enet->cfg.rx_ring_num + enet->cfg.tx_ring_num; + + req = kcalloc(q_num, sizeof(struct eea_aq_queue_drv_status), + GFP_KERNEL); + if (!req) + return -ENOMEM; + + size = struct_size(dev_status, q_status, q_num); + + rep = kzalloc(size, GFP_KERNEL); + if (!rep) { + kfree(req); + return -ENOMEM; + } + + drv_status = req; + for (i = 0; i < io_num; ++i, ++drv_status) { + ering = qid_to_ering(enet, i); + drv_status->qidx = cpu_to_le16(i); + drv_status->cq_head = cpu_to_le16(ering->cq.head); + drv_status->sq_head = cpu_to_le16(ering->sq.head); + } + + drv_status->qidx = cpu_to_le16(i); + drv_status->cq_head = cpu_to_le16(enet->adminq.ring->cq.head); + drv_status->sq_head = cpu_to_le16(enet->adminq.ring->sq.head); + + err = eea_adminq_exec(enet, EEA_AQ_CMD_DEV_STATUS, req, + q_num * sizeof(struct eea_aq_queue_drv_status), + rep, size, NULL); + kfree(req); + if (err) { + kfree(rep); + return err; + } + + dstatus->num = q_num; + dstatus->status = rep; + + return 0; +} + +void eea_adminq_config_host_info(struct eea_net *enet) +{ + struct device *dev = enet->edev->dma_dev; + struct eea_aq_host_info_cfg *cfg; + struct eea_aq_host_info_rep *rep; + int rc = -ENOMEM; + + cfg = kzalloc(sizeof(*cfg), GFP_KERNEL); + if (!cfg) + return; + + rep = kzalloc(sizeof(*rep), GFP_KERNEL); + if (!rep) + goto err_free_cfg; + + cfg->os_type = cpu_to_le16(EEA_OS_LINUX); + cfg->os_dist = cpu_to_le16(EEA_OS_DISTRO); + cfg->drv_type = cpu_to_le16(EEA_DRV_TYPE); + + cfg->kern_ver_major = cpu_to_le16(LINUX_VERSION_MAJOR); + cfg->kern_ver_minor = cpu_to_le16(LINUX_VERSION_PATCHLEVEL); + cfg->kern_ver_sub_minor = cpu_to_le16(LINUX_VERSION_SUBLEVEL); + + cfg->drv_ver_major = cpu_to_le16(EEA_VER_MAJOR); + cfg->drv_ver_minor = cpu_to_le16(EEA_VER_MINOR); + cfg->drv_ver_sub_minor = cpu_to_le16(EEA_VER_SUB_MINOR); + + cfg->spec_ver_major = cpu_to_le16(EEA_SPEC_VER_MAJOR); + cfg->spec_ver_minor = cpu_to_le16(EEA_SPEC_VER_MINOR); + + cfg->pci_bdf = cpu_to_le16(eea_pci_bdf(enet->edev)); + cfg->pci_domain = cpu_to_le32(eea_pci_domain_nr(enet->edev)); + + strscpy(cfg->os_ver_str, utsname()->release, sizeof(cfg->os_ver_str)); + strscpy(cfg->isa_str, utsname()->machine, sizeof(cfg->isa_str)); + + rc = eea_adminq_exec(enet, EEA_AQ_CMD_HOST_INFO, + cfg, sizeof(*cfg), rep, sizeof(*rep), NULL); + + if (!rc) { + if (rep->op_code == EEA_HINFO_REP_BAD) + dev_warn(dev, "The hardware-driven state validation may be abnormal.\n"); + + if (rep->has_reply) { + char buf[EEA_HINFO_MAX_REP_LEN] = {0}; + + rep->reply_str[EEA_HINFO_MAX_REP_LEN - 1] = '\0'; + + string_escape_str(rep->reply_str, buf, sizeof(buf), + ESCAPE_NP, NULL); + + buf[EEA_HINFO_MAX_REP_LEN - 1] = '\0'; + + dev_warn(dev, "Device replied: %s\n", buf); + } + } + + kfree(rep); +err_free_cfg: + kfree(cfg); +} diff --git a/drivers/net/ethernet/alibaba/eea/eea_adminq.h b/drivers/net/ethernet/alibaba/eea/eea_adminq.h new file mode 100644 index 000000000000..0182f5641fcf --- /dev/null +++ b/drivers/net/ethernet/alibaba/eea/eea_adminq.h @@ -0,0 +1,83 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Driver for Alibaba Elastic Ethernet Adapter. + * + * Copyright (C) 2025 Alibaba Inc. + */ + +#ifndef __EEA_ADMINQ_H__ +#define __EEA_ADMINQ_H__ + +struct eea_aq_cfg { + __le32 rx_depth_max; + __le32 rx_depth_def; + + __le32 tx_depth_max; + __le32 tx_depth_def; + + __le32 max_tso_size; + __le32 max_tso_segs; + + u8 mac[ETH_ALEN]; + __le16 status; + + __le16 mtu; + __le16 reserved0; + __le16 reserved1; + u8 reserved2; + u8 reserved3; + + __le16 reserved4; + __le16 reserved5; + __le16 reserved6; +}; + +struct eea_aq_queue_status { + __le16 qidx; +#define EEA_QUEUE_STATUS_OK 0 +#define EEA_QUEUE_STATUS_NEED_RESET 1 + __le16 status; +}; + +struct __eea_aq_dev_status { +#define EEA_LINK_DOWN_STATUS 0 +#define EEA_LINK_UP_STATUS 1 + __le16 link_status; + __le16 reserved; + + struct eea_aq_queue_status q_status[]; +}; + +struct eea_aq_dev_status { + u32 num; + struct __eea_aq_dev_status *status; +}; + +struct eea_aq { + struct eea_ring *ring; + u32 num; + bool broken; + u16 phase; + + /* lock for adminq exec */ + struct mutex lock; + + u32 q_req_size; + u32 q_res_size; + struct eea_aq_create *q_req_buf; + __le32 *q_res_buf; +}; + +struct eea_net; + +int eea_create_adminq(struct eea_net *enet, u32 qid); +void eea_destroy_adminq(struct eea_net *enet); + +int eea_adminq_query_cfg(struct eea_net *enet, struct eea_aq_cfg *cfg); + +int eea_adminq_create_q(struct eea_net *enet, u32 num, u32 flags); +int eea_adminq_destroy_all_q(struct eea_net *enet); +int eea_adminq_dev_status(struct eea_net *enet, + struct eea_aq_dev_status *dstatus); +void eea_adminq_config_host_info(struct eea_net *enet); +#endif diff --git a/drivers/net/ethernet/alibaba/eea/eea_desc.h b/drivers/net/ethernet/alibaba/eea/eea_desc.h new file mode 100644 index 000000000000..8d94a0f0f237 --- /dev/null +++ b/drivers/net/ethernet/alibaba/eea/eea_desc.h @@ -0,0 +1,138 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Driver for Alibaba Elastic Ethernet Adapter. + * + * Copyright (C) 2025 Alibaba Inc. + */ + +#ifndef __EEA_DESC_H__ +#define __EEA_DESC_H__ + +#define EEA_DESC_TS_MASK GENMASK_ULL(47, 0) +#define EEA_DESC_TS(desc) (le64_to_cpu((desc)->ts) & EEA_DESC_TS_MASK) + +struct eea_aq_desc { + __le16 flags; + __le16 id; + __le16 reserved; + u8 classid; + u8 command; + __le64 data_addr; + __le64 reply_addr; + __le32 data_len; + __le32 reply_len; +}; + +struct eea_aq_cdesc { + __le16 flags; + __le16 id; +#define EEA_OK 0 +#define EEA_ERR 0xffffffff + __le32 status; + __le32 reply_len; + __le32 reserved1; + + __le64 reserved2; + __le64 reserved3; +}; + +struct eea_rx_desc_no_hdr { + __le16 flags; + __le16 id; + __le16 len; + __le16 reserved1; + + __le64 addr; +}; + +struct eea_rx_desc { + __le16 flags; + __le16 id; + __le16 len; + __le16 reserved1; + + __le64 addr; + + __le64 hdr_addr; + __le32 reserved2; + __le32 reserved3; +}; + +#define EEA_RX_CDESC_HDR_LEN_MASK GENMASK_ULL(9, 0) + +struct eea_rx_cdesc { +#define EEA_DESC_F_DATA_VALID BIT(6) +#define EEA_DESC_F_SPLIT_HDR BIT(5) + __le16 flags; + __le16 id; + __le16 len; +#define EEA_NET_PT_NONE 0 +#define EEA_NET_PT_IPv4 1 +#define EEA_NET_PT_TCPv4 2 +#define EEA_NET_PT_UDPv4 3 +#define EEA_NET_PT_IPv6 4 +#define EEA_NET_PT_TCPv6 5 +#define EEA_NET_PT_UDPv6 6 +#define EEA_NET_PT_IPv6_EX 7 +#define EEA_NET_PT_TCPv6_EX 8 +#define EEA_NET_PT_UDPv6_EX 9 + /* [9:0] is packet type. */ + __le16 type; + + /* hw timestamp [0:47]: ts */ + __le64 ts; + + __le32 hash; + + /* 0-9: hdr_len split header + * 10-15: reserved1 + */ + __le16 len_ex; + __le16 reserved2; + + __le32 reserved3; + __le32 reserved4; +}; + +#define EEA_TX_GSO_NONE 0 +#define EEA_TX_GSO_TCPV4 1 +#define EEA_TX_GSO_TCPV6 4 +#define EEA_TX_GSO_UDP_L4 5 +#define EEA_TX_GSO_ECN 0x80 + +struct eea_tx_desc { +#define EEA_DESC_F_DO_CSUM BIT(6) + __le16 flags; + __le16 id; + __le16 len; + __le16 reserved1; + + __le64 addr; + + __le16 csum_start; + __le16 csum_offset; + u8 gso_type; + u8 reserved2; + __le16 gso_size; + __le64 reserved3; +}; + +struct eea_tx_cdesc { + __le16 flags; + __le16 id; + __le16 len; + __le16 reserved1; + + /* hw timestamp [0:47]: ts */ + __le64 ts; +}; + +#define EEA_DB_FLAGS_OFF 0 +#define EEA_DB_IDX_OFF (2 * 8) +#define EEA_DB_TX_CQ_HEAD_OFF (4 * 8) +#define EEA_DB_RX_CQ_HEAD_OFF (6 * 8) + +#define EEA_IDX_PRESENT BIT(0) +#define EEA_IRQ_MASK BIT(1) +#define EEA_IRQ_UNMASK BIT(2) +#endif diff --git a/drivers/net/ethernet/alibaba/eea/eea_ethtool.c b/drivers/net/ethernet/alibaba/eea/eea_ethtool.c new file mode 100644 index 000000000000..479779aa2dd8 --- /dev/null +++ b/drivers/net/ethernet/alibaba/eea/eea_ethtool.c @@ -0,0 +1,273 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Driver for Alibaba Elastic Ethernet Adapter. + * + * Copyright (C) 2025 Alibaba Inc. + */ + +#include <linux/ethtool.h> +#include <linux/ethtool_netlink.h> +#include <linux/rtnetlink.h> + +#include "eea_adminq.h" +#include "eea_net.h" +#include "eea_pci.h" + +struct eea_stat_desc { + char desc[ETH_GSTRING_LEN]; + size_t offset; +}; + +#define EEA_TX_STAT(m) {#m, offsetof(struct eea_tx_stats, m)} +#define EEA_RX_STAT(m) {#m, offsetof(struct eea_rx_stats, m)} + +static const struct eea_stat_desc eea_rx_stats_desc[] = { + EEA_RX_STAT(descs), + EEA_RX_STAT(kicks), +}; + +static const struct eea_stat_desc eea_tx_stats_desc[] = { + EEA_TX_STAT(descs), + EEA_TX_STAT(kicks), +}; + +#define EEA_TX_STATS_LEN ARRAY_SIZE(eea_tx_stats_desc) +#define EEA_RX_STATS_LEN ARRAY_SIZE(eea_rx_stats_desc) + +static void eea_get_drvinfo(struct net_device *netdev, + struct ethtool_drvinfo *info) +{ + struct eea_net *enet = netdev_priv(netdev); + struct eea_device *edev = enet->edev; + + strscpy(info->driver, KBUILD_MODNAME, sizeof(info->driver)); + strscpy(info->bus_info, eea_pci_name(edev), sizeof(info->bus_info)); +} + +static void eea_get_ringparam(struct net_device *netdev, + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) +{ + struct eea_net *enet = netdev_priv(netdev); + + ring->rx_max_pending = enet->cfg_hw.rx_ring_depth; + ring->tx_max_pending = enet->cfg_hw.tx_ring_depth; + ring->rx_pending = enet->cfg.rx_ring_depth; + ring->tx_pending = enet->cfg.tx_ring_depth; + + kernel_ring->tcp_data_split = enet->cfg.split_hdr ? + ETHTOOL_TCP_DATA_SPLIT_ENABLED : + ETHTOOL_TCP_DATA_SPLIT_DISABLED; +} + +static int eea_set_ringparam(struct net_device *netdev, + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) +{ + struct eea_net *enet = netdev_priv(netdev); + struct eea_net_init_ctx ctx; + bool need_update = false; + struct eea_net_cfg *cfg; + bool sh; + + if (ring->rx_pending < EEA_NET_IO_RING_DEPTH_MIN || + ring->tx_pending < EEA_NET_IO_RING_DEPTH_MIN) + return -EINVAL; + + if (!is_power_of_2(ring->rx_pending) || + !is_power_of_2(ring->tx_pending)) + return -EINVAL; + + eea_init_ctx(enet, &ctx); + + cfg = &ctx.cfg; + + if (ring->rx_pending != cfg->rx_ring_depth) + need_update = true; + + if (ring->tx_pending != cfg->tx_ring_depth) + need_update = true; + + sh = false; + + switch (kernel_ring->tcp_data_split) { + case ETHTOOL_TCP_DATA_SPLIT_ENABLED: + sh = true; + break; + + case ETHTOOL_TCP_DATA_SPLIT_DISABLED: + sh = false; + break; + + case ETHTOOL_TCP_DATA_SPLIT_UNKNOWN: + sh = !!cfg->split_hdr; + break; + } + + if (sh != !!(cfg->split_hdr)) + need_update = true; + + if (!need_update) + return 0; + + cfg->rx_ring_depth = ring->rx_pending; + cfg->tx_ring_depth = ring->tx_pending; + + /* By default, enet->cfg_hw.split_hdr is 128. */ + cfg->split_hdr = sh ? enet->cfg_hw.split_hdr : 0; + + return eea_reset_hw_resources(enet, &ctx); +} + +static int eea_set_channels(struct net_device *netdev, + struct ethtool_channels *channels) +{ + struct eea_net *enet = netdev_priv(netdev); + u16 queue_pairs = channels->combined_count; + struct eea_net_init_ctx ctx; + struct eea_net_cfg *cfg; + + eea_init_ctx(enet, &ctx); + + cfg = &ctx.cfg; + + cfg->rx_ring_num = queue_pairs; + cfg->tx_ring_num = queue_pairs; + + return eea_reset_hw_resources(enet, &ctx); +} + +static void eea_get_channels(struct net_device *netdev, + struct ethtool_channels *channels) +{ + struct eea_net *enet = netdev_priv(netdev); + + channels->combined_count = enet->cfg.rx_ring_num; + channels->max_combined = enet->cfg_hw.rx_ring_num; +} + +static void eea_get_strings(struct net_device *netdev, u32 stringset, u8 *data) +{ + struct eea_net *enet = netdev_priv(netdev); + u8 *p = data; + u32 i, j; + + if (stringset != ETH_SS_STATS) + return; + + for (i = 0; i < enet->cfg.rx_ring_num; i++) { + for (j = 0; j < EEA_RX_STATS_LEN; j++) + ethtool_sprintf(&p, "rx%u_%s", i, + eea_rx_stats_desc[j].desc); + } + + for (i = 0; i < enet->cfg.tx_ring_num; i++) { + for (j = 0; j < EEA_TX_STATS_LEN; j++) + ethtool_sprintf(&p, "tx%u_%s", i, + eea_tx_stats_desc[j].desc); + } +} + +static int eea_get_sset_count(struct net_device *netdev, int sset) +{ + struct eea_net *enet = netdev_priv(netdev); + + if (sset != ETH_SS_STATS) + return -EOPNOTSUPP; + + return enet->cfg.rx_ring_num * EEA_RX_STATS_LEN + + enet->cfg.tx_ring_num * EEA_TX_STATS_LEN; +} + +static void eea_stats_fill_for_q(struct u64_stats_sync *syncp, u32 num, + const struct eea_stat_desc *desc, + u64 *data, u32 idx) +{ + void *stats_base = syncp; + u32 start, i; + + do { + start = u64_stats_fetch_begin(syncp); + for (i = 0; i < num; i++) + data[idx + i] = + u64_stats_read(stats_base + desc[i].offset); + + } while (u64_stats_fetch_retry(syncp, start)); + + BUILD_BUG_ON(offsetof(struct eea_tx_stats, syncp)); + BUILD_BUG_ON(offsetof(struct eea_rx_stats, syncp)); +} + +static void eea_get_ethtool_stats(struct net_device *netdev, + struct ethtool_stats *stats, u64 *data) +{ + struct eea_net *enet = netdev_priv(netdev); + u32 i, idx = 0; + + ASSERT_RTNL(); + + if (enet->rx) { + for (i = 0; i < enet->cfg.rx_ring_num; i++) { + struct eea_net_rx *rx = enet->rx[i]; + + eea_stats_fill_for_q(&rx->stats.syncp, EEA_RX_STATS_LEN, + eea_rx_stats_desc, data, idx); + + idx += EEA_RX_STATS_LEN; + } + } + + if (enet->tx) { + for (i = 0; i < enet->cfg.tx_ring_num; i++) { + struct eea_net_tx *tx = &enet->tx[i]; + + eea_stats_fill_for_q(&tx->stats.syncp, EEA_TX_STATS_LEN, + eea_tx_stats_desc, data, idx); + + idx += EEA_TX_STATS_LEN; + } + } +} + +void eea_update_rx_stats(struct eea_rx_stats *rx_stats, + struct eea_rx_ctx_stats *stats) +{ + u64_stats_update_begin(&rx_stats->syncp); + u64_stats_add(&rx_stats->descs, stats->descs); + u64_stats_add(&rx_stats->packets, stats->packets); + u64_stats_add(&rx_stats->bytes, stats->bytes); + u64_stats_add(&rx_stats->drops, stats->drops); + u64_stats_add(&rx_stats->split_hdr_bytes, stats->split_hdr_bytes); + u64_stats_add(&rx_stats->split_hdr_packets, stats->split_hdr_packets); + u64_stats_add(&rx_stats->length_errors, stats->length_errors); + u64_stats_add(&rx_stats->kicks, stats->kicks); + u64_stats_update_end(&rx_stats->syncp); +} + +static int eea_get_link_ksettings(struct net_device *netdev, + struct ethtool_link_ksettings *cmd) +{ + struct eea_net *enet = netdev_priv(netdev); + + cmd->base.speed = enet->speed; + cmd->base.duplex = enet->duplex; + cmd->base.port = PORT_OTHER; + + return 0; +} + +const struct ethtool_ops eea_ethtool_ops = { + .supported_ring_params = ETHTOOL_RING_USE_TCP_DATA_SPLIT, + .get_drvinfo = eea_get_drvinfo, + .get_link = ethtool_op_get_link, + .get_ringparam = eea_get_ringparam, + .set_ringparam = eea_set_ringparam, + .set_channels = eea_set_channels, + .get_channels = eea_get_channels, + .get_strings = eea_get_strings, + .get_sset_count = eea_get_sset_count, + .get_ethtool_stats = eea_get_ethtool_stats, + .get_link_ksettings = eea_get_link_ksettings, +}; diff --git a/drivers/net/ethernet/alibaba/eea/eea_ethtool.h b/drivers/net/ethernet/alibaba/eea/eea_ethtool.h new file mode 100644 index 000000000000..08c824a116de --- /dev/null +++ b/drivers/net/ethernet/alibaba/eea/eea_ethtool.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Driver for Alibaba Elastic Ethernet Adapter. + * + * Copyright (C) 2025 Alibaba Inc. + */ + +#ifndef __EEA_ETHTOOL_H__ +#define __EEA_ETHTOOL_H__ + +struct eea_tx_stats { + struct u64_stats_sync syncp; + u64_stats_t descs; + u64_stats_t packets; + u64_stats_t bytes; + u64_stats_t drops; + u64_stats_t kicks; +}; + +struct eea_rx_ctx_stats { + u64 descs; + u64 packets; + u64 bytes; + u64 drops; + u64 split_hdr_bytes; + u64 split_hdr_packets; + u64 kicks; + u64 length_errors; +}; + +struct eea_rx_stats { + struct u64_stats_sync syncp; + u64_stats_t descs; + u64_stats_t packets; + u64_stats_t bytes; + u64_stats_t drops; + u64_stats_t kicks; + u64_stats_t split_hdr_bytes; + u64_stats_t split_hdr_packets; + + u64_stats_t length_errors; +}; + +void eea_update_rx_stats(struct eea_rx_stats *rx_stats, + struct eea_rx_ctx_stats *stats); + +extern const struct ethtool_ops eea_ethtool_ops; +#endif diff --git a/drivers/net/ethernet/alibaba/eea/eea_net.c b/drivers/net/ethernet/alibaba/eea/eea_net.c new file mode 100644 index 000000000000..63e68580de94 --- /dev/null +++ b/drivers/net/ethernet/alibaba/eea/eea_net.c @@ -0,0 +1,887 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Driver for Alibaba Elastic Ethernet Adapter. + * + * Copyright (C) 2025 Alibaba Inc. + */ + +#include <linux/etherdevice.h> +#include <linux/module.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <net/netdev_queues.h> + +#include "eea_adminq.h" +#include "eea_net.h" +#include "eea_pci.h" +#include "eea_ring.h" + +#define EEA_SPLIT_HDR_SIZE ALIGN(128, L1_CACHE_BYTES) + +static irqreturn_t eea_irq_handler(int irq, void *data) +{ + struct eea_irq_blk *blk = data; + + napi_schedule_irqoff(&blk->napi); + + return IRQ_HANDLED; +} + +static void eea_free_irq_blk(struct eea_net *enet) +{ + struct eea_irq_blk *blk; + u32 num; + int i; + + if (!enet->irq_blks) + return; + + num = enet->edev->rx_num; + + for (i = 0; i < num; i++) { + blk = &enet->irq_blks[i]; + + if (blk->ready) + eea_pci_free_irq(blk); + + blk->ready = false; + } + + kvfree(enet->irq_blks); + enet->irq_blks = NULL; +} + +/* The driver will always attempt to allocate IRQ blocks based on the maximum + * possible queue num. + */ +static int eea_alloc_irq_blks(struct eea_net *enet) +{ + struct eea_device *edev = enet->edev; + struct eea_irq_blk *blk, *irq_blks; + int i, err, num; + + num = enet->edev->rx_num; + + irq_blks = kvcalloc(num, sizeof(*blk), GFP_KERNEL); + if (!irq_blks) + return -ENOMEM; + + enet->irq_blks = irq_blks; + + for (i = 0; i < num; i++) { + blk = &irq_blks[i]; + blk->idx = i; + + /* vec 0 is for error notify. */ + blk->msix_vec = i + 1; + + err = eea_pci_request_irq(edev, blk, eea_irq_handler); + if (err) + goto err_free_irq_blk; + + blk->ready = true; + } + + return 0; + +err_free_irq_blk: + eea_free_irq_blk(enet); + return err; +} + +static int eea_update_queues(struct eea_net *enet) +{ + return netif_set_real_num_queues(enet->netdev, enet->cfg.tx_ring_num, + enet->cfg.rx_ring_num); +} + +void eea_init_ctx(struct eea_net *enet, struct eea_net_init_ctx *ctx) +{ + memset(ctx, 0, sizeof(*ctx)); + + ctx->netdev = enet->netdev; + ctx->edev = enet->edev; + ctx->cfg = enet->cfg; +} + +static void eea_bind_q_and_cfg(struct eea_net *enet, + struct eea_net_init_ctx *ctx) +{ + struct eea_irq_blk *blk; + struct eea_net_rx *rx; + struct eea_net_tx *tx; + int i; + + /* Since 'ndo_get_stats64' is not called in softirq context, there is no + * need to use 'spin_lock_bh'. + */ + spin_lock(&enet->stats_lock); + + enet->cfg = ctx->cfg; + enet->rx = ctx->rx; + enet->tx = ctx->tx; + + for (i = 0; i < ctx->cfg.rx_ring_num; i++) { + blk = &enet->irq_blks[i]; + + rx = ctx->rx[i]; + tx = &ctx->tx[i]; + + rx->enet = enet; + rx->napi = &blk->napi; + rx->ering->msix_vec = blk->msix_vec; + + tx->enet = enet; + tx->ering->msix_vec = blk->msix_vec; + + blk->rx = rx; + } + + spin_unlock(&enet->stats_lock); +} + +static void eea_unbind_q_and_cfg(struct eea_net *enet, + struct eea_net_init_ctx *ctx) +{ + struct eea_irq_blk *blk; + struct eea_net_rx *rx; + int i; + + spin_lock(&enet->stats_lock); + + ctx->cfg = enet->cfg; + ctx->rx = enet->rx; + ctx->tx = enet->tx; + + enet->rx = NULL; + enet->tx = NULL; + + for (i = 0; i < ctx->cfg.rx_ring_num; i++) { + blk = &enet->irq_blks[i]; + + rx = ctx->rx[i]; + + rx->napi = NULL; + + blk->rx = NULL; + } + + spin_unlock(&enet->stats_lock); +} + +static void eea_free_rxtx_q_mem(struct eea_net_init_ctx *ctx) +{ + struct eea_net_rx *rx; + struct eea_net_tx *tx; + int i; + + for (i = 0; i < ctx->cfg.rx_ring_num; i++) { + rx = ctx->rx[i]; + tx = &ctx->tx[i]; + + eea_free_rx(rx, &ctx->cfg); + eea_free_tx(tx, &ctx->cfg); + } + + kvfree(ctx->rx); + kvfree(ctx->tx); +} + +/* alloc tx/rx: struct, ring, meta, pp, napi */ +static int eea_alloc_rxtx_q_mem(struct eea_net_init_ctx *ctx) +{ + struct eea_net_rx *rx; + struct eea_net_tx *tx; + int err, i; + + ctx->tx = kvcalloc(ctx->cfg.tx_ring_num, sizeof(*ctx->tx), GFP_KERNEL); + if (!ctx->tx) + return -ENOMEM; + + ctx->rx = kvcalloc(ctx->cfg.rx_ring_num, sizeof(*ctx->rx), GFP_KERNEL); + if (!ctx->rx) + goto err_free_tx; + + ctx->cfg.rx_sq_desc_size = sizeof(struct eea_rx_desc); + ctx->cfg.rx_cq_desc_size = sizeof(struct eea_rx_cdesc); + ctx->cfg.tx_sq_desc_size = sizeof(struct eea_tx_desc); + ctx->cfg.tx_cq_desc_size = sizeof(struct eea_tx_cdesc); + + /* ethtool may config this. */ + if (!ctx->cfg.split_hdr) + ctx->cfg.rx_sq_desc_size = sizeof(struct eea_rx_desc_no_hdr); + + for (i = 0; i < ctx->cfg.rx_ring_num; i++) { + rx = eea_alloc_rx(ctx, i); + if (!rx) + goto err_free; + + ctx->rx[i] = rx; + + tx = ctx->tx + i; + err = eea_alloc_tx(ctx, tx, i); + if (err) + goto err_free; + } + + return 0; + +err_free: + for (i = 0; i < ctx->cfg.rx_ring_num; i++) { + rx = ctx->rx[i]; + tx = ctx->tx + i; + + eea_free_rx(rx, &ctx->cfg); + eea_free_tx(tx, &ctx->cfg); + } + + kvfree(ctx->rx); + +err_free_tx: + kvfree(ctx->tx); + return -ENOMEM; +} + +static int eea_hw_active_ring(struct eea_net *enet) +{ + return eea_adminq_create_q(enet, enet->cfg.rx_ring_num + + enet->cfg.tx_ring_num, 0); +} + +static int eea_hw_unactive_ring(struct eea_net *enet) +{ + int err; + + err = eea_adminq_destroy_all_q(enet); + if (err) + netdev_warn(enet->netdev, "unactive rxtx ring failed.\n"); + + return err; +} + +/* stop rx napi, stop tx queue. */ +static void eea_stop_rxtx(struct net_device *netdev) +{ + struct eea_net *enet = netdev_priv(netdev); + int i; + + netif_tx_disable(netdev); + + for (i = 0; i < enet->cfg.rx_ring_num; i++) + enet_rx_stop(enet->rx[i]); + + netif_carrier_off(netdev); +} + +static void eea_start_rxtx(struct eea_net *enet) +{ + int i; + + for (i = 0; i < enet->cfg.rx_ring_num; i++) + enet_rx_start(enet->rx[i]); + + netif_tx_start_all_queues(enet->netdev); + netif_carrier_on(enet->netdev); + + enet->started = true; +} + +static int eea_netdev_stop(struct net_device *netdev) +{ + struct eea_net *enet = netdev_priv(netdev); + struct eea_net_init_ctx ctx; + + /* This function can be called during device anomaly recovery. To + * prevent duplicate stop operations, the `started` flag is introduced + * for checking. + */ + + if (!enet->started) { + netdev_warn(netdev, "eea netdev stop: but dev is not started.\n"); + return 0; + } + + eea_init_ctx(enet, &ctx); + + eea_stop_rxtx(netdev); + eea_hw_unactive_ring(enet); + eea_unbind_q_and_cfg(enet, &ctx); + eea_free_rxtx_q_mem(&ctx); + + enet->started = false; + + return 0; +} + +static int eea_netdev_open(struct net_device *netdev) +{ + struct eea_net *enet = netdev_priv(netdev); + struct eea_net_init_ctx ctx; + int err; + + if (enet->link_err) { + netdev_err(netdev, "netdev open err, because link error: %d\n", + enet->link_err); + return -EBUSY; + } + + eea_init_ctx(enet, &ctx); + + err = eea_alloc_rxtx_q_mem(&ctx); + if (err) + goto err_done; + + eea_bind_q_and_cfg(enet, &ctx); + + err = eea_update_queues(enet); + if (err) + goto err_free_q; + + err = eea_hw_active_ring(enet); + if (err) + goto err_free_q; + + eea_start_rxtx(enet); + + return 0; + +err_free_q: + eea_unbind_q_and_cfg(enet, &ctx); + eea_free_rxtx_q_mem(&ctx); + +err_done: + return err; +} + +/* Statistics may be reset to zero upon device reset. This is expected behavior + * for now and will be addressed in the future. + */ +static void eea_stats(struct net_device *netdev, struct rtnl_link_stats64 *tot) +{ + struct eea_net *enet = netdev_priv(netdev); + u64 packets, bytes, drop, lerr; + u32 start; + int i; + + spin_lock(&enet->stats_lock); + + if (enet->rx) { + for (i = 0; i < enet->cfg.rx_ring_num; i++) { + struct eea_net_rx *rx = enet->rx[i]; + + do { + start = u64_stats_fetch_begin(&rx->stats.syncp); + packets = u64_stats_read(&rx->stats.packets); + bytes = u64_stats_read(&rx->stats.bytes); + drop = u64_stats_read(&rx->stats.drops); + lerr = u64_stats_read(&rx->stats.length_errors); + } while (u64_stats_fetch_retry(&rx->stats.syncp, + start)); + + tot->rx_packets += packets; + tot->rx_bytes += bytes; + tot->rx_dropped += drop; + tot->rx_length_errors += lerr; + tot->rx_errors += lerr; + } + } + + if (enet->tx) { + for (i = 0; i < enet->cfg.tx_ring_num; i++) { + struct eea_net_tx *tx = &enet->tx[i]; + + do { + start = u64_stats_fetch_begin(&tx->stats.syncp); + packets = u64_stats_read(&tx->stats.packets); + bytes = u64_stats_read(&tx->stats.bytes); + drop = u64_stats_read(&tx->stats.drops); + } while (u64_stats_fetch_retry(&tx->stats.syncp, + start)); + + tot->tx_packets += packets; + tot->tx_bytes += bytes; + tot->tx_dropped += drop; + } + } + + spin_unlock(&enet->stats_lock); +} + +/* resources: ring, buffers, irq */ +int eea_reset_hw_resources(struct eea_net *enet, struct eea_net_init_ctx *ctx) +{ + struct eea_net_init_ctx ctx_old = {0}; + int err, error; + + if (!netif_running(enet->netdev) || !enet->started) { + spin_lock(&enet->stats_lock); + enet->cfg = ctx->cfg; + spin_unlock(&enet->stats_lock); + return 0; + } + + err = eea_alloc_rxtx_q_mem(ctx); + if (err) { + netdev_warn(enet->netdev, + "eea reset: alloc q failed. stop reset. err %d\n", + err); + return err; + } + + eea_stop_rxtx(enet->netdev); + eea_hw_unactive_ring(enet); + + eea_unbind_q_and_cfg(enet, &ctx_old); + eea_bind_q_and_cfg(enet, ctx); + + err = eea_update_queues(enet); + if (err) { + netdev_err(enet->netdev, + "eea reset: set real num queues failed. err %d\n", + err); + goto err_bind_old; + } + + err = eea_hw_active_ring(enet); + if (err) { + netdev_err(enet->netdev, "eea reset: active new ring. err %d\n", + err); + eea_unbind_q_and_cfg(enet, ctx); + goto err_free_q; + } + + eea_start_rxtx(enet); + eea_free_rxtx_q_mem(&ctx_old); + return 0; + +err_bind_old: + eea_unbind_q_and_cfg(enet, ctx); + eea_bind_q_and_cfg(enet, &ctx_old); + error = eea_hw_active_ring(enet); + if (error) { + netdev_err(enet->netdev, "eea reset: active old ring. err %d\n", + error); + eea_unbind_q_and_cfg(enet, &ctx_old); + err = error; + goto err_free_q; + } + + eea_start_rxtx(enet); + eea_free_rxtx_q_mem(ctx); + return err; + +err_free_q: + + /* An exception occurred at the hardware level, and there's not much we + * can do about it -- we can only release the resources first. + */ + eea_free_rxtx_q_mem(ctx); + eea_free_rxtx_q_mem(&ctx_old); + enet->started = false; + return err; +} + +int eea_queues_check_and_reset(struct eea_device *edev) +{ + struct eea_aq_dev_status dstatus = {0}; + struct eea_aq_queue_status *qstatus; + struct eea_aq_queue_status *qs; + struct eea_net_init_ctx ctx; + bool need_reset = false; + int i, err = 0; + + rtnl_lock(); + + if (!netif_running(edev->enet->netdev)) + goto err_unlock; + + /* Maybe stopped by ha. */ + if (!edev->enet->started || edev->enet->link_err) + goto err_unlock; + + err = eea_adminq_dev_status(edev->enet, &dstatus); + if (err) { + netdev_warn(edev->enet->netdev, "query queue status failed.\n"); + goto err_unlock; + } + + if (le16_to_cpu(dstatus.status->link_status) == EEA_LINK_DOWN_STATUS) { + /* The device is broken, can not be up. */ + eea_netdev_stop(edev->enet->netdev); + edev->enet->link_err = EEA_LINK_ERR_LINK_DOWN; + netdev_warn(edev->enet->netdev, "device link is down. stop device.\n"); + goto err_free; + } + + qstatus = dstatus.status->q_status; + + for (i = 0; i < dstatus.num; ++i) { + qs = &qstatus[i]; + + if (le16_to_cpu(qs->status) == EEA_QUEUE_STATUS_NEED_RESET) { + netdev_warn(edev->enet->netdev, + "queue status: queue %u needs to reset\n", + le16_to_cpu(qs->qidx)); + need_reset = true; + } + } + + if (need_reset) { + eea_init_ctx(edev->enet, &ctx); + err = eea_reset_hw_resources(edev->enet, &ctx); + } + +err_free: + kfree(dstatus.status); + +err_unlock: + rtnl_unlock(); + return err; +} + +static int eea_update_cfg(struct eea_net *enet, + struct eea_device *edev, + struct eea_aq_cfg *hwcfg) +{ + u32 rx_max = le32_to_cpu(hwcfg->rx_depth_max); + u32 tx_max = le32_to_cpu(hwcfg->tx_depth_max); + u32 rx_def = le32_to_cpu(hwcfg->rx_depth_def); + u32 tx_def = le32_to_cpu(hwcfg->tx_depth_def); + + /* Now, we assert that the rx ring num is equal to the tx ring num. */ + if (edev->rx_num != edev->tx_num) { + dev_err(edev->dma_dev, "Inconsistent ring num: RX %u, TX %u\n", + edev->rx_num, edev->tx_num); + return -EINVAL; + } + + if (rx_max > EEA_NET_IO_HW_RING_DEPTH_MAX || + rx_max < EEA_NET_IO_HW_RING_DEPTH_MIN || + tx_max > EEA_NET_IO_HW_RING_DEPTH_MAX || + tx_max < EEA_NET_IO_HW_RING_DEPTH_MIN) { + dev_err(edev->dma_dev, "Invalid HW max depth: RX %u, TX %u\n", + rx_max, tx_max); + return -EINVAL; + } + + if (rx_def > rx_max || + tx_def > tx_max || + rx_def < EEA_NET_IO_HW_RING_DEPTH_MIN || + tx_def < EEA_NET_IO_HW_RING_DEPTH_MIN) { + dev_err(edev->dma_dev, "Invalid default depth: RX %u (max %u), TX %u (max %u)\n", + rx_def, rx_max, tx_def, tx_max); + return -EINVAL; + } + + if (!is_power_of_2(rx_max) || !is_power_of_2(tx_max) || + !is_power_of_2(rx_def) || !is_power_of_2(tx_def)) { + dev_err(edev->dma_dev, "Ring depth must be power of 2\n"); + return -EINVAL; + } + + enet->cfg_hw.rx_ring_depth = rx_max; + enet->cfg_hw.tx_ring_depth = tx_max; + enet->cfg_hw.rx_ring_num = edev->rx_num; + enet->cfg_hw.tx_ring_num = edev->tx_num; + enet->cfg_hw.split_hdr = EEA_SPLIT_HDR_SIZE; + + enet->cfg.rx_ring_depth = rx_def; + enet->cfg.tx_ring_depth = tx_def; + enet->cfg.rx_ring_num = edev->rx_num; + enet->cfg.tx_ring_num = edev->tx_num; + + return 0; +} + +static int eea_netdev_init_features(struct net_device *netdev, + struct eea_net *enet, + struct eea_device *edev) +{ + struct eea_aq_cfg *cfg; + int err; + u32 mtu; + + cfg = kzalloc(sizeof(*cfg), GFP_KERNEL); + if (!cfg) + return -ENOMEM; + + err = eea_adminq_query_cfg(enet, cfg); + if (err) + goto err_free; + + mtu = le16_to_cpu(cfg->mtu); + if (mtu < ETH_MIN_MTU) { + dev_err(edev->dma_dev, "The device gave us an invalid MTU. Here we can only exit the initialization. %u < %u\n", + mtu, ETH_MIN_MTU); + err = -EINVAL; + goto err_free; + } + + err = eea_update_cfg(enet, edev, cfg); + if (err) + goto err_free; + + netdev->priv_flags |= IFF_UNICAST_FLT; + netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + + netdev->hw_features |= NETIF_F_HW_CSUM; + netdev->hw_features |= NETIF_F_GRO_HW; + netdev->hw_features |= NETIF_F_SG; + netdev->hw_features |= NETIF_F_TSO; + netdev->hw_features |= NETIF_F_TSO_ECN; + netdev->hw_features |= NETIF_F_TSO6; + netdev->hw_features |= NETIF_F_GSO_UDP_L4; + + netdev->features |= NETIF_F_HIGHDMA; + netdev->features |= NETIF_F_HW_CSUM; + netdev->features |= NETIF_F_SG; + netdev->features |= NETIF_F_GSO_ROBUST; + netdev->features |= netdev->hw_features & NETIF_F_ALL_TSO; + netdev->features |= NETIF_F_RXCSUM; + netdev->features |= NETIF_F_GRO_HW; + + netdev->vlan_features = netdev->features; + + if (!is_valid_ether_addr(cfg->mac)) { + dev_err(edev->dma_dev, "The device gave invalid mac %pM\n", + cfg->mac); + err = -EINVAL; + goto err_free; + } + + eth_hw_addr_set(netdev, cfg->mac); + + enet->speed = SPEED_UNKNOWN; + enet->duplex = DUPLEX_UNKNOWN; + + netdev->min_mtu = ETH_MIN_MTU; + + netdev->mtu = mtu; + + /* If jumbo frames are already enabled, then the returned MTU will be a + * jumbo MTU, and the driver will automatically enable jumbo frame + * support by default. + */ + netdev->max_mtu = mtu; + +err_free: + kfree(cfg); + return err; +} + +static const struct net_device_ops eea_netdev = { + .ndo_open = eea_netdev_open, + .ndo_stop = eea_netdev_stop, + .ndo_start_xmit = eea_tx_xmit, + .ndo_validate_addr = eth_validate_addr, + .ndo_get_stats64 = eea_stats, + .ndo_features_check = passthru_features_check, +}; + +static struct eea_net *eea_netdev_alloc(struct eea_device *edev, u32 pairs) +{ + struct net_device *netdev; + struct eea_net *enet; + int err; + + netdev = alloc_etherdev_mq(sizeof(struct eea_net), pairs); + if (!netdev) { + dev_err(edev->dma_dev, + "alloc_etherdev_mq failed with pairs %d\n", pairs); + return NULL; + } + + netdev->netdev_ops = &eea_netdev; + netdev->ethtool_ops = &eea_ethtool_ops; + SET_NETDEV_DEV(netdev, edev->dma_dev); + + enet = netdev_priv(netdev); + enet->netdev = netdev; + enet->edev = edev; + edev->enet = enet; + + err = eea_alloc_irq_blks(enet); + if (err) { + dev_err(edev->dma_dev, + "eea_alloc_irq_blks failed with pairs %d\n", pairs); + free_netdev(netdev); + return NULL; + } + + spin_lock_init(&enet->stats_lock); + + return enet; +} + +static void eea_update_ts_off(struct eea_device *edev, struct eea_net *enet) +{ + u64 ts; + + ts = eea_pci_device_ts(edev); + + enet->hw_ts_offset = ktime_get_real() - ts; +} + +static int eea_net_reprobe(struct eea_device *edev) +{ + struct eea_net *enet = edev->enet; + int err = 0; + + enet->edev = edev; + + if (!enet->adminq.ring) { + err = eea_create_adminq(enet, edev->rx_num + edev->tx_num); + if (err) + return err; + } + + err = eea_alloc_irq_blks(enet); + if (err) + goto err_destroy_aq; + + eea_update_ts_off(edev, enet); + + rtnl_lock(); + + enet->link_err = 0; + if (edev->ha_reset_netdev_running && + netif_running(edev->enet->netdev)) { + err = eea_netdev_open(enet->netdev); + if (err) { + enet->link_err = EEA_LINK_ERR_HA_RESET_DEV; + rtnl_unlock(); + goto err_free_irq_blks; + } + } + + rtnl_unlock(); + + enet->wait_pci_ready = false; + return 0; + +err_free_irq_blks: + eea_free_irq_blk(enet); + +err_destroy_aq: + eea_destroy_adminq(enet); + + return err; +} + +int eea_net_probe(struct eea_device *edev) +{ + struct eea_net *enet; + int err = -ENOMEM; + + /* If edev->enet is not null, then this is called from ha reset worker. + * Call eea_net_reprobe() directly. + */ + if (edev->enet) + return eea_net_reprobe(edev); + + enet = eea_netdev_alloc(edev, edev->rx_num); + if (!enet) + return -ENOMEM; + + err = eea_create_adminq(enet, edev->rx_num + edev->tx_num); + if (err) + goto err_free_netdev; + + eea_adminq_config_host_info(enet); + + err = eea_netdev_init_features(enet->netdev, enet, edev); + if (err) + goto err_reset_dev; + + eea_update_ts_off(edev, enet); + + netif_carrier_off(enet->netdev); + + err = register_netdev(enet->netdev); + if (err) + goto err_reset_dev; + + netdev_dbg(enet->netdev, "eea probe success.\n"); + + return 0; + +err_reset_dev: + eea_device_reset(edev); + eea_destroy_adminq(enet); + +err_free_netdev: + eea_free_irq_blk(enet); + free_netdev(enet->netdev); + return err; +} + +static void eea_net_ha_reset_remove(struct eea_net *enet, + struct eea_device *edev) +{ + rtnl_lock(); + edev->ha_reset_netdev_running = false; + if (netif_running(enet->netdev)) { + eea_netdev_stop(enet->netdev); + edev->ha_reset_netdev_running = true; + } + + /* Prevent that the user set up the net device. */ + enet->link_err = EEA_LINK_ERR_HA_RESET_DEV; + + rtnl_unlock(); + + eea_device_reset(edev); + eea_destroy_adminq(enet); + eea_free_irq_blk(enet); + + enet->wait_pci_ready = true; +} + +void eea_net_remove(struct eea_device *edev, bool ha) +{ + struct net_device *netdev; + struct eea_net *enet; + + enet = edev->enet; + netdev = enet->netdev; + + if (ha) { + if (enet->wait_pci_ready) + return; + + eea_net_ha_reset_remove(enet, edev); + return; + } + + unregister_netdev(netdev); + + if (!enet->wait_pci_ready) { + eea_device_reset(edev); + eea_destroy_adminq(enet); + eea_free_irq_blk(enet); + } + + free_netdev(netdev); +} + +void eea_net_shutdown(struct eea_device *edev) +{ + struct net_device *netdev; + struct eea_net *enet; + + enet = edev->enet; + netdev = enet->netdev; + + rtnl_lock(); + + netif_device_detach(netdev); + dev_close(netdev); + + if (!enet->wait_pci_ready) { + eea_device_reset(edev); + eea_destroy_adminq(enet); + eea_free_irq_blk(enet); + } + + rtnl_unlock(); +} diff --git a/drivers/net/ethernet/alibaba/eea/eea_net.h b/drivers/net/ethernet/alibaba/eea/eea_net.h new file mode 100644 index 000000000000..848bb90bccf8 --- /dev/null +++ b/drivers/net/ethernet/alibaba/eea/eea_net.h @@ -0,0 +1,198 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Driver for Alibaba Elastic Ethernet Adapter. + * + * Copyright (C) 2025 Alibaba Inc. + */ + +#ifndef __EEA_NET_H__ +#define __EEA_NET_H__ + +#include <linux/ethtool.h> +#include <linux/netdevice.h> + +#include "eea_adminq.h" +#include "eea_ethtool.h" +#include "eea_ring.h" + +#define EEA_VER_MAJOR 1 +#define EEA_VER_MINOR 0 +#define EEA_VER_SUB_MINOR 0 + +struct eea_tx_meta; + +struct eea_net_tx { + struct eea_net *enet; + + struct eea_ring *ering; + + struct eea_tx_meta *meta; + struct eea_tx_meta *free; + + struct device *dma_dev; + + u32 index; + + char name[16]; + + struct eea_tx_stats stats; +}; + +struct eea_rx_meta { + struct eea_rx_meta *next; + + struct page *page; + dma_addr_t dma; + u32 offset; + u32 sync_for_cpu; + u32 frags; + + struct page *hdr_page; + void *hdr_addr; + dma_addr_t hdr_dma; + + u32 id; + + u32 truesize; + u32 headroom; + u32 tailroom; + + u32 len; + + bool in_use; +}; + +struct eea_net_rx_pkt_ctx { + u16 idx; + + bool data_valid; + bool do_drop; + + u32 recv_len; + struct sk_buff *head_skb; +}; + +struct eea_net_rx { + struct eea_net *enet; + + struct eea_ring *ering; + + struct eea_rx_meta *meta; + struct eea_rx_meta *free; + + struct device *dma_dev; + + u32 index; + + u32 flags; + + u32 headroom; + + struct napi_struct *napi; + + struct eea_rx_stats stats; + + char name[16]; + + struct eea_net_rx_pkt_ctx pkt; + + struct page_pool *pp; +}; + +struct eea_net_cfg { + u32 rx_ring_depth; + u32 tx_ring_depth; + u32 rx_ring_num; + u32 tx_ring_num; + + u8 rx_sq_desc_size; + u8 rx_cq_desc_size; + u8 tx_sq_desc_size; + u8 tx_cq_desc_size; + + u32 split_hdr; + + struct hwtstamp_config ts_cfg; +}; + +struct eea_net_init_ctx { + struct eea_net_cfg cfg; + + struct eea_net_tx *tx; + struct eea_net_rx **rx; + + struct net_device *netdev; + struct eea_device *edev; +}; + +enum { + EEA_LINK_ERR_NONE, + EEA_LINK_ERR_HA_RESET_DEV, + EEA_LINK_ERR_LINK_DOWN, +}; + +struct eea_irq_blk { + struct napi_struct napi; + u16 msix_vec; + bool ready; + struct eea_net_rx *rx; + char irq_name[32]; + int irq; + int idx; + +}; + +struct eea_net { + struct eea_device *edev; + struct net_device *netdev; + + struct eea_aq adminq; + + struct eea_net_tx *tx; + struct eea_net_rx **rx; + + struct eea_net_cfg cfg; + struct eea_net_cfg cfg_hw; + + struct eea_irq_blk *irq_blks; + + u32 link_err; + + bool started; + bool wait_pci_ready; + + u8 duplex; + u32 speed; + + u64 hw_ts_offset; + + /* Protect the tx and rx of struct eea_net, when eea_stats accesses the + * stats from rx and tx queues. + */ + spinlock_t stats_lock; +}; + +int eea_net_probe(struct eea_device *edev); +void eea_net_remove(struct eea_device *edev, bool ha); +void eea_net_shutdown(struct eea_device *edev); + +int eea_reset_hw_resources(struct eea_net *enet, struct eea_net_init_ctx *ctx); +void eea_init_ctx(struct eea_net *enet, struct eea_net_init_ctx *ctx); +int eea_queues_check_and_reset(struct eea_device *edev); + +/* rx apis */ + +void enet_rx_stop(struct eea_net_rx *rx); +void enet_rx_start(struct eea_net_rx *rx); + +void eea_free_rx(struct eea_net_rx *rx, struct eea_net_cfg *cfg); +struct eea_net_rx *eea_alloc_rx(struct eea_net_init_ctx *ctx, u32 idx); + +/* tx apis */ +int eea_poll_tx(struct eea_net_tx *tx, int budget); +netdev_tx_t eea_tx_xmit(struct sk_buff *skb, struct net_device *netdev); + +void eea_free_tx(struct eea_net_tx *tx, struct eea_net_cfg *cfg); +int eea_alloc_tx(struct eea_net_init_ctx *ctx, struct eea_net_tx *tx, u32 idx); + +#endif diff --git a/drivers/net/ethernet/alibaba/eea/eea_pci.c b/drivers/net/ethernet/alibaba/eea/eea_pci.c new file mode 100644 index 000000000000..9872e360dd5d --- /dev/null +++ b/drivers/net/ethernet/alibaba/eea/eea_pci.c @@ -0,0 +1,744 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Driver for Alibaba Elastic Ethernet Adapter. + * + * Copyright (C) 2025 Alibaba Inc. + */ + +#include <linux/io-64-nonatomic-lo-hi.h> +#include <linux/iopoll.h> + +#include "eea_net.h" +#include "eea_pci.h" + +#define EEA_PCI_DB_OFFSET 4096 +#define EEA_PCI_DB_MIN_SIZE 8 +#define EEA_PCI_DB_MAX_SIZE 512 +#define EEA_PCI_Q_MAX_NUM 1000 + +#define EEA_PCI_CAP_RESET_DEVICE 0xFA +#define EEA_PCI_CAP_RESET_FLAG BIT(1) + +struct eea_pci_cfg { + __le32 reserve0; + __le32 reserve1; + __le32 drv_f_idx; + __le32 drv_f; + +#define EEA_S_INIT (BIT(0) | BIT(1)) +#define EEA_S_OK BIT(2) +#define EEA_S_FEATURE_DONE BIT(3) +#define EEA_S_FAILED BIT(7) + u8 device_status; + u8 reserved[7]; + + __le32 rx_num_max; + __le32 tx_num_max; + __le32 db_blk_size; + + /* admin queue cfg */ + __le16 aq_size; + __le16 aq_msix_vector; + __le32 aq_db_off; + + __le32 aq_sq_addr; + __le32 aq_sq_addr_hi; + __le32 aq_cq_addr; + __le32 aq_cq_addr_hi; + + __le32 reserved1; + __le64 hw_ts; +}; + +struct eea_pci_device { + struct eea_device edev; + struct pci_dev *pci_dev; + + u32 msix_vec_n; + u32 db_len; + + void __iomem *reg; + void __iomem *db_base; + void __iomem *db_end; + + int ha_irq; + + struct work_struct ha_handle_work; + char ha_irq_name[32]; + int reset_pos; + bool ha_ready; + + bool shutdown; +}; + +#define cfg_pointer(reg, item) \ + ((void __iomem *)((reg) + offsetof(struct eea_pci_cfg, item))) + +#define cfg_write8(reg, item, val) iowrite8(val, cfg_pointer(reg, item)) +#define cfg_write16(reg, item, val) iowrite16(val, cfg_pointer(reg, item)) +#define cfg_write32(reg, item, val) iowrite32(val, cfg_pointer(reg, item)) +#define cfg_write64(reg, item, val) iowrite64_lo_hi(val, cfg_pointer(reg, item)) + +#define cfg_read8(reg, item) ioread8(cfg_pointer(reg, item)) +#define cfg_read32(reg, item) ioread32(cfg_pointer(reg, item)) +#define cfg_read64(reg, item) ioread64(cfg_pointer(reg, item)) + +/* Due to circular references, we have to add function definitions here. */ +static int __eea_pci_probe(struct pci_dev *pci_dev, + struct eea_pci_device *ep_dev, bool pci_probe); +static void __eea_pci_remove(struct pci_dev *pci_dev, bool pci_remove); + +const char *eea_pci_name(struct eea_device *edev) +{ + return pci_name(edev->ep_dev->pci_dev); +} + +int eea_pci_domain_nr(struct eea_device *edev) +{ + return pci_domain_nr(edev->ep_dev->pci_dev->bus); +} + +u16 eea_pci_bdf(struct eea_device *edev) +{ + return pci_dev_id(edev->ep_dev->pci_dev); +} + +static void eea_pci_io_set_status(struct eea_device *edev, u8 status) +{ + struct eea_pci_device *ep_dev = edev->ep_dev; + + cfg_write8(ep_dev->reg, device_status, status); +} + +static u8 eea_pci_io_get_status(struct eea_device *edev) +{ + struct eea_pci_device *ep_dev = edev->ep_dev; + + return cfg_read8(ep_dev->reg, device_status); +} + +static void eea_add_status(struct eea_device *dev, u32 status) +{ + eea_pci_io_set_status(dev, eea_pci_io_get_status(dev) | status); +} + +#define EEA_RESET_TIMEOUT_US (60 * 1000 * 1000) + +int eea_device_reset(struct eea_device *edev) +{ + struct eea_pci_device *ep_dev = edev->ep_dev; + int err; + u8 val; + + eea_pci_io_set_status(edev, 0); + + /* We are no longer waiting for device ack during the shutdown flow. */ + if (ep_dev->shutdown) + return 0; + + /* A longer timeout is set here to handle edge cases, though it should + * return promptly in most scenarios. + * + * In our case, all replies are handled by the DPU software, so there is + * no race condition between the hardware processes and the register. + */ + err = read_poll_timeout(cfg_read8, val, (!val || val == 0xFF), 20, + EEA_RESET_TIMEOUT_US, + false, ep_dev->reg, device_status); + + /* Surprise PCIe Removal */ + if (val == 0xFF) + return -EINVAL; + + return err; +} + +int eea_pci_set_aq_up(struct eea_device *edev) +{ + struct eea_pci_device *ep_dev = edev->ep_dev; + u8 status = eea_pci_io_get_status(edev); + int err; + u8 val; + + eea_pci_io_set_status(edev, status | EEA_S_OK); + + /* A longer timeout is set here to handle edge cases, though it should + * return promptly in most scenarios. + * + * In our case, all replies are handled by the DPU software, so there is + * no race condition between the hardware processes and the register. + */ + err = read_poll_timeout(cfg_read8, val, + val & (EEA_S_OK | EEA_S_FAILED), + 20, EEA_RESET_TIMEOUT_US, + false, ep_dev->reg, device_status); + + /* Surprise PCIe Removal */ + if (val == 0xFF) + return -EINVAL; + + /* device fail */ + if (val & EEA_S_FAILED) + return -EINVAL; + + return err; +} + +static int eea_negotiate(struct eea_device *edev) +{ + struct eea_pci_device *ep_dev; + u32 status; + + ep_dev = edev->ep_dev; + + edev->features = 0; + + cfg_write32(ep_dev->reg, drv_f_idx, 0); + cfg_write32(ep_dev->reg, drv_f, lower_32_bits(edev->features)); + cfg_write32(ep_dev->reg, drv_f_idx, 1); + cfg_write32(ep_dev->reg, drv_f, upper_32_bits(edev->features)); + + eea_add_status(edev, EEA_S_FEATURE_DONE); + status = eea_pci_io_get_status(edev); + + /* Surprise PCIe Removal */ + if (status == 0xFF) + return -EINVAL; + + if (!(status & EEA_S_FEATURE_DONE)) + return -ENODEV; + + return 0; +} + +static void eea_pci_release_resource(struct eea_pci_device *ep_dev) +{ + struct pci_dev *pci_dev = ep_dev->pci_dev; + struct eea_device *edev; + + edev = &ep_dev->edev; + + if (edev->status < EEA_PCI_STATUS_READY) + return; + + if (ep_dev->reg) { + pci_iounmap(pci_dev, ep_dev->reg); + ep_dev->reg = NULL; + } + + if (ep_dev->msix_vec_n) { + ep_dev->msix_vec_n = 0; + pci_free_irq_vectors(ep_dev->pci_dev); + } + + pci_clear_master(pci_dev); + pci_release_regions(pci_dev); + pci_disable_device(pci_dev); + + edev->status = EEA_PCI_STATUS_NONE; +} + +static int eea_pci_setup(struct pci_dev *pci_dev, struct eea_pci_device *ep_dev) +{ + int err, n, ret, len; + + ep_dev->edev.status = EEA_PCI_STATUS_ERR; + + ep_dev->pci_dev = pci_dev; + + err = pci_enable_device(pci_dev); + if (err) + return err; + + err = pci_request_regions(pci_dev, "EEA"); + if (err) + goto err_disable_dev; + + if (pci_resource_len(pci_dev, 0) < EEA_PCI_DB_OFFSET) { + dev_err(&pci_dev->dev, "Bar 0 is too small %llu\n", + (u64)pci_resource_len(pci_dev, 0)); + err = -EINVAL; + goto err_release_regions; + } + + ep_dev->reg = pci_iomap(pci_dev, 0, 0); + if (!ep_dev->reg) { + dev_err(&pci_dev->dev, "Failed to map pci bar!\n"); + err = -ENOMEM; + goto err_release_regions; + } + + err = eea_device_reset(&ep_dev->edev); + if (err) { + dev_err(&pci_dev->dev, "Failed to reset device for setup!\n"); + goto err_unmap_reg; + } + + err = dma_set_mask_and_coherent(&pci_dev->dev, DMA_BIT_MASK(64)); + if (err) { + dev_warn(&pci_dev->dev, "Failed to enable 64-bit DMA.\n"); + goto err_unmap_reg; + } + + pci_set_master(pci_dev); + + ep_dev->edev.rx_num = cfg_read32(ep_dev->reg, rx_num_max); + ep_dev->edev.tx_num = cfg_read32(ep_dev->reg, tx_num_max); + + if (ep_dev->edev.rx_num > EEA_PCI_Q_MAX_NUM || + ep_dev->edev.tx_num > EEA_PCI_Q_MAX_NUM) { + dev_err(&pci_dev->dev, "Invalid queue num %u %u\n", + ep_dev->edev.rx_num, + ep_dev->edev.tx_num); + err = -EINVAL; + goto err_clear_master; + } + + ep_dev->edev.db_blk_size = cfg_read32(ep_dev->reg, db_blk_size); + if (!IS_ALIGNED(ep_dev->edev.db_blk_size, 8) || + ep_dev->edev.db_blk_size > EEA_PCI_DB_MAX_SIZE || + ep_dev->edev.db_blk_size < EEA_PCI_DB_MIN_SIZE) { + dev_err(&pci_dev->dev, "Invalid db size %u\n", + ep_dev->edev.db_blk_size); + err = -EINVAL; + goto err_clear_master; + } + + ep_dev->db_len = ep_dev->edev.db_blk_size * (ep_dev->edev.rx_num + + ep_dev->edev.tx_num + 1); + ep_dev->db_base = ep_dev->reg + EEA_PCI_DB_OFFSET; + ep_dev->db_end = ep_dev->db_base + ep_dev->db_len; + + len = ep_dev->db_end - ep_dev->reg; + + if (pci_resource_len(pci_dev, 0) < len) { + dev_err(&pci_dev->dev, "Bar 0 is too small %llu\n", + (u64)pci_resource_len(pci_dev, 0)); + err = -EINVAL; + goto err_clear_master; + } + + /* In our design, the number of hardware interrupts matches the maximum + * number of queues. If pci_alloc_irq_vectors failed, return directly. + * + * 2: adminq, error handle + */ + n = ep_dev->edev.rx_num + 2; + ret = pci_alloc_irq_vectors(ep_dev->pci_dev, n, n, PCI_IRQ_MSIX); + if (ret != n) { + err = ret; + goto err_clear_master; + } + + ep_dev->msix_vec_n = ret; + + ep_dev->edev.status = EEA_PCI_STATUS_READY; + + return 0; + +err_clear_master: + pci_clear_master(pci_dev); + +err_unmap_reg: + pci_iounmap(pci_dev, ep_dev->reg); + ep_dev->reg = NULL; + +err_release_regions: + pci_release_regions(pci_dev); + +err_disable_dev: + pci_disable_device(pci_dev); + + return err; +} + +void __iomem *eea_pci_db_addr(struct eea_device *edev, u32 off) +{ + u32 max_off; + + if (!IS_ALIGNED(off, 8)) + return NULL; + + max_off = edev->ep_dev->db_len - edev->db_blk_size; + + if (off > max_off) + return NULL; + + return edev->ep_dev->db_base + off; +} + +int eea_pci_active_aq(struct eea_ring *ering, int msix_vec) +{ + struct eea_pci_device *ep_dev = ering->edev->ep_dev; + + cfg_write16(ep_dev->reg, aq_size, ering->num); + cfg_write16(ep_dev->reg, aq_msix_vector, msix_vec); + + cfg_write64(ep_dev->reg, aq_sq_addr, ering->sq.dma_addr); + cfg_write64(ep_dev->reg, aq_cq_addr, ering->cq.dma_addr); + + ering->db = eea_pci_db_addr(ering->edev, + cfg_read32(ep_dev->reg, aq_db_off)); + + if (!ering->db) + return -EIO; + + return 0; +} + +void eea_pci_free_irq(struct eea_irq_blk *blk) +{ + irq_update_affinity_hint(blk->irq, NULL); + free_irq(blk->irq, blk); +} + +int eea_pci_request_irq(struct eea_device *edev, struct eea_irq_blk *blk, + irqreturn_t (*callback)(int irq, void *data)) +{ + struct eea_pci_device *ep_dev = edev->ep_dev; + int irq; + + snprintf(blk->irq_name, sizeof(blk->irq_name), "eea-q%d@%s", blk->idx, + pci_name(ep_dev->pci_dev)); + + irq = pci_irq_vector(ep_dev->pci_dev, blk->msix_vec); + + blk->irq = irq; + + return request_irq(irq, callback, IRQF_NO_AUTOEN, blk->irq_name, blk); +} + +static void eea_ha_handle_reset(struct eea_pci_device *ep_dev) +{ + struct eea_device *edev; + struct pci_dev *pci_dev; + u16 reset; + int err; + + if (!ep_dev->reset_pos) { + eea_queues_check_and_reset(&ep_dev->edev); + return; + } + + edev = &ep_dev->edev; + + pci_read_config_word(ep_dev->pci_dev, ep_dev->reset_pos, &reset); + + /* Clear bits using 0xFFFF and ignore all previous messages. */ + pci_write_config_word(ep_dev->pci_dev, ep_dev->reset_pos, 0xFFFF); + + if (reset & EEA_PCI_CAP_RESET_FLAG) { + dev_warn(&ep_dev->pci_dev->dev, "recv device reset request.\n"); + + pci_dev = ep_dev->pci_dev; + + /* The pci remove callback may hold this lock. If the + * pci remove callback is called, then we can ignore the + * ha interrupt. + */ + if (mutex_trylock(&edev->ha_lock)) { + if (edev->status != EEA_PCI_STATUS_DONE) { + dev_err(&ep_dev->pci_dev->dev, "ha: reset device: pci status is %d. skip it.\n", + edev->status); + + mutex_unlock(&edev->ha_lock); + return; + } + + __eea_pci_remove(pci_dev, false); + err = __eea_pci_probe(pci_dev, ep_dev, false); + if (err) + /* Currently, for some reason, PCI + * initialization or network device re-probing + * has failed. Waiting for the PCI subsystem to + * call the remove callback to release the + * remaining resources. + */ + dev_err(&ep_dev->pci_dev->dev, + "ha: re-setup failed.\n"); + + mutex_unlock(&edev->ha_lock); + } else { + /* Device removal is in progress, so return directly. */ + dev_warn(&ep_dev->pci_dev->dev, + "ha device reset: trylock failed.\n"); + } + return; + } + + eea_queues_check_and_reset(&ep_dev->edev); +} + +/* ha handle code */ +static void eea_ha_handle_work(struct work_struct *work) +{ + struct eea_pci_device *ep_dev; + + ep_dev = container_of(work, struct eea_pci_device, ha_handle_work); + + /* Ha interrupt is triggered, so there maybe some error, we may need to + * reset the device or reset some queues. + */ + dev_warn(&ep_dev->pci_dev->dev, "recv ha interrupt.\n"); + + eea_ha_handle_reset(ep_dev); +} + +static irqreturn_t eea_pci_ha_handle(int irq, void *data) +{ + struct eea_device *edev = data; + + schedule_work(&edev->ep_dev->ha_handle_work); + + return IRQ_HANDLED; +} + +static void eea_pci_free_ha_irq(struct eea_device *edev) +{ + struct eea_pci_device *ep_dev = edev->ep_dev; + int irq; + + if (ep_dev->ha_ready) { + irq = pci_irq_vector(ep_dev->pci_dev, 0); + free_irq(irq, edev); + ep_dev->ha_ready = false; + } +} + +static int eea_pci_ha_init(struct eea_device *edev, struct pci_dev *pci_dev, + bool pci_probe) +{ + int pos, cfg_type_off, cfg_drv_off, cfg_dev_off; + struct eea_pci_device *ep_dev = edev->ep_dev; + int irq, err; + u8 type; + + snprintf(ep_dev->ha_irq_name, sizeof(ep_dev->ha_irq_name), "eea-ha@%s", + pci_name(ep_dev->pci_dev)); + + irq = pci_irq_vector(ep_dev->pci_dev, 0); + + if (pci_probe) + INIT_WORK(&ep_dev->ha_handle_work, eea_ha_handle_work); + + /* This irq is not only work for ha, so request it always. */ + err = request_irq(irq, eea_pci_ha_handle, IRQF_NO_AUTOEN, + ep_dev->ha_irq_name, edev); + if (err) + return err; + + ep_dev->ha_irq = irq; + + ep_dev->ha_ready = true; + ep_dev->reset_pos = 0; + + cfg_type_off = offsetof(struct eea_pci_cap, cfg_type); + cfg_drv_off = offsetof(struct eea_pci_reset_reg, driver); + cfg_dev_off = offsetof(struct eea_pci_reset_reg, device); + + for (pos = pci_find_capability(pci_dev, PCI_CAP_ID_VNDR); + pos > 0; + pos = pci_find_next_capability(pci_dev, pos, PCI_CAP_ID_VNDR)) { + pci_read_config_byte(pci_dev, pos + cfg_type_off, &type); + + if (type == EEA_PCI_CAP_RESET_DEVICE) { + /* notify device, driver support this feature. */ + pci_write_config_word(pci_dev, pos + cfg_drv_off, + EEA_PCI_CAP_RESET_FLAG); + pci_write_config_word(pci_dev, pos + cfg_dev_off, + 0xFFFF); + + edev->ep_dev->reset_pos = pos + cfg_dev_off; + return 0; + } + } + + /* irq just for event notify */ + dev_warn(&edev->ep_dev->pci_dev->dev, "Not Found reset cap.\n"); + return 0; +} + +u64 eea_pci_device_ts(struct eea_device *edev) +{ + struct eea_pci_device *ep_dev = edev->ep_dev; + + return cfg_read64(ep_dev->reg, hw_ts); +} + +static int eea_init_device(struct eea_device *edev) +{ + int err; + + err = eea_device_reset(edev); + if (err) + return err; + + eea_pci_io_set_status(edev, EEA_S_INIT); + + err = eea_negotiate(edev); + if (err) + goto err; + + err = eea_net_probe(edev); + if (err) + goto err; + + return 0; +err: + eea_add_status(edev, EEA_S_FAILED); + return err; +} + +static int __eea_pci_probe(struct pci_dev *pci_dev, + struct eea_pci_device *ep_dev, + bool pci_probe) +{ + struct eea_device *edev; + int err; + + pci_set_drvdata(pci_dev, ep_dev); + + edev = &ep_dev->edev; + + err = eea_pci_setup(pci_dev, ep_dev); + if (err) + return err; + + err = eea_init_device(&ep_dev->edev); + if (err) + goto err_pci_rel; + + err = eea_pci_ha_init(edev, pci_dev, pci_probe); + if (err) + goto err_net_rm; + + edev->status = EEA_PCI_STATUS_DONE; + + enable_irq(ep_dev->ha_irq); + + return 0; + +err_net_rm: + eea_net_remove(edev, !pci_probe); + +err_pci_rel: + eea_pci_release_resource(ep_dev); + return err; +} + +static void __eea_pci_remove(struct pci_dev *pci_dev, bool pci_remove) +{ + struct eea_pci_device *ep_dev = pci_get_drvdata(pci_dev); + struct device *dev = get_device(&ep_dev->pci_dev->dev); + struct eea_device *edev = &ep_dev->edev; + + eea_pci_free_ha_irq(edev); + + if (pci_remove) + flush_work(&ep_dev->ha_handle_work); + + eea_net_remove(edev, !pci_remove); + + eea_pci_release_resource(ep_dev); + + put_device(dev); +} + +static int eea_pci_probe(struct pci_dev *pci_dev, + const struct pci_device_id *id) +{ + struct eea_pci_device *ep_dev; + struct eea_device *edev; + int err; + + ep_dev = kzalloc(sizeof(*ep_dev), GFP_KERNEL); + if (!ep_dev) + return -ENOMEM; + + edev = &ep_dev->edev; + + edev->ep_dev = ep_dev; + edev->dma_dev = &pci_dev->dev; + + ep_dev->pci_dev = pci_dev; + + mutex_init(&edev->ha_lock); + + err = __eea_pci_probe(pci_dev, ep_dev, true); + if (err) { + mutex_destroy(&edev->ha_lock); + pci_set_drvdata(pci_dev, NULL); + kfree(ep_dev); + } + + return err; +} + +static void eea_pci_remove(struct pci_dev *pci_dev) +{ + struct eea_pci_device *ep_dev = pci_get_drvdata(pci_dev); + struct eea_device *edev; + + edev = &ep_dev->edev; + + mutex_lock(&edev->ha_lock); + __eea_pci_remove(pci_dev, true); + mutex_unlock(&edev->ha_lock); + + pci_set_drvdata(pci_dev, NULL); + + mutex_destroy(&edev->ha_lock); + kfree(ep_dev); +} + +static void eea_pci_shutdown(struct pci_dev *pci_dev) +{ + struct eea_pci_device *ep_dev = pci_get_drvdata(pci_dev); + struct eea_device *edev; + + edev = &ep_dev->edev; + + ep_dev->shutdown = true; + + mutex_lock(&edev->ha_lock); + eea_pci_free_ha_irq(edev); + flush_work(&ep_dev->ha_handle_work); + mutex_unlock(&edev->ha_lock); + + eea_net_shutdown(edev); + + pci_clear_master(pci_dev); +} + +static const struct pci_device_id eea_pci_id_table[] = { + { PCI_DEVICE(PCI_VENDOR_ID_ALIBABA, 0x500B) }, + { 0 } +}; + +MODULE_DEVICE_TABLE(pci, eea_pci_id_table); + +static struct pci_driver eea_pci_driver = { + .name = "alibaba_eea", + .id_table = eea_pci_id_table, + .probe = eea_pci_probe, + .remove = eea_pci_remove, + .shutdown = eea_pci_shutdown, + .sriov_configure = pci_sriov_configure_simple, +}; + +static __init int eea_pci_init(void) +{ + return pci_register_driver(&eea_pci_driver); +} + +static __exit void eea_pci_exit(void) +{ + pci_unregister_driver(&eea_pci_driver); +} + +module_init(eea_pci_init); +module_exit(eea_pci_exit); + +MODULE_DESCRIPTION("Driver for Alibaba Elastic Ethernet Adapter"); +MODULE_AUTHOR("Xuan Zhuo <xuanzhuo@linux.alibaba.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/net/ethernet/alibaba/eea/eea_pci.h b/drivers/net/ethernet/alibaba/eea/eea_pci.h new file mode 100644 index 000000000000..6f6ba7d5514d --- /dev/null +++ b/drivers/net/ethernet/alibaba/eea/eea_pci.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Driver for Alibaba Elastic Ethernet Adapter. + * + * Copyright (C) 2025 Alibaba Inc. + */ + +#ifndef __EEA_PCI_H__ +#define __EEA_PCI_H__ + +#include <linux/pci.h> + +#include "eea_net.h" +#include "eea_ring.h" + +enum eea_pci_status { + EEA_PCI_STATUS_NONE, + EEA_PCI_STATUS_ERR, + EEA_PCI_STATUS_READY, + EEA_PCI_STATUS_DONE, +}; + +struct eea_irq_blk; + +struct eea_pci_cap { + __u8 cap_vndr; + __u8 cap_next; + __u8 cap_len; + __u8 cfg_type; +}; + +struct eea_pci_reset_reg { + struct eea_pci_cap cap; + __le16 driver; + __le16 device; +}; + +struct eea_pci_device; + +struct eea_device { + struct eea_pci_device *ep_dev; + struct device *dma_dev; + struct eea_net *enet; + + u64 features; + + enum eea_pci_status status; + bool ha_reset_netdev_running; + + /* ha lock for the race between ha work and pci remove */ + struct mutex ha_lock; + + u32 rx_num; + u32 tx_num; + u32 db_blk_size; +}; + +const char *eea_pci_name(struct eea_device *edev); +int eea_pci_domain_nr(struct eea_device *edev); +u16 eea_pci_bdf(struct eea_device *edev); + +int eea_device_reset(struct eea_device *dev); +int eea_pci_set_aq_up(struct eea_device *dev); +int eea_pci_active_aq(struct eea_ring *ering, int msix_vec); + +int eea_pci_request_irq(struct eea_device *edev, struct eea_irq_blk *blk, + irqreturn_t (*callback)(int irq, void *data)); +void eea_pci_free_irq(struct eea_irq_blk *blk); + +u64 eea_pci_device_ts(struct eea_device *edev); + +void __iomem *eea_pci_db_addr(struct eea_device *edev, u32 off); +#endif diff --git a/drivers/net/ethernet/alibaba/eea/eea_ring.c b/drivers/net/ethernet/alibaba/eea/eea_ring.c new file mode 100644 index 000000000000..99dcabd094b8 --- /dev/null +++ b/drivers/net/ethernet/alibaba/eea/eea_ring.c @@ -0,0 +1,249 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Driver for Alibaba Elastic Ethernet Adapter. + * + * Copyright (C) 2025 Alibaba Inc. + */ + +#include "eea_pci.h" +#include "eea_ring.h" + +void eea_ering_irq_active(struct eea_ring *ering, struct eea_ring *tx_ering) +{ + u64 value = 0, rx_idx, tx_idx; + + tx_idx = (u64)tx_ering->cq.hw_idx; + rx_idx = (u64)ering->cq.hw_idx; + + value |= EEA_IRQ_UNMASK << EEA_DB_FLAGS_OFF; + value |= tx_idx << EEA_DB_TX_CQ_HEAD_OFF; + value |= rx_idx << EEA_DB_RX_CQ_HEAD_OFF; + + writeq(value, ering->db); +} + +void *eea_ering_cq_get_desc(const struct eea_ring *ering) +{ + u8 phase; + u8 *desc; + + desc = ering->cq.desc + (ering->cq.head << ering->cq.desc_size_shift); + + phase = READ_ONCE(*(u8 *)(desc + ering->cq.desc_size - 1)); + + if ((phase & EEA_RING_DESC_F_CQ_PHASE) == ering->cq.phase) { + dma_rmb(); + return desc; + } + + return NULL; +} + +/* sq api */ +void *eea_ering_sq_alloc_desc(struct eea_ring *ering, u16 id, bool is_last, + u16 flags) +{ + struct eea_ring_sq *sq = &ering->sq; + struct eea_common_desc *desc; + + if (!sq->shadow_num) { + sq->shadow_idx = sq->head; + sq->shadow_id = cpu_to_le16(id); + } + + if (!is_last) + flags |= EEA_RING_DESC_F_MORE; + + desc = sq->desc + (sq->shadow_idx << sq->desc_size_shift); + + desc->flags = cpu_to_le16(flags); + desc->id = sq->shadow_id; + + if (unlikely(++sq->shadow_idx >= ering->num)) + sq->shadow_idx = 0; + + ++sq->shadow_num; + + return desc; +} + +/* This is an allocation API for admin Q. For each call to admin Q, only one + * desc will be allocated. + */ +void *eea_ering_aq_alloc_desc(struct eea_ring *ering) +{ + struct eea_ring_sq *sq = &ering->sq; + struct eea_common_desc *desc; + + if (!sq->shadow_num) + sq->shadow_idx = sq->head; + + desc = sq->desc + (sq->shadow_idx << sq->desc_size_shift); + + if (unlikely(++sq->shadow_idx >= ering->num)) + sq->shadow_idx = 0; + + ++sq->shadow_num; + + return desc; +} + +void eea_ering_sq_commit_desc(struct eea_ring *ering) +{ + struct eea_ring_sq *sq = &ering->sq; + int num; + + num = sq->shadow_num; + + ering->num_free -= num; + + sq->head = sq->shadow_idx; + sq->hw_idx += num; + sq->shadow_num = 0; +} + +void eea_ering_sq_cancel(struct eea_ring *ering) +{ + ering->sq.shadow_num = 0; +} + +/* cq api */ +void eea_ering_cq_ack_desc(struct eea_ring *ering, u32 num) +{ + struct eea_ring_cq *cq = &ering->cq; + + cq->head += num; + cq->hw_idx += num; + + if (unlikely(cq->head >= ering->num)) { + cq->head -= ering->num; + cq->phase ^= EEA_RING_DESC_F_CQ_PHASE; + } + + ering->num_free += num; +} + +/* notify */ +void eea_ering_kick(struct eea_ring *ering) +{ + u64 value = 0, idx; + + idx = (u64)ering->sq.hw_idx; + + value |= EEA_IDX_PRESENT << EEA_DB_FLAGS_OFF; + value |= idx << EEA_DB_IDX_OFF; + + writeq(value, ering->db); +} + +/* ering alloc/free */ +static void ering_free_queue(struct eea_device *edev, size_t size, + void *queue, dma_addr_t dma_handle) +{ + dma_free_coherent(edev->dma_dev, size, queue, dma_handle); +} + +static void *ering_alloc_queue(struct eea_device *edev, size_t size, + dma_addr_t *dma_handle) +{ + gfp_t flags = GFP_KERNEL | __GFP_NOWARN; + + return dma_alloc_coherent(edev->dma_dev, size, dma_handle, flags); +} + +static int ering_alloc_queues(struct eea_ring *ering, struct eea_device *edev, + size_t num, u8 sq_desc_size, u8 cq_desc_size) +{ + dma_addr_t addr; + size_t size; + void *ring; + + size = num * sq_desc_size; + + ring = ering_alloc_queue(edev, size, &addr); + if (!ring) + return -ENOMEM; + + ering->sq.desc = ring; + ering->sq.dma_addr = addr; + ering->sq.dma_size = size; + ering->sq.desc_size = sq_desc_size; + ering->sq.desc_size_shift = fls(sq_desc_size) - 1; + + size = num * cq_desc_size; + + ring = ering_alloc_queue(edev, size, &addr); + if (!ring) + goto err_free_sq; + + ering->cq.desc = ring; + ering->cq.dma_addr = addr; + ering->cq.dma_size = size; + ering->cq.desc_size = cq_desc_size; + ering->cq.desc_size_shift = fls(cq_desc_size) - 1; + + ering->num = num; + + return 0; + +err_free_sq: + ering_free_queue(ering->edev, ering->sq.dma_size, + ering->sq.desc, ering->sq.dma_addr); + return -ENOMEM; +} + +static void ering_init(struct eea_ring *ering) +{ + ering->cq.phase = EEA_RING_DESC_F_CQ_PHASE; + ering->num_free = ering->num; +} + +struct eea_ring *eea_ering_alloc(u32 index, u32 num, struct eea_device *edev, + u8 sq_desc_size, u8 cq_desc_size, + const char *name) +{ + struct eea_ring *ering; + + if (num > EEA_NET_IO_HW_RING_DEPTH_MAX || + num < EEA_NET_IO_RING_DEPTH_MIN) + return NULL; + + if (!is_power_of_2(num)) + return NULL; + + if (!sq_desc_size || !is_power_of_2(sq_desc_size)) + return NULL; + + if (!cq_desc_size || !is_power_of_2(cq_desc_size)) + return NULL; + + ering = kzalloc(sizeof(*ering), GFP_KERNEL); + if (!ering) + return NULL; + + ering->edev = edev; + ering->name = name; + ering->index = index; + + if (ering_alloc_queues(ering, edev, num, sq_desc_size, cq_desc_size)) + goto err_free; + + ering_init(ering); + + return ering; + +err_free: + kfree(ering); + return NULL; +} + +void eea_ering_free(struct eea_ring *ering) +{ + ering_free_queue(ering->edev, ering->cq.dma_size, + ering->cq.desc, ering->cq.dma_addr); + + ering_free_queue(ering->edev, ering->sq.dma_size, + ering->sq.desc, ering->sq.dma_addr); + + kfree(ering); +} diff --git a/drivers/net/ethernet/alibaba/eea/eea_ring.h b/drivers/net/ethernet/alibaba/eea/eea_ring.h new file mode 100644 index 000000000000..a7ce465943a5 --- /dev/null +++ b/drivers/net/ethernet/alibaba/eea/eea_ring.h @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Driver for Alibaba Elastic Ethernet Adapter. + * + * Copyright (C) 2025 Alibaba Inc. + */ + +#ifndef __EEA_RING_H__ +#define __EEA_RING_H__ + +#include <linux/dma-mapping.h> +#include "eea_desc.h" + +#define EEA_RING_DESC_F_MORE BIT(0) +#define EEA_RING_DESC_F_CQ_PHASE BIT(7) + +/* These two values define the bounds for the queue depth returned by the + * hardware. + */ +#define EEA_NET_IO_HW_RING_DEPTH_MAX (32 * 1024) +#define EEA_NET_IO_HW_RING_DEPTH_MIN 128 + +/* This value constrains the minimum queue depth that the driver configures for + * the hardware, which typically applies to user-provided settings. Naturally, + * the configured depth must also not exceed the maximum capacity supported by + * the hardware. + */ +#define EEA_NET_IO_RING_DEPTH_MIN 64 + +struct eea_common_desc { + __le16 flags; + __le16 id; +}; + +struct eea_device; + +struct eea_ring_sq { + void *desc; + + u16 head; + u16 hw_idx; + + u16 shadow_idx; + __le16 shadow_id; + u16 shadow_num; + + u8 desc_size; + u8 desc_size_shift; + + dma_addr_t dma_addr; + u32 dma_size; +}; + +struct eea_ring_cq { + void *desc; + + u16 head; + u16 hw_idx; + + u8 phase; + u8 desc_size_shift; + u8 desc_size; + + dma_addr_t dma_addr; + u32 dma_size; +}; + +struct eea_ring { + const char *name; + struct eea_device *edev; + u32 index; + void __iomem *db; + u16 msix_vec; + + u32 num; + + u32 num_free; + + struct eea_ring_sq sq; + struct eea_ring_cq cq; +}; + +struct eea_ring *eea_ering_alloc(u32 index, u32 num, struct eea_device *edev, + u8 sq_desc_size, u8 cq_desc_size, + const char *name); +void eea_ering_free(struct eea_ring *ering); +void eea_ering_kick(struct eea_ring *ering); + +void *eea_ering_sq_alloc_desc(struct eea_ring *ering, u16 id, + bool is_last, u16 flags); +void *eea_ering_aq_alloc_desc(struct eea_ring *ering); +void eea_ering_sq_commit_desc(struct eea_ring *ering); +void eea_ering_sq_cancel(struct eea_ring *ering); + +void eea_ering_cq_ack_desc(struct eea_ring *ering, u32 num); + +void eea_ering_irq_active(struct eea_ring *ering, struct eea_ring *tx_ering); +void *eea_ering_cq_get_desc(const struct eea_ring *ering); +#endif diff --git a/drivers/net/ethernet/alibaba/eea/eea_rx.c b/drivers/net/ethernet/alibaba/eea/eea_rx.c new file mode 100644 index 000000000000..a3f0d2a79ad8 --- /dev/null +++ b/drivers/net/ethernet/alibaba/eea/eea_rx.c @@ -0,0 +1,814 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Driver for Alibaba Elastic Ethernet Adapter. + * + * Copyright (C) 2025 Alibaba Inc. + */ + +#include <net/netdev_rx_queue.h> +#include <net/page_pool/helpers.h> + +#include "eea_adminq.h" +#include "eea_net.h" +#include "eea_pci.h" +#include "eea_ring.h" + +#define EEA_ENABLE_F_NAPI BIT(0) + +#define EEA_PAGE_FRAGS_NUM 1024 + +#define EEA_RX_BUF_ALIGN 128 + +#define EEA_RX_BUF_MAX_LEN (10 * 1024) + +struct eea_rx_ctx { + u32 len; + u32 hdr_len; + + u16 flags; + bool more; + + struct eea_rx_meta *meta; + + struct eea_rx_ctx_stats stats; +}; + +static struct eea_rx_meta *eea_rx_meta_get(struct eea_net_rx *rx) +{ + struct eea_rx_meta *meta; + + if (!rx->free) + return NULL; + + meta = rx->free; + rx->free = meta->next; + + return meta; +} + +static void eea_rx_meta_put(struct eea_net_rx *rx, struct eea_rx_meta *meta) +{ + meta->next = rx->free; + rx->free = meta; +} + +static void eea_free_rx_buffer(struct eea_net_rx *rx, struct eea_rx_meta *meta, + bool allow_direct) +{ + u32 drain_count; + + drain_count = EEA_PAGE_FRAGS_NUM - meta->frags; + + if (page_pool_unref_page(meta->page, drain_count) == 0) + page_pool_put_unrefed_page(rx->pp, meta->page, -1, + allow_direct); + + meta->page = NULL; +} + +static void eea_rx_meta_dma_sync_for_device(struct eea_net_rx *rx, + struct eea_rx_meta *meta) +{ + u32 len; + + if (meta->sync_for_cpu <= meta->offset + rx->headroom) + return; + + len = meta->sync_for_cpu - meta->offset - rx->headroom; + + dma_sync_single_for_device(rx->enet->edev->dma_dev, + meta->dma + meta->offset + rx->headroom, + len, DMA_FROM_DEVICE); + meta->sync_for_cpu = 0; +} + +static void meta_align_offset(struct eea_net_rx *rx, struct eea_rx_meta *meta) +{ + int h, b; + + h = rx->headroom; + b = meta->offset + h; + + /* For better performance, we align the buffer address to + * EEA_RX_BUF_ALIGN, as required by the device design. + */ + b = ALIGN(b, EEA_RX_BUF_ALIGN); + + meta->offset = b - h; +} + +static int eea_alloc_rx_buffer(struct eea_net_rx *rx, struct eea_rx_meta *meta) +{ + struct page *page; + + if (meta->page) { + eea_rx_meta_dma_sync_for_device(rx, meta); + return 0; + } + + page = page_pool_dev_alloc_pages(rx->pp); + if (!page) + return -ENOMEM; + + page_pool_fragment_page(page, EEA_PAGE_FRAGS_NUM); + + meta->page = page; + meta->dma = page_pool_get_dma_addr(page); + meta->offset = 0; + meta->frags = 0; + meta->sync_for_cpu = 0; + + meta_align_offset(rx, meta); + + return 0; +} + +static u32 eea_consume_rx_buffer(struct eea_net_rx *rx, + struct eea_rx_meta *meta, + u32 consumed) +{ + u32 offset; + int min; + + offset = meta->offset; + + meta->offset += consumed; + ++meta->frags; + + min = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + min += rx->headroom; + min += SKB_DATA_ALIGN(ETH_DATA_LEN); + + meta_align_offset(rx, meta); + + if (min + meta->offset > PAGE_SIZE) { + eea_free_rx_buffer(rx, meta, true); + return PAGE_SIZE - offset; + } + + return meta->offset - offset; +} + +static void eea_free_rx_hdr(struct eea_net_rx *rx, struct eea_net_cfg *cfg) +{ + struct eea_rx_meta *meta; + int i; + + for (i = 0; i < cfg->rx_ring_depth; ++i) { + meta = &rx->meta[i]; + meta->hdr_addr = NULL; + + if (!meta->hdr_page) + continue; + + dma_unmap_page(rx->dma_dev, meta->hdr_dma, PAGE_SIZE, + DMA_FROM_DEVICE); + put_page(meta->hdr_page); + + meta->hdr_page = NULL; + } +} + +static int eea_alloc_rx_hdr(struct eea_net_init_ctx *ctx, struct eea_net_rx *rx) +{ + struct page *hdr_page = NULL; + struct eea_rx_meta *meta; + u32 offset = 0, hdrsize; + struct device *dmadev; + dma_addr_t dma; + int i; + + dmadev = ctx->edev->dma_dev; + hdrsize = ctx->cfg.split_hdr; + + for (i = 0; i < ctx->cfg.rx_ring_depth; ++i) { + meta = &rx->meta[i]; + meta->hdr_page = NULL; + + if (!hdr_page || offset + hdrsize > PAGE_SIZE) { + hdr_page = alloc_page(GFP_KERNEL); + if (!hdr_page) + goto err; + + dma = dma_map_page(dmadev, hdr_page, 0, PAGE_SIZE, + DMA_FROM_DEVICE); + + if (unlikely(dma_mapping_error(dmadev, dma))) { + put_page(hdr_page); + goto err; + } + + offset = 0; + meta->hdr_page = hdr_page; + } + + meta->hdr_dma = dma + offset; + meta->hdr_addr = page_address(hdr_page) + offset; + offset += hdrsize; + } + + return 0; + +err: + eea_free_rx_hdr(rx, &ctx->cfg); + return -ENOMEM; +} + +static void eea_rx_meta_dma_sync_for_cpu(struct eea_net_rx *rx, + struct eea_rx_meta *meta, u32 len) +{ + dma_sync_single_for_cpu(rx->enet->edev->dma_dev, + meta->dma + meta->offset + meta->headroom, + len, DMA_FROM_DEVICE); + meta->sync_for_cpu = meta->offset + meta->headroom + len; +} + +static int eea_harden_check_overflow(struct eea_rx_ctx *ctx, + struct eea_net *enet) +{ + u32 max_len; + + max_len = ctx->meta->truesize - ctx->meta->headroom - + ctx->meta->tailroom; + + if (unlikely(ctx->len > max_len)) { + pr_debug("%s: rx error: len %u exceeds truesize %u\n", + enet->netdev->name, ctx->len, max_len); + ++ctx->stats.length_errors; + return -EINVAL; + } + + return 0; +} + +static int eea_harden_check_size(struct eea_rx_ctx *ctx, struct eea_net *enet) +{ + int err; + + err = eea_harden_check_overflow(ctx, enet); + if (err) + return err; + + if (ctx->hdr_len) { + if (unlikely(ctx->hdr_len < ETH_HLEN)) { + pr_debug("%s: short hdr %u\n", enet->netdev->name, + ctx->hdr_len); + ++ctx->stats.length_errors; + return -EINVAL; + } + + if (unlikely(ctx->hdr_len > enet->cfg.split_hdr)) { + pr_debug("%s: rx error: hdr len %u exceeds hdr buffer size %u\n", + enet->netdev->name, ctx->hdr_len, + enet->cfg.split_hdr); + ++ctx->stats.length_errors; + return -EINVAL; + } + + return 0; + } + + if (unlikely(ctx->len < ETH_HLEN)) { + pr_debug("%s: short packet %u\n", enet->netdev->name, ctx->len); + ++ctx->stats.length_errors; + return -EINVAL; + } + + return 0; +} + +static struct sk_buff *eea_build_skb(void *buf, u32 buflen, u32 headroom, + u32 len) +{ + struct sk_buff *skb; + + skb = build_skb(buf, buflen); + if (unlikely(!skb)) + return NULL; + + skb_reserve(skb, headroom); + skb_put(skb, len); + + return skb; +} + +static struct sk_buff *eea_rx_build_split_hdr_skb(struct eea_net_rx *rx, + struct eea_rx_ctx *ctx) +{ + struct eea_rx_meta *meta = ctx->meta; + u32 truesize, offset; + struct sk_buff *skb; + struct page *page; + + dma_sync_single_for_cpu(rx->enet->edev->dma_dev, meta->hdr_dma, + ctx->hdr_len, DMA_FROM_DEVICE); + + skb = napi_alloc_skb(rx->napi, ctx->hdr_len); + if (unlikely(!skb)) + return NULL; + + skb_put_data(skb, ctx->meta->hdr_addr, ctx->hdr_len); + + if (ctx->len) { + page = meta->page; + offset = meta->offset + meta->headroom; + + truesize = eea_consume_rx_buffer(rx, meta, + meta->headroom + ctx->len); + + skb_add_rx_frag(skb, 0, page, offset, ctx->len, truesize); + } + + skb_mark_for_recycle(skb); + + return skb; +} + +static struct sk_buff *eea_rx_build_skb(struct eea_net_rx *rx, + struct eea_rx_ctx *ctx) +{ + struct eea_rx_meta *meta = ctx->meta; + u32 shinfo_size, bufsize, truesize; + struct sk_buff *skb; + struct page *page; + void *buf; + + page = meta->page; + + shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + + buf = page_address(page) + meta->offset; + bufsize = meta->headroom + SKB_DATA_ALIGN(ctx->len) + shinfo_size; + + skb = eea_build_skb(buf, bufsize, meta->headroom, ctx->len); + if (unlikely(!skb)) + return NULL; + + truesize = eea_consume_rx_buffer(rx, meta, bufsize); + skb_mark_for_recycle(skb); + + skb->truesize += truesize - bufsize; + + return skb; +} + +static void process_remain_buf(struct eea_net_rx *rx, struct eea_rx_ctx *ctx) +{ + struct eea_net *enet = rx->enet; + struct sk_buff *head_skb; + u32 offset, truesize, nr_frags; + struct page *page; + + if (eea_harden_check_overflow(ctx, enet)) + goto err; + + head_skb = rx->pkt.head_skb; + + nr_frags = skb_shinfo(head_skb)->nr_frags; + if (unlikely(nr_frags >= MAX_SKB_FRAGS)) + goto err; + + offset = ctx->meta->offset + ctx->meta->headroom; + page = ctx->meta->page; + truesize = eea_consume_rx_buffer(rx, ctx->meta, + ctx->meta->headroom + ctx->len); + + skb_add_rx_frag(head_skb, nr_frags, page, offset, ctx->len, truesize); + + return; + +err: + dev_kfree_skb(rx->pkt.head_skb); + ++ctx->stats.drops; + rx->pkt.do_drop = true; + rx->pkt.head_skb = NULL; +} + +static void process_first_buf(struct eea_net_rx *rx, struct eea_rx_ctx *ctx) +{ + struct eea_net *enet = rx->enet; + struct sk_buff *skb = NULL; + + if (eea_harden_check_size(ctx, enet)) + goto err; + + rx->pkt.data_valid = ctx->flags & EEA_DESC_F_DATA_VALID; + + if (ctx->hdr_len) + skb = eea_rx_build_split_hdr_skb(rx, ctx); + else + skb = eea_rx_build_skb(rx, ctx); + + if (unlikely(!skb)) + goto err; + + rx->pkt.head_skb = skb; + + return; + +err: + ++ctx->stats.drops; + rx->pkt.do_drop = true; +} + +static void eea_submit_skb(struct eea_net_rx *rx, struct sk_buff *skb, + struct eea_rx_cdesc *desc) +{ + struct eea_net *enet = rx->enet; + + if (rx->pkt.data_valid) + skb->ip_summed = CHECKSUM_UNNECESSARY; + + if (enet->cfg.ts_cfg.rx_filter == HWTSTAMP_FILTER_ALL) + skb_hwtstamps(skb)->hwtstamp = EEA_DESC_TS(desc) + + enet->hw_ts_offset; + + skb_record_rx_queue(skb, rx->index); + skb->protocol = eth_type_trans(skb, enet->netdev); + + napi_gro_receive(rx->napi, skb); +} + +static int eea_rx_desc_to_ctx(struct eea_net_rx *rx, + struct eea_rx_ctx *ctx, + struct eea_rx_cdesc *desc) +{ + u16 id; + + ctx->meta = NULL; + + id = le16_to_cpu(desc->id); + if (unlikely(id >= rx->ering->num)) { + if (net_ratelimit()) + netdev_err(rx->enet->netdev, "rx invalid id %d\n", id); + return -EINVAL; + } + + ctx->meta = &rx->meta[id]; + if (!ctx->meta->in_use) { + if (net_ratelimit()) + netdev_err(rx->enet->netdev, "rx invalid id %d\n", id); + ctx->meta = NULL; + return -EINVAL; + } + + ctx->meta->in_use = false; + + ctx->len = le16_to_cpu(desc->len); + if (unlikely(ctx->len > ctx->meta->len)) { + if (net_ratelimit()) + netdev_err(rx->enet->netdev, "rx invalid len(%d) id:%d\n", + ctx->len, id); + return -EINVAL; + } + + ctx->flags = le16_to_cpu(desc->flags); + + ctx->hdr_len = 0; + if (ctx->flags & EEA_DESC_F_SPLIT_HDR) { + ctx->hdr_len = le16_to_cpu(desc->len_ex) & + EEA_RX_CDESC_HDR_LEN_MASK; + ctx->stats.split_hdr_bytes += ctx->hdr_len; + ++ctx->stats.split_hdr_packets; + } + + ctx->more = ctx->flags & EEA_RING_DESC_F_MORE; + + return 0; +} + +static int eea_cleanrx(struct eea_net_rx *rx, int budget, + struct eea_rx_ctx *ctx) +{ + struct eea_rx_cdesc *desc; + struct eea_rx_meta *meta; + int recv, err; + + for (recv = 0; recv < budget; ) { + desc = eea_ering_cq_get_desc(rx->ering); + if (!desc) + break; + + err = eea_rx_desc_to_ctx(rx, ctx, desc); + if (unlikely(err)) { + if (ctx->meta) + eea_rx_meta_put(rx, ctx->meta); + + if (rx->pkt.head_skb) { + dev_kfree_skb(rx->pkt.head_skb); + ++ctx->stats.drops; + } + + /* A hardware error occurred; we are attempting to + * mitigate the impact. Subsequent packets may be + * corrupted. + */ + ctx->more = false; + goto ack; + } + + meta = ctx->meta; + + if (unlikely(rx->pkt.do_drop)) + goto skip; + + eea_rx_meta_dma_sync_for_cpu(rx, meta, ctx->len); + + rx->pkt.recv_len += ctx->len; + rx->pkt.recv_len += ctx->hdr_len; + + if (!rx->pkt.idx) + process_first_buf(rx, ctx); + else + process_remain_buf(rx, ctx); + + ++rx->pkt.idx; + + if (!ctx->more && rx->pkt.head_skb) { + eea_submit_skb(rx, rx->pkt.head_skb, desc); + ctx->stats.bytes += rx->pkt.recv_len; + ++ctx->stats.packets; + } + +skip: + eea_rx_meta_put(rx, meta); +ack: + eea_ering_cq_ack_desc(rx->ering, 1); + ++ctx->stats.descs; + + if (!ctx->more) { + memset(&rx->pkt, 0, sizeof(rx->pkt)); + ++recv; + } + } + + return recv; +} + +static void eea_rx_dma_sync_hdr(struct eea_net_rx *rx, dma_addr_t addr) +{ + dma_sync_single_for_device(rx->dma_dev, addr, + rx->enet->cfg.split_hdr, + DMA_FROM_DEVICE); +} + +/* Only be called from napi. */ +static void eea_rx_post(struct eea_net_rx *rx, struct eea_rx_ctx *ctx) +{ + u32 tailroom, headroom, room, len; + struct eea_rx_meta *meta; + struct eea_rx_desc *desc; + int err = 0, num = 0; + dma_addr_t addr; + + tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + headroom = rx->headroom; + room = headroom + tailroom; + + while (true) { + meta = eea_rx_meta_get(rx); + if (!meta) + break; + + err = eea_alloc_rx_buffer(rx, meta); + if (err) { + eea_rx_meta_put(rx, meta); + break; + } + + len = min_t(u32, PAGE_SIZE - meta->offset - room, + EEA_RX_BUF_MAX_LEN); + + len = ALIGN_DOWN(len, SMP_CACHE_BYTES); + + addr = meta->dma + meta->offset + headroom; + + desc = eea_ering_sq_alloc_desc(rx->ering, meta->id, true, 0); + desc->addr = cpu_to_le64(addr); + desc->len = cpu_to_le16(len); + + if (meta->hdr_addr) { + eea_rx_dma_sync_hdr(rx, meta->hdr_dma); + desc->hdr_addr = cpu_to_le64(meta->hdr_dma); + } + + eea_ering_sq_commit_desc(rx->ering); + + meta->truesize = len + room; + meta->headroom = headroom; + meta->tailroom = tailroom; + meta->len = len; + meta->in_use = true; + ++num; + } + + if (num) { + eea_ering_kick(rx->ering); + ++ctx->stats.kicks; + } +} + +static int eea_poll(struct napi_struct *napi, int budget) +{ + struct eea_irq_blk *blk = container_of(napi, struct eea_irq_blk, napi); + struct eea_net_rx *rx = blk->rx; + struct eea_net_tx *tx = &rx->enet->tx[rx->index]; + struct eea_rx_ctx ctx = {}; + bool busy = false; + u32 received; + + busy |= eea_poll_tx(tx, budget) >= budget; + + received = eea_cleanrx(rx, budget, &ctx); + + if (rx->ering->num_free > budget) { + /* Due to the hardware design, there is no notification when + * buffers are exhausted. Therefore, we should proactively + * pre-fill the buffers to avoid starvation. + */ + eea_rx_post(rx, &ctx); + + if (rx->ering->num - rx->ering->num_free < budget) + busy = true; + } + + eea_update_rx_stats(&rx->stats, &ctx.stats); + + busy |= received >= budget; + + if (busy) + return budget; + + if (napi_complete_done(napi, received)) + eea_ering_irq_active(rx->ering, tx->ering); + + return received; +} + +static void eea_free_rx_buffers(struct eea_net_rx *rx, struct eea_net_cfg *cfg) +{ + struct eea_rx_meta *meta; + u32 i; + + if (rx->pkt.head_skb) { + dev_kfree_skb(rx->pkt.head_skb); + rx->pkt.head_skb = NULL; + } + + for (i = 0; i < cfg->rx_ring_depth; ++i) { + meta = &rx->meta[i]; + if (!meta->page) + continue; + + eea_free_rx_buffer(rx, meta, false); + } +} + +static struct page_pool *eea_create_pp(struct eea_net_init_ctx *ctx, u32 idx) +{ + struct page_pool_params pp_params = {0}; + + pp_params.order = 0; + pp_params.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV; + pp_params.pool_size = ctx->cfg.rx_ring_depth; + pp_params.nid = dev_to_node(ctx->edev->dma_dev); + pp_params.dev = ctx->edev->dma_dev; + pp_params.netdev = ctx->netdev; + pp_params.dma_dir = DMA_FROM_DEVICE; + pp_params.max_len = PAGE_SIZE; + pp_params.queue_idx = idx; + + return page_pool_create(&pp_params); +} + +static void eea_destroy_page_pool(struct eea_net_rx *rx) +{ + if (rx->pp) + page_pool_destroy(rx->pp); +} + +void enet_rx_stop(struct eea_net_rx *rx) +{ + if (rx->flags & EEA_ENABLE_F_NAPI) { + rx->flags &= ~EEA_ENABLE_F_NAPI; + + disable_irq(rx->enet->irq_blks[rx->index].irq); + napi_disable(rx->napi); + + page_pool_disable_direct_recycling(rx->pp); + netif_napi_del(rx->napi); + } +} + +void enet_rx_start(struct eea_net_rx *rx) +{ + netif_napi_add(rx->enet->netdev, rx->napi, eea_poll); + + page_pool_enable_direct_recycling(rx->pp, rx->napi); + + napi_enable(rx->napi); + + rx->flags |= EEA_ENABLE_F_NAPI; + + local_bh_disable(); + napi_schedule(rx->napi); + local_bh_enable(); + + enable_irq(rx->enet->irq_blks[rx->index].irq); +} + +/* Maybe called before eea_bind_q_and_cfg. So the cfg must be passed. */ +void eea_free_rx(struct eea_net_rx *rx, struct eea_net_cfg *cfg) +{ + if (!rx) + return; + + if (rx->ering) { + eea_ering_free(rx->ering); + rx->ering = NULL; + } + + if (rx->meta) { + eea_free_rx_buffers(rx, cfg); + eea_free_rx_hdr(rx, cfg); + kvfree(rx->meta); + rx->meta = NULL; + } + + if (rx->pp) { + eea_destroy_page_pool(rx); + rx->pp = NULL; + } + + kfree(rx); +} + +static void eea_rx_meta_init(struct eea_net_rx *rx, u32 num) +{ + struct eea_rx_meta *meta; + int i; + + rx->free = NULL; + + for (i = 0; i < num; ++i) { + meta = &rx->meta[i]; + meta->id = i; + meta->next = rx->free; + rx->free = meta; + } +} + +struct eea_net_rx *eea_alloc_rx(struct eea_net_init_ctx *ctx, u32 idx) +{ + struct eea_ring *ering; + struct eea_net_rx *rx; + int err; + + rx = kzalloc(sizeof(*rx), GFP_KERNEL); + if (!rx) + return rx; + + rx->index = idx; + snprintf(rx->name, sizeof(rx->name), "rx.%u", idx); + + u64_stats_init(&rx->stats.syncp); + + /* ering */ + ering = eea_ering_alloc(idx * 2, ctx->cfg.rx_ring_depth, ctx->edev, + ctx->cfg.rx_sq_desc_size, + ctx->cfg.rx_cq_desc_size, + rx->name); + if (!ering) + goto err_free_rx; + + rx->ering = ering; + + rx->dma_dev = ctx->edev->dma_dev; + + /* meta */ + rx->meta = kvcalloc(ctx->cfg.rx_ring_depth, + sizeof(*rx->meta), GFP_KERNEL); + if (!rx->meta) + goto err_free_rx; + + eea_rx_meta_init(rx, ctx->cfg.rx_ring_depth); + + if (ctx->cfg.split_hdr) { + err = eea_alloc_rx_hdr(ctx, rx); + if (err) + goto err_free_rx; + } + + rx->pp = eea_create_pp(ctx, idx); + if (IS_ERR(rx->pp)) { + err = PTR_ERR(rx->pp); + rx->pp = NULL; + goto err_free_rx; + } + + return rx; + +err_free_rx: + eea_free_rx(rx, &ctx->cfg); + return NULL; +} diff --git a/drivers/net/ethernet/alibaba/eea/eea_tx.c b/drivers/net/ethernet/alibaba/eea/eea_tx.c new file mode 100644 index 000000000000..85fb0e9ca5ba --- /dev/null +++ b/drivers/net/ethernet/alibaba/eea/eea_tx.c @@ -0,0 +1,500 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Driver for Alibaba Elastic Ethernet Adapter. + * + * Copyright (C) 2025 Alibaba Inc. + */ + +#include <net/netdev_queues.h> + +#include "eea_net.h" +#include "eea_pci.h" +#include "eea_ring.h" + +struct eea_sq_free_stats { + u64 packets; + u64 bytes; +}; + +struct eea_tx_meta { + struct eea_tx_meta *next; + + u32 id; + + union { + struct sk_buff *skb; + void *data; + }; + + u32 num; + + dma_addr_t dma_addr; + struct eea_tx_desc *desc; + u32 dma_len; + bool unmap; + bool unmap_single; +}; + +static struct eea_tx_meta *eea_tx_meta_get(struct eea_net_tx *tx) +{ + struct eea_tx_meta *meta; + + if (!tx->free) + return NULL; + + meta = tx->free; + tx->free = meta->next; + + return meta; +} + +static void eea_tx_meta_put_and_unmap(struct eea_net_tx *tx, + struct eea_tx_meta *meta) +{ + struct eea_tx_meta *head; + + head = meta; + + while (true) { + if (meta->unmap) { + if (meta->unmap_single) + dma_unmap_single(tx->dma_dev, meta->dma_addr, + meta->dma_len, DMA_TO_DEVICE); + else + dma_unmap_page(tx->dma_dev, meta->dma_addr, + meta->dma_len, DMA_TO_DEVICE); + } + + if (meta->next) { + meta = meta->next; + continue; + } + + break; + } + + meta->next = tx->free; + tx->free = head; +} + +static void eea_meta_free_xmit(struct eea_net_tx *tx, + struct eea_tx_meta *meta, + int budget, + struct eea_tx_cdesc *desc, + struct eea_sq_free_stats *stats) +{ + struct sk_buff *skb = meta->skb; + + if (unlikely((skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) && desc)) { + struct skb_shared_hwtstamps ts = {}; + + ts.hwtstamp = EEA_DESC_TS(desc) + tx->enet->hw_ts_offset; + skb_tstamp_tx(skb, &ts); + } + + ++stats->packets; + stats->bytes += meta->skb->len; + napi_consume_skb(meta->skb, budget); + + meta->data = NULL; +} + +static int eea_clean_tx(struct eea_net_tx *tx, int budget) +{ + struct eea_sq_free_stats stats = {0}; + struct eea_tx_cdesc *desc; + struct eea_tx_meta *meta; + int desc_n; + u16 id; + + while (stats.packets < budget) { + desc = eea_ering_cq_get_desc(tx->ering); + if (!desc) + break; + + id = le16_to_cpu(desc->id); + if (unlikely(id >= tx->ering->num)) { + if (net_ratelimit()) + netdev_err(tx->enet->netdev, "tx invalid id %d\n", + id); + eea_ering_cq_ack_desc(tx->ering, 1); + continue; + } + + meta = &tx->meta[id]; + + if (meta->data) { + eea_tx_meta_put_and_unmap(tx, meta); + eea_meta_free_xmit(tx, meta, budget, desc, &stats); + desc_n = meta->num; + } else { + if (net_ratelimit()) + netdev_err(tx->enet->netdev, + "tx meta->data is null. id %d num: %d\n", + meta->id, meta->num); + desc_n = 1; + } + + eea_ering_cq_ack_desc(tx->ering, desc_n); + } + + if (stats.packets) { + u64_stats_update_begin(&tx->stats.syncp); + u64_stats_add(&tx->stats.bytes, stats.bytes); + u64_stats_add(&tx->stats.packets, stats.packets); + u64_stats_update_end(&tx->stats.syncp); + } + + return stats.packets; +} + +int eea_poll_tx(struct eea_net_tx *tx, int budget) +{ + struct eea_net *enet = tx->enet; + u32 index = tx - enet->tx; + struct netdev_queue *txq; + int num; + + txq = netdev_get_tx_queue(enet->netdev, index); + + __netif_tx_lock(txq, smp_processor_id()); + + num = eea_clean_tx(tx, budget); + + if (netif_tx_queue_stopped(txq) && + tx->ering->num_free >= MAX_SKB_FRAGS + 2) + netif_tx_wake_queue(txq); + + __netif_tx_unlock(txq); + + return num; +} + +static int eea_fill_desc_from_skb(const struct sk_buff *skb, + struct eea_tx_desc *desc) +{ + if (skb_is_gso(skb)) { + struct skb_shared_info *sinfo = skb_shinfo(skb); + + desc->gso_size = cpu_to_le16(sinfo->gso_size); + if (sinfo->gso_type & SKB_GSO_TCPV4) + desc->gso_type = EEA_TX_GSO_TCPV4; + + else if (sinfo->gso_type & SKB_GSO_TCPV6) + desc->gso_type = EEA_TX_GSO_TCPV6; + + else if (sinfo->gso_type & SKB_GSO_UDP_L4) + desc->gso_type = EEA_TX_GSO_UDP_L4; + + else + return -EINVAL; + + if (sinfo->gso_type & SKB_GSO_TCP_ECN) + desc->gso_type |= EEA_TX_GSO_ECN; + } else { + desc->gso_type = EEA_TX_GSO_NONE; + } + + if (skb->ip_summed == CHECKSUM_PARTIAL) { + desc->csum_start = cpu_to_le16(skb_checksum_start_offset(skb)); + desc->csum_offset = cpu_to_le16(skb->csum_offset); + } + + return 0; +} + +static struct eea_tx_meta *__eea_tx_desc_fill(struct eea_net_tx *tx, + struct eea_tx_meta *head_meta, + dma_addr_t addr, u32 data_len, + u32 dma_len, bool last, + void *data, u16 flags, + bool unmap) +{ + struct eea_tx_meta *meta; + struct eea_tx_desc *desc; + + meta = eea_tx_meta_get(tx); + + desc = eea_ering_sq_alloc_desc(tx->ering, meta->id, last, flags); + desc->addr = cpu_to_le64(addr); + desc->len = cpu_to_le16(data_len); + + meta->next = NULL; + meta->dma_len = dma_len; + meta->dma_addr = addr; + meta->data = data; + meta->num = 1; + meta->desc = desc; + meta->unmap = unmap; + meta->unmap_single = false; + + if (head_meta) { + meta->next = head_meta->next; + head_meta->next = meta; + ++head_meta->num; + } + + return meta; +} + +static struct eea_tx_meta *eea_tx_desc_fill(struct eea_net_tx *tx, + struct eea_tx_meta *head_meta, + dma_addr_t addr, u32 length, + bool is_last, void *data, u16 flags) +{ + struct eea_tx_meta *meta; + u16 len, last; + + WARN_ON_ONCE(length >= 2 * USHRT_MAX); + + /* Since eea does not support BIG TCP, the maximum GSO size is capped at + * 64KB. Consequently, a single skb buffer (head or fragment) will not + * require more than two descriptors + */ + if (length > USHRT_MAX) { + len = USHRT_MAX; + last = false; + } else { + len = length; + last = is_last; + } + + meta = __eea_tx_desc_fill(tx, head_meta, addr, len, length, + last, data, flags, true); + + if (length > USHRT_MAX) { + if (!head_meta) + head_meta = meta; + + addr += USHRT_MAX; + len = length - USHRT_MAX; + + __eea_tx_desc_fill(tx, head_meta, addr, len, 0, is_last, + NULL, 0, false); + } + + return meta; +} + +static int eea_tx_add_skb_frag(struct eea_net_tx *tx, + struct eea_tx_meta *head_meta, + const skb_frag_t *frag, bool is_last) +{ + u32 len = skb_frag_size(frag); + dma_addr_t addr; + + addr = skb_frag_dma_map(tx->dma_dev, frag, 0, len, DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(tx->dma_dev, addr))) + return -ENOMEM; + + eea_tx_desc_fill(tx, head_meta, addr, len, is_last, NULL, 0); + + return 0; +} + +static int eea_tx_post_skb(struct eea_net_tx *tx, struct sk_buff *skb) +{ + const struct skb_shared_info *shinfo = skb_shinfo(skb); + u32 hlen = skb_headlen(skb); + struct eea_tx_meta *meta; + const skb_frag_t *frag; + dma_addr_t addr; + u32 len = hlen; + int i, err; + u16 flags; + bool last; + + if (len) { + addr = dma_map_single(tx->dma_dev, skb->data, len, + DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(tx->dma_dev, addr))) + return -ENOMEM; + + last = !shinfo->nr_frags; + i = 0; + } else { + /* The net stack will never submit an skb with an skb->len of + * 0. If the head len is 0, the number of frags must be greater + * than 0. + */ + frag = &shinfo->frags[0]; + len = skb_frag_size(frag); + + addr = skb_frag_dma_map(tx->dma_dev, frag, 0, len, + DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(tx->dma_dev, addr))) + return -ENOMEM; + + last = shinfo->nr_frags == 1; + i = 1; + } + + flags = skb->ip_summed == CHECKSUM_PARTIAL ? EEA_DESC_F_DO_CSUM : 0; + + meta = eea_tx_desc_fill(tx, NULL, addr, len, last, skb, flags); + meta->unmap_single = !!hlen; + + err = eea_fill_desc_from_skb(skb, meta->desc); + if (err) + goto err_cancel; + + for (; i < shinfo->nr_frags; i++) { + frag = &shinfo->frags[i]; + bool is_last = i == (shinfo->nr_frags - 1); + + err = eea_tx_add_skb_frag(tx, meta, frag, is_last); + if (err) + goto err_cancel; + } + + eea_ering_sq_commit_desc(tx->ering); + + u64_stats_update_begin(&tx->stats.syncp); + u64_stats_add(&tx->stats.descs, meta->num); + u64_stats_update_end(&tx->stats.syncp); + + return 0; + +err_cancel: + eea_ering_sq_cancel(tx->ering); + eea_tx_meta_put_and_unmap(tx, meta); + meta->data = NULL; + return err; +} + +static void eea_tx_kick(struct eea_net_tx *tx) +{ + eea_ering_kick(tx->ering); + + u64_stats_update_begin(&tx->stats.syncp); + u64_stats_inc(&tx->stats.kicks); + u64_stats_update_end(&tx->stats.syncp); +} + +static int eea_tx_check_free_num(struct eea_net_tx *tx, + struct netdev_queue *txq) +{ + int n; + + /* MAX_SKB_FRAGS + 1: Covers the skb linear head and all paged fragments + * 1: Extra slot for a head or fragment that exceeds 64KB. + */ + n = MAX_SKB_FRAGS + 2; + return netif_txq_maybe_stop(txq, tx->ering->num_free, n, n); +} + +netdev_tx_t eea_tx_xmit(struct sk_buff *skb, struct net_device *netdev) +{ + struct eea_net *enet = netdev_priv(netdev); + int qnum = skb_get_queue_mapping(skb); + struct eea_net_tx *tx = &enet->tx[qnum]; + struct netdev_queue *txq; + int err, enable; + + txq = netdev_get_tx_queue(netdev, qnum); + + enable = eea_tx_check_free_num(tx, txq); + if (!enable) + return NETDEV_TX_BUSY; + + err = eea_tx_post_skb(tx, skb); + if (unlikely(err)) { + u64_stats_update_begin(&tx->stats.syncp); + u64_stats_inc(&tx->stats.drops); + u64_stats_update_end(&tx->stats.syncp); + + dev_kfree_skb_any(skb); + } else { + if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) + skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS; + skb_tx_timestamp(skb); + } + + /* NETDEV_TX_BUSY is expensive. So stop advancing the TX queue. */ + eea_tx_check_free_num(tx, txq); + + if (!netdev_xmit_more() || netif_xmit_stopped(txq)) + eea_tx_kick(tx); + + return NETDEV_TX_OK; +} + +static void eea_free_meta(struct eea_net_tx *tx, struct eea_net_cfg *cfg) +{ + struct eea_sq_free_stats stats = {0}; + struct eea_tx_meta *meta; + int i; + + while ((meta = eea_tx_meta_get(tx))) + meta->skb = NULL; + + for (i = 0; i < cfg->tx_ring_depth; i++) { + meta = &tx->meta[i]; + + if (!meta->skb) + continue; + + eea_tx_meta_put_and_unmap(tx, meta); + + eea_meta_free_xmit(tx, meta, 0, NULL, &stats); + } + + kvfree(tx->meta); + tx->meta = NULL; +} + +/* Maybe called before eea_bind_q_and_cfg. So the cfg must be passed. */ +void eea_free_tx(struct eea_net_tx *tx, struct eea_net_cfg *cfg) +{ + if (!tx) + return; + + if (tx->ering) { + eea_ering_free(tx->ering); + tx->ering = NULL; + } + + if (tx->meta) + eea_free_meta(tx, cfg); +} + +int eea_alloc_tx(struct eea_net_init_ctx *ctx, struct eea_net_tx *tx, u32 idx) +{ + struct eea_tx_meta *meta; + struct eea_ring *ering; + u32 i; + + u64_stats_init(&tx->stats.syncp); + + snprintf(tx->name, sizeof(tx->name), "tx.%u", idx); + + ering = eea_ering_alloc(idx * 2 + 1, ctx->cfg.tx_ring_depth, ctx->edev, + ctx->cfg.tx_sq_desc_size, + ctx->cfg.tx_cq_desc_size, + tx->name); + if (!ering) + goto err_free_tx; + + tx->ering = ering; + tx->index = idx; + tx->dma_dev = ctx->edev->dma_dev; + + /* meta */ + tx->meta = kvcalloc(ctx->cfg.tx_ring_depth, + sizeof(*tx->meta), GFP_KERNEL); + if (!tx->meta) + goto err_free_tx; + + for (i = 0; i < ctx->cfg.tx_ring_depth; ++i) { + meta = &tx->meta[i]; + meta->id = i; + meta->next = tx->free; + tx->free = meta; + } + + return 0; + +err_free_tx: + eea_free_tx(tx, &ctx->cfg); + return -ENOMEM; +} |
