summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2026-06-04 03:42:33 +0300
committerJakub Kicinski <kuba@kernel.org>2026-06-04 03:42:34 +0300
commitc0192c7ec1fc5fa4ec9793bb460204715e2d9cd3 (patch)
treea1622430fa26f0cbf3085cd6e4dd92801340ca85
parentdfcc2ff12925d99e858eaf539eaa4aaaf81fe2a6 (diff)
parent0b1c4495aa007932e9cbd7b45a8037e7b4fe34b0 (diff)
downloadlinux-c0192c7ec1fc5fa4ec9793bb460204715e2d9cd3.tar.xz
Merge branch 'net-mlx5-add-switchdev-mode-support-for-socket-direct-single-netdev-part-1-2'
Tariq Toukan says: ==================== net/mlx5: Add switchdev mode support for Socket Direct single netdev, part 1/2 This series enables Socket Direct single netdev to operate in switchdev mode with shared FDB. SD single netdev combines multiple PCI functions behind a single netdev interface. To support switchdev offloads, these functions must participate in virtual LAG (shared FDB). Design Rather than introducing a separate LAG instance for SD, this series integrates SD secondary devices into the existing LAG structure (priv.lag) created at probe time. Each lag_func entry carries a group_id field that identifies its SD group membership (0 means not part of any SD group). An xarray mark (XA_MARK_PORT) distinguishes physical port entries from SD secondaries, enabling a single unified iterator that filters by group: - MLX5_LAG_FILTER_PORTS: iterate port-level entries only (existing behavior, used by bonding, FW LAG commands, v2p_map) - MLX5_LAG_FILTER_ALL: iterate all devices including SD secondaries (used by MPESW shared FDB across all devices) - specific group_id: iterate only devices in that SD group (used by per-group SD shared FDB operations) Existing callers use mlx5_ldev_for_each() which maps to MLX5_LAG_FILTER_PORTS, preserving current behavior for non-SD configurations. Lifecycle and ownership The SD LAG lifecycle is tied to the SD group, not to bonding events: 1. At PCI probe, mlx5_lag_add_mdev() creates the LAG structure (priv.lag) for each LAG-capable PF. e.g.: SD primary devices 2. During mlx5_sd_init(), after the SD group is fully formed (primary and secondaries paired), sd_lag_init() registers the secondary devices into the primary's existing priv.lag by calling mlx5_ldev_add_mdev() with the SD group_id. The primary's lag_func also gets its group_id set. No separate LAG instance is created. 3. After all the devices in SD group transition to switchdev, mlx5_lag_shared_fdb_create() is invoked with the group_id to create a software-only shared FDB scoped to that SD group. This sets sd_fdb_active on all lag_func entries in the group. No FW LAG commands are issued since SD devices share the same physical port. 4. If MPESW (multi-port eswitch) is enabled on top of SD groups, the per-group SD shared FDB is torn down first, then MPESW shared FDB is created spanning all devices (ports + SD secondaries) using MLX5_LAG_FILTER_ALL. On MPESW disable, per-group SD shared FDB is restored. 5. On SD teardown (mlx5_sd_cleanup or device unbind), sd_lag_cleanup() removes secondaries from priv.lag and clears the primary's group_id. The LAG structure itself is not destroyed. The sd_fdb_active flag is set on all lag_func entries in a group (not just the primary), so any device can detect the SD shared FDB state during lag_disable_change teardown without needing to look up peer entries. SD shared FDB is a pure software construct -- unlike regular LAG modes (ROCE, SRIOV, MPESW), it does not issue FW create_lag/destroy_lag commands. The software vport LAG for SD is implemented via eswitch egress ACL bounce rules, managed by the IB layer through mlx5_eth_lag_init(). And the software LAG demux is implemented via steering rules that utilize new destination, VHCA_RX. Patches Infrastructure (patches 1, 5-6): - Factor out shared FDB code into a dedicated file - Extend lag_func with group_id and sd_fdb_active fields; add XA_MARK_PORT and unified iterator with group_id filter - Extend shared FDB API with group_id parameter E-Switch preparation (patches 2-3): - Align eswitch disable sequence ordering - Move devcom init from TC to eswitch layer SD group management (patches 4, 7-9): - Replace peer count check with direct peer lookup - Register SD secondaries in the existing LAG at SD init time - Block RoCE and VF LAG for SD devices - Block multipath LAG for SD devices Switchdev integration (patch 10): - Keep netdev resources local in switchdev mode Steering (patches 11-12): - Track peer flow slots with bitmap for selective peer flow deletion - Enable TC flow steering for SD LAG Enablement (patch 13): - Verify unique vhca_id count for cross-VHCA RQT ==================== Link: https://patch.msgid.link/20260531113954.395443-1-tariqt@nvidia.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/Makefile2
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/rqt.c27
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h7
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_tc.c83
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/eswitch.h11
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c26
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c429
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h100
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c4
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c28
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/lag/shared_fdb.c235
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c227
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h23
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/main.c3
14 files changed, 916 insertions, 289 deletions
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index d39fe9c4a87c..19e50f0d55af 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -41,7 +41,7 @@ mlx5_core-$(CONFIG_MLX5_CORE_EN_DCB) += en_dcbnl.o en/port_buffer.o
mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += en/hv_vhca_stats.o
mlx5_core-$(CONFIG_MLX5_ESWITCH) += lag/mp.o lag/port_sel.o lib/geneve.o lib/port_tun.o \
en_rep.o en/rep/bond.o en/mod_hdr.o \
- en/mapping.o lag/mpesw.o
+ en/mapping.o lag/mpesw.o lag/shared_fdb.o
mlx5_core-$(CONFIG_MLX5_CLS_ACT) += en_tc.o en/rep/tc.o en/rep/neigh.o \
lib/fs_chains.o en/tc_tun.o \
esw/indir_table.o en/tc_tun_encap.o \
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.c
index a3382f6a6b74..8511363f7bec 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.c
@@ -8,13 +8,28 @@ static bool verify_num_vhca_ids(struct mlx5_core_dev *mdev, u32 *vhca_ids,
unsigned int size)
{
unsigned int max_num_vhca_id = MLX5_CAP_GEN_2(mdev, max_rqt_vhca_id);
- int i;
+ unsigned int unique_count = 0;
+ int i, j;
+
+ /* Count unique vhca_ids */
+ for (i = 0; i < size; i++) {
+ bool is_unique = true;
+
+ /* Check if vhca_ids[i] was already seen */
+ for (j = 0; j < i; j++) {
+ if (vhca_ids[j] == vhca_ids[i]) {
+ is_unique = false;
+ break;
+ }
+ }
+ if (is_unique)
+ unique_count++;
+ }
- /* Verify that all vhca_ids are in range [0, max_num_vhca_ids - 1] */
- for (i = 0; i < size; i++)
- if (vhca_ids[i] >= max_num_vhca_id)
- return false;
- return true;
+ /* Verify that number of unique vhca_ids doesn't exceed
+ * max_num_vhca_id
+ */
+ return unique_count <= max_num_vhca_id;
}
static bool rqt_verify_vhca_ids(struct mlx5_core_dev *mdev, u32 *vhca_ids,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h
index efb34de4cb7a..28cab4bf525c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h
@@ -97,10 +97,17 @@ struct mlx5e_tc_flow {
struct mlx5e_hairpin_entry *hpe; /* attached hairpin instance */
struct list_head hairpin; /* flows sharing the same hairpin */
struct list_head peer[MLX5_MAX_PORTS]; /* flows with peer flow */
+ DECLARE_BITMAP(peer_used, MLX5_MAX_PORTS); /* tracks populated peer
+ * slots
+ */
struct list_head unready; /* flows not ready to be offloaded (e.g
* due to missing route)
*/
struct list_head peer_flows; /* flows on peer */
+ int peer_index; /* peer-flow index pinned at add time, used at del
+ * time so removal is independent of LAG state
+ * changes between add and del.
+ */
struct net_device *orig_dev; /* netdev adding flow first */
int tmp_entry_index;
struct list_head tmp_list; /* temporary flow list used by neigh update */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index a9001d1c902f..910492eb51f2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -71,6 +71,7 @@
#include <asm/div64.h>
#include "lag/lag.h"
#include "lag/mp.h"
+#include "lib/sd.h"
#define MLX5E_TC_TABLE_NUM_GROUPS 4
#define MLX5E_TC_TABLE_MAX_GROUP_SIZE BIT(18)
@@ -2128,10 +2129,11 @@ static void mlx5e_tc_del_fdb_peer_flow(struct mlx5e_tc_flow *flow,
mutex_lock(&esw->offloads.peer_mutex);
list_del(&flow->peer[peer_index]);
+ clear_bit(peer_index, flow->peer_used);
mutex_unlock(&esw->offloads.peer_mutex);
list_for_each_entry_safe(peer_flow, tmp, &flow->peer_flows, peer_flows) {
- if (peer_index != mlx5_lag_get_dev_seq(peer_flow->priv->mdev))
+ if (peer_index != peer_flow->peer_index)
continue;
list_del(&peer_flow->peer_flows);
@@ -2147,16 +2149,10 @@ static void mlx5e_tc_del_fdb_peer_flow(struct mlx5e_tc_flow *flow,
static void mlx5e_tc_del_fdb_peers_flow(struct mlx5e_tc_flow *flow)
{
- struct mlx5_devcom_comp_dev *devcom;
- struct mlx5_devcom_comp_dev *pos;
- struct mlx5_eswitch *peer_esw;
int i;
- devcom = flow->priv->mdev->priv.eswitch->devcom;
- mlx5_devcom_for_each_peer_entry(devcom, peer_esw, pos) {
- i = mlx5_lag_get_dev_seq(peer_esw->dev);
+ for_each_set_bit(i, flow->peer_used, MLX5_MAX_PORTS)
mlx5e_tc_del_fdb_peer_flow(flow, i);
- }
}
static void mlx5e_tc_del_flow(struct mlx5e_priv *priv,
@@ -4201,9 +4197,26 @@ static bool is_lag_dev(struct mlx5e_priv *priv,
same_hw_reps(priv, peer_netdev));
}
+static bool is_sd_eligible(struct mlx5e_priv *priv,
+ struct net_device *peer_netdev)
+{
+ struct mlx5e_priv *peer_priv;
+
+ peer_priv = netdev_priv(peer_netdev);
+ return same_hw_reps(priv, peer_netdev) &&
+ mlx5_lag_is_sd(priv->mdev) &&
+ (mlx5_sd_get_primary(priv->mdev) ==
+ mlx5_sd_get_primary(peer_priv->mdev));
+}
+
static bool is_multiport_eligible(struct mlx5e_priv *priv, struct net_device *out_dev)
{
- return same_hw_reps(priv, out_dev) && mlx5_lag_is_mpesw(priv->mdev);
+ struct mlx5_core_dev *primary = mlx5_sd_get_primary(priv->mdev);
+
+ if (!primary)
+ return false;
+
+ return same_hw_reps(priv, out_dev) && mlx5_lag_is_mpesw(primary);
}
bool mlx5e_is_valid_eswitch_fwd_dev(struct mlx5e_priv *priv,
@@ -4212,6 +4225,9 @@ bool mlx5e_is_valid_eswitch_fwd_dev(struct mlx5e_priv *priv,
if (is_merged_eswitch_vfs(priv, out_dev))
return true;
+ if (is_sd_eligible(priv, out_dev))
+ return true;
+
if (is_multiport_eligible(priv, out_dev))
return true;
@@ -4356,7 +4372,7 @@ static struct rhashtable *get_tc_ht(struct mlx5e_priv *priv,
return &tc->ht;
}
-static bool is_peer_flow_needed(struct mlx5e_tc_flow *flow)
+static bool is_peer_flow_needed(struct mlx5e_tc_flow *flow, bool *is_sd)
{
struct mlx5_esw_flow_attr *esw_attr = flow->attr->esw_attr;
struct mlx5_flow_attr *attr = flow->attr;
@@ -4377,6 +4393,13 @@ static bool is_peer_flow_needed(struct mlx5e_tc_flow *flow)
if (mlx5_lag_is_mpesw(esw_attr->in_mdev))
return true;
+ if (mlx5_lag_is_sd(esw_attr->in_mdev) &&
+ !mlx5_sd_is_primary(esw_attr->in_mdev)) {
+ if (!mlx5_lag_is_mpesw(mlx5_sd_get_primary(esw_attr->in_mdev)))
+ *is_sd = true;
+ return true;
+ }
+
return false;
}
@@ -4614,10 +4637,12 @@ static int mlx5e_tc_add_fdb_peer_flow(struct flow_cls_offload *f,
goto out;
}
+ peer_flow->peer_index = i;
list_add_tail(&peer_flow->peer_flows, &flow->peer_flows);
flow_flag_set(flow, DUP);
mutex_lock(&esw->offloads.peer_mutex);
list_add_tail(&flow->peer[i], &esw->offloads.peer_flows[i]);
+ set_bit(i, flow->peer_used);
mutex_unlock(&esw->offloads.peer_mutex);
out:
@@ -4632,19 +4657,26 @@ mlx5e_add_fdb_flow(struct mlx5e_priv *priv,
struct mlx5e_tc_flow **__flow)
{
struct mlx5_devcom_comp_dev *devcom = priv->mdev->priv.eswitch->devcom, *pos;
+ struct netlink_ext_ack *extack = f->common.extack;
struct mlx5e_rep_priv *rpriv = priv->ppriv;
struct mlx5_eswitch_rep *in_rep = rpriv->rep;
struct mlx5_core_dev *in_mdev = priv->mdev;
struct mlx5_eswitch *peer_esw;
struct mlx5e_tc_flow *flow;
+ bool is_sd = false;
int err;
+ if (mlx5_lag_is_sd(in_mdev) && !mlx5_lag_is_active(in_mdev)) {
+ NL_SET_ERR_MSG_MOD(extack, "SD shared FDB not yet active");
+ return -EOPNOTSUPP;
+ }
+
flow = __mlx5e_add_fdb_flow(priv, f, flow_flags, filter_dev, in_rep,
in_mdev);
if (IS_ERR(flow))
return PTR_ERR(flow);
- if (!is_peer_flow_needed(flow)) {
+ if (!is_peer_flow_needed(flow, &is_sd)) {
*__flow = flow;
return 0;
}
@@ -4655,6 +4687,15 @@ mlx5e_add_fdb_flow(struct mlx5e_priv *priv,
}
mlx5_devcom_for_each_peer_entry(devcom, peer_esw, pos) {
+ if (is_sd) {
+ /* SD shared FDB: only the matching SD primary. */
+ if (mlx5_sd_get_primary(in_mdev) !=
+ mlx5_sd_get_primary(peer_esw->dev))
+ continue;
+ } else {
+ if (!mlx5_sd_is_primary(peer_esw->dev))
+ continue;
+ }
err = mlx5e_tc_add_fdb_peer_flow(f, flow, flow_flags, peer_esw);
if (err)
goto peer_clean;
@@ -5394,8 +5435,6 @@ int mlx5e_tc_esw_init(struct mlx5_rep_uplink_priv *uplink_priv)
{
const size_t sz_enc_opts = sizeof(struct tunnel_match_enc_opts);
u8 mapping_id[MLX5_SW_IMAGE_GUID_MAX_BYTES];
- struct mlx5_devcom_match_attr attr = {};
- struct netdev_phys_item_id ppid;
struct mlx5e_rep_priv *rpriv;
struct mapping_ctx *mapping;
struct mlx5_eswitch *esw;
@@ -5456,14 +5495,6 @@ int mlx5e_tc_esw_init(struct mlx5_rep_uplink_priv *uplink_priv)
goto err_action_counter;
}
- err = netif_get_port_parent_id(priv->netdev, &ppid, false);
- if (!err) {
- memcpy(&attr.key.buf, &ppid.id, ppid.id_len);
- attr.flags = MLX5_DEVCOM_MATCH_FLAGS_NS;
- attr.net = mlx5_core_net(esw->dev);
- mlx5_esw_offloads_devcom_init(esw, &attr);
- }
-
return 0;
err_action_counter:
@@ -5484,16 +5515,6 @@ err_tun_mapping:
void mlx5e_tc_esw_cleanup(struct mlx5_rep_uplink_priv *uplink_priv)
{
- struct mlx5e_rep_priv *rpriv;
- struct mlx5_eswitch *esw;
- struct mlx5e_priv *priv;
-
- rpriv = container_of(uplink_priv, struct mlx5e_rep_priv, uplink_priv);
- priv = netdev_priv(rpriv->netdev);
- esw = priv->mdev->priv.eswitch;
-
- mlx5_esw_offloads_devcom_cleanup(esw);
-
mlx5e_tc_tun_cleanup(uplink_priv->encap);
mapping_destroy(uplink_priv->tunnel_enc_opts_mapping);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 8a94c38f8566..94a530d19828 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -955,6 +955,8 @@ int mlx5_eswitch_offloads_single_fdb_add_one(struct mlx5_eswitch *master_esw,
void mlx5_eswitch_offloads_single_fdb_del_one(struct mlx5_eswitch *master_esw,
struct mlx5_eswitch *slave_esw);
int mlx5_eswitch_reload_ib_reps(struct mlx5_eswitch *esw);
+bool mlx5_eswitch_is_peer(struct mlx5_eswitch *esw,
+ struct mlx5_eswitch *peer_esw);
bool mlx5_eswitch_block_encap(struct mlx5_core_dev *dev, bool from_fdb);
void mlx5_eswitch_unblock_encap(struct mlx5_core_dev *dev);
@@ -970,13 +972,6 @@ static inline int mlx5_eswitch_num_vfs(struct mlx5_eswitch *esw)
return 0;
}
-static inline int mlx5_eswitch_get_npeers(struct mlx5_eswitch *esw)
-{
- if (mlx5_esw_allowed(esw))
- return esw->num_peers;
- return 0;
-}
-
static inline struct mlx5_flow_table *
mlx5_eswitch_get_slow_fdb(struct mlx5_eswitch *esw)
{
@@ -1058,8 +1053,6 @@ static inline void
mlx5_eswitch_offloads_single_fdb_del_one(struct mlx5_eswitch *master_esw,
struct mlx5_eswitch *slave_esw) {}
-static inline int mlx5_eswitch_get_npeers(struct mlx5_eswitch *esw) { return 0; }
-
static inline int
mlx5_eswitch_reload_ib_reps(struct mlx5_eswitch *esw)
{
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 189be11c4c39..830fc910a080 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -3296,6 +3296,18 @@ static int mlx5_esw_offloads_set_ns_peer(struct mlx5_eswitch *esw,
return 0;
}
+bool mlx5_eswitch_is_peer(struct mlx5_eswitch *esw,
+ struct mlx5_eswitch *peer_esw)
+{
+ u16 peer_esw_i;
+
+ if (!mlx5_esw_allowed(esw) || !mlx5_esw_allowed(peer_esw))
+ return false;
+
+ peer_esw_i = MLX5_CAP_GEN(peer_esw->dev, vhca_id);
+ return !!xa_load(&esw->paired, peer_esw_i);
+}
+
static int mlx5_esw_offloads_devcom_event(int event,
void *my_data,
void *event_data)
@@ -3866,6 +3878,7 @@ bool mlx5_esw_offloads_controller_valid(const struct mlx5_eswitch *esw, u32 cont
int esw_offloads_enable(struct mlx5_eswitch *esw)
{
u8 mapping_id[MLX5_SW_IMAGE_GUID_MAX_BYTES];
+ struct mlx5_devcom_match_attr attr = {};
struct mapping_ctx *reg_c0_obj_pool;
struct mlx5_vport *vport;
unsigned long i;
@@ -3926,6 +3939,10 @@ int esw_offloads_enable(struct mlx5_eswitch *esw)
if (err)
goto err_vports;
+ memcpy(attr.key.buf, mapping_id, id_len);
+ attr.flags = MLX5_DEVCOM_MATCH_FLAGS_NS;
+ attr.net = mlx5_core_net(esw->dev);
+ mlx5_esw_offloads_devcom_init(esw, &attr);
return 0;
err_vports:
@@ -3970,6 +3987,7 @@ static int esw_offloads_stop(struct mlx5_eswitch *esw,
void esw_offloads_disable(struct mlx5_eswitch *esw)
{
+ mlx5_esw_offloads_devcom_cleanup(esw);
mlx5_eswitch_disable_pf_vf_vports(esw);
mlx5_esw_offloads_rep_unload(esw, MLX5_VPORT_UPLINK);
esw_set_passing_vport_metadata(esw, false);
@@ -4672,8 +4690,11 @@ EXPORT_SYMBOL(mlx5_eswitch_unregister_vport_reps_nested);
void *mlx5_eswitch_get_uplink_priv(struct mlx5_eswitch *esw, u8 rep_type)
{
+ struct mlx5_core_dev *primary = mlx5_sd_get_primary(esw->dev);
struct mlx5_eswitch_rep *rep;
+ if (primary)
+ esw = primary->priv.eswitch;
rep = mlx5_eswitch_get_rep(esw, MLX5_VPORT_UPLINK);
return rep->rep_data[rep_type].priv;
}
@@ -4695,6 +4716,11 @@ EXPORT_SYMBOL(mlx5_eswitch_get_proto_dev);
void *mlx5_eswitch_uplink_get_proto_dev(struct mlx5_eswitch *esw, u8 rep_type)
{
+ struct mlx5_core_dev *primary = mlx5_sd_get_primary(esw->dev);
+
+ if (primary)
+ esw = primary->priv.eswitch;
+
return mlx5_eswitch_get_proto_dev(esw, MLX5_VPORT_UPLINK, rep_type);
}
EXPORT_SYMBOL(mlx5_eswitch_uplink_get_proto_dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
index 22b7efea34b8..dd3f18f85466 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
@@ -242,7 +242,7 @@ static void mlx5_ldev_free(struct kref *ref)
unregister_netdevice_notifier_net(net, &ldev->nb);
}
- mlx5_ldev_for_each(i, 0, ldev) {
+ mlx5_lag_for_each(i, 0, ldev, MLX5_LAG_FILTER_ALL) {
pf = mlx5_lag_pf(ldev, i);
if (pf->port_change_nb.nb.notifier_call) {
struct mlx5_nb *nb = &pf->port_change_nb;
@@ -293,11 +293,14 @@ static struct mlx5_lag *mlx5_lag_dev_alloc(struct mlx5_core_dev *dev)
INIT_DELAYED_WORK(&ldev->bond_work, mlx5_do_bond_work);
INIT_WORK(&ldev->speed_update_work, mlx5_mpesw_speed_update_work);
- ldev->nb.notifier_call = mlx5_lag_netdev_event;
- write_pnet(&ldev->net, mlx5_core_net(dev));
- if (register_netdevice_notifier_net(read_pnet(&ldev->net), &ldev->nb)) {
- ldev->nb.notifier_call = NULL;
- mlx5_core_err(dev, "Failed to register LAG netdev notifier\n");
+ if (!mlx5_sd_is_supported(dev)) {
+ ldev->nb.notifier_call = mlx5_lag_netdev_event;
+ write_pnet(&ldev->net, mlx5_core_net(dev));
+ if (register_netdevice_notifier_net(read_pnet(&ldev->net),
+ &ldev->nb)) {
+ ldev->nb.notifier_call = NULL;
+ mlx5_core_err(dev, "Failed to register LAG netdev notifier\n");
+ }
}
ldev->mode = MLX5_LAG_MODE_NONE;
@@ -370,6 +373,22 @@ int mlx5_lag_get_dev_index_by_seq(struct mlx5_lag *ldev, int seq)
return -ENOENT;
}
+/* Return the appropriate iterator filter for a device in LAG:
+ * - SD shared FDB active: iterate only the device's SD group
+ * - SD group exists but shared FDB not active: iterate all devices
+ * - No SD: iterate ports only
+ */
+static u32 mlx5_lag_get_filter(struct mlx5_lag *ldev, struct mlx5_core_dev *dev)
+{
+ struct lag_func *pf = mlx5_lag_pf_by_dev(ldev, dev);
+
+ if (pf && pf->sd_fdb_active)
+ return pf->group_id;
+ if (pf && pf->group_id)
+ return MLX5_LAG_FILTER_ALL;
+ return MLX5_LAG_FILTER_PORTS;
+}
+
/* Reverse of mlx5_lag_get_dev_index_by_seq: given a device, return its
* sequence number in the LAG. Master is always 0, others numbered
* sequentially starting from 1.
@@ -379,11 +398,13 @@ int mlx5_lag_get_dev_seq(struct mlx5_core_dev *dev)
struct mlx5_lag *ldev = mlx5_lag_dev(dev);
int master_idx, i, num = 1;
struct lag_func *pf;
+ u32 filter;
if (!ldev)
return -ENOENT;
- master_idx = mlx5_lag_get_master_idx(ldev);
+ filter = mlx5_lag_get_filter(ldev, dev);
+ master_idx = mlx5_lag_get_dev_index_by_seq_filter(ldev, 0, filter);
if (master_idx < 0)
return -ENOENT;
@@ -391,7 +412,7 @@ int mlx5_lag_get_dev_seq(struct mlx5_core_dev *dev)
if (pf && pf->dev == dev)
return 0;
- mlx5_ldev_for_each(i, 0, ldev) {
+ mlx5_lag_for_each(i, 0, ldev, filter) {
if (i == master_idx)
continue;
pf = mlx5_lag_pf(ldev, i);
@@ -403,6 +424,69 @@ int mlx5_lag_get_dev_seq(struct mlx5_core_dev *dev)
}
EXPORT_SYMBOL(mlx5_lag_get_dev_seq);
+/* seq 0 = master, then all remaining devices */
+static int mlx5_lag_get_dev_index_by_seq_all(struct mlx5_lag *ldev, int seq)
+{
+ int master_idx, i, num = 0;
+
+ master_idx = mlx5_lag_get_master_idx(ldev);
+
+ if (master_idx >= 0) {
+ if (seq == 0)
+ return master_idx;
+ num++;
+ }
+
+ mlx5_lag_for_each(i, 0, ldev, MLX5_LAG_FILTER_ALL) {
+ if (i == master_idx)
+ continue;
+ if (num == seq)
+ return i;
+ num++;
+ }
+ return -ENOENT;
+}
+
+/* From group POV, port-marked entry is the lag master */
+static int mlx5_lag_get_dev_index_by_seq_group(struct mlx5_lag *ldev, int seq,
+ u32 group_id)
+{
+ int i, num = 0;
+
+ mlx5_lag_for_each(i, 0, ldev, group_id) {
+ if (xa_get_mark(&ldev->pfs, i, MLX5_LAG_XA_MARK_PORT)) {
+ if (seq == 0)
+ return i;
+ num++;
+ break;
+ }
+ }
+
+ mlx5_lag_for_each(i, 0, ldev, group_id) {
+ if (xa_get_mark(&ldev->pfs, i, MLX5_LAG_XA_MARK_PORT))
+ continue;
+ if (num == seq)
+ return i;
+ num++;
+ }
+ return -ENOENT;
+}
+
+int mlx5_lag_get_dev_index_by_seq_filter(struct mlx5_lag *ldev, int seq,
+ u32 filter)
+{
+ if (!ldev)
+ return -ENOENT;
+
+ if (!filter || filter == MLX5_LAG_FILTER_PORTS)
+ return mlx5_lag_get_dev_index_by_seq(ldev, seq);
+
+ if (filter == MLX5_LAG_FILTER_ALL)
+ return mlx5_lag_get_dev_index_by_seq_all(ldev, seq);
+
+ return mlx5_lag_get_dev_index_by_seq_group(ldev, seq, filter);
+}
+
/* Devcom events for LAG master marking */
#define LAG_DEVCOM_PAIR (0)
#define LAG_DEVCOM_UNPAIR (1)
@@ -512,6 +596,14 @@ static bool __mlx5_lag_is_sriov(struct mlx5_lag *ldev)
return ldev->mode == MLX5_LAG_MODE_SRIOV;
}
+static bool __mlx5_lag_is_sd_active(struct mlx5_lag *ldev,
+ struct mlx5_core_dev *dev)
+{
+ struct lag_func *pf = mlx5_lag_pf_by_dev(ldev, dev);
+
+ return pf && pf->sd_fdb_active;
+}
+
/* Create a mapping between steering slots and active ports.
* As we have ldev->buckets slots per port first assume the native
* mapping should be used.
@@ -817,43 +909,6 @@ char *mlx5_get_str_port_sel_mode(enum mlx5_lag_mode mode, unsigned long flags)
}
}
-static int mlx5_lag_create_single_fdb(struct mlx5_lag *ldev)
-{
- int master_idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1);
- struct mlx5_eswitch *master_esw;
- struct mlx5_core_dev *dev0;
- int i, j;
- int err;
-
- if (master_idx < 0)
- return -EINVAL;
-
- dev0 = mlx5_lag_pf(ldev, master_idx)->dev;
- master_esw = dev0->priv.eswitch;
- mlx5_ldev_for_each(i, 0, ldev) {
- struct mlx5_eswitch *slave_esw;
-
- if (i == master_idx)
- continue;
-
- slave_esw = mlx5_lag_pf(ldev, i)->dev->priv.eswitch;
-
- err = mlx5_eswitch_offloads_single_fdb_add_one(master_esw,
- slave_esw, ldev->ports);
- if (err)
- goto err;
- }
- return 0;
-err:
- mlx5_ldev_for_each_reverse(j, i, 0, ldev) {
- if (j == master_idx)
- continue;
- mlx5_eswitch_offloads_single_fdb_del_one(master_esw,
- mlx5_lag_pf(ldev, j)->dev->priv.eswitch);
- }
- return err;
-}
-
static int mlx5_create_lag(struct mlx5_lag *ldev,
struct lag_tracker *tracker,
enum mlx5_lag_mode mode,
@@ -964,27 +1019,19 @@ int mlx5_deactivate_lag(struct mlx5_lag *ldev)
u32 in[MLX5_ST_SZ_DW(destroy_lag_in)] = {};
bool roce_lag = __mlx5_lag_is_roce(ldev);
unsigned long flags = ldev->mode_flags;
- struct mlx5_eswitch *master_esw;
struct mlx5_core_dev *dev0;
int err;
- int i;
if (master_idx < 0)
return -EINVAL;
dev0 = mlx5_lag_pf(ldev, master_idx)->dev;
- master_esw = dev0->priv.eswitch;
ldev->mode = MLX5_LAG_MODE_NONE;
ldev->mode_flags = 0;
mlx5_lag_mp_reset(ldev);
if (test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &flags)) {
- mlx5_ldev_for_each(i, 0, ldev) {
- if (i == master_idx)
- continue;
- mlx5_eswitch_offloads_single_fdb_del_one(master_esw,
- mlx5_lag_pf(ldev, i)->dev->priv.eswitch);
- }
+ mlx5_lag_destroy_single_fdb(ldev);
clear_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &flags);
}
@@ -1063,7 +1110,7 @@ bool mlx5_lag_check_prereq(struct mlx5_lag *ldev)
return true;
}
-static void mlx5_lag_assert_locked_transition(struct mlx5_lag *ldev)
+static void mlx5_lag_assert_locked_transition(struct mlx5_lag *ldev, u32 filter)
{
struct mlx5_devcom_comp_dev *devcom = NULL;
struct lag_func *pf;
@@ -1071,17 +1118,21 @@ static void mlx5_lag_assert_locked_transition(struct mlx5_lag *ldev)
lockdep_assert_held(&ldev->lock);
- i = mlx5_get_next_ldev_func(ldev, 0);
+ i = mlx5_get_next_lag_func(ldev, 0, filter);
if (i < MLX5_MAX_PORTS) {
pf = mlx5_lag_pf(ldev, i);
- devcom = pf->dev->priv.hca_devcom_comp;
+ if (filter == MLX5_LAG_FILTER_PORTS ||
+ filter == MLX5_LAG_FILTER_ALL)
+ devcom = pf->dev->priv.hca_devcom_comp;
+ else
+ devcom = mlx5_sd_get_devcom(pf->dev);
}
mlx5_devcom_comp_assert_locked(devcom);
}
-static void mlx5_lag_drop_lock_for_reps(struct mlx5_lag *ldev)
+static void mlx5_lag_drop_lock_for_reps(struct mlx5_lag *ldev, u32 filter)
{
- mlx5_lag_assert_locked_transition(ldev);
+ mlx5_lag_assert_locked_transition(ldev, filter);
/* Keep PF membership stable while ldev->lock is dropped. Device add
* and remove paths observe mode_changes_in_progress and retry.
@@ -1112,21 +1163,22 @@ void mlx5_lag_rescan_dev_locked(struct mlx5_lag *ldev,
* callbacks and take reps_lock. Drop ldev->lock so the only ordering
* remains reps_lock -> ldev->lock from representor callbacks.
*/
- mlx5_lag_drop_lock_for_reps(ldev);
+ mlx5_lag_drop_lock_for_reps(ldev, mlx5_lag_get_filter(ldev, dev));
mlx5_rescan_drivers_locked(dev);
mlx5_lag_retake_lock_after_reps(ldev);
}
-static void mlx5_lag_rescan_devices_locked(struct mlx5_lag *ldev, bool enable)
+static void mlx5_lag_rescan_devices_locked_filter(struct mlx5_lag *ldev,
+ bool enable, u32 filter)
{
struct mlx5_core_dev *devs[MLX5_MAX_PORTS];
struct lag_func *pf;
int num_devs = 0;
int i;
- mlx5_lag_assert_locked_transition(ldev);
+ mlx5_lag_assert_locked_transition(ldev, filter);
- mlx5_ldev_for_each(i, 0, ldev) {
+ mlx5_lag_for_each(i, 0, ldev, filter) {
pf = mlx5_lag_pf(ldev, i);
if (pf->dev->priv.flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV)
continue;
@@ -1138,30 +1190,40 @@ static void mlx5_lag_rescan_devices_locked(struct mlx5_lag *ldev, bool enable)
devs[num_devs++] = pf->dev;
}
- mlx5_lag_drop_lock_for_reps(ldev);
+ mlx5_lag_drop_lock_for_reps(ldev, filter);
for (i = 0; i < num_devs; i++)
mlx5_rescan_drivers_locked(devs[i]);
mlx5_lag_retake_lock_after_reps(ldev);
}
+void mlx5_lag_add_devices_filter(struct mlx5_lag *ldev, u32 filter)
+{
+ mlx5_lag_rescan_devices_locked_filter(ldev, true, filter);
+}
+
void mlx5_lag_add_devices(struct mlx5_lag *ldev)
{
- mlx5_lag_rescan_devices_locked(ldev, true);
+ mlx5_lag_add_devices_filter(ldev, MLX5_LAG_FILTER_PORTS);
+}
+
+void mlx5_lag_remove_devices_filter(struct mlx5_lag *ldev, u32 filter)
+{
+ mlx5_lag_rescan_devices_locked_filter(ldev, false, filter);
}
void mlx5_lag_remove_devices(struct mlx5_lag *ldev)
{
- mlx5_lag_rescan_devices_locked(ldev, false);
+ mlx5_lag_remove_devices_filter(ldev, MLX5_LAG_FILTER_PORTS);
}
static int mlx5_lag_reload_ib_reps_unlocked(struct mlx5_lag *ldev, u32 flags,
- bool cont_on_fail)
+ u32 filter, bool cont_on_fail)
{
struct lag_func *pf;
int ret;
int i;
- mlx5_ldev_for_each(i, 0, ldev) {
+ mlx5_lag_for_each(i, 0, ldev, filter) {
pf = mlx5_lag_pf(ldev, i);
if (!(pf->dev->priv.flags & flags)) {
struct mlx5_eswitch *esw;
@@ -1179,7 +1241,7 @@ static int mlx5_lag_reload_ib_reps_unlocked(struct mlx5_lag *ldev, u32 flags,
}
static int mlx5_lag_reload_ib_reps(struct mlx5_lag *ldev, u32 flags,
- bool cont_on_fail)
+ u32 filter, bool cont_on_fail)
{
int ret;
@@ -1189,21 +1251,18 @@ static int mlx5_lag_reload_ib_reps(struct mlx5_lag *ldev, u32 flags,
* load/unload callbacks can re-enter LAG netdev add/remove and take
* ldev->lock. Keep the ordering reps_lock -> ldev->lock.
*/
- mlx5_lag_drop_lock_for_reps(ldev);
- ret = mlx5_lag_reload_ib_reps_unlocked(ldev, flags, cont_on_fail);
+ mlx5_lag_drop_lock_for_reps(ldev, filter);
+ ret = mlx5_lag_reload_ib_reps_unlocked(ldev, flags, filter,
+ cont_on_fail);
mlx5_lag_retake_lock_after_reps(ldev);
return ret;
}
int mlx5_lag_reload_ib_reps_from_locked(struct mlx5_lag *ldev, u32 flags,
- bool cont_on_fail)
+ u32 filter, bool cont_on_fail)
{
- int ret;
-
- ret = mlx5_lag_reload_ib_reps(ldev, flags, cont_on_fail);
-
- return ret;
+ return mlx5_lag_reload_ib_reps(ldev, flags, filter, cont_on_fail);
}
void mlx5_disable_lag(struct mlx5_lag *ldev)
@@ -1218,12 +1277,15 @@ void mlx5_disable_lag(struct mlx5_lag *ldev)
if (idx < 0)
return;
+ if (shared_fdb) {
+ mlx5_lag_shared_fdb_destroy(ldev, 0);
+ return;
+ }
+
dev0 = mlx5_lag_pf(ldev, idx)->dev;
roce_lag = __mlx5_lag_is_roce(ldev);
- if (shared_fdb) {
- mlx5_lag_remove_devices(ldev);
- } else if (roce_lag) {
+ if (roce_lag) {
mlx5_lag_rescan_dev_locked(ldev, dev0, false);
mlx5_ldev_for_each(i, 0, ldev) {
if (i == idx)
@@ -1236,49 +1298,8 @@ void mlx5_disable_lag(struct mlx5_lag *ldev)
if (err)
return;
- if (shared_fdb || roce_lag)
+ if (roce_lag)
mlx5_lag_add_devices(ldev);
-
- if (shared_fdb)
- mlx5_lag_reload_ib_reps_from_locked(ldev,
- MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV,
- true);
-}
-
-bool mlx5_lag_shared_fdb_supported(struct mlx5_lag *ldev)
-{
- struct mlx5_core_dev *dev;
- bool ret = false;
- int idx;
- int i;
-
- idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1);
- if (idx < 0)
- return false;
-
- mlx5_ldev_for_each(i, 0, ldev) {
- if (i == idx)
- continue;
- dev = mlx5_lag_pf(ldev, i)->dev;
- if (is_mdev_switchdev_mode(dev) &&
- mlx5_eswitch_vport_match_metadata_enabled(dev->priv.eswitch) &&
- MLX5_CAP_GEN(dev, lag_native_fdb_selection) &&
- MLX5_CAP_ESW(dev, root_ft_on_other_esw) &&
- mlx5_eswitch_get_npeers(dev->priv.eswitch) ==
- MLX5_CAP_GEN(dev, num_lag_ports) - 1)
- continue;
- return false;
- }
-
- dev = mlx5_lag_pf(ldev, idx)->dev;
- if (is_mdev_switchdev_mode(dev) &&
- mlx5_eswitch_vport_match_metadata_enabled(dev->priv.eswitch) &&
- mlx5_esw_offloads_devcom_is_ready(dev->priv.eswitch) &&
- MLX5_CAP_ESW(dev, esw_shared_ingress_acl) &&
- mlx5_eswitch_get_npeers(dev->priv.eswitch) == MLX5_CAP_GEN(dev, num_lag_ports) - 1)
- ret = true;
-
- return ret;
}
static bool mlx5_lag_is_roce_lag(struct mlx5_lag *ldev)
@@ -1493,47 +1514,38 @@ static void mlx5_do_bond(struct mlx5_lag *ldev)
roce_lag = mlx5_lag_is_roce_lag(ldev);
- if (shared_fdb || roce_lag)
- mlx5_lag_remove_devices(ldev);
-
- err = mlx5_activate_lag(ldev, &tracker,
- roce_lag ? MLX5_LAG_MODE_ROCE :
- MLX5_LAG_MODE_SRIOV,
- shared_fdb);
- if (err) {
- if (shared_fdb || roce_lag)
- mlx5_lag_add_devices(ldev);
- if (shared_fdb)
- mlx5_lag_reload_ib_reps_from_locked(ldev, 0,
- true);
-
- return;
- }
+ if (shared_fdb) {
+ err = mlx5_lag_shared_fdb_create(ldev, &tracker,
+ MLX5_LAG_MODE_SRIOV,
+ 0);
+ if (err)
+ return;
+ } else {
+ if (roce_lag)
+ mlx5_lag_remove_devices(ldev);
- if (roce_lag) {
- struct mlx5_core_dev *dev;
-
- mlx5_lag_rescan_dev_locked(ldev, dev0, true);
- mlx5_ldev_for_each(i, 0, ldev) {
- if (i == idx)
- continue;
- dev = mlx5_lag_pf(ldev, i)->dev;
- if (mlx5_get_roce_state(dev))
- mlx5_nic_vport_enable_roce(dev);
- }
- } else if (shared_fdb) {
- mlx5_lag_rescan_dev_locked(ldev, dev0, true);
- err = mlx5_lag_reload_ib_reps_from_locked(ldev, 0,
- false);
+ err = mlx5_activate_lag(ldev, &tracker,
+ roce_lag ? MLX5_LAG_MODE_ROCE :
+ MLX5_LAG_MODE_SRIOV,
+ false);
if (err) {
- mlx5_lag_rescan_dev_locked(ldev, dev0, false);
- mlx5_deactivate_lag(ldev);
- mlx5_lag_add_devices(ldev);
- mlx5_lag_reload_ib_reps_from_locked(ldev, 0,
- true);
- mlx5_core_err(dev0, "Failed to enable lag\n");
+ if (roce_lag)
+ mlx5_lag_add_devices(ldev);
return;
}
+
+ if (roce_lag) {
+ struct mlx5_core_dev *dev;
+
+ mlx5_lag_rescan_dev_locked(ldev, dev0, true);
+ mlx5_ldev_for_each(i, 0, ldev) {
+ if (i == idx)
+ continue;
+ dev = mlx5_lag_pf(ldev, i)->dev;
+ if (mlx5_get_roce_state(dev))
+ mlx5_nic_vport_enable_roce(dev);
+ }
+ }
}
if (tracker.tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) {
ndev = mlx5_lag_active_backup_get_netdev(dev0);
@@ -1545,7 +1557,8 @@ static void mlx5_do_bond(struct mlx5_lag *ldev)
ndev);
dev_put(ndev);
}
- mlx5_lag_set_vports_agg_speed(ldev);
+ if (!shared_fdb)
+ mlx5_lag_set_vports_agg_speed(ldev);
} else if (mlx5_lag_should_modify_lag(ldev, do_bond)) {
mlx5_modify_lag(ldev, &tracker);
mlx5_lag_set_vports_agg_speed(ldev);
@@ -1566,7 +1579,7 @@ struct mlx5_devcom_comp_dev *mlx5_lag_get_devcom_comp(struct mlx5_lag *ldev)
int i;
mutex_lock(&ldev->lock);
- i = mlx5_get_next_ldev_func(ldev, 0);
+ i = mlx5_get_next_lag_func(ldev, 0, MLX5_LAG_FILTER_PORTS);
if (i < MLX5_MAX_PORTS) {
pf = mlx5_lag_pf(ldev, i);
devcom = pf->dev->priv.hca_devcom_comp;
@@ -2049,8 +2062,9 @@ static void mlx5_ldev_remove_netdev(struct mlx5_lag *ldev,
spin_unlock_irqrestore(&lag_lock, flags);
}
-static int mlx5_ldev_add_mdev(struct mlx5_lag *ldev,
- struct mlx5_core_dev *dev)
+int mlx5_ldev_add_mdev(struct mlx5_lag *ldev,
+ struct mlx5_core_dev *dev,
+ u32 group_id)
{
struct lag_func *pf;
u32 idx;
@@ -2069,8 +2083,14 @@ static int mlx5_ldev_add_mdev(struct mlx5_lag *ldev,
pf->idx = idx;
pf->dev = dev;
+ pf->group_id = group_id;
dev->priv.lag = ldev;
+ if (group_id)
+ return 0;
+
+ xa_set_mark(&ldev->pfs, idx, MLX5_LAG_XA_MARK_PORT);
+
MLX5_NB_INIT(&pf->port_change_nb,
mlx5_lag_mpesw_port_change_event, PORT_CHANGE);
mlx5_eq_notifier_register(dev, &pf->port_change_nb);
@@ -2078,13 +2098,13 @@ static int mlx5_ldev_add_mdev(struct mlx5_lag *ldev,
return 0;
}
-static void mlx5_ldev_remove_mdev(struct mlx5_lag *ldev,
- struct mlx5_core_dev *dev)
+void mlx5_ldev_remove_mdev(struct mlx5_lag *ldev,
+ struct mlx5_core_dev *dev)
{
struct lag_func *pf;
int i;
- mlx5_ldev_for_each(i, 0, ldev) {
+ mlx5_lag_for_each(i, 0, ldev, MLX5_LAG_FILTER_ALL) {
pf = mlx5_lag_pf(ldev, i);
if (pf->dev == dev)
break;
@@ -2119,7 +2139,7 @@ static int __mlx5_lag_dev_add_mdev(struct mlx5_core_dev *dev)
mlx5_core_err(dev, "Failed to alloc lag dev\n");
return 0;
}
- err = mlx5_ldev_add_mdev(ldev, dev);
+ err = mlx5_ldev_add_mdev(ldev, dev, 0);
if (err) {
mlx5_core_err(dev, "Failed to add mdev to lag dev\n");
mlx5_ldev_put(ldev);
@@ -2134,7 +2154,7 @@ static int __mlx5_lag_dev_add_mdev(struct mlx5_core_dev *dev)
return -EAGAIN;
}
mlx5_ldev_get(ldev);
- err = mlx5_ldev_add_mdev(ldev, dev);
+ err = mlx5_ldev_add_mdev(ldev, dev, 0);
if (err) {
mlx5_ldev_put(ldev);
mutex_unlock(&ldev->lock);
@@ -2271,27 +2291,47 @@ void mlx5_lag_add_netdev(struct mlx5_core_dev *dev,
mlx5_queue_bond_work(ldev, 0);
}
-int mlx5_get_pre_ldev_func(struct mlx5_lag *ldev, int start_idx, int end_idx)
+int mlx5_get_pre_lag_func(struct mlx5_lag *ldev, int start_idx, int end_idx,
+ u32 filter)
{
struct lag_func *pf;
int i;
for (i = start_idx; i >= end_idx; i--) {
pf = xa_load(&ldev->pfs, i);
- if (pf && pf->dev)
+ if (!pf || !pf->dev)
+ continue;
+ if (filter == MLX5_LAG_FILTER_PORTS) {
+ if (xa_get_mark(&ldev->pfs, i, MLX5_LAG_XA_MARK_PORT))
+ return i;
+ } else if (filter == MLX5_LAG_FILTER_ALL ||
+ filter == pf->group_id) {
return i;
+ }
}
return -1;
}
-int mlx5_get_next_ldev_func(struct mlx5_lag *ldev, int start_idx)
+int mlx5_get_next_lag_func(struct mlx5_lag *ldev, int start_idx, u32 filter)
{
struct lag_func *pf;
unsigned long idx;
- xa_for_each_start(&ldev->pfs, idx, pf, start_idx)
- if (pf->dev)
+ if (filter == MLX5_LAG_FILTER_PORTS) {
+ xa_for_each_marked_start(&ldev->pfs, idx, pf,
+ MLX5_LAG_XA_MARK_PORT, start_idx)
+ if (pf->dev)
+ return idx;
+ return MLX5_MAX_PORTS;
+ }
+
+ xa_for_each_start(&ldev->pfs, idx, pf, start_idx) {
+ if (!pf->dev)
+ continue;
+ if (filter == MLX5_LAG_FILTER_ALL ||
+ filter == pf->group_id)
return idx;
+ }
return MLX5_MAX_PORTS;
}
@@ -2318,7 +2358,8 @@ bool mlx5_lag_is_active(struct mlx5_core_dev *dev)
spin_lock_irqsave(&lag_lock, flags);
ldev = mlx5_lag_dev(dev);
- res = ldev && __mlx5_lag_is_active(ldev);
+ res = ldev && (__mlx5_lag_is_active(ldev) ||
+ __mlx5_lag_is_sd_active(ldev, dev));
spin_unlock_irqrestore(&lag_lock, flags);
return res;
@@ -2351,10 +2392,17 @@ bool mlx5_lag_is_master(struct mlx5_core_dev *dev)
spin_lock_irqsave(&lag_lock, flags);
ldev = mlx5_lag_dev(dev);
- idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1);
- if (ldev && __mlx5_lag_is_active(ldev) && idx >= 0) {
- pf = mlx5_lag_pf(ldev, idx);
- res = pf && dev == pf->dev;
+ if (ldev) {
+ u32 filter;
+
+ filter = mlx5_lag_get_filter(ldev, dev);
+ idx = mlx5_lag_get_dev_index_by_seq_filter(ldev, MLX5_LAG_P1,
+ filter);
+ if ((__mlx5_lag_is_active(ldev) ||
+ __mlx5_lag_is_sd_active(ldev, dev)) && idx >= 0) {
+ pf = mlx5_lag_pf(ldev, idx);
+ res = pf && dev == pf->dev;
+ }
}
spin_unlock_irqrestore(&lag_lock, flags);
@@ -2377,7 +2425,7 @@ bool mlx5_lag_is_sriov(struct mlx5_core_dev *dev)
}
EXPORT_SYMBOL(mlx5_lag_is_sriov);
-bool mlx5_lag_is_shared_fdb(struct mlx5_core_dev *dev)
+bool mlx5_lag_is_sd(struct mlx5_core_dev *dev)
{
struct mlx5_lag *ldev;
unsigned long flags;
@@ -2385,7 +2433,26 @@ bool mlx5_lag_is_shared_fdb(struct mlx5_core_dev *dev)
spin_lock_irqsave(&lag_lock, flags);
ldev = mlx5_lag_dev(dev);
- res = ldev && test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &ldev->mode_flags);
+ res = ldev && __mlx5_lag_is_sd(ldev, dev);
+ spin_unlock_irqrestore(&lag_lock, flags);
+
+ return res;
+}
+
+bool mlx5_lag_is_shared_fdb(struct mlx5_core_dev *dev)
+{
+ struct mlx5_lag *ldev;
+ unsigned long flags;
+ bool res = false;
+
+ spin_lock_irqsave(&lag_lock, flags);
+ ldev = mlx5_lag_dev(dev);
+ if (ldev) {
+ res = test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB,
+ &ldev->mode_flags);
+ if (__mlx5_lag_is_sd(ldev, dev) && !__mlx5_lag_is_active(ldev))
+ res = __mlx5_lag_is_sd_active(ldev, dev);
+ }
spin_unlock_irqrestore(&lag_lock, flags);
return res;
@@ -2486,7 +2553,7 @@ struct mlx5_core_dev *mlx5_lag_get_next_peer_mdev(struct mlx5_core_dev *dev, int
if (*i == MLX5_MAX_PORTS)
goto unlock;
- mlx5_ldev_for_each(idx, *i, ldev) {
+ mlx5_lag_for_each(idx, *i, ldev, mlx5_lag_get_filter(ldev, dev)) {
pf = mlx5_lag_pf(ldev, idx);
if (pf->dev != dev)
break;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
index 6afe7707d076..0296f752bb4c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
@@ -15,6 +15,13 @@
* Note: XA_MARK_0 is reserved by XA_FLAGS_ALLOC for free-slot tracking.
*/
#define MLX5_LAG_XA_MARK_MASTER XA_MARK_1
+/* XArray mark for port-level entries (excludes SD secondaries) */
+#define MLX5_LAG_XA_MARK_PORT XA_MARK_2
+
+/* Like xa_for_each_marked but starting from a given index */
+#define xa_for_each_marked_start(xa, index, entry, filter, start) \
+ for (index = start, entry = xa_find(xa, &index, ULONG_MAX, filter); \
+ entry; entry = xa_find_after(xa, &index, ULONG_MAX, filter))
#include "mlx5_core.h"
#include "mp.h"
@@ -50,6 +57,8 @@ struct lag_func {
bool has_drop;
unsigned int idx; /* xarray index assigned by LAG */
struct mlx5_nb port_change_nb;
+ u32 group_id; /* SD group ID, 0 = not SD */
+ bool sd_fdb_active; /* set on all SD group members */
};
/* Used for collection of netdev event info. */
@@ -125,6 +134,28 @@ mlx5_lag_pf_by_dev_idx(struct mlx5_lag *ldev, int dev_idx)
return NULL;
}
+/* Find lag_func by mlx5_core_dev pointer */
+static inline struct lag_func *
+mlx5_lag_pf_by_dev(struct mlx5_lag *ldev, struct mlx5_core_dev *dev)
+{
+ struct lag_func *pf;
+ unsigned long idx;
+
+ xa_for_each(&ldev->pfs, idx, pf) {
+ if (pf->dev == dev)
+ return pf;
+ }
+ return NULL;
+}
+
+static inline bool
+__mlx5_lag_is_sd(struct mlx5_lag *ldev, struct mlx5_core_dev *dev)
+{
+ struct lag_func *pf = mlx5_lag_pf_by_dev(ldev, dev);
+
+ return pf && pf->group_id != 0;
+}
+
static inline bool
__mlx5_lag_is_active(struct mlx5_lag *ldev)
{
@@ -137,8 +168,41 @@ mlx5_lag_is_ready(struct mlx5_lag *ldev)
return test_bit(MLX5_LAG_FLAG_NDEVS_READY, &ldev->state_flags);
}
+#ifdef CONFIG_MLX5_ESWITCH
+int mlx5_lag_shared_fdb_create(struct mlx5_lag *ldev,
+ struct lag_tracker *tracker,
+ enum mlx5_lag_mode mode,
+ u32 group_id);
+void mlx5_lag_shared_fdb_destroy(struct mlx5_lag *ldev, u32 group_id);
+int mlx5_lag_create_single_fdb(struct mlx5_lag *ldev);
+void mlx5_lag_destroy_single_fdb(struct mlx5_lag *ldev);
bool mlx5_lag_shared_fdb_supported(struct mlx5_lag *ldev);
+bool mlx5_lag_shared_fdb_supported_filter(struct mlx5_lag *ldev, u32 filter);
+#else
+static inline int mlx5_lag_shared_fdb_create(struct mlx5_lag *ldev,
+ struct lag_tracker *tracker,
+ enum mlx5_lag_mode mode,
+ u32 group_id)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void mlx5_lag_shared_fdb_destroy(struct mlx5_lag *ldev,
+ u32 group_id) {}
+
+static inline int mlx5_lag_create_single_fdb(struct mlx5_lag *ldev)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void mlx5_lag_destroy_single_fdb(struct mlx5_lag *ldev) {}
+static inline bool mlx5_lag_shared_fdb_supported(struct mlx5_lag *ldev)
+{
+ return false;
+}
+#endif
bool mlx5_lag_check_prereq(struct mlx5_lag *ldev);
+bool mlx5_lag_is_sd(struct mlx5_core_dev *dev);
int mlx5_lag_demux_init(struct mlx5_core_dev *dev,
struct mlx5_flow_table_attr *ft_attr);
void mlx5_lag_demux_cleanup(struct mlx5_core_dev *dev);
@@ -162,11 +226,13 @@ void mlx5_ldev_add_debugfs(struct mlx5_core_dev *dev);
void mlx5_ldev_remove_debugfs(struct dentry *dbg);
void mlx5_disable_lag(struct mlx5_lag *ldev);
void mlx5_lag_remove_devices(struct mlx5_lag *ldev);
+void mlx5_lag_remove_devices_filter(struct mlx5_lag *ldev, u32 filter);
int mlx5_deactivate_lag(struct mlx5_lag *ldev);
void mlx5_lag_add_devices(struct mlx5_lag *ldev);
void mlx5_lag_rescan_dev_locked(struct mlx5_lag *ldev,
struct mlx5_core_dev *dev,
bool enable);
+void mlx5_lag_add_devices_filter(struct mlx5_lag *ldev, u32 filter);
struct mlx5_devcom_comp_dev *mlx5_lag_get_devcom_comp(struct mlx5_lag *ldev);
#ifdef CONFIG_MLX5_ESWITCH
@@ -188,20 +254,40 @@ static inline bool mlx5_lag_is_supported(struct mlx5_core_dev *dev)
return true;
}
-#define mlx5_ldev_for_each(i, start_index, ldev) \
- for (int tmp = start_index; tmp = mlx5_get_next_ldev_func(ldev, tmp), \
+/* Iterator filter constants for mlx5_lag_for_each() */
+#define MLX5_LAG_FILTER_PORTS 0 /* iterate ports only (XA_MARK_PORT) */
+#define MLX5_LAG_FILTER_ALL U32_MAX /* iterate ALL devices */
+/* any other value = iterate devices with that specific group_id */
+
+#define mlx5_lag_for_each(i, start_index, ldev, filter) \
+ for (int tmp = start_index; \
+ tmp = mlx5_get_next_lag_func(ldev, tmp, filter), \
i = tmp, tmp < MLX5_MAX_PORTS; tmp++)
-#define mlx5_ldev_for_each_reverse(i, start_index, end_index, ldev) \
+#define mlx5_lag_for_each_reverse(i, start_index, end_index, ldev, filter) \
for (int tmp = start_index, tmp1 = end_index; \
- tmp = mlx5_get_pre_ldev_func(ldev, tmp, tmp1), \
+ tmp = mlx5_get_pre_lag_func(ldev, tmp, tmp1, filter), \
i = tmp, tmp >= tmp1; tmp--)
-int mlx5_get_pre_ldev_func(struct mlx5_lag *ldev, int start_idx, int end_idx);
-int mlx5_get_next_ldev_func(struct mlx5_lag *ldev, int start_idx);
+/* Convenience wrappers - keeps existing behavior */
+#define mlx5_ldev_for_each(i, start_index, ldev) \
+ mlx5_lag_for_each(i, start_index, ldev, MLX5_LAG_FILTER_PORTS)
+
+#define mlx5_ldev_for_each_reverse(i, start_index, end_index, ldev) \
+ mlx5_lag_for_each_reverse(i, start_index, end_index, ldev, \
+ MLX5_LAG_FILTER_PORTS)
+
+int mlx5_get_pre_lag_func(struct mlx5_lag *ldev, int start_idx, int end_idx,
+ u32 filter);
+int mlx5_get_next_lag_func(struct mlx5_lag *ldev, int start_idx, u32 filter);
int mlx5_lag_get_dev_index_by_seq(struct mlx5_lag *ldev, int seq);
+int mlx5_lag_get_dev_index_by_seq_filter(struct mlx5_lag *ldev, int seq,
+ u32 filter);
int mlx5_lag_num_devs(struct mlx5_lag *ldev);
int mlx5_lag_num_netdevs(struct mlx5_lag *ldev);
int mlx5_lag_reload_ib_reps_from_locked(struct mlx5_lag *ldev, u32 flags,
- bool cont_on_fail);
+ u32 filter, bool cont_on_fail);
+int mlx5_ldev_add_mdev(struct mlx5_lag *ldev, struct mlx5_core_dev *dev,
+ u32 group_id);
+void mlx5_ldev_remove_mdev(struct mlx5_lag *ldev, struct mlx5_core_dev *dev);
#endif /* __MLX5_LAG_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c
index f42e051fa7e7..65c76bd748c6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c
@@ -26,6 +26,10 @@ static bool mlx5_lag_multipath_check_prereq(struct mlx5_lag *ldev)
if (__mlx5_lag_is_active(ldev) && !__mlx5_lag_is_multipath(ldev))
return false;
+ if (__mlx5_lag_is_sd(ldev, mlx5_lag_pf(ldev, idx0)->dev) ||
+ __mlx5_lag_is_sd(ldev, mlx5_lag_pf(ldev, idx1)->dev))
+ return false;
+
if (ldev->ports > MLX5_LAG_MULTIPATH_OFFLOADS_SUPPORTED_PORTS)
return false;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c
index 8a349f8fd823..2cb44084e239 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c
@@ -85,45 +85,29 @@ static int mlx5_lag_enable_mpesw(struct mlx5_lag *ldev)
!MLX5_CAP_PORT_SELECTION(dev0, port_select_flow_table) ||
!MLX5_CAP_GEN(dev0, create_lag_when_not_master_up) ||
!mlx5_lag_check_prereq(ldev) ||
- !mlx5_lag_shared_fdb_supported(ldev))
+ !mlx5_lag_shared_fdb_supported_filter(ldev, MLX5_LAG_FILTER_ALL))
return -EOPNOTSUPP;
err = mlx5_mpesw_metadata_set(ldev);
if (err)
return err;
- mlx5_lag_remove_devices(ldev);
-
- err = mlx5_activate_lag(ldev, NULL, MLX5_LAG_MODE_MPESW, true);
+ err = mlx5_lag_shared_fdb_create(ldev, NULL, MLX5_LAG_MODE_MPESW,
+ MLX5_LAG_FILTER_ALL);
if (err) {
mlx5_core_warn(dev0, "Failed to create LAG in MPESW mode (%d)\n", err);
- goto err_add_devices;
+ mlx5_mpesw_metadata_cleanup(ldev);
+ return err;
}
- mlx5_lag_rescan_dev_locked(ldev, dev0, true);
- err = mlx5_lag_reload_ib_reps_from_locked(ldev, 0, false);
- if (err)
- goto err_rescan_drivers;
-
- mlx5_lag_set_vports_agg_speed(ldev);
-
return 0;
-
-err_rescan_drivers:
- mlx5_lag_rescan_dev_locked(ldev, dev0, false);
- mlx5_deactivate_lag(ldev);
-err_add_devices:
- mlx5_lag_add_devices(ldev);
- mlx5_lag_reload_ib_reps_from_locked(ldev, 0, true);
- mlx5_mpesw_metadata_cleanup(ldev);
- return err;
}
void mlx5_lag_disable_mpesw(struct mlx5_lag *ldev)
{
if (ldev->mode == MLX5_LAG_MODE_MPESW) {
mlx5_mpesw_metadata_cleanup(ldev);
- mlx5_disable_lag(ldev);
+ mlx5_lag_shared_fdb_destroy(ldev, MLX5_LAG_FILTER_ALL);
}
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/shared_fdb.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/shared_fdb.c
new file mode 100644
index 000000000000..1371e14c4c13
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/shared_fdb.c
@@ -0,0 +1,235 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
+
+#include <linux/netdevice.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/eswitch.h>
+#include "mlx5_core.h"
+#include "lag.h"
+#include "eswitch.h"
+
+bool mlx5_lag_shared_fdb_supported_filter(struct mlx5_lag *ldev, u32 filter)
+{
+ int idx = mlx5_lag_get_dev_index_by_seq_filter(ldev, MLX5_LAG_P1,
+ filter);
+ struct mlx5_core_dev *dev0, *dev;
+ bool ret = false;
+ int i;
+
+ if (idx < 0)
+ return false;
+
+ dev0 = mlx5_lag_pf(ldev, idx)->dev;
+ mlx5_lag_for_each(i, 0, ldev, filter) {
+ if (i == idx)
+ continue;
+ dev = mlx5_lag_pf(ldev, i)->dev;
+ if (is_mdev_switchdev_mode(dev) &&
+ mlx5_eswitch_vport_match_metadata_enabled(dev->priv.eswitch) &&
+ MLX5_CAP_GEN(dev, lag_native_fdb_selection) &&
+ MLX5_CAP_ESW(dev, root_ft_on_other_esw) &&
+ mlx5_eswitch_is_peer(dev0->priv.eswitch, dev->priv.eswitch))
+ continue;
+ return false;
+ }
+
+ if (is_mdev_switchdev_mode(dev0) &&
+ mlx5_eswitch_vport_match_metadata_enabled(dev0->priv.eswitch) &&
+ mlx5_esw_offloads_devcom_is_ready(dev0->priv.eswitch) &&
+ MLX5_CAP_ESW(dev0, esw_shared_ingress_acl))
+ ret = true;
+
+ return ret;
+}
+
+bool mlx5_lag_shared_fdb_supported(struct mlx5_lag *ldev)
+{
+ return mlx5_lag_shared_fdb_supported_filter(ldev,
+ MLX5_LAG_FILTER_PORTS);
+}
+
+static int mlx5_lag_create_single_fdb_filter(struct mlx5_lag *ldev, u32 filter)
+{
+ int master_idx = mlx5_lag_get_dev_index_by_seq_filter(ldev, MLX5_LAG_P1,
+ filter);
+ struct mlx5_eswitch *master_esw;
+ struct mlx5_core_dev *dev0;
+ int i, j;
+ int err;
+
+ if (master_idx < 0)
+ return -EINVAL;
+
+ dev0 = mlx5_lag_pf(ldev, master_idx)->dev;
+ master_esw = dev0->priv.eswitch;
+ mlx5_lag_for_each(i, 0, ldev, filter) {
+ struct mlx5_eswitch *slave_esw;
+
+ if (i == master_idx)
+ continue;
+
+ slave_esw = mlx5_lag_pf(ldev, i)->dev->priv.eswitch;
+
+ err = mlx5_eswitch_offloads_single_fdb_add_one(master_esw,
+ slave_esw,
+ ldev->ports);
+ if (err)
+ goto err;
+ }
+ return 0;
+err:
+ mlx5_lag_for_each_reverse(j, i, 0, ldev, filter) {
+ struct mlx5_eswitch *slave_esw;
+
+ if (j == master_idx)
+ continue;
+ slave_esw = mlx5_lag_pf(ldev, j)->dev->priv.eswitch;
+ mlx5_eswitch_offloads_single_fdb_del_one(master_esw, slave_esw);
+ }
+ return err;
+}
+
+static void mlx5_lag_destroy_single_fdb_filter(struct mlx5_lag *ldev,
+ u32 filter)
+{
+ int master_idx = mlx5_lag_get_dev_index_by_seq_filter(ldev, MLX5_LAG_P1,
+ filter);
+ struct mlx5_eswitch *master_esw;
+ struct mlx5_eswitch *peer_esw;
+ int i;
+
+ if (master_idx < 0)
+ return;
+
+ master_esw = mlx5_lag_pf(ldev, master_idx)->dev->priv.eswitch;
+ mlx5_lag_for_each(i, 0, ldev, filter) {
+ if (i == master_idx)
+ continue;
+
+ peer_esw = mlx5_lag_pf(ldev, i)->dev->priv.eswitch;
+ mlx5_eswitch_offloads_single_fdb_del_one(master_esw, peer_esw);
+ }
+}
+
+int mlx5_lag_create_single_fdb(struct mlx5_lag *ldev)
+{
+ return mlx5_lag_create_single_fdb_filter(ldev, MLX5_LAG_FILTER_ALL);
+}
+
+void mlx5_lag_destroy_single_fdb(struct mlx5_lag *ldev)
+{
+ mlx5_lag_destroy_single_fdb_filter(ldev, MLX5_LAG_FILTER_ALL);
+}
+
+/**
+ * mlx5_lag_shared_fdb_create - Create shared FDB LAG
+ * @ldev: LAG device
+ * @tracker: LAG tracker (NULL for SD)
+ * @mode: LAG mode (unused for SD)
+ * @group_id: SD group ID; 0 (MLX5_LAG_FILTER_PORTS) for ports LAG;
+ * MLX5_LAG_FILTER_ALL for all-device (mpesw) LAG
+ *
+ * When group_id is 0 (MLX5_LAG_FILTER_PORTS) or MLX5_LAG_FILTER_ALL,
+ * activates a FW LAG with shared FDB.
+ * When group_id is a specific SD group ID, creates a software-only shared
+ * FDB scoped to that group (no FW LAG commands).
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int mlx5_lag_shared_fdb_create(struct mlx5_lag *ldev,
+ struct lag_tracker *tracker,
+ enum mlx5_lag_mode mode,
+ u32 group_id)
+{
+ u32 filter = group_id ? group_id : MLX5_LAG_FILTER_PORTS;
+ int idx = mlx5_lag_get_dev_index_by_seq_filter(ldev, MLX5_LAG_P1,
+ filter);
+ struct mlx5_core_dev *dev0;
+ struct lag_func *pf;
+ int err;
+ int i;
+
+ if (idx < 0)
+ return -EINVAL;
+
+ dev0 = mlx5_lag_pf(ldev, idx)->dev;
+
+ mlx5_lag_remove_devices_filter(ldev, filter);
+
+ if (filter == MLX5_LAG_FILTER_PORTS || filter == MLX5_LAG_FILTER_ALL) {
+ err = mlx5_activate_lag(ldev, tracker, mode, true);
+ if (err) {
+ mlx5_core_warn(dev0,
+ "Failed to create LAG in shared FDB mode (%d)\n",
+ err);
+ goto err_add_devices;
+ }
+ } else {
+ err = mlx5_lag_create_single_fdb_filter(ldev, group_id);
+ if (err) {
+ mlx5_core_warn(dev0,
+ "Failed to create SD shared FDB (%d)\n",
+ err);
+ goto err_add_devices;
+ }
+ mlx5_lag_for_each(i, 0, ldev, filter) {
+ pf = mlx5_lag_pf(ldev, i);
+ pf->sd_fdb_active = true;
+ }
+ BLOCKING_INIT_NOTIFIER_HEAD(&dev0->priv.lag_nh);
+ }
+
+ mlx5_lag_rescan_dev_locked(ldev, dev0, true);
+ err = mlx5_lag_reload_ib_reps_from_locked(ldev, 0, filter, false);
+ if (err) {
+ mlx5_core_err(dev0, "Failed to enable lag\n");
+ goto err_rescan_drivers;
+ }
+
+ if (filter == MLX5_LAG_FILTER_PORTS || filter == MLX5_LAG_FILTER_ALL)
+ mlx5_lag_set_vports_agg_speed(ldev);
+ return 0;
+
+err_rescan_drivers:
+ mlx5_lag_rescan_dev_locked(ldev, dev0, false);
+ if (filter == MLX5_LAG_FILTER_PORTS || filter == MLX5_LAG_FILTER_ALL) {
+ mlx5_deactivate_lag(ldev);
+ } else {
+ mlx5_lag_for_each(i, 0, ldev, filter) {
+ pf = mlx5_lag_pf(ldev, i);
+ pf->sd_fdb_active = false;
+ }
+ mlx5_lag_destroy_single_fdb_filter(ldev, group_id);
+ }
+err_add_devices:
+ mlx5_lag_add_devices_filter(ldev, filter);
+ mlx5_lag_reload_ib_reps_from_locked(ldev, 0, filter, true);
+ return err;
+}
+
+void mlx5_lag_shared_fdb_destroy(struct mlx5_lag *ldev, u32 group_id)
+{
+ u32 filter = group_id ? group_id : MLX5_LAG_FILTER_PORTS;
+ struct lag_func *pf;
+ int err;
+ int i;
+
+ mlx5_lag_remove_devices_filter(ldev, filter);
+
+ if (filter == MLX5_LAG_FILTER_PORTS || filter == MLX5_LAG_FILTER_ALL) {
+ err = mlx5_deactivate_lag(ldev);
+ if (err)
+ return;
+ } else {
+ mlx5_lag_for_each(i, 0, ldev, filter) {
+ pf = mlx5_lag_pf(ldev, i);
+ pf->sd_fdb_active = false;
+ }
+ mlx5_lag_destroy_single_fdb_filter(ldev, group_id);
+ }
+
+ mlx5_lag_add_devices_filter(ldev, filter);
+ mlx5_lag_reload_ib_reps_from_locked(ldev,
+ MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV,
+ filter, true);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c
index 6e199161b008..25286ecd724e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c
@@ -2,9 +2,11 @@
/* Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
#include "lib/sd.h"
+#include "../lag/lag.h"
#include "mlx5_core.h"
#include "lib/mlx5.h"
#include "fs_cmd.h"
+#include <linux/mlx5/eswitch.h>
#include <linux/mlx5/vport.h>
#include <linux/debugfs.h>
@@ -47,16 +49,39 @@ static int mlx5_sd_get_host_buses(struct mlx5_core_dev *dev)
return sd->host_buses;
}
-static struct mlx5_core_dev *mlx5_sd_get_primary(struct mlx5_core_dev *dev)
+struct mlx5_core_dev *mlx5_sd_get_primary(struct mlx5_core_dev *dev)
{
struct mlx5_sd *sd = mlx5_get_sd(dev);
if (!sd)
return dev;
+ if (!mlx5_devcom_comp_is_ready(sd->devcom))
+ return NULL;
+
return sd->primary ? dev : sd->primary_dev;
}
+struct mlx5_devcom_comp_dev *mlx5_sd_get_devcom(struct mlx5_core_dev *dev)
+{
+ struct mlx5_sd *sd = mlx5_get_sd(dev);
+
+ if (!sd)
+ return NULL;
+
+ return sd->devcom;
+}
+
+bool mlx5_sd_is_primary(struct mlx5_core_dev *dev)
+{
+ struct mlx5_sd *sd = mlx5_get_sd(dev);
+
+ if (!sd)
+ return true;
+
+ return sd->primary;
+}
+
struct mlx5_core_dev *
mlx5_sd_primary_get_peer(struct mlx5_core_dev *primary, int idx)
{
@@ -74,11 +99,17 @@ mlx5_sd_primary_get_peer(struct mlx5_core_dev *primary, int idx)
int mlx5_sd_ch_ix_get_dev_ix(struct mlx5_core_dev *dev, int ch_ix)
{
+ if (is_mdev_switchdev_mode(dev))
+ return 0;
+
return ch_ix % mlx5_sd_get_host_buses(dev);
}
int mlx5_sd_ch_ix_get_vec_ix(struct mlx5_core_dev *dev, int ch_ix)
{
+ if (is_mdev_switchdev_mode(dev))
+ return ch_ix;
+
return ch_ix / mlx5_sd_get_host_buses(dev);
}
@@ -104,7 +135,28 @@ static bool ft_create_alias_supported(struct mlx5_core_dev *dev)
return true;
}
-static bool mlx5_sd_is_supported(struct mlx5_core_dev *dev, u8 host_buses)
+static int mlx5_query_sd(struct mlx5_core_dev *dev, bool *sdm,
+ u8 *host_buses)
+{
+ u32 out[MLX5_ST_SZ_DW(mpir_reg)];
+ int err;
+
+ err = mlx5_query_mpir_reg(dev, out);
+ if (err)
+ return err;
+
+ *sdm = MLX5_GET(mpir_reg, out, sdm);
+ *host_buses = MLX5_GET(mpir_reg, out, host_buses);
+
+ return 0;
+}
+
+static u32 mlx5_sd_group_id(struct mlx5_core_dev *dev, u8 sd_group)
+{
+ return (u32)((MLX5_CAP_GEN(dev, native_port_num) << 8) | sd_group);
+}
+
+static bool mlx5_sd_caps_supported(struct mlx5_core_dev *dev, u8 host_buses)
{
/* Honor the SW implementation limit */
if (host_buses > MLX5_SD_MAX_GROUP_SZ)
@@ -131,25 +183,32 @@ static bool mlx5_sd_is_supported(struct mlx5_core_dev *dev, u8 host_buses)
return true;
}
-static int mlx5_query_sd(struct mlx5_core_dev *dev, bool *sdm,
- u8 *host_buses)
+bool mlx5_sd_is_supported(struct mlx5_core_dev *dev)
{
- u32 out[MLX5_ST_SZ_DW(mpir_reg)];
+ u8 host_buses, sd_group;
+ bool sdm;
int err;
- err = mlx5_query_mpir_reg(dev, out);
- if (err)
- return err;
+ /* Feature is currently implemented for PFs only */
+ if (!mlx5_core_is_pf(dev))
+ return false;
- *sdm = MLX5_GET(mpir_reg, out, sdm);
- *host_buses = MLX5_GET(mpir_reg, out, host_buses);
+ /* Block on embedded CPU PFs */
+ if (mlx5_core_is_ecpf(dev))
+ return false;
- return 0;
-}
+ err = mlx5_query_nic_vport_sd_group(dev, &sd_group);
+ if (err || !sd_group)
+ return false;
-static u32 mlx5_sd_group_id(struct mlx5_core_dev *dev, u8 sd_group)
-{
- return (u32)((MLX5_CAP_GEN(dev, native_port_num) << 8) | sd_group);
+ if (!MLX5_CAP_MCAM_REG(dev, mpir))
+ return false;
+
+ err = mlx5_query_sd(dev, &sdm, &host_buses);
+ if (err || !sdm)
+ return false;
+
+ return mlx5_sd_caps_supported(dev, host_buses);
}
static int sd_init(struct mlx5_core_dev *dev)
@@ -187,8 +246,8 @@ static int sd_init(struct mlx5_core_dev *dev)
group_id = mlx5_sd_group_id(dev, sd_group);
- if (!mlx5_sd_is_supported(dev, host_buses)) {
- sd_warn(dev, "can't support requested netdev combining for group id 0x%x), skipping\n",
+ if (!mlx5_sd_caps_supported(dev, host_buses)) {
+ sd_warn(dev, "can't support requested netdev combining for group id 0x%x, skipping\n",
group_id);
return 0;
}
@@ -213,6 +272,108 @@ static void sd_cleanup(struct mlx5_core_dev *dev)
kfree(sd);
}
+static int sd_lag_state_show(struct seq_file *file, void *priv)
+{
+ struct mlx5_core_dev *dev = file->private;
+ struct mlx5_lag *ldev;
+ struct lag_func *pf;
+ bool active = false;
+ int i;
+
+ ldev = mlx5_lag_dev(dev);
+ if (!ldev)
+ return -EINVAL;
+
+ mutex_lock(&ldev->lock);
+ mlx5_ldev_for_each(i, 0, ldev) {
+ pf = mlx5_lag_pf(ldev, i);
+ if (pf->dev == dev) {
+ active = pf->sd_fdb_active;
+ break;
+ }
+ }
+ mutex_unlock(&ldev->lock);
+
+ seq_printf(file, "%s\n", active ? "active" : "disabled");
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(sd_lag_state);
+
+/* SD LAG integration is optional. If LAG isn't available on this device
+ * (e.g. lag caps are off), or registering secondaries fails, just warn
+ * and continue - SD can operate without the LAG-side bookkeeping.
+ */
+static void sd_lag_init(struct mlx5_core_dev *dev)
+{
+ struct mlx5_core_dev *primary = mlx5_sd_get_primary(dev);
+ struct mlx5_sd *sd = mlx5_get_sd(primary);
+ struct mlx5_core_dev *pos, *to;
+ struct mlx5_lag *ldev;
+ struct lag_func *pf;
+ int err;
+ int i;
+
+ ldev = mlx5_lag_dev(primary);
+ if (!ldev) {
+ sd_warn(primary, "%s: no ldev (LAG caps off?), skipping\n",
+ __func__);
+ return;
+ }
+
+ mutex_lock(&ldev->lock);
+ pf = mlx5_lag_pf_by_dev(ldev, primary);
+ if (!pf) {
+ sd_warn(primary, "%s: primary not registered in ldev, skipping\n",
+ __func__);
+ goto out;
+ }
+
+ pf->group_id = sd->group_id;
+
+ mlx5_sd_for_each_secondary(i, primary, pos) {
+ err = mlx5_ldev_add_mdev(ldev, pos, sd->group_id);
+ if (err) {
+ sd_warn(primary, "%s: failed to add secondary %s to ldev: %d\n",
+ __func__, dev_name(pos->device), err);
+ goto err;
+ }
+ }
+
+out:
+ mutex_unlock(&ldev->lock);
+ return;
+
+err:
+ to = pos;
+ mlx5_sd_for_each_secondary_to(i, primary, to, pos)
+ mlx5_ldev_remove_mdev(ldev, pos);
+ pf->group_id = 0;
+ mutex_unlock(&ldev->lock);
+}
+
+static void sd_lag_cleanup(struct mlx5_core_dev *dev)
+{
+ struct mlx5_core_dev *primary = mlx5_sd_get_primary(dev);
+ struct mlx5_core_dev *pos;
+ struct mlx5_lag *ldev;
+ struct lag_func *pf;
+ int i;
+
+ ldev = mlx5_lag_dev(primary);
+ if (!ldev)
+ return;
+
+ mutex_lock(&ldev->lock);
+ mlx5_sd_for_each_secondary(i, primary, pos)
+ mlx5_ldev_remove_mdev(ldev, pos);
+
+ pf = mlx5_lag_pf_by_dev(ldev, primary);
+ if (pf)
+ pf->group_id = 0;
+ mutex_unlock(&ldev->lock);
+}
+
static int sd_register(struct mlx5_core_dev *dev)
{
struct mlx5_devcom_comp_dev *devcom, *pos;
@@ -463,27 +624,32 @@ int mlx5_sd_init(struct mlx5_core_dev *dev)
if (err)
goto err_sd_unregister;
+ mlx5_sd_for_each_secondary(i, primary, pos) {
+ err = sd_cmd_set_secondary(pos, primary, alias_key);
+ if (err)
+ goto err_unset_secondaries;
+ }
+
+ sd_lag_init(primary);
+
primary_sd->dfs =
debugfs_create_dir("multi-pf",
mlx5_debugfs_get_dev_root(primary));
- debugfs_create_x32("group_id", 0400, primary_sd->dfs,
- &primary_sd->group_id);
- debugfs_create_file("primary", 0400, primary_sd->dfs, primary,
- &dev_fops);
-
mlx5_sd_for_each_secondary(i, primary, pos) {
char name[32];
- err = sd_cmd_set_secondary(pos, primary, alias_key);
- if (err)
- goto err_unset_secondaries;
-
snprintf(name, sizeof(name), "secondary_%d", i - 1);
debugfs_create_file(name, 0400, primary_sd->dfs, pos,
&dev_fops);
-
}
+ debugfs_create_file("sd_lag_state", 0400, primary_sd->dfs, primary,
+ &sd_lag_state_fops);
+ debugfs_create_x32("group_id", 0400, primary_sd->dfs,
+ &primary_sd->group_id);
+ debugfs_create_file("primary", 0400, primary_sd->dfs, primary,
+ &dev_fops);
+
sd_info(primary, "group id %#x, size %d, combined\n",
sd->group_id, mlx5_devcom_comp_get_size(sd->devcom));
sd_print_group(primary);
@@ -498,8 +664,6 @@ err_unset_secondaries:
mlx5_sd_for_each_secondary_to(i, primary, to, pos)
sd_cmd_unset_secondary(pos);
sd_cmd_unset_primary(primary);
- debugfs_remove_recursive(primary_sd->dfs);
- primary_sd->dfs = NULL;
err_sd_unregister:
mlx5_sd_for_each_secondary(i, primary, pos) {
struct mlx5_sd *peer_sd = mlx5_get_sd(pos);
@@ -538,11 +702,12 @@ void mlx5_sd_cleanup(struct mlx5_core_dev *dev)
if (primary_sd->state != MLX5_SD_STATE_UP)
goto out_clear_peers;
+ debugfs_remove_recursive(primary_sd->dfs);
+ primary_sd->dfs = NULL;
+ sd_lag_cleanup(primary);
mlx5_sd_for_each_secondary(i, primary, pos)
sd_cmd_unset_secondary(pos);
sd_cmd_unset_primary(primary);
- debugfs_remove_recursive(primary_sd->dfs);
- primary_sd->dfs = NULL;
sd_info(primary, "group id %#x, uncombined\n", sd->group_id);
primary_sd->state = MLX5_SD_STATE_DOWN;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h
index 9bfd5b9756b5..011702ff6f02 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h
@@ -4,10 +4,14 @@
#ifndef __MLX5_LIB_SD_H__
#define __MLX5_LIB_SD_H__
+#include <linux/types.h>
+
#define MLX5_SD_MAX_GROUP_SZ 2
struct mlx5_sd;
+struct mlx5_core_dev *mlx5_sd_get_primary(struct mlx5_core_dev *dev);
+bool mlx5_sd_is_primary(struct mlx5_core_dev *dev);
struct mlx5_core_dev *mlx5_sd_primary_get_peer(struct mlx5_core_dev *primary, int idx);
int mlx5_sd_ch_ix_get_dev_ix(struct mlx5_core_dev *dev, int ch_ix);
int mlx5_sd_ch_ix_get_vec_ix(struct mlx5_core_dev *dev, int ch_ix);
@@ -18,9 +22,28 @@ struct auxiliary_device *mlx5_sd_get_adev(struct mlx5_core_dev *dev,
void mlx5_sd_put_adev(struct auxiliary_device *actual_adev,
struct auxiliary_device *adev);
+#ifdef CONFIG_MLX5_CORE_EN
+bool mlx5_sd_is_supported(struct mlx5_core_dev *dev);
+#else
+static inline bool mlx5_sd_is_supported(struct mlx5_core_dev *dev)
+{
+ return false;
+}
+#endif
+
int mlx5_sd_init(struct mlx5_core_dev *dev);
void mlx5_sd_cleanup(struct mlx5_core_dev *dev);
+#ifdef CONFIG_MLX5_CORE_EN
+struct mlx5_devcom_comp_dev *mlx5_sd_get_devcom(struct mlx5_core_dev *dev);
+#else
+static inline struct mlx5_devcom_comp_dev *
+mlx5_sd_get_devcom(struct mlx5_core_dev *dev)
+{
+ return NULL;
+}
+#endif
+
#define mlx5_sd_for_each_dev_from_to(i, primary, ix_from, to, pos) \
for (i = ix_from; \
(pos = mlx5_sd_primary_get_peer(primary, i)) && pos != (to); i++)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 0c6e4efe38c8..fd285aeb9630 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1369,7 +1369,6 @@ err_irq_table:
static void mlx5_unload(struct mlx5_core_dev *dev)
{
- mlx5_eswitch_disable(dev->priv.eswitch);
mlx5_devlink_traps_unregister(priv_to_devlink(dev));
mlx5_vhca_event_stop(dev);
mlx5_sf_dev_table_destroy(dev);
@@ -1484,6 +1483,7 @@ void mlx5_uninit_one(struct mlx5_core_dev *dev)
mlx5_hwmon_dev_unregister(dev);
mlx5_crdump_disable(dev);
+ mlx5_eswitch_disable(dev->priv.eswitch);
mlx5_unregister_device(dev);
if (!test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) {
@@ -1568,6 +1568,7 @@ void mlx5_unload_one_devl_locked(struct mlx5_core_dev *dev, bool suspend)
devl_assert_locked(priv_to_devlink(dev));
mutex_lock(&dev->intf_state_mutex);
+ mlx5_eswitch_disable(dev->priv.eswitch);
mlx5_detach_device(dev, suspend);
if (!test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) {