summaryrefslogtreecommitdiff
path: root/net/core
diff options
context:
space:
mode:
authorPaolo Abeni <pabeni@redhat.com>2026-04-21 13:50:26 +0300
committerPaolo Abeni <pabeni@redhat.com>2026-04-21 13:50:26 +0300
commitedaa48dc2c071cf2ab0611ee504bbd4c544fc178 (patch)
tree20044ff1883003104534576496d228414979c4a0 /net/core
parent4c1367a2d7aad643a6f87c6931b13cc1a25e8ca7 (diff)
parentc4dde411bc366f568dbe33366253bbfea049e8ea (diff)
downloadlinux-edaa48dc2c071cf2ab0611ee504bbd4c544fc178.tar.xz
Merge branch 'net-sleepable-ndo_set_rx_mode'
Stanislav Fomichev says: ==================== net: sleepable ndo_set_rx_mode This series adds a new ndo_set_rx_mode_async callback that enables drivers to handle address list updates in a sleepable context. The current ndo_set_rx_mode is called under the netif_addr_lock spinlock with BHs disabled, which prevents drivers from sleeping. This is problematic for ops-locked drivers that need to sleep. The approach: 1. Add snapshot/reconcile infrastructure for address lists 2. Introduce dev_rx_mode_work that takes snapshots under the lock, drops the lock, calls the driver, then reconciles changes back 3. Move promiscuity handling into the scheduled work as well 4. Convert existing ops-locked drivers to ndo_set_rx_mode_async 5. Add a warning for ops-locked drivers still using ndo_set_rx_mode 6. Add a selftest exercising the team+bridge+macvlan topology that triggers the addr_lock -> ops_lock ordering issue ==================== Link: https://patch.msgid.link/20260416185712.2155425-1-sdf@fomichev.me Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Diffstat (limited to 'net/core')
-rw-r--r--net/core/dev.c67
-rw-r--r--net/core/dev.h4
-rw-r--r--net/core/dev_addr_lists.c385
-rw-r--r--net/core/dev_addr_lists_test.c387
-rw-r--r--net/core/dev_api.c3
-rw-r--r--net/core/dev_ioctl.c6
-rw-r--r--net/core/rtnetlink.c1
7 files changed, 795 insertions, 58 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index e59f6025067c..d426c1beeb76 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -9593,14 +9593,14 @@ static void dev_change_rx_flags(struct net_device *dev, int flags)
ops->ndo_change_rx_flags(dev, flags);
}
-static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
+int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
{
unsigned int old_flags = dev->flags;
unsigned int promiscuity, flags;
kuid_t uid;
kgid_t gid;
- ASSERT_RTNL();
+ netdev_ops_assert_locked(dev);
promiscuity = dev->promiscuity + inc;
if (promiscuity == 0) {
@@ -9636,16 +9636,8 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
dev_change_rx_flags(dev, IFF_PROMISC);
}
- if (notify) {
- /* The ops lock is only required to ensure consistent locking
- * for `NETDEV_CHANGE` notifiers. This function is sometimes
- * called without the lock, even for devices that are ops
- * locked, such as in `dev_uc_sync_multiple` when using
- * bonding or teaming.
- */
- netdev_ops_assert_locked(dev);
+ if (notify)
__dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL);
- }
return 0;
}
@@ -9667,7 +9659,7 @@ int netif_set_allmulti(struct net_device *dev, int inc, bool notify)
unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
unsigned int allmulti, flags;
- ASSERT_RTNL();
+ netdev_ops_assert_locked(dev);
allmulti = dev->allmulti + inc;
if (allmulti == 0) {
@@ -9697,46 +9689,6 @@ int netif_set_allmulti(struct net_device *dev, int inc, bool notify)
return 0;
}
-/*
- * Upload unicast and multicast address lists to device and
- * configure RX filtering. When the device doesn't support unicast
- * filtering it is put in promiscuous mode while unicast addresses
- * are present.
- */
-void __dev_set_rx_mode(struct net_device *dev)
-{
- const struct net_device_ops *ops = dev->netdev_ops;
-
- /* dev_open will call this function so the list will stay sane. */
- if (!(dev->flags&IFF_UP))
- return;
-
- if (!netif_device_present(dev))
- return;
-
- if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
- /* Unicast addresses changes may only happen under the rtnl,
- * therefore calling __dev_set_promiscuity here is safe.
- */
- if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
- __dev_set_promiscuity(dev, 1, false);
- dev->uc_promisc = true;
- } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
- __dev_set_promiscuity(dev, -1, false);
- dev->uc_promisc = false;
- }
- }
-
- if (ops->ndo_set_rx_mode)
- ops->ndo_set_rx_mode(dev);
-}
-
-void dev_set_rx_mode(struct net_device *dev)
-{
- netif_addr_lock_bh(dev);
- __dev_set_rx_mode(dev);
- netif_addr_unlock_bh(dev);
-}
/**
* netif_get_flags() - get flags reported to userspace
@@ -9775,7 +9727,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags,
unsigned int old_flags = dev->flags;
int ret;
- ASSERT_RTNL();
+ netdev_ops_assert_locked(dev);
/*
* Set the flags on our device.
@@ -11408,6 +11360,11 @@ int register_netdevice(struct net_device *dev)
goto err_uninit;
}
+ if (netdev_need_ops_lock(dev) &&
+ dev->netdev_ops->ndo_set_rx_mode &&
+ !dev->netdev_ops->ndo_set_rx_mode_async)
+ netdev_WARN(dev, "ops-locked drivers should use ndo_set_rx_mode_async\n");
+
ret = netdev_do_alloc_pcpu_stats(dev);
if (ret)
goto err_uninit;
@@ -12127,6 +12084,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
#endif
mutex_init(&dev->lock);
+ INIT_LIST_HEAD(&dev->rx_mode_node);
+ __hw_addr_init(&dev->rx_mode_addr_cache);
dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
setup(dev);
@@ -12231,6 +12190,8 @@ void free_netdev(struct net_device *dev)
kfree(rcu_dereference_protected(dev->ingress_queue, 1));
+ __hw_addr_flush(&dev->rx_mode_addr_cache);
+
/* Flush device addresses */
dev_addr_flush(dev);
diff --git a/net/core/dev.h b/net/core/dev.h
index 628bdaebf0ca..0cf24b8f5008 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -78,6 +78,7 @@ void linkwatch_run_queue(void);
void dev_addr_flush(struct net_device *dev);
int dev_addr_init(struct net_device *dev);
void dev_addr_check(struct net_device *dev);
+void __hw_addr_flush(struct netdev_hw_addr_list *list);
#if IS_ENABLED(CONFIG_NET_SHAPER)
void net_shaper_flush_netdev(struct net_device *dev);
@@ -164,6 +165,9 @@ int netif_change_carrier(struct net_device *dev, bool new_carrier);
int dev_change_carrier(struct net_device *dev, bool new_carrier);
void __dev_set_rx_mode(struct net_device *dev);
+int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify);
+bool netif_rx_mode_clean(struct net_device *dev);
+void netif_rx_mode_sync(struct net_device *dev);
void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
unsigned int gchanges, u32 portid,
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index 76c91f224886..d73fcb0c6785 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -11,9 +11,18 @@
#include <linux/rtnetlink.h>
#include <linux/export.h>
#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+#include <kunit/visibility.h>
#include "dev.h"
+static void netdev_rx_mode_work(struct work_struct *work);
+
+static LIST_HEAD(rx_mode_list);
+static DEFINE_SPINLOCK(rx_mode_lock);
+static DECLARE_WORK(rx_mode_work, netdev_rx_mode_work);
+
/*
* General list handling functions
*/
@@ -481,7 +490,7 @@ void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list,
}
EXPORT_SYMBOL(__hw_addr_unsync_dev);
-static void __hw_addr_flush(struct netdev_hw_addr_list *list)
+void __hw_addr_flush(struct netdev_hw_addr_list *list)
{
struct netdev_hw_addr *ha, *tmp;
@@ -492,6 +501,7 @@ static void __hw_addr_flush(struct netdev_hw_addr_list *list)
}
list->count = 0;
}
+EXPORT_SYMBOL_IF_KUNIT(__hw_addr_flush);
void __hw_addr_init(struct netdev_hw_addr_list *list)
{
@@ -501,6 +511,133 @@ void __hw_addr_init(struct netdev_hw_addr_list *list)
}
EXPORT_SYMBOL(__hw_addr_init);
+static void __hw_addr_splice(struct netdev_hw_addr_list *dst,
+ struct netdev_hw_addr_list *src)
+{
+ src->tree = RB_ROOT;
+ list_splice_init(&src->list, &dst->list);
+ dst->count += src->count;
+ src->count = 0;
+}
+
+/**
+ * __hw_addr_list_snapshot - create a snapshot copy of an address list
+ * @snap: destination snapshot list (needs to be __hw_addr_init-initialized)
+ * @list: source address list to snapshot
+ * @addr_len: length of addresses
+ * @cache: entry cache to reuse entries from; falls back to GFP_ATOMIC
+ *
+ * Creates a copy of @list reusing entries from @cache when available.
+ * Must be called under a spinlock.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int __hw_addr_list_snapshot(struct netdev_hw_addr_list *snap,
+ const struct netdev_hw_addr_list *list,
+ int addr_len, struct netdev_hw_addr_list *cache)
+{
+ struct netdev_hw_addr *ha, *entry;
+
+ list_for_each_entry(ha, &list->list, list) {
+ if (cache->count) {
+ entry = list_first_entry(&cache->list,
+ struct netdev_hw_addr, list);
+ list_del(&entry->list);
+ cache->count--;
+ memcpy(entry->addr, ha->addr, addr_len);
+ entry->type = ha->type;
+ entry->global_use = false;
+ entry->synced = 0;
+ } else {
+ entry = __hw_addr_create(ha->addr, addr_len, ha->type,
+ false, false);
+ if (!entry) {
+ __hw_addr_flush(snap);
+ return -ENOMEM;
+ }
+ }
+ entry->sync_cnt = ha->sync_cnt;
+ entry->refcount = ha->refcount;
+
+ list_add_tail(&entry->list, &snap->list);
+ __hw_addr_insert(snap, entry, addr_len);
+ snap->count++;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_IF_KUNIT(__hw_addr_list_snapshot);
+
+/**
+ * __hw_addr_list_reconcile - sync snapshot changes back and free snapshots
+ * @real_list: the real address list to update
+ * @work: the working snapshot (modified by driver via __hw_addr_sync_dev)
+ * @ref: the reference snapshot (untouched copy of original state)
+ * @addr_len: length of addresses
+ * @cache: entry cache to return snapshot entries to for reuse
+ *
+ * Walks the reference snapshot and compares each entry against the work
+ * snapshot to compute sync_cnt deltas. Applies those deltas to @real_list.
+ * Returns snapshot entries to @cache for reuse; frees both snapshots.
+ * Caller must hold netif_addr_lock_bh.
+ */
+void __hw_addr_list_reconcile(struct netdev_hw_addr_list *real_list,
+ struct netdev_hw_addr_list *work,
+ struct netdev_hw_addr_list *ref, int addr_len,
+ struct netdev_hw_addr_list *cache)
+{
+ struct netdev_hw_addr *ref_ha, *tmp, *work_ha, *real_ha;
+ int delta;
+
+ list_for_each_entry_safe(ref_ha, tmp, &ref->list, list) {
+ work_ha = __hw_addr_lookup(work, ref_ha->addr, addr_len,
+ ref_ha->type);
+ if (work_ha)
+ delta = work_ha->sync_cnt - ref_ha->sync_cnt;
+ else
+ delta = -1;
+
+ if (delta == 0)
+ continue;
+
+ real_ha = __hw_addr_lookup(real_list, ref_ha->addr, addr_len,
+ ref_ha->type);
+ if (!real_ha) {
+ /* The real entry was concurrently removed. If the
+ * driver synced this addr to hardware (delta > 0),
+ * re-insert it as a stale entry so the next work
+ * run unsyncs it from hardware.
+ */
+ if (delta > 0) {
+ rb_erase(&ref_ha->node, &ref->tree);
+ list_del(&ref_ha->list);
+ ref->count--;
+ ref_ha->sync_cnt = delta;
+ ref_ha->refcount = delta;
+ list_add_tail_rcu(&ref_ha->list,
+ &real_list->list);
+ __hw_addr_insert(real_list, ref_ha,
+ addr_len);
+ real_list->count++;
+ }
+ continue;
+ }
+
+ real_ha->sync_cnt += delta;
+ real_ha->refcount += delta;
+ if (!real_ha->refcount) {
+ rb_erase(&real_ha->node, &real_list->tree);
+ list_del_rcu(&real_ha->list);
+ kfree_rcu(real_ha, rcu_head);
+ real_list->count--;
+ }
+ }
+
+ __hw_addr_splice(cache, work);
+ __hw_addr_splice(cache, ref);
+}
+EXPORT_SYMBOL_IF_KUNIT(__hw_addr_list_reconcile);
+
/*
* Device addresses handling functions
*/
@@ -1049,3 +1186,249 @@ void dev_mc_init(struct net_device *dev)
__hw_addr_init(&dev->mc);
}
EXPORT_SYMBOL(dev_mc_init);
+
+static int netif_addr_lists_snapshot(struct net_device *dev,
+ struct netdev_hw_addr_list *uc_snap,
+ struct netdev_hw_addr_list *mc_snap,
+ struct netdev_hw_addr_list *uc_ref,
+ struct netdev_hw_addr_list *mc_ref)
+{
+ int err;
+
+ err = __hw_addr_list_snapshot(uc_snap, &dev->uc, dev->addr_len,
+ &dev->rx_mode_addr_cache);
+ if (!err)
+ err = __hw_addr_list_snapshot(uc_ref, &dev->uc, dev->addr_len,
+ &dev->rx_mode_addr_cache);
+ if (!err)
+ err = __hw_addr_list_snapshot(mc_snap, &dev->mc,
+ dev->addr_len,
+ &dev->rx_mode_addr_cache);
+ if (!err)
+ err = __hw_addr_list_snapshot(mc_ref, &dev->mc, dev->addr_len,
+ &dev->rx_mode_addr_cache);
+
+ if (err) {
+ __hw_addr_flush(uc_snap);
+ __hw_addr_flush(uc_ref);
+ __hw_addr_flush(mc_snap);
+ }
+
+ return err;
+}
+
+static void netif_addr_lists_reconcile(struct net_device *dev,
+ struct netdev_hw_addr_list *uc_snap,
+ struct netdev_hw_addr_list *mc_snap,
+ struct netdev_hw_addr_list *uc_ref,
+ struct netdev_hw_addr_list *mc_ref)
+{
+ __hw_addr_list_reconcile(&dev->uc, uc_snap, uc_ref, dev->addr_len,
+ &dev->rx_mode_addr_cache);
+ __hw_addr_list_reconcile(&dev->mc, mc_snap, mc_ref, dev->addr_len,
+ &dev->rx_mode_addr_cache);
+}
+
+/**
+ * netif_uc_promisc_update() - evaluate whether uc_promisc should be toggled.
+ * @dev: device
+ *
+ * Must be called under netif_addr_lock_bh.
+ * Return: +1 to enter promisc, -1 to leave, 0 for no change.
+ */
+static int netif_uc_promisc_update(struct net_device *dev)
+{
+ if (dev->priv_flags & IFF_UNICAST_FLT)
+ return 0;
+
+ if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
+ dev->uc_promisc = true;
+ return 1;
+ }
+ if (netdev_uc_empty(dev) && dev->uc_promisc) {
+ dev->uc_promisc = false;
+ return -1;
+ }
+ return 0;
+}
+
+static void netif_rx_mode_run(struct net_device *dev)
+{
+ struct netdev_hw_addr_list uc_snap, mc_snap, uc_ref, mc_ref;
+ const struct net_device_ops *ops = dev->netdev_ops;
+ int promisc_inc;
+ int err;
+
+ might_sleep();
+ netdev_ops_assert_locked(dev);
+
+ __hw_addr_init(&uc_snap);
+ __hw_addr_init(&mc_snap);
+ __hw_addr_init(&uc_ref);
+ __hw_addr_init(&mc_ref);
+
+ if (!(dev->flags & IFF_UP) || !netif_device_present(dev))
+ return;
+
+ if (ops->ndo_set_rx_mode_async) {
+ netif_addr_lock_bh(dev);
+ err = netif_addr_lists_snapshot(dev, &uc_snap, &mc_snap,
+ &uc_ref, &mc_ref);
+ if (err) {
+ netdev_WARN(dev, "failed to sync uc/mc addresses\n");
+ netif_addr_unlock_bh(dev);
+ return;
+ }
+
+ promisc_inc = netif_uc_promisc_update(dev);
+ netif_addr_unlock_bh(dev);
+ } else {
+ netif_addr_lock_bh(dev);
+ promisc_inc = netif_uc_promisc_update(dev);
+ netif_addr_unlock_bh(dev);
+ }
+
+ if (promisc_inc)
+ __dev_set_promiscuity(dev, promisc_inc, false);
+
+ if (ops->ndo_set_rx_mode_async) {
+ ops->ndo_set_rx_mode_async(dev, &uc_snap, &mc_snap);
+
+ netif_addr_lock_bh(dev);
+ netif_addr_lists_reconcile(dev, &uc_snap, &mc_snap,
+ &uc_ref, &mc_ref);
+ netif_addr_unlock_bh(dev);
+ } else if (ops->ndo_set_rx_mode) {
+ netif_addr_lock_bh(dev);
+ ops->ndo_set_rx_mode(dev);
+ netif_addr_unlock_bh(dev);
+ }
+}
+
+static void netdev_rx_mode_work(struct work_struct *work)
+{
+ struct net_device *dev;
+
+ rtnl_lock();
+
+ while (true) {
+ spin_lock_bh(&rx_mode_lock);
+ if (list_empty(&rx_mode_list)) {
+ spin_unlock_bh(&rx_mode_lock);
+ break;
+ }
+ dev = list_first_entry(&rx_mode_list, struct net_device,
+ rx_mode_node);
+ list_del_init(&dev->rx_mode_node);
+ /* We must free netdev tracker under
+ * the spinlock protection.
+ */
+ netdev_tracker_free(dev, &dev->rx_mode_tracker);
+ spin_unlock_bh(&rx_mode_lock);
+
+ netdev_lock_ops(dev);
+ netif_rx_mode_run(dev);
+ netdev_unlock_ops(dev);
+ /* Use __dev_put() because netdev_tracker_free() was already
+ * called above. Must be after netdev_unlock_ops() to prevent
+ * netdev_run_todo() from freeing the device while still in use.
+ */
+ __dev_put(dev);
+ }
+
+ rtnl_unlock();
+}
+
+static void netif_rx_mode_queue(struct net_device *dev)
+{
+ spin_lock_bh(&rx_mode_lock);
+ if (list_empty(&dev->rx_mode_node)) {
+ list_add_tail(&dev->rx_mode_node, &rx_mode_list);
+ netdev_hold(dev, &dev->rx_mode_tracker, GFP_ATOMIC);
+ }
+ spin_unlock_bh(&rx_mode_lock);
+ schedule_work(&rx_mode_work);
+}
+
+/**
+ * __dev_set_rx_mode() - upload unicast and multicast address lists to device
+ * and configure RX filtering.
+ * @dev: device
+ *
+ * When the device doesn't support unicast filtering it is put in promiscuous
+ * mode while unicast addresses are present.
+ */
+void __dev_set_rx_mode(struct net_device *dev)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ int promisc_inc;
+
+ /* dev_open will call this function so the list will stay sane. */
+ if (!(dev->flags & IFF_UP))
+ return;
+
+ if (!netif_device_present(dev))
+ return;
+
+ if (ops->ndo_set_rx_mode_async || ops->ndo_change_rx_flags ||
+ netdev_need_ops_lock(dev)) {
+ netif_rx_mode_queue(dev);
+ return;
+ }
+
+ /* Legacy path for non-ops-locked HW devices. */
+
+ promisc_inc = netif_uc_promisc_update(dev);
+ if (promisc_inc)
+ __dev_set_promiscuity(dev, promisc_inc, false);
+
+ if (ops->ndo_set_rx_mode)
+ ops->ndo_set_rx_mode(dev);
+}
+
+void dev_set_rx_mode(struct net_device *dev)
+{
+ netif_addr_lock_bh(dev);
+ __dev_set_rx_mode(dev);
+ netif_addr_unlock_bh(dev);
+}
+
+bool netif_rx_mode_clean(struct net_device *dev)
+{
+ bool clean = false;
+
+ spin_lock_bh(&rx_mode_lock);
+ if (!list_empty(&dev->rx_mode_node)) {
+ list_del_init(&dev->rx_mode_node);
+ clean = true;
+ /* We must release netdev tracker under
+ * the spinlock protection.
+ */
+ netdev_tracker_free(dev, &dev->rx_mode_tracker);
+ }
+ spin_unlock_bh(&rx_mode_lock);
+
+ return clean;
+}
+
+/**
+ * netif_rx_mode_sync() - sync rx mode inline
+ * @dev: network device
+ *
+ * Drivers implementing ndo_set_rx_mode_async() have their rx mode callback
+ * executed from a workqueue. This allows the callback to sleep, but means
+ * the hardware update is deferred and may not be visible to userspace
+ * by the time the initiating syscall returns. netif_rx_mode_sync() steals
+ * workqueue update and executes it inline. This preserves the atomicity of
+ * operations to the userspace.
+ */
+void netif_rx_mode_sync(struct net_device *dev)
+{
+ if (netif_rx_mode_clean(dev)) {
+ netif_rx_mode_run(dev);
+ /* Use __dev_put() because netdev_tracker_free() was already
+ * called inside netif_rx_mode_clean().
+ */
+ __dev_put(dev);
+ }
+}
diff --git a/net/core/dev_addr_lists_test.c b/net/core/dev_addr_lists_test.c
index 8e1dba825e94..260e71a2399f 100644
--- a/net/core/dev_addr_lists_test.c
+++ b/net/core/dev_addr_lists_test.c
@@ -2,22 +2,31 @@
#include <kunit/test.h>
#include <linux/etherdevice.h>
+#include <linux/math64.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
static const struct net_device_ops dummy_netdev_ops = {
};
+#define ADDR_A 1
+#define ADDR_B 2
+#define ADDR_C 3
+
struct dev_addr_test_priv {
u32 addr_seen;
+ u32 addr_synced;
+ u32 addr_unsynced;
};
static int dev_addr_test_sync(struct net_device *netdev, const unsigned char *a)
{
struct dev_addr_test_priv *datp = netdev_priv(netdev);
- if (a[0] < 31 && !memchr_inv(a, a[0], ETH_ALEN))
+ if (a[0] < 31 && !memchr_inv(a, a[0], ETH_ALEN)) {
datp->addr_seen |= 1 << a[0];
+ datp->addr_synced |= 1 << a[0];
+ }
return 0;
}
@@ -26,11 +35,22 @@ static int dev_addr_test_unsync(struct net_device *netdev,
{
struct dev_addr_test_priv *datp = netdev_priv(netdev);
- if (a[0] < 31 && !memchr_inv(a, a[0], ETH_ALEN))
+ if (a[0] < 31 && !memchr_inv(a, a[0], ETH_ALEN)) {
datp->addr_seen &= ~(1 << a[0]);
+ datp->addr_unsynced |= 1 << a[0];
+ }
return 0;
}
+static void dev_addr_test_reset(struct net_device *netdev)
+{
+ struct dev_addr_test_priv *datp = netdev_priv(netdev);
+
+ datp->addr_seen = 0;
+ datp->addr_synced = 0;
+ datp->addr_unsynced = 0;
+}
+
static int dev_addr_test_init(struct kunit *test)
{
struct dev_addr_test_priv *datp;
@@ -225,6 +245,363 @@ static void dev_addr_test_add_excl(struct kunit *test)
rtnl_unlock();
}
+/* Snapshot test: basic sync with no concurrent modifications.
+ * Add one address, snapshot, driver syncs it, reconcile propagates
+ * sync_cnt delta back to real list.
+ */
+static void dev_addr_test_snapshot_sync(struct kunit *test)
+{
+ struct netdev_hw_addr_list snap, ref, cache;
+ struct net_device *netdev = test->priv;
+ struct dev_addr_test_priv *datp;
+ struct netdev_hw_addr *ha;
+ u8 addr[ETH_ALEN];
+
+ datp = netdev_priv(netdev);
+
+ rtnl_lock();
+
+ memset(addr, ADDR_A, sizeof(addr));
+ KUNIT_EXPECT_EQ(test, 0, dev_uc_add(netdev, addr));
+
+ /* Snapshot: ADDR_A has sync_cnt=0, refcount=1 (new) */
+ netif_addr_lock_bh(netdev);
+ __hw_addr_init(&snap);
+ __hw_addr_init(&ref);
+ __hw_addr_init(&cache);
+ KUNIT_EXPECT_EQ(test, 0,
+ __hw_addr_list_snapshot(&snap, &netdev->uc, ETH_ALEN,
+ &cache));
+ KUNIT_EXPECT_EQ(test, 0,
+ __hw_addr_list_snapshot(&ref, &netdev->uc, ETH_ALEN,
+ &cache));
+ netif_addr_unlock_bh(netdev);
+
+ /* Driver syncs ADDR_A to hardware */
+ dev_addr_test_reset(netdev);
+ __hw_addr_sync_dev(&snap, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ KUNIT_EXPECT_EQ(test, 1 << ADDR_A, datp->addr_synced);
+ KUNIT_EXPECT_EQ(test, 0, datp->addr_unsynced);
+
+ /* Reconcile: delta=+1 applied to real entry */
+ netif_addr_lock_bh(netdev);
+ __hw_addr_list_reconcile(&netdev->uc, &snap, &ref, ETH_ALEN,
+ &cache);
+ netif_addr_unlock_bh(netdev);
+
+ /* Real entry should now reflect the sync: sync_cnt=1, refcount=2 */
+ KUNIT_EXPECT_EQ(test, 1, netdev->uc.count);
+ ha = list_first_entry(&netdev->uc.list, struct netdev_hw_addr, list);
+ KUNIT_EXPECT_MEMEQ(test, ha->addr, addr, ETH_ALEN);
+ KUNIT_EXPECT_EQ(test, 1, ha->sync_cnt);
+ KUNIT_EXPECT_EQ(test, 2, ha->refcount);
+
+ /* Second work run: already synced, nothing to do */
+ dev_addr_test_reset(netdev);
+ __hw_addr_sync_dev(&netdev->uc, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ KUNIT_EXPECT_EQ(test, 0, datp->addr_synced);
+ KUNIT_EXPECT_EQ(test, 0, datp->addr_unsynced);
+ KUNIT_EXPECT_EQ(test, 1, netdev->uc.count);
+
+ __hw_addr_flush(&cache);
+ rtnl_unlock();
+}
+
+/* Snapshot test: ADDR_A synced to hardware, then concurrently removed
+ * from the real list before reconcile runs. Reconcile re-inserts ADDR_A as
+ * a stale entry so the next work run unsyncs it from hardware.
+ */
+static void dev_addr_test_snapshot_remove_during_sync(struct kunit *test)
+{
+ struct netdev_hw_addr_list snap, ref, cache;
+ struct net_device *netdev = test->priv;
+ struct dev_addr_test_priv *datp;
+ struct netdev_hw_addr *ha;
+ u8 addr[ETH_ALEN];
+
+ datp = netdev_priv(netdev);
+
+ rtnl_lock();
+
+ memset(addr, ADDR_A, sizeof(addr));
+ KUNIT_EXPECT_EQ(test, 0, dev_uc_add(netdev, addr));
+
+ /* Snapshot: ADDR_A is new (sync_cnt=0, refcount=1) */
+ netif_addr_lock_bh(netdev);
+ __hw_addr_init(&snap);
+ __hw_addr_init(&ref);
+ __hw_addr_init(&cache);
+ KUNIT_EXPECT_EQ(test, 0,
+ __hw_addr_list_snapshot(&snap, &netdev->uc, ETH_ALEN,
+ &cache));
+ KUNIT_EXPECT_EQ(test, 0,
+ __hw_addr_list_snapshot(&ref, &netdev->uc, ETH_ALEN,
+ &cache));
+ netif_addr_unlock_bh(netdev);
+
+ /* Driver syncs ADDR_A to hardware */
+ dev_addr_test_reset(netdev);
+ __hw_addr_sync_dev(&snap, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ KUNIT_EXPECT_EQ(test, 1 << ADDR_A, datp->addr_synced);
+ KUNIT_EXPECT_EQ(test, 0, datp->addr_unsynced);
+
+ /* Concurrent removal: user deletes ADDR_A while driver was working */
+ memset(addr, ADDR_A, sizeof(addr));
+ KUNIT_EXPECT_EQ(test, 0, dev_uc_del(netdev, addr));
+ KUNIT_EXPECT_EQ(test, 0, netdev->uc.count);
+
+ /* Reconcile: ADDR_A gone from real list but driver synced it,
+ * so it gets re-inserted as stale (sync_cnt=1, refcount=1).
+ */
+ netif_addr_lock_bh(netdev);
+ __hw_addr_list_reconcile(&netdev->uc, &snap, &ref, ETH_ALEN,
+ &cache);
+ netif_addr_unlock_bh(netdev);
+
+ KUNIT_EXPECT_EQ(test, 1, netdev->uc.count);
+ ha = list_first_entry(&netdev->uc.list, struct netdev_hw_addr, list);
+ KUNIT_EXPECT_MEMEQ(test, ha->addr, addr, ETH_ALEN);
+ KUNIT_EXPECT_EQ(test, 1, ha->sync_cnt);
+ KUNIT_EXPECT_EQ(test, 1, ha->refcount);
+
+ /* Second work run: stale entry gets unsynced from HW and removed */
+ dev_addr_test_reset(netdev);
+ __hw_addr_sync_dev(&netdev->uc, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ KUNIT_EXPECT_EQ(test, 0, datp->addr_synced);
+ KUNIT_EXPECT_EQ(test, 1 << ADDR_A, datp->addr_unsynced);
+ KUNIT_EXPECT_EQ(test, 0, netdev->uc.count);
+
+ __hw_addr_flush(&cache);
+ rtnl_unlock();
+}
+
+/* Snapshot test: ADDR_A was stale (unsynced from hardware by driver),
+ * but concurrently re-added by the user. The re-add bumps refcount of
+ * the existing stale entry. Reconcile applies delta=-1, leaving ADDR_A
+ * as a fresh entry (sync_cnt=0, refcount=1) for the next work run.
+ */
+static void dev_addr_test_snapshot_readd_during_unsync(struct kunit *test)
+{
+ struct netdev_hw_addr_list snap, ref, cache;
+ struct net_device *netdev = test->priv;
+ struct dev_addr_test_priv *datp;
+ struct netdev_hw_addr *ha;
+ u8 addr[ETH_ALEN];
+
+ datp = netdev_priv(netdev);
+
+ rtnl_lock();
+
+ memset(addr, ADDR_A, sizeof(addr));
+ KUNIT_EXPECT_EQ(test, 0, dev_uc_add(netdev, addr));
+
+ /* Sync ADDR_A to hardware: sync_cnt=1, refcount=2 */
+ dev_addr_test_reset(netdev);
+ __hw_addr_sync_dev(&netdev->uc, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ KUNIT_EXPECT_EQ(test, 1 << ADDR_A, datp->addr_synced);
+ KUNIT_EXPECT_EQ(test, 0, datp->addr_unsynced);
+
+ /* User removes ADDR_A: refcount=1, sync_cnt=1 -> stale */
+ KUNIT_EXPECT_EQ(test, 0, dev_uc_del(netdev, addr));
+
+ /* Snapshot: ADDR_A is stale (sync_cnt=1, refcount=1) */
+ netif_addr_lock_bh(netdev);
+ __hw_addr_init(&snap);
+ __hw_addr_init(&ref);
+ __hw_addr_init(&cache);
+ KUNIT_EXPECT_EQ(test, 0,
+ __hw_addr_list_snapshot(&snap, &netdev->uc, ETH_ALEN,
+ &cache));
+ KUNIT_EXPECT_EQ(test, 0,
+ __hw_addr_list_snapshot(&ref, &netdev->uc, ETH_ALEN,
+ &cache));
+ netif_addr_unlock_bh(netdev);
+
+ /* Driver unsyncs stale ADDR_A from hardware */
+ dev_addr_test_reset(netdev);
+ __hw_addr_sync_dev(&snap, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ KUNIT_EXPECT_EQ(test, 0, datp->addr_synced);
+ KUNIT_EXPECT_EQ(test, 1 << ADDR_A, datp->addr_unsynced);
+
+ /* Concurrent: user re-adds ADDR_A. dev_uc_add finds the existing
+ * stale entry and bumps refcount from 1 -> 2. sync_cnt stays 1.
+ */
+ KUNIT_EXPECT_EQ(test, 0, dev_uc_add(netdev, addr));
+ KUNIT_EXPECT_EQ(test, 1, netdev->uc.count);
+
+ /* Reconcile: ref sync_cnt=1 matches real sync_cnt=1, delta=-1
+ * applied. Result: sync_cnt=0, refcount=1 (fresh).
+ */
+ netif_addr_lock_bh(netdev);
+ __hw_addr_list_reconcile(&netdev->uc, &snap, &ref, ETH_ALEN,
+ &cache);
+ netif_addr_unlock_bh(netdev);
+
+ /* Entry survives as fresh: needs re-sync to HW */
+ KUNIT_EXPECT_EQ(test, 1, netdev->uc.count);
+ ha = list_first_entry(&netdev->uc.list, struct netdev_hw_addr, list);
+ KUNIT_EXPECT_MEMEQ(test, ha->addr, addr, ETH_ALEN);
+ KUNIT_EXPECT_EQ(test, 0, ha->sync_cnt);
+ KUNIT_EXPECT_EQ(test, 1, ha->refcount);
+
+ /* Second work run: fresh entry gets synced to HW */
+ dev_addr_test_reset(netdev);
+ __hw_addr_sync_dev(&netdev->uc, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ KUNIT_EXPECT_EQ(test, 1 << ADDR_A, datp->addr_synced);
+ KUNIT_EXPECT_EQ(test, 0, datp->addr_unsynced);
+
+ __hw_addr_flush(&cache);
+ rtnl_unlock();
+}
+
+/* Snapshot test: ADDR_A is new (synced by driver), and independent ADDR_B
+ * is concurrently removed from the real list. A's sync delta propagates
+ * normally; B's absence doesn't interfere.
+ */
+static void dev_addr_test_snapshot_add_and_remove(struct kunit *test)
+{
+ struct netdev_hw_addr_list snap, ref, cache;
+ struct net_device *netdev = test->priv;
+ struct dev_addr_test_priv *datp;
+ struct netdev_hw_addr *ha;
+ u8 addr[ETH_ALEN];
+
+ datp = netdev_priv(netdev);
+
+ rtnl_lock();
+
+ /* Add ADDR_A and ADDR_B (will be synced then removed) */
+ memset(addr, ADDR_A, sizeof(addr));
+ KUNIT_EXPECT_EQ(test, 0, dev_uc_add(netdev, addr));
+ memset(addr, ADDR_B, sizeof(addr));
+ KUNIT_EXPECT_EQ(test, 0, dev_uc_add(netdev, addr));
+
+ /* Sync both to hardware: sync_cnt=1, refcount=2 */
+ __hw_addr_sync_dev(&netdev->uc, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+
+ /* Add ADDR_C (new, will be synced by snapshot) */
+ memset(addr, ADDR_C, sizeof(addr));
+ KUNIT_EXPECT_EQ(test, 0, dev_uc_add(netdev, addr));
+
+ /* Snapshot: A,B synced (sync_cnt=1,refcount=2); C new (0,1) */
+ netif_addr_lock_bh(netdev);
+ __hw_addr_init(&snap);
+ __hw_addr_init(&ref);
+ __hw_addr_init(&cache);
+ KUNIT_EXPECT_EQ(test, 0,
+ __hw_addr_list_snapshot(&snap, &netdev->uc, ETH_ALEN,
+ &cache));
+ KUNIT_EXPECT_EQ(test, 0,
+ __hw_addr_list_snapshot(&ref, &netdev->uc, ETH_ALEN,
+ &cache));
+ netif_addr_unlock_bh(netdev);
+
+ /* Driver syncs snapshot: ADDR_C is new -> synced; A,B already synced */
+ dev_addr_test_reset(netdev);
+ __hw_addr_sync_dev(&snap, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ KUNIT_EXPECT_EQ(test, 1 << ADDR_C, datp->addr_synced);
+ KUNIT_EXPECT_EQ(test, 0, datp->addr_unsynced);
+
+ /* Concurrent: user removes addr B while driver was working */
+ memset(addr, ADDR_B, sizeof(addr));
+ KUNIT_EXPECT_EQ(test, 0, dev_uc_del(netdev, addr));
+
+ /* Reconcile: ADDR_C's delta=+1 applied to real list.
+ * ADDR_B's delta=0 (unchanged in snapshot),
+ * so nothing to apply to ADDR_B.
+ */
+ netif_addr_lock_bh(netdev);
+ __hw_addr_list_reconcile(&netdev->uc, &snap, &ref, ETH_ALEN,
+ &cache);
+ netif_addr_unlock_bh(netdev);
+
+ /* ADDR_A: unchanged (sync_cnt=1, refcount=2)
+ * ADDR_B: refcount went from 2->1 via dev_uc_del (still present, stale)
+ * ADDR_C: sync propagated (sync_cnt=1, refcount=2)
+ */
+ KUNIT_EXPECT_EQ(test, 3, netdev->uc.count);
+ netdev_hw_addr_list_for_each(ha, &netdev->uc) {
+ u8 id = ha->addr[0];
+
+ if (!memchr_inv(ha->addr, id, ETH_ALEN)) {
+ if (id == ADDR_A) {
+ KUNIT_EXPECT_EQ(test, 1, ha->sync_cnt);
+ KUNIT_EXPECT_EQ(test, 2, ha->refcount);
+ } else if (id == ADDR_B) {
+ /* B: still present but now stale */
+ KUNIT_EXPECT_EQ(test, 1, ha->sync_cnt);
+ KUNIT_EXPECT_EQ(test, 1, ha->refcount);
+ } else if (id == ADDR_C) {
+ KUNIT_EXPECT_EQ(test, 1, ha->sync_cnt);
+ KUNIT_EXPECT_EQ(test, 2, ha->refcount);
+ }
+ }
+ }
+
+ /* Second work run: ADDR_B is stale, gets unsynced and removed */
+ dev_addr_test_reset(netdev);
+ __hw_addr_sync_dev(&netdev->uc, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ KUNIT_EXPECT_EQ(test, 0, datp->addr_synced);
+ KUNIT_EXPECT_EQ(test, 1 << ADDR_B, datp->addr_unsynced);
+ KUNIT_EXPECT_EQ(test, 2, netdev->uc.count);
+
+ __hw_addr_flush(&cache);
+ rtnl_unlock();
+}
+
+static void dev_addr_test_snapshot_benchmark(struct kunit *test)
+{
+ struct net_device *netdev = test->priv;
+ struct netdev_hw_addr_list snap, cache;
+ u8 addr[ETH_ALEN];
+ s64 duration = 0;
+ ktime_t start;
+ int i, iter;
+
+ rtnl_lock();
+
+ for (i = 0; i < 1024; i++) {
+ memset(addr, 0, sizeof(addr));
+ addr[0] = (i >> 8) & 0xff;
+ addr[1] = i & 0xff;
+ KUNIT_EXPECT_EQ(test, 0, dev_uc_add(netdev, addr));
+ }
+
+ __hw_addr_init(&cache);
+
+ for (iter = 0; iter < 1000; iter++) {
+ netif_addr_lock_bh(netdev);
+ __hw_addr_init(&snap);
+
+ start = ktime_get();
+ KUNIT_EXPECT_EQ(test, 0,
+ __hw_addr_list_snapshot(&snap, &netdev->uc,
+ ETH_ALEN, &cache));
+ duration += ktime_to_ns(ktime_sub(ktime_get(), start));
+
+ netif_addr_unlock_bh(netdev);
+ __hw_addr_flush(&snap);
+ }
+
+ __hw_addr_flush(&cache);
+
+ kunit_info(test,
+ "1024 addrs x 1000 snapshots: %lld ns total, %lld ns/iter",
+ duration, div_s64(duration, 1000));
+
+ rtnl_unlock();
+}
+
static struct kunit_case dev_addr_test_cases[] = {
KUNIT_CASE(dev_addr_test_basic),
KUNIT_CASE(dev_addr_test_sync_one),
@@ -232,6 +609,11 @@ static struct kunit_case dev_addr_test_cases[] = {
KUNIT_CASE(dev_addr_test_del_main),
KUNIT_CASE(dev_addr_test_add_set),
KUNIT_CASE(dev_addr_test_add_excl),
+ KUNIT_CASE(dev_addr_test_snapshot_sync),
+ KUNIT_CASE(dev_addr_test_snapshot_remove_during_sync),
+ KUNIT_CASE(dev_addr_test_snapshot_readd_during_unsync),
+ KUNIT_CASE(dev_addr_test_snapshot_add_and_remove),
+ KUNIT_CASE_SLOW(dev_addr_test_snapshot_benchmark),
{}
};
@@ -243,5 +625,6 @@ static struct kunit_suite dev_addr_test_suite = {
};
kunit_test_suite(dev_addr_test_suite);
+MODULE_IMPORT_NS("EXPORTED_FOR_KUNIT_TESTING");
MODULE_DESCRIPTION("KUnit tests for struct netdev_hw_addr_list");
MODULE_LICENSE("GPL");
diff --git a/net/core/dev_api.c b/net/core/dev_api.c
index f28852078aa6..437947dd08ed 100644
--- a/net/core/dev_api.c
+++ b/net/core/dev_api.c
@@ -66,6 +66,7 @@ int dev_change_flags(struct net_device *dev, unsigned int flags,
netdev_lock_ops(dev);
ret = netif_change_flags(dev, flags, extack);
+ netif_rx_mode_sync(dev);
netdev_unlock_ops(dev);
return ret;
@@ -285,6 +286,7 @@ int dev_set_promiscuity(struct net_device *dev, int inc)
netdev_lock_ops(dev);
ret = netif_set_promiscuity(dev, inc);
+ netif_rx_mode_sync(dev);
netdev_unlock_ops(dev);
return ret;
@@ -311,6 +313,7 @@ int dev_set_allmulti(struct net_device *dev, int inc)
netdev_lock_ops(dev);
ret = netif_set_allmulti(dev, inc, true);
+ netif_rx_mode_sync(dev);
netdev_unlock_ops(dev);
return ret;
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 7a8966544c9d..f3979b276090 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -586,24 +586,26 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
return err;
case SIOCADDMULTI:
- if (!ops->ndo_set_rx_mode ||
+ if ((!ops->ndo_set_rx_mode && !ops->ndo_set_rx_mode_async) ||
ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
return -EINVAL;
if (!netif_device_present(dev))
return -ENODEV;
netdev_lock_ops(dev);
err = dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
+ netif_rx_mode_sync(dev);
netdev_unlock_ops(dev);
return err;
case SIOCDELMULTI:
- if (!ops->ndo_set_rx_mode ||
+ if ((!ops->ndo_set_rx_mode && !ops->ndo_set_rx_mode_async) ||
ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
return -EINVAL;
if (!netif_device_present(dev))
return -ENODEV;
netdev_lock_ops(dev);
err = dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
+ netif_rx_mode_sync(dev);
netdev_unlock_ops(dev);
return err;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 69daba3ddaf0..b613bb6e07df 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3431,6 +3431,7 @@ errout:
dev->name);
}
+ netif_rx_mode_sync(dev);
netdev_unlock_ops(dev);
return err;