diff options
author | Yunsheng Lin <linyunsheng@huawei.com> | 2021-05-14 06:17:00 +0300 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2021-05-15 01:05:46 +0300 |
commit | 102b55ee92f9fda4dde7a45d2b20538e6e3e3d1e (patch) | |
tree | b99ff14fdb63b9a505bab2d45f01289a925424a5 | |
parent | a90c57f2cedd52a511f739fb55e6244e22e1a2fb (diff) | |
download | linux-102b55ee92f9fda4dde7a45d2b20538e6e3e3d1e.tar.xz |
net: sched: fix tx action rescheduling issue during deactivation
Currently qdisc_run() checks the STATE_DEACTIVATED of lockless
qdisc before calling __qdisc_run(), which ultimately clear the
STATE_MISSED when all the skb is dequeued. If STATE_DEACTIVATED
is set before clearing STATE_MISSED, there may be rescheduling
of net_tx_action() at the end of qdisc_run_end(), see below:
CPU0(net_tx_atcion) CPU1(__dev_xmit_skb) CPU2(dev_deactivate)
. . .
. set STATE_MISSED .
. __netif_schedule() .
. . set STATE_DEACTIVATED
. . qdisc_reset()
. . .
.<--------------- . synchronize_net()
clear __QDISC_STATE_SCHED | . .
. | . .
. | . some_qdisc_is_busy()
. | . return *false*
. | . .
test STATE_DEACTIVATED | . .
__qdisc_run() *not* called | . .
. | . .
test STATE_MISS | . .
__netif_schedule()--------| . .
. . .
. . .
__qdisc_run() is not called by net_tx_atcion() in CPU0 because
CPU2 has set STATE_DEACTIVATED flag during dev_deactivate(), and
STATE_MISSED is only cleared in __qdisc_run(), __netif_schedule
is called at the end of qdisc_run_end(), causing tx action
rescheduling problem.
qdisc_run() called by net_tx_action() runs in the softirq context,
which should has the same semantic as the qdisc_run() called by
__dev_xmit_skb() protected by rcu_read_lock_bh(). And there is a
synchronize_net() between STATE_DEACTIVATED flag being set and
qdisc_reset()/some_qdisc_is_busy in dev_deactivate(), we can safely
bail out for the deactived lockless qdisc in net_tx_action(), and
qdisc_reset() will reset all skb not dequeued yet.
So add the rcu_read_lock() explicitly to protect the qdisc_run()
and do the STATE_DEACTIVATED checking in net_tx_action() before
calling qdisc_run_begin(). Another option is to do the checking in
the qdisc_run_end(), but it will add unnecessary overhead for
non-tx_action case, because __dev_queue_xmit() will not see qdisc
with STATE_DEACTIVATED after synchronize_net(), the qdisc with
STATE_DEACTIVATED can only be seen by net_tx_action() because of
__netif_schedule().
The STATE_DEACTIVATED checking in qdisc_run() is to avoid race
between net_tx_action() and qdisc_reset(), see:
commit d518d2ed8640 ("net/sched: fix race between deactivation
and dequeue for NOLOCK qdisc"). As the bailout added above for
deactived lockless qdisc in net_tx_action() provides better
protection for the race without calling qdisc_run() at all, so
remove the STATE_DEACTIVATED checking in qdisc_run().
After qdisc_reset(), there is no skb in qdisc to be dequeued, so
clear the STATE_MISSED in dev_reset_queue() too.
Fixes: 6b3ba9146fe6 ("net: sched: allow qdiscs to handle locking")
Acked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
V8: Clearing STATE_MISSED before calling __netif_schedule() has
avoid the endless rescheduling problem, but there may still
be a unnecessary rescheduling, so adjust the commit log.
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | include/net/pkt_sched.h | 7 | ||||
-rw-r--r-- | net/core/dev.c | 26 | ||||
-rw-r--r-- | net/sched/sch_generic.c | 4 |
3 files changed, 26 insertions, 11 deletions
diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index f5c1bee0cd6a..6d7b12cba015 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -128,12 +128,7 @@ void __qdisc_run(struct Qdisc *q); static inline void qdisc_run(struct Qdisc *q) { if (qdisc_run_begin(q)) { - /* NOLOCK qdisc must check 'state' under the qdisc seqlock - * to avoid racing with dev_qdisc_reset() - */ - if (!(q->flags & TCQ_F_NOLOCK) || - likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) - __qdisc_run(q); + __qdisc_run(q); qdisc_run_end(q); } } diff --git a/net/core/dev.c b/net/core/dev.c index 222b1d322c96..d596cd746353 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5025,25 +5025,43 @@ static __latent_entropy void net_tx_action(struct softirq_action *h) sd->output_queue_tailp = &sd->output_queue; local_irq_enable(); + rcu_read_lock(); + while (head) { struct Qdisc *q = head; spinlock_t *root_lock = NULL; head = head->next_sched; - if (!(q->flags & TCQ_F_NOLOCK)) { - root_lock = qdisc_lock(q); - spin_lock(root_lock); - } /* We need to make sure head->next_sched is read * before clearing __QDISC_STATE_SCHED */ smp_mb__before_atomic(); + + if (!(q->flags & TCQ_F_NOLOCK)) { + root_lock = qdisc_lock(q); + spin_lock(root_lock); + } else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, + &q->state))) { + /* There is a synchronize_net() between + * STATE_DEACTIVATED flag being set and + * qdisc_reset()/some_qdisc_is_busy() in + * dev_deactivate(), so we can safely bail out + * early here to avoid data race between + * qdisc_deactivate() and some_qdisc_is_busy() + * for lockless qdisc. + */ + clear_bit(__QDISC_STATE_SCHED, &q->state); + continue; + } + clear_bit(__QDISC_STATE_SCHED, &q->state); qdisc_run(q); if (root_lock) spin_unlock(root_lock); } + + rcu_read_unlock(); } xfrm_dev_backlog(sd); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 795d986e7030..d86c4cca2cab 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -1177,8 +1177,10 @@ static void dev_reset_queue(struct net_device *dev, qdisc_reset(qdisc); spin_unlock_bh(qdisc_lock(qdisc)); - if (nolock) + if (nolock) { + clear_bit(__QDISC_STATE_MISSED, &qdisc->state); spin_unlock_bh(&qdisc->seqlock); + } } static bool some_qdisc_is_busy(struct net_device *dev) |