diff options
Diffstat (limited to 'net/sched/sch_generic.c')
-rw-r--r-- | net/sched/sch_generic.c | 440 |
1 files changed, 317 insertions, 123 deletions
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index cd1b200acae7..981c08fe810b 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -26,6 +26,7 @@ #include <linux/list.h> #include <linux/slab.h> #include <linux/if_vlan.h> +#include <linux/skb_array.h> #include <linux/if_macvlan.h> #include <net/sch_generic.h> #include <net/pkt_sched.h> @@ -47,9 +48,70 @@ EXPORT_SYMBOL(default_qdisc_ops); * - updates to tree and tree walking are only done under the rtnl mutex. */ -static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) +static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q) +{ + const struct netdev_queue *txq = q->dev_queue; + spinlock_t *lock = NULL; + struct sk_buff *skb; + + if (q->flags & TCQ_F_NOLOCK) { + lock = qdisc_lock(q); + spin_lock(lock); + } + + skb = skb_peek(&q->skb_bad_txq); + if (skb) { + /* check the reason of requeuing without tx lock first */ + txq = skb_get_tx_queue(txq->dev, skb); + if (!netif_xmit_frozen_or_stopped(txq)) { + skb = __skb_dequeue(&q->skb_bad_txq); + if (qdisc_is_percpu_stats(q)) { + qdisc_qstats_cpu_backlog_dec(q, skb); + qdisc_qstats_cpu_qlen_dec(q); + } else { + qdisc_qstats_backlog_dec(q, skb); + q->q.qlen--; + } + } else { + skb = NULL; + } + } + + if (lock) + spin_unlock(lock); + + return skb; +} + +static inline struct sk_buff *qdisc_dequeue_skb_bad_txq(struct Qdisc *q) +{ + struct sk_buff *skb = skb_peek(&q->skb_bad_txq); + + if (unlikely(skb)) + skb = __skb_dequeue_bad_txq(q); + + return skb; +} + +static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *q, + struct sk_buff *skb) { - q->gso_skb = skb; + spinlock_t *lock = NULL; + + if (q->flags & TCQ_F_NOLOCK) { + lock = qdisc_lock(q); + spin_lock(lock); + } + + __skb_queue_tail(&q->skb_bad_txq, skb); + + if (lock) + spin_unlock(lock); +} + +static inline int __dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) +{ + __skb_queue_head(&q->gso_skb, skb); q->qstats.requeues++; qdisc_qstats_backlog_inc(q, skb); q->q.qlen++; /* it's still part of the queue */ @@ -58,6 +120,30 @@ static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) return 0; } +static inline int dev_requeue_skb_locked(struct sk_buff *skb, struct Qdisc *q) +{ + spinlock_t *lock = qdisc_lock(q); + + spin_lock(lock); + __skb_queue_tail(&q->gso_skb, skb); + spin_unlock(lock); + + qdisc_qstats_cpu_requeues_inc(q); + qdisc_qstats_cpu_backlog_inc(q, skb); + qdisc_qstats_cpu_qlen_inc(q); + __netif_schedule(q); + + return 0; +} + +static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) +{ + if (q->flags & TCQ_F_NOLOCK) + return dev_requeue_skb_locked(skb, q); + else + return __dev_requeue_skb(skb, q); +} + static void try_bulk_dequeue_skb(struct Qdisc *q, struct sk_buff *skb, const struct netdev_queue *txq, @@ -95,9 +181,15 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q, if (!nskb) break; if (unlikely(skb_get_queue_mapping(nskb) != mapping)) { - q->skb_bad_txq = nskb; - qdisc_qstats_backlog_inc(q, nskb); - q->q.qlen++; + qdisc_enqueue_skb_bad_txq(q, nskb); + + if (qdisc_is_percpu_stats(q)) { + qdisc_qstats_cpu_backlog_inc(q, nskb); + qdisc_qstats_cpu_qlen_inc(q); + } else { + qdisc_qstats_backlog_inc(q, nskb); + q->q.qlen++; + } break; } skb->next = nskb; @@ -113,40 +205,60 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q, static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, int *packets) { - struct sk_buff *skb = q->gso_skb; const struct netdev_queue *txq = q->dev_queue; + struct sk_buff *skb = NULL; *packets = 1; - if (unlikely(skb)) { + if (unlikely(!skb_queue_empty(&q->gso_skb))) { + spinlock_t *lock = NULL; + + if (q->flags & TCQ_F_NOLOCK) { + lock = qdisc_lock(q); + spin_lock(lock); + } + + skb = skb_peek(&q->gso_skb); + + /* skb may be null if another cpu pulls gso_skb off in between + * empty check and lock. + */ + if (!skb) { + if (lock) + spin_unlock(lock); + goto validate; + } + /* skb in gso_skb were already validated */ *validate = false; /* check the reason of requeuing without tx lock first */ txq = skb_get_tx_queue(txq->dev, skb); if (!netif_xmit_frozen_or_stopped(txq)) { - q->gso_skb = NULL; - qdisc_qstats_backlog_dec(q, skb); - q->q.qlen--; - } else + skb = __skb_dequeue(&q->gso_skb); + if (qdisc_is_percpu_stats(q)) { + qdisc_qstats_cpu_backlog_dec(q, skb); + qdisc_qstats_cpu_qlen_dec(q); + } else { + qdisc_qstats_backlog_dec(q, skb); + q->q.qlen--; + } + } else { skb = NULL; - goto trace; - } - *validate = true; - skb = q->skb_bad_txq; - if (unlikely(skb)) { - /* check the reason of requeuing without tx lock first */ - txq = skb_get_tx_queue(txq->dev, skb); - if (!netif_xmit_frozen_or_stopped(txq)) { - q->skb_bad_txq = NULL; - qdisc_qstats_backlog_dec(q, skb); - q->q.qlen--; - goto bulk; } - skb = NULL; + if (lock) + spin_unlock(lock); goto trace; } - if (!(q->flags & TCQ_F_ONETXQUEUE) || - !netif_xmit_frozen_or_stopped(txq)) - skb = q->dequeue(q); +validate: + *validate = true; + + if ((q->flags & TCQ_F_ONETXQUEUE) && + netif_xmit_frozen_or_stopped(txq)) + return skb; + + skb = qdisc_dequeue_skb_bad_txq(q); + if (unlikely(skb)) + goto bulk; + skb = q->dequeue(q); if (skb) { bulk: if (qdisc_may_bulk(q)) @@ -165,17 +277,18 @@ trace: * only one CPU can execute this function. * * Returns to the caller: - * 0 - queue is empty or throttled. - * >0 - queue is not empty. + * false - hardware queue frozen backoff + * true - feel free to send more pkts */ -int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, - struct net_device *dev, struct netdev_queue *txq, - spinlock_t *root_lock, bool validate) +bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, + struct net_device *dev, struct netdev_queue *txq, + spinlock_t *root_lock, bool validate) { int ret = NETDEV_TX_BUSY; /* And release qdisc */ - spin_unlock(root_lock); + if (root_lock) + spin_unlock(root_lock); /* Note that we validate skb (GSO, checksum, ...) outside of locks */ if (validate) @@ -188,27 +301,28 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, HARD_TX_UNLOCK(dev, txq); } else { - spin_lock(root_lock); - return qdisc_qlen(q); + if (root_lock) + spin_lock(root_lock); + return true; } - spin_lock(root_lock); - if (dev_xmit_complete(ret)) { - /* Driver sent out skb successfully or skb was consumed */ - ret = qdisc_qlen(q); - } else { + if (root_lock) + spin_lock(root_lock); + + if (!dev_xmit_complete(ret)) { /* Driver returned NETDEV_TX_BUSY - requeue skb */ if (unlikely(ret != NETDEV_TX_BUSY)) net_warn_ratelimited("BUG %s code %d qlen %d\n", dev->name, ret, q->q.qlen); - ret = dev_requeue_skb(skb, q); + dev_requeue_skb(skb, q); + return false; } if (ret && netif_xmit_frozen_or_stopped(txq)) - ret = 0; + return false; - return ret; + return true; } /* @@ -230,20 +344,22 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, * >0 - queue is not empty. * */ -static inline int qdisc_restart(struct Qdisc *q, int *packets) +static inline bool qdisc_restart(struct Qdisc *q, int *packets) { + spinlock_t *root_lock = NULL; struct netdev_queue *txq; struct net_device *dev; - spinlock_t *root_lock; struct sk_buff *skb; bool validate; /* Dequeue packet */ skb = dequeue_skb(q, &validate, packets); if (unlikely(!skb)) - return 0; + return false; + + if (!(q->flags & TCQ_F_NOLOCK)) + root_lock = qdisc_lock(q); - root_lock = qdisc_lock(q); dev = qdisc_dev(q); txq = skb_get_tx_queue(dev, skb); @@ -267,8 +383,6 @@ void __qdisc_run(struct Qdisc *q) break; } } - - qdisc_run_end(q); } unsigned long dev_trans_start(struct net_device *dev) @@ -468,93 +582,93 @@ static const u8 prio2band[TC_PRIO_MAX + 1] = { /* * Private data for a pfifo_fast scheduler containing: - * - queues for the three band - * - bitmap indicating which of the bands contain skbs + * - rings for priority bands */ struct pfifo_fast_priv { - u32 bitmap; - struct qdisc_skb_head q[PFIFO_FAST_BANDS]; + struct skb_array q[PFIFO_FAST_BANDS]; }; -/* - * Convert a bitmap to the first band number where an skb is queued, where: - * bitmap=0 means there are no skbs on any band. - * bitmap=1 means there is an skb on band 0. - * bitmap=7 means there are skbs on all 3 bands, etc. - */ -static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0}; - -static inline struct qdisc_skb_head *band2list(struct pfifo_fast_priv *priv, - int band) +static inline struct skb_array *band2list(struct pfifo_fast_priv *priv, + int band) { - return priv->q + band; + return &priv->q[band]; } static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, struct sk_buff **to_free) { - if (qdisc->q.qlen < qdisc_dev(qdisc)->tx_queue_len) { - int band = prio2band[skb->priority & TC_PRIO_MAX]; - struct pfifo_fast_priv *priv = qdisc_priv(qdisc); - struct qdisc_skb_head *list = band2list(priv, band); - - priv->bitmap |= (1 << band); - qdisc->q.qlen++; - return __qdisc_enqueue_tail(skb, qdisc, list); - } + int band = prio2band[skb->priority & TC_PRIO_MAX]; + struct pfifo_fast_priv *priv = qdisc_priv(qdisc); + struct skb_array *q = band2list(priv, band); + int err; + + err = skb_array_produce(q, skb); - return qdisc_drop(skb, qdisc, to_free); + if (unlikely(err)) + return qdisc_drop_cpu(skb, qdisc, to_free); + + qdisc_qstats_cpu_qlen_inc(qdisc); + qdisc_qstats_cpu_backlog_inc(qdisc, skb); + return NET_XMIT_SUCCESS; } static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) { struct pfifo_fast_priv *priv = qdisc_priv(qdisc); - int band = bitmap2band[priv->bitmap]; - - if (likely(band >= 0)) { - struct qdisc_skb_head *qh = band2list(priv, band); - struct sk_buff *skb = __qdisc_dequeue_head(qh); + struct sk_buff *skb = NULL; + int band; - if (likely(skb != NULL)) { - qdisc_qstats_backlog_dec(qdisc, skb); - qdisc_bstats_update(qdisc, skb); - } + for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) { + struct skb_array *q = band2list(priv, band); - qdisc->q.qlen--; - if (qh->qlen == 0) - priv->bitmap &= ~(1 << band); + if (__skb_array_empty(q)) + continue; - return skb; + skb = skb_array_consume_bh(q); + } + if (likely(skb)) { + qdisc_qstats_cpu_backlog_dec(qdisc, skb); + qdisc_bstats_cpu_update(qdisc, skb); + qdisc_qstats_cpu_qlen_dec(qdisc); } - return NULL; + return skb; } static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc) { struct pfifo_fast_priv *priv = qdisc_priv(qdisc); - int band = bitmap2band[priv->bitmap]; + struct sk_buff *skb = NULL; + int band; - if (band >= 0) { - struct qdisc_skb_head *qh = band2list(priv, band); + for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) { + struct skb_array *q = band2list(priv, band); - return qh->head; + skb = __skb_array_peek(q); } - return NULL; + return skb; } static void pfifo_fast_reset(struct Qdisc *qdisc) { - int prio; + int i, band; struct pfifo_fast_priv *priv = qdisc_priv(qdisc); - for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) - __qdisc_reset_queue(band2list(priv, prio)); + for (band = 0; band < PFIFO_FAST_BANDS; band++) { + struct skb_array *q = band2list(priv, band); + struct sk_buff *skb; - priv->bitmap = 0; - qdisc->qstats.backlog = 0; - qdisc->q.qlen = 0; + while ((skb = skb_array_consume_bh(q)) != NULL) + kfree_skb(skb); + } + + for_each_possible_cpu(i) { + struct gnet_stats_queue *q = per_cpu_ptr(qdisc->cpu_qstats, i); + + q->backlog = 0; + q->qlen = 0; + } } static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb) @@ -572,17 +686,48 @@ nla_put_failure: static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt) { - int prio; + unsigned int qlen = qdisc_dev(qdisc)->tx_queue_len; struct pfifo_fast_priv *priv = qdisc_priv(qdisc); + int prio; - for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) - qdisc_skb_head_init(band2list(priv, prio)); + /* guard against zero length rings */ + if (!qlen) + return -EINVAL; + + for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) { + struct skb_array *q = band2list(priv, prio); + int err; + + err = skb_array_init(q, qlen, GFP_KERNEL); + if (err) + return -ENOMEM; + } /* Can by-pass the queue discipline */ qdisc->flags |= TCQ_F_CAN_BYPASS; return 0; } +static void pfifo_fast_destroy(struct Qdisc *sch) +{ + struct pfifo_fast_priv *priv = qdisc_priv(sch); + int prio; + + for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) { + struct skb_array *q = band2list(priv, prio); + + /* NULL ring is possible if destroy path is due to a failed + * skb_array_init() in pfifo_fast_init() case. + */ + if (!&q->ring.queue) + continue; + /* Destroy ring but no need to kfree_skb because a call to + * pfifo_fast_reset() has already done that work. + */ + ptr_ring_cleanup(&q->ring, NULL); + } +} + struct Qdisc_ops pfifo_fast_ops __read_mostly = { .id = "pfifo_fast", .priv_size = sizeof(struct pfifo_fast_priv), @@ -590,9 +735,11 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = { .dequeue = pfifo_fast_dequeue, .peek = pfifo_fast_peek, .init = pfifo_fast_init, + .destroy = pfifo_fast_destroy, .reset = pfifo_fast_reset, .dump = pfifo_fast_dump, .owner = THIS_MODULE, + .static_flags = TCQ_F_NOLOCK | TCQ_F_CPUSTATS, }; EXPORT_SYMBOL(pfifo_fast_ops); @@ -630,9 +777,24 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p); sch->padded = (char *) sch - (char *) p; } + __skb_queue_head_init(&sch->gso_skb); + __skb_queue_head_init(&sch->skb_bad_txq); qdisc_skb_head_init(&sch->q); spin_lock_init(&sch->q.lock); + if (ops->static_flags & TCQ_F_CPUSTATS) { + sch->cpu_bstats = + netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); + if (!sch->cpu_bstats) + goto errout1; + + sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue); + if (!sch->cpu_qstats) { + free_percpu(sch->cpu_bstats); + goto errout1; + } + } + spin_lock_init(&sch->busylock); lockdep_set_class(&sch->busylock, dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); @@ -642,6 +804,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, dev->qdisc_running_key ?: &qdisc_running_key); sch->ops = ops; + sch->flags = ops->static_flags; sch->enqueue = ops->enqueue; sch->dequeue = ops->dequeue; sch->dev_queue = dev_queue; @@ -649,6 +812,8 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, refcount_set(&sch->refcnt, 1); return sch; +errout1: + kfree(p); errout: return ERR_PTR(err); } @@ -682,17 +847,21 @@ EXPORT_SYMBOL(qdisc_create_dflt); void qdisc_reset(struct Qdisc *qdisc) { const struct Qdisc_ops *ops = qdisc->ops; + struct sk_buff *skb, *tmp; if (ops->reset) ops->reset(qdisc); - kfree_skb(qdisc->skb_bad_txq); - qdisc->skb_bad_txq = NULL; + skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp) { + __skb_unlink(skb, &qdisc->gso_skb); + kfree_skb_list(skb); + } - if (qdisc->gso_skb) { - kfree_skb_list(qdisc->gso_skb); - qdisc->gso_skb = NULL; + skb_queue_walk_safe(&qdisc->skb_bad_txq, skb, tmp) { + __skb_unlink(skb, &qdisc->skb_bad_txq); + kfree_skb_list(skb); } + qdisc->q.qlen = 0; qdisc->qstats.backlog = 0; } @@ -711,6 +880,7 @@ static void qdisc_free(struct Qdisc *qdisc) void qdisc_destroy(struct Qdisc *qdisc) { const struct Qdisc_ops *ops = qdisc->ops; + struct sk_buff *skb, *tmp; if (qdisc->flags & TCQ_F_BUILTIN || !refcount_dec_and_test(&qdisc->refcnt)) @@ -730,8 +900,16 @@ void qdisc_destroy(struct Qdisc *qdisc) module_put(ops->owner); dev_put(qdisc_dev(qdisc)); - kfree_skb_list(qdisc->gso_skb); - kfree_skb(qdisc->skb_bad_txq); + skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp) { + __skb_unlink(skb, &qdisc->gso_skb); + kfree_skb_list(skb); + } + + skb_queue_walk_safe(&qdisc->skb_bad_txq, skb, tmp) { + __skb_unlink(skb, &qdisc->skb_bad_txq); + kfree_skb_list(skb); + } + qdisc_free(qdisc); } EXPORT_SYMBOL(qdisc_destroy); @@ -746,10 +924,6 @@ struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue, root_lock = qdisc_lock(oqdisc); spin_lock_bh(root_lock); - /* Prune old scheduler */ - if (oqdisc && refcount_read(&oqdisc->refcnt) <= 1) - qdisc_reset(oqdisc); - /* ... and graft new one */ if (qdisc == NULL) qdisc = &noop_qdisc; @@ -885,14 +1059,18 @@ static bool some_qdisc_is_busy(struct net_device *dev) dev_queue = netdev_get_tx_queue(dev, i); q = dev_queue->qdisc_sleeping; - root_lock = qdisc_lock(q); - spin_lock_bh(root_lock); + if (q->flags & TCQ_F_NOLOCK) { + val = test_bit(__QDISC_STATE_SCHED, &q->state); + } else { + root_lock = qdisc_lock(q); + spin_lock_bh(root_lock); - val = (qdisc_is_running(q) || - test_bit(__QDISC_STATE_SCHED, &q->state)); + val = (qdisc_is_running(q) || + test_bit(__QDISC_STATE_SCHED, &q->state)); - spin_unlock_bh(root_lock); + spin_unlock_bh(root_lock); + } if (val) return true; @@ -900,6 +1078,16 @@ static bool some_qdisc_is_busy(struct net_device *dev) return false; } +static void dev_qdisc_reset(struct net_device *dev, + struct netdev_queue *dev_queue, + void *none) +{ + struct Qdisc *qdisc = dev_queue->qdisc_sleeping; + + if (qdisc) + qdisc_reset(qdisc); +} + /** * dev_deactivate_many - deactivate transmissions on several devices * @head: list of devices to deactivate @@ -910,7 +1098,6 @@ static bool some_qdisc_is_busy(struct net_device *dev) void dev_deactivate_many(struct list_head *head) { struct net_device *dev; - bool sync_needed = false; list_for_each_entry(dev, head, close_list) { netdev_for_each_tx_queue(dev, dev_deactivate_queue, @@ -920,20 +1107,25 @@ void dev_deactivate_many(struct list_head *head) &noop_qdisc); dev_watchdog_down(dev); - sync_needed |= !dev->dismantle; } /* Wait for outstanding qdisc-less dev_queue_xmit calls. * This is avoided if all devices are in dismantle phase : * Caller will call synchronize_net() for us */ - if (sync_needed) - synchronize_net(); + synchronize_net(); /* Wait for outstanding qdisc_run calls. */ - list_for_each_entry(dev, head, close_list) + list_for_each_entry(dev, head, close_list) { while (some_qdisc_is_busy(dev)) yield(); + /* The new qdisc is assigned at this point so we can safely + * unwind stale skb lists and qdisc statistics + */ + netdev_for_each_tx_queue(dev, dev_qdisc_reset, NULL); + if (dev_ingress_queue(dev)) + dev_qdisc_reset(dev, dev_ingress_queue(dev), NULL); + } } void dev_deactivate(struct net_device *dev) @@ -954,6 +1146,8 @@ static void dev_init_scheduler_queue(struct net_device *dev, rcu_assign_pointer(dev_queue->qdisc, qdisc); dev_queue->qdisc_sleeping = qdisc; + __skb_queue_head_init(&qdisc->gso_skb); + __skb_queue_head_init(&qdisc->skb_bad_txq); } void dev_init_scheduler(struct net_device *dev) |