diff options
| author | Jakub Kicinski <kuba@kernel.org> | 2026-06-12 17:20:55 -0700 |
|---|---|---|
| committer | Jakub Kicinski <kuba@kernel.org> | 2026-06-12 17:20:56 -0700 |
| commit | 592b792026eaab89efb84bed71b05994645fa790 (patch) | |
| tree | 07d421ce66a74971b35299515136e5e573cd148a | |
| parent | 344873108ca7f342f1a7ffeb81ffca2347fe9535 (diff) | |
| parent | 101f1047c2f6261d252d68ca3f77e52ed05a8402 (diff) | |
Merge branch 'avoid-mistaken-parent-class-deactivation-during-peek'
Victor Nogueira says:
====================
Avoid mistaken parent class deactivation during peek
Several qdiscs (fq_codel, codel and dualpi2) may drop packets while
peeking at their queue. When that happens they call
qdisc_tree_reduce_backlog() to notify the parent of the backlog/qlen
change. The problem is that they do so *before* reincrementing the qlen
that peek had temporarily decremented.
If the qlen momentarily drops to zero while peek still has an skb to
return, qdisc_tree_reduce_backlog() ends up invoking the parent's
qlen_notify() callback even though the child is not actually empty. The
parent then deactivates the class, while the child still holds a packet.
For parents such as QFQ this desync corrupts the active class list and
leads to wild memory accesses and NULL pointer dereferences (see the
per-patch splats). For HFSC it might lead to stalls [1].
Fix all three qdiscs the same way: only call qdisc_tree_reduce_backlog()
once the qlen has been restored, so the parent never observes a
transient empty child during peek.
Patch 1 fixes this for fq_codel, patch 2 for codel, patch 3 for dualpi2
and patch 4 adds test cases for these 3 setups.
Note: Patch 1 is one of two fixes for the stall reported in [1]; the
companion fix is "net/sched: sch_hfsc: Don't make class passive twice",
sent separately.
Note2: A possible cleaner fix is to create a new helper function for peek
that only calls qdisc_tree_reduce_backlog after reincrementing the qlen.
This would be called from the 3 vulnerable qdiscs, however we thought this
might make it harder for backporting so, if people agree, we can submit
this cleaner version to net-next after this one is merged.
[1] https://lore.kernel.org/netdev/CAN2cbVe79oj0O9==m4+4x3v+O+qzRagA=2=wkrp9i9=CqYvyZA@mail.gmail.com/
====================
Link: https://patch.msgid.link/20260610192855.3121513-1-victor@mojatatu.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
| -rw-r--r-- | net/sched/sch_codel.c | 48 | ||||
| -rw-r--r-- | net/sched/sch_dualpi2.c | 41 | ||||
| -rw-r--r-- | net/sched/sch_fq_codel.c | 41 | ||||
| -rw-r--r-- | tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json | 184 |
4 files changed, 305 insertions, 9 deletions
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index 317aae0ec7bd..7d5076196aff 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -56,7 +56,7 @@ static void drop_func(struct sk_buff *skb, void *ctx) qdisc_qstats_drop(sch); } -static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch) +static struct sk_buff *__codel_qdisc_dequeue(struct Qdisc *sch) { struct codel_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; @@ -65,13 +65,51 @@ static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch) &q->stats, qdisc_pkt_len, codel_get_enqueue_time, drop_func, dequeue_func); + if (skb) + qdisc_bstats_update(sch, skb); + return skb; +} + +static void codel_dequeue_drop(struct Qdisc *sch) +{ + struct codel_sched_data *q = qdisc_priv(sch); + if (q->stats.drop_count) { - qdisc_tree_reduce_backlog(sch, q->stats.drop_count, q->stats.drop_len); + qdisc_tree_reduce_backlog(sch, q->stats.drop_count, + q->stats.drop_len); q->stats.drop_count = 0; q->stats.drop_len = 0; } - if (skb) - qdisc_bstats_update(sch, skb); +} + +static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch) +{ + struct sk_buff *skb; + + skb = __codel_qdisc_dequeue(sch); + + codel_dequeue_drop(sch); + + return skb; +} + +static struct sk_buff *codel_peek(struct Qdisc *sch) +{ + struct sk_buff *skb = skb_peek(&sch->gso_skb); + + if (!skb) { + skb = __codel_qdisc_dequeue(sch); + + if (skb) { + __skb_queue_head(&sch->gso_skb, skb); + /* it's still part of the queue */ + qdisc_qstats_backlog_inc(sch, skb); + sch->q.qlen++; + } + + codel_dequeue_drop(sch); + } + return skb; } @@ -257,7 +295,7 @@ static struct Qdisc_ops codel_qdisc_ops __read_mostly = { .enqueue = codel_qdisc_enqueue, .dequeue = codel_qdisc_dequeue, - .peek = qdisc_peek_dequeued, + .peek = codel_peek, .init = codel_init, .reset = codel_reset, .change = codel_change, diff --git a/net/sched/sch_dualpi2.c b/net/sched/sch_dualpi2.c index a22489c14458..05285775b454 100644 --- a/net/sched/sch_dualpi2.c +++ b/net/sched/sch_dualpi2.c @@ -579,7 +579,7 @@ static void drop_and_retry(struct dualpi2_sched_data *q, struct sk_buff *skb, qdisc_qstats_drop(sch); } -static struct sk_buff *dualpi2_qdisc_dequeue(struct Qdisc *sch) +static struct sk_buff *__dualpi2_qdisc_dequeue(struct Qdisc *sch) { struct dualpi2_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; @@ -605,12 +605,49 @@ static struct sk_buff *dualpi2_qdisc_dequeue(struct Qdisc *sch) break; } + return skb; +} + +static void dualpi2_dequeue_drop(struct Qdisc *sch) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + if (q->deferred_drops_cnt) { qdisc_tree_reduce_backlog(sch, q->deferred_drops_cnt, q->deferred_drops_len); q->deferred_drops_cnt = 0; q->deferred_drops_len = 0; } +} + +static struct sk_buff *dualpi2_qdisc_dequeue(struct Qdisc *sch) +{ + struct sk_buff *skb; + + skb = __dualpi2_qdisc_dequeue(sch); + + dualpi2_dequeue_drop(sch); + + return skb; +} + +static struct sk_buff *dualpi2_peek(struct Qdisc *sch) +{ + struct sk_buff *skb = skb_peek(&sch->gso_skb); + + if (!skb) { + skb = __dualpi2_qdisc_dequeue(sch); + + if (skb) { + __skb_queue_head(&sch->gso_skb, skb); + /* it's still part of the queue */ + qdisc_qstats_backlog_inc(sch, skb); + sch->q.qlen++; + } + + dualpi2_dequeue_drop(sch); + } + return skb; } @@ -1165,7 +1202,7 @@ static struct Qdisc_ops dualpi2_qdisc_ops __read_mostly = { .priv_size = sizeof(struct dualpi2_sched_data), .enqueue = dualpi2_qdisc_enqueue, .dequeue = dualpi2_qdisc_dequeue, - .peek = qdisc_peek_dequeued, + .peek = dualpi2_peek, .init = dualpi2_init, .destroy = dualpi2_destroy, .reset = dualpi2_reset, diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 24db54684e8a..9aebf25ddc79 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -280,7 +280,7 @@ static void drop_func(struct sk_buff *skb, void *ctx) qdisc_qstats_drop(sch); } -static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch) +static struct sk_buff *__fq_codel_dequeue(struct Qdisc *sch) { struct fq_codel_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; @@ -317,12 +317,49 @@ begin: qdisc_bstats_update(sch, skb); WRITE_ONCE(flow->deficit, flow->deficit - qdisc_pkt_len(skb)); + return skb; +} + +static void fq_codel_dequeue_drop(struct Qdisc *sch) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + if (q->cstats.drop_count) { qdisc_tree_reduce_backlog(sch, q->cstats.drop_count, q->cstats.drop_len); q->cstats.drop_count = 0; q->cstats.drop_len = 0; } +} + +static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch) +{ + struct sk_buff *skb; + + skb = __fq_codel_dequeue(sch); + + fq_codel_dequeue_drop(sch); + + return skb; +} + +static struct sk_buff *fq_codel_peek(struct Qdisc *sch) +{ + struct sk_buff *skb = skb_peek(&sch->gso_skb); + + if (!skb) { + skb = __fq_codel_dequeue(sch); + + if (skb) { + __skb_queue_head(&sch->gso_skb, skb); + /* it's still part of the queue */ + qdisc_qstats_backlog_inc(sch, skb); + sch->q.qlen++; + } + + fq_codel_dequeue_drop(sch); + } + return skb; } @@ -725,7 +762,7 @@ static struct Qdisc_ops fq_codel_qdisc_ops __read_mostly = { .priv_size = sizeof(struct fq_codel_sched_data), .enqueue = fq_codel_enqueue, .dequeue = fq_codel_dequeue, - .peek = qdisc_peek_dequeued, + .peek = fq_codel_peek, .init = fq_codel_init, .reset = fq_codel_reset, .destroy = fq_codel_destroy, diff --git a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json index 82c38a13dfbf..e83e31b932dc 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json +++ b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json @@ -1326,5 +1326,189 @@ "teardown": [ "$TC qdisc del dev $DUMMY handle 1: root" ] + }, + { + "id": "c797", + "name": "Verify fq_codel won't mistakenly deactivate QFQ parent class during peek", + "category": [ + "qdisc", + "qfq", + "fq_codel" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.10.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: qfq", + "$TC class add dev $DUMMY parent 1: classid 1:1 qfq weight 1 maxpkt 1000", + "$TC class add dev $DUMMY parent 1: classid 1:2 qfq weight 1 maxpkt 1000", + "$TC qdisc add dev $DUMMY parent 1:1 handle 2:0 plug limit 1024", + "$IP l set dev $DUMMY mtu 1500", + "$TC qdisc add dev $DUMMY parent 1:2 handle 10: fq_codel target 1 interval 1 flows 1", + "$TC filter add dev $DUMMY parent 1: protocol ip prio 1 u32 match ip dst 10.10.10.1/32 flowid 1:1", + "$TC filter add dev $DUMMY parent 1: protocol ip prio 2 u32 match ip dst 10.10.10.2/32 flowid 1:2", + "$IP l set dev $DUMMY mtu 65336", + "ping -c 1 -I $DUMMY 10.10.10.1 -W0.01 > /dev/null || true", + "ping -c 3 -s 2000 -I $DUMMY 10.10.10.2 -W0.01 > /dev/null || true", + "sleep 0.1" + ], + "cmdUnderTest": "$TC qdisc change dev $DUMMY handle 2:0 plug release_indefinite", + "expExitCode": "0", + "verifyCmd": "$TC -s -j qdisc show dev $DUMMY", + "matchJSON": [ + { + "kind": "qfq", + "handle": "1:", + "packets": 3, + "drops": 1, + "backlog": 0, + "qlen": 0 + }, + { + "kind": "plug", + "handle": "2:", + "packets": 1, + "drops": 0, + "backlog": 0, + "qlen": 0 + }, + { + "kind": "fq_codel", + "handle": "10:", + "packets": 2, + "drops": 1, + "backlog": 0, + "qlen": 0 + } + ], + "teardown": [ + "$TC qdisc del dev $DUMMY root", + "$IP addr del 10.10.10.10/24 dev $DUMMY || true" + ] + }, + { + "id": "82d9", + "name": "Verify codel won't mistakenly deactivate QFQ parent class during peek", + "category": [ + "qdisc", + "qfq", + "codel" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.10.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: qfq", + "$TC class add dev $DUMMY parent 1: classid 1:1 qfq weight 1 maxpkt 1000", + "$TC class add dev $DUMMY parent 1: classid 1:2 qfq weight 1 maxpkt 1000", + "$TC qdisc add dev $DUMMY parent 1:1 handle 2:0 plug limit 1024", + "$IP l set dev $DUMMY mtu 1500", + "$TC qdisc add dev $DUMMY parent 1:2 handle 10: codel target 1ms interval 1ms", + "$TC filter add dev $DUMMY parent 1: protocol ip prio 1 u32 match ip dst 10.10.10.1/32 flowid 1:1", + "$TC filter add dev $DUMMY parent 1: protocol ip prio 2 u32 match ip dst 10.10.10.2/32 flowid 1:2", + "$IP l set dev $DUMMY mtu 65336", + "ping -c 1 -I $DUMMY 10.10.10.1 -W0.01 > /dev/null || true", + "ping -c 3 -s 2000 -I $DUMMY 10.10.10.2 -W0.01 > /dev/null || true", + "sleep 0.1" + ], + "cmdUnderTest": "$TC qdisc change dev $DUMMY handle 2:0 plug release_indefinite", + "expExitCode": "0", + "verifyCmd": "$TC -s -j qdisc show dev $DUMMY", + "matchJSON": [ + { + "kind": "qfq", + "handle": "1:", + "packets": 3, + "drops": 1, + "backlog": 0, + "qlen": 0 + }, + { + "kind": "plug", + "handle": "2:", + "packets": 1, + "drops": 0, + "backlog": 0, + "qlen": 0 + }, + { + "kind": "codel", + "handle": "10:", + "packets": 2, + "drops": 1, + "backlog": 0, + "qlen": 0 + } + ], + "teardown": [ + "$TC qdisc del dev $DUMMY root", + "$IP addr del 10.10.10.10/24 dev $DUMMY || true" + ] + }, + { + "id": "d3da", + "name": "Verify dualpi2 won't mistakenly deactivate QFQ parent class during peek", + "category": [ + "qdisc", + "qfq", + "dualpi2" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.10.10/24 dev $DUMMY || true" , + "$TC qdisc add dev $DUMMY root handle 1: qfq", + "$TC class add dev $DUMMY parent 1: classid 1:1 qfq weight 1 maxpkt 1000", + "$TC class add dev $DUMMY parent 1: classid 1:2 qfq weight 1 maxpkt 1000", + "$TC qdisc add dev $DUMMY parent 1:1 handle 2:0 plug limit 1024", + "$TC qdisc add dev $DUMMY parent 1:2 handle 10: dualpi2 step_thresh 500ms", + "$TC filter add dev $DUMMY parent 10: protocol ip prio 1 matchall classid 10:1 action ok", + "$TC filter add dev $DUMMY parent 1: protocol ip prio 1 u32 match ip dst 10.10.10.1/32 flowid 1:1", + "$TC filter add dev $DUMMY parent 1: protocol ip prio 2 u32 match ip dst 10.10.10.2/32 flowid 1:2", + "ping -c 1 -I $DUMMY 10.10.10.1 -W0.01 || true", + "ping -c 3 -i 0.1 -I $DUMMY 10.10.10.2 -W0.01 || true", + "sleep 0.7", + "ping -c 1 -I $DUMMY 10.10.10.2 -W0.01 || true", + "$TC qdisc change dev $DUMMY handle 2:0 plug release_indefinite" + ], + "cmdUnderTest": "ping -c 1 -I $DUMMY 10.10.10.1 -W0.01", + "expExitCode": "1", + "verifyCmd": "$TC -s -j qdisc show dev $DUMMY", + "matchJSON": [ + { + "kind": "qfq", + "handle": "1:", + "packets": 4, + "drops": 2, + "backlog": 0, + "qlen": 0 + }, + { + "kind": "plug", + "handle": "2:", + "packets": 2, + "drops": 0, + "backlog": 0, + "qlen": 0 + }, + { + "kind": "dualpi2", + "handle": "10:", + "packets": 2, + "drops": 2, + "backlog": 0, + "qlen": 0 + } + ], + "teardown": [ + "$TC qdisc del dev $DUMMY root", + "$IP addr del 10.10.10.10/24 dev $DUMMY || true" + ] } ] |
