summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2026-06-12 17:20:55 -0700
committerJakub Kicinski <kuba@kernel.org>2026-06-12 17:20:56 -0700
commit592b792026eaab89efb84bed71b05994645fa790 (patch)
tree07d421ce66a74971b35299515136e5e573cd148a
parent344873108ca7f342f1a7ffeb81ffca2347fe9535 (diff)
parent101f1047c2f6261d252d68ca3f77e52ed05a8402 (diff)
Merge branch 'avoid-mistaken-parent-class-deactivation-during-peek'
Victor Nogueira says: ==================== Avoid mistaken parent class deactivation during peek Several qdiscs (fq_codel, codel and dualpi2) may drop packets while peeking at their queue. When that happens they call qdisc_tree_reduce_backlog() to notify the parent of the backlog/qlen change. The problem is that they do so *before* reincrementing the qlen that peek had temporarily decremented. If the qlen momentarily drops to zero while peek still has an skb to return, qdisc_tree_reduce_backlog() ends up invoking the parent's qlen_notify() callback even though the child is not actually empty. The parent then deactivates the class, while the child still holds a packet. For parents such as QFQ this desync corrupts the active class list and leads to wild memory accesses and NULL pointer dereferences (see the per-patch splats). For HFSC it might lead to stalls [1]. Fix all three qdiscs the same way: only call qdisc_tree_reduce_backlog() once the qlen has been restored, so the parent never observes a transient empty child during peek. Patch 1 fixes this for fq_codel, patch 2 for codel, patch 3 for dualpi2 and patch 4 adds test cases for these 3 setups. Note: Patch 1 is one of two fixes for the stall reported in [1]; the companion fix is "net/sched: sch_hfsc: Don't make class passive twice", sent separately. Note2: A possible cleaner fix is to create a new helper function for peek that only calls qdisc_tree_reduce_backlog after reincrementing the qlen. This would be called from the 3 vulnerable qdiscs, however we thought this might make it harder for backporting so, if people agree, we can submit this cleaner version to net-next after this one is merged. [1] https://lore.kernel.org/netdev/CAN2cbVe79oj0O9==m4+4x3v+O+qzRagA=2=wkrp9i9=CqYvyZA@mail.gmail.com/ ==================== Link: https://patch.msgid.link/20260610192855.3121513-1-victor@mojatatu.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
-rw-r--r--net/sched/sch_codel.c48
-rw-r--r--net/sched/sch_dualpi2.c41
-rw-r--r--net/sched/sch_fq_codel.c41
-rw-r--r--tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json184
4 files changed, 305 insertions, 9 deletions
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
index 317aae0ec7bd..7d5076196aff 100644
--- a/net/sched/sch_codel.c
+++ b/net/sched/sch_codel.c
@@ -56,7 +56,7 @@ static void drop_func(struct sk_buff *skb, void *ctx)
qdisc_qstats_drop(sch);
}
-static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch)
+static struct sk_buff *__codel_qdisc_dequeue(struct Qdisc *sch)
{
struct codel_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb;
@@ -65,13 +65,51 @@ static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch)
&q->stats, qdisc_pkt_len, codel_get_enqueue_time,
drop_func, dequeue_func);
+ if (skb)
+ qdisc_bstats_update(sch, skb);
+ return skb;
+}
+
+static void codel_dequeue_drop(struct Qdisc *sch)
+{
+ struct codel_sched_data *q = qdisc_priv(sch);
+
if (q->stats.drop_count) {
- qdisc_tree_reduce_backlog(sch, q->stats.drop_count, q->stats.drop_len);
+ qdisc_tree_reduce_backlog(sch, q->stats.drop_count,
+ q->stats.drop_len);
q->stats.drop_count = 0;
q->stats.drop_len = 0;
}
- if (skb)
- qdisc_bstats_update(sch, skb);
+}
+
+static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch)
+{
+ struct sk_buff *skb;
+
+ skb = __codel_qdisc_dequeue(sch);
+
+ codel_dequeue_drop(sch);
+
+ return skb;
+}
+
+static struct sk_buff *codel_peek(struct Qdisc *sch)
+{
+ struct sk_buff *skb = skb_peek(&sch->gso_skb);
+
+ if (!skb) {
+ skb = __codel_qdisc_dequeue(sch);
+
+ if (skb) {
+ __skb_queue_head(&sch->gso_skb, skb);
+ /* it's still part of the queue */
+ qdisc_qstats_backlog_inc(sch, skb);
+ sch->q.qlen++;
+ }
+
+ codel_dequeue_drop(sch);
+ }
+
return skb;
}
@@ -257,7 +295,7 @@ static struct Qdisc_ops codel_qdisc_ops __read_mostly = {
.enqueue = codel_qdisc_enqueue,
.dequeue = codel_qdisc_dequeue,
- .peek = qdisc_peek_dequeued,
+ .peek = codel_peek,
.init = codel_init,
.reset = codel_reset,
.change = codel_change,
diff --git a/net/sched/sch_dualpi2.c b/net/sched/sch_dualpi2.c
index a22489c14458..05285775b454 100644
--- a/net/sched/sch_dualpi2.c
+++ b/net/sched/sch_dualpi2.c
@@ -579,7 +579,7 @@ static void drop_and_retry(struct dualpi2_sched_data *q, struct sk_buff *skb,
qdisc_qstats_drop(sch);
}
-static struct sk_buff *dualpi2_qdisc_dequeue(struct Qdisc *sch)
+static struct sk_buff *__dualpi2_qdisc_dequeue(struct Qdisc *sch)
{
struct dualpi2_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb;
@@ -605,12 +605,49 @@ static struct sk_buff *dualpi2_qdisc_dequeue(struct Qdisc *sch)
break;
}
+ return skb;
+}
+
+static void dualpi2_dequeue_drop(struct Qdisc *sch)
+{
+ struct dualpi2_sched_data *q = qdisc_priv(sch);
+
if (q->deferred_drops_cnt) {
qdisc_tree_reduce_backlog(sch, q->deferred_drops_cnt,
q->deferred_drops_len);
q->deferred_drops_cnt = 0;
q->deferred_drops_len = 0;
}
+}
+
+static struct sk_buff *dualpi2_qdisc_dequeue(struct Qdisc *sch)
+{
+ struct sk_buff *skb;
+
+ skb = __dualpi2_qdisc_dequeue(sch);
+
+ dualpi2_dequeue_drop(sch);
+
+ return skb;
+}
+
+static struct sk_buff *dualpi2_peek(struct Qdisc *sch)
+{
+ struct sk_buff *skb = skb_peek(&sch->gso_skb);
+
+ if (!skb) {
+ skb = __dualpi2_qdisc_dequeue(sch);
+
+ if (skb) {
+ __skb_queue_head(&sch->gso_skb, skb);
+ /* it's still part of the queue */
+ qdisc_qstats_backlog_inc(sch, skb);
+ sch->q.qlen++;
+ }
+
+ dualpi2_dequeue_drop(sch);
+ }
+
return skb;
}
@@ -1165,7 +1202,7 @@ static struct Qdisc_ops dualpi2_qdisc_ops __read_mostly = {
.priv_size = sizeof(struct dualpi2_sched_data),
.enqueue = dualpi2_qdisc_enqueue,
.dequeue = dualpi2_qdisc_dequeue,
- .peek = qdisc_peek_dequeued,
+ .peek = dualpi2_peek,
.init = dualpi2_init,
.destroy = dualpi2_destroy,
.reset = dualpi2_reset,
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 24db54684e8a..9aebf25ddc79 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -280,7 +280,7 @@ static void drop_func(struct sk_buff *skb, void *ctx)
qdisc_qstats_drop(sch);
}
-static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch)
+static struct sk_buff *__fq_codel_dequeue(struct Qdisc *sch)
{
struct fq_codel_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb;
@@ -317,12 +317,49 @@ begin:
qdisc_bstats_update(sch, skb);
WRITE_ONCE(flow->deficit, flow->deficit - qdisc_pkt_len(skb));
+ return skb;
+}
+
+static void fq_codel_dequeue_drop(struct Qdisc *sch)
+{
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+
if (q->cstats.drop_count) {
qdisc_tree_reduce_backlog(sch, q->cstats.drop_count,
q->cstats.drop_len);
q->cstats.drop_count = 0;
q->cstats.drop_len = 0;
}
+}
+
+static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch)
+{
+ struct sk_buff *skb;
+
+ skb = __fq_codel_dequeue(sch);
+
+ fq_codel_dequeue_drop(sch);
+
+ return skb;
+}
+
+static struct sk_buff *fq_codel_peek(struct Qdisc *sch)
+{
+ struct sk_buff *skb = skb_peek(&sch->gso_skb);
+
+ if (!skb) {
+ skb = __fq_codel_dequeue(sch);
+
+ if (skb) {
+ __skb_queue_head(&sch->gso_skb, skb);
+ /* it's still part of the queue */
+ qdisc_qstats_backlog_inc(sch, skb);
+ sch->q.qlen++;
+ }
+
+ fq_codel_dequeue_drop(sch);
+ }
+
return skb;
}
@@ -725,7 +762,7 @@ static struct Qdisc_ops fq_codel_qdisc_ops __read_mostly = {
.priv_size = sizeof(struct fq_codel_sched_data),
.enqueue = fq_codel_enqueue,
.dequeue = fq_codel_dequeue,
- .peek = qdisc_peek_dequeued,
+ .peek = fq_codel_peek,
.init = fq_codel_init,
.reset = fq_codel_reset,
.destroy = fq_codel_destroy,
diff --git a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json
index 82c38a13dfbf..e83e31b932dc 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json
@@ -1326,5 +1326,189 @@
"teardown": [
"$TC qdisc del dev $DUMMY handle 1: root"
]
+ },
+ {
+ "id": "c797",
+ "name": "Verify fq_codel won't mistakenly deactivate QFQ parent class during peek",
+ "category": [
+ "qdisc",
+ "qfq",
+ "fq_codel"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$IP link set dev $DUMMY up || true",
+ "$IP addr add 10.10.10.10/24 dev $DUMMY || true",
+ "$TC qdisc add dev $DUMMY root handle 1: qfq",
+ "$TC class add dev $DUMMY parent 1: classid 1:1 qfq weight 1 maxpkt 1000",
+ "$TC class add dev $DUMMY parent 1: classid 1:2 qfq weight 1 maxpkt 1000",
+ "$TC qdisc add dev $DUMMY parent 1:1 handle 2:0 plug limit 1024",
+ "$IP l set dev $DUMMY mtu 1500",
+ "$TC qdisc add dev $DUMMY parent 1:2 handle 10: fq_codel target 1 interval 1 flows 1",
+ "$TC filter add dev $DUMMY parent 1: protocol ip prio 1 u32 match ip dst 10.10.10.1/32 flowid 1:1",
+ "$TC filter add dev $DUMMY parent 1: protocol ip prio 2 u32 match ip dst 10.10.10.2/32 flowid 1:2",
+ "$IP l set dev $DUMMY mtu 65336",
+ "ping -c 1 -I $DUMMY 10.10.10.1 -W0.01 > /dev/null || true",
+ "ping -c 3 -s 2000 -I $DUMMY 10.10.10.2 -W0.01 > /dev/null || true",
+ "sleep 0.1"
+ ],
+ "cmdUnderTest": "$TC qdisc change dev $DUMMY handle 2:0 plug release_indefinite",
+ "expExitCode": "0",
+ "verifyCmd": "$TC -s -j qdisc show dev $DUMMY",
+ "matchJSON": [
+ {
+ "kind": "qfq",
+ "handle": "1:",
+ "packets": 3,
+ "drops": 1,
+ "backlog": 0,
+ "qlen": 0
+ },
+ {
+ "kind": "plug",
+ "handle": "2:",
+ "packets": 1,
+ "drops": 0,
+ "backlog": 0,
+ "qlen": 0
+ },
+ {
+ "kind": "fq_codel",
+ "handle": "10:",
+ "packets": 2,
+ "drops": 1,
+ "backlog": 0,
+ "qlen": 0
+ }
+ ],
+ "teardown": [
+ "$TC qdisc del dev $DUMMY root",
+ "$IP addr del 10.10.10.10/24 dev $DUMMY || true"
+ ]
+ },
+ {
+ "id": "82d9",
+ "name": "Verify codel won't mistakenly deactivate QFQ parent class during peek",
+ "category": [
+ "qdisc",
+ "qfq",
+ "codel"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$IP link set dev $DUMMY up || true",
+ "$IP addr add 10.10.10.10/24 dev $DUMMY || true",
+ "$TC qdisc add dev $DUMMY root handle 1: qfq",
+ "$TC class add dev $DUMMY parent 1: classid 1:1 qfq weight 1 maxpkt 1000",
+ "$TC class add dev $DUMMY parent 1: classid 1:2 qfq weight 1 maxpkt 1000",
+ "$TC qdisc add dev $DUMMY parent 1:1 handle 2:0 plug limit 1024",
+ "$IP l set dev $DUMMY mtu 1500",
+ "$TC qdisc add dev $DUMMY parent 1:2 handle 10: codel target 1ms interval 1ms",
+ "$TC filter add dev $DUMMY parent 1: protocol ip prio 1 u32 match ip dst 10.10.10.1/32 flowid 1:1",
+ "$TC filter add dev $DUMMY parent 1: protocol ip prio 2 u32 match ip dst 10.10.10.2/32 flowid 1:2",
+ "$IP l set dev $DUMMY mtu 65336",
+ "ping -c 1 -I $DUMMY 10.10.10.1 -W0.01 > /dev/null || true",
+ "ping -c 3 -s 2000 -I $DUMMY 10.10.10.2 -W0.01 > /dev/null || true",
+ "sleep 0.1"
+ ],
+ "cmdUnderTest": "$TC qdisc change dev $DUMMY handle 2:0 plug release_indefinite",
+ "expExitCode": "0",
+ "verifyCmd": "$TC -s -j qdisc show dev $DUMMY",
+ "matchJSON": [
+ {
+ "kind": "qfq",
+ "handle": "1:",
+ "packets": 3,
+ "drops": 1,
+ "backlog": 0,
+ "qlen": 0
+ },
+ {
+ "kind": "plug",
+ "handle": "2:",
+ "packets": 1,
+ "drops": 0,
+ "backlog": 0,
+ "qlen": 0
+ },
+ {
+ "kind": "codel",
+ "handle": "10:",
+ "packets": 2,
+ "drops": 1,
+ "backlog": 0,
+ "qlen": 0
+ }
+ ],
+ "teardown": [
+ "$TC qdisc del dev $DUMMY root",
+ "$IP addr del 10.10.10.10/24 dev $DUMMY || true"
+ ]
+ },
+ {
+ "id": "d3da",
+ "name": "Verify dualpi2 won't mistakenly deactivate QFQ parent class during peek",
+ "category": [
+ "qdisc",
+ "qfq",
+ "dualpi2"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$IP link set dev $DUMMY up || true",
+ "$IP addr add 10.10.10.10/24 dev $DUMMY || true" ,
+ "$TC qdisc add dev $DUMMY root handle 1: qfq",
+ "$TC class add dev $DUMMY parent 1: classid 1:1 qfq weight 1 maxpkt 1000",
+ "$TC class add dev $DUMMY parent 1: classid 1:2 qfq weight 1 maxpkt 1000",
+ "$TC qdisc add dev $DUMMY parent 1:1 handle 2:0 plug limit 1024",
+ "$TC qdisc add dev $DUMMY parent 1:2 handle 10: dualpi2 step_thresh 500ms",
+ "$TC filter add dev $DUMMY parent 10: protocol ip prio 1 matchall classid 10:1 action ok",
+ "$TC filter add dev $DUMMY parent 1: protocol ip prio 1 u32 match ip dst 10.10.10.1/32 flowid 1:1",
+ "$TC filter add dev $DUMMY parent 1: protocol ip prio 2 u32 match ip dst 10.10.10.2/32 flowid 1:2",
+ "ping -c 1 -I $DUMMY 10.10.10.1 -W0.01 || true",
+ "ping -c 3 -i 0.1 -I $DUMMY 10.10.10.2 -W0.01 || true",
+ "sleep 0.7",
+ "ping -c 1 -I $DUMMY 10.10.10.2 -W0.01 || true",
+ "$TC qdisc change dev $DUMMY handle 2:0 plug release_indefinite"
+ ],
+ "cmdUnderTest": "ping -c 1 -I $DUMMY 10.10.10.1 -W0.01",
+ "expExitCode": "1",
+ "verifyCmd": "$TC -s -j qdisc show dev $DUMMY",
+ "matchJSON": [
+ {
+ "kind": "qfq",
+ "handle": "1:",
+ "packets": 4,
+ "drops": 2,
+ "backlog": 0,
+ "qlen": 0
+ },
+ {
+ "kind": "plug",
+ "handle": "2:",
+ "packets": 2,
+ "drops": 0,
+ "backlog": 0,
+ "qlen": 0
+ },
+ {
+ "kind": "dualpi2",
+ "handle": "10:",
+ "packets": 2,
+ "drops": 2,
+ "backlog": 0,
+ "qlen": 0
+ }
+ ],
+ "teardown": [
+ "$TC qdisc del dev $DUMMY root",
+ "$IP addr del 10.10.10.10/24 dev $DUMMY || true"
+ ]
}
]