summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/6lowpan/iphc.c4
-rw-r--r--net/802/garp.c2
-rw-r--r--net/802/mrp.c9
-rw-r--r--net/appletalk/aarp.c7
-rw-r--r--net/atm/signaling.c1
-rw-r--r--net/batman-adv/bat_iv_ogm.c167
-rw-r--r--net/batman-adv/bat_v_ogm.c59
-rw-r--r--net/batman-adv/bridge_loop_avoidance.c120
-rw-r--r--net/batman-adv/distributed-arp-table.c3
-rw-r--r--net/batman-adv/fragmentation.c58
-rw-r--r--net/batman-adv/gateway_client.c4
-rw-r--r--net/batman-adv/main.c1
-rw-r--r--net/batman-adv/mesh-interface.c1
-rw-r--r--net/batman-adv/originator.c4
-rw-r--r--net/batman-adv/tp_meter.c229
-rw-r--r--net/batman-adv/tp_meter.h1
-rw-r--r--net/batman-adv/translation-table.c55
-rw-r--r--net/batman-adv/tvlv.c28
-rw-r--r--net/batman-adv/tvlv.h2
-rw-r--r--net/batman-adv/types.h63
-rw-r--r--net/bluetooth/6lowpan.c2
-rw-r--r--net/bluetooth/af_bluetooth.c97
-rw-r--r--net/bluetooth/bnep/core.c52
-rw-r--r--net/bluetooth/hci_conn.c128
-rw-r--r--net/bluetooth/hci_core.c43
-rw-r--r--net/bluetooth/hci_event.c29
-rw-r--r--net/bluetooth/hci_sync.c27
-rw-r--r--net/bluetooth/hci_sysfs.c6
-rw-r--r--net/bluetooth/hidp/core.c50
-rw-r--r--net/bluetooth/iso.c145
-rw-r--r--net/bluetooth/l2cap_core.c103
-rw-r--r--net/bluetooth/l2cap_sock.c66
-rw-r--r--net/bluetooth/mgmt.c19
-rw-r--r--net/bluetooth/rfcomm/core.c74
-rw-r--r--net/bluetooth/rfcomm/sock.c35
-rw-r--r--net/bluetooth/sco.c89
-rw-r--r--net/bridge/br_multicast.c22
-rw-r--r--net/bridge/br_netfilter_hooks.c6
-rw-r--r--net/bridge/br_netlink.c27
-rw-r--r--net/bridge/br_switchdev.c1
-rw-r--r--net/bridge/br_sysfs_if.c30
-rw-r--r--net/bridge/netfilter/ebt_snat.c3
-rw-r--r--net/bridge/netfilter/ebtable_broute.c14
-rw-r--r--net/bridge/netfilter/ebtable_filter.c14
-rw-r--r--net/bridge/netfilter/ebtable_nat.c12
-rw-r--r--net/bridge/netfilter/ebtables.c101
-rw-r--r--net/ceph/auth_x.c5
-rw-r--r--net/ceph/crush/crush.c6
-rw-r--r--net/ceph/osdmap.c17
-rw-r--r--net/core/bpf_sk_storage.c14
-rw-r--r--net/core/dev.c23
-rw-r--r--net/core/devmem.c11
-rw-r--r--net/core/failover.c6
-rw-r--r--net/core/filter.c57
-rw-r--r--net/core/gro.c7
-rw-r--r--net/core/netmem_priv.h23
-rw-r--r--net/core/netpoll.c25
-rw-r--r--net/core/page_pool.c24
-rw-r--r--net/core/rtnetlink.c6
-rw-r--r--net/core/skbuff.c56
-rw-r--r--net/core/skmsg.c9
-rw-r--r--net/core/sock.c8
-rw-r--r--net/core/sock_map.c39
-rw-r--r--net/devlink/core.c2
-rw-r--r--net/ethtool/bitset.c8
-rw-r--r--net/ethtool/cmis.h4
-rw-r--r--net/ethtool/cmis_cdb.c9
-rw-r--r--net/ethtool/cmis_fw_update.c44
-rw-r--r--net/ethtool/coalesce.c6
-rw-r--r--net/ethtool/eeprom.c10
-rw-r--r--net/ethtool/linkstate.c6
-rw-r--r--net/ethtool/module.c41
-rw-r--r--net/ethtool/netlink.c4
-rw-r--r--net/ethtool/netlink.h4
-rw-r--r--net/ethtool/phy.c36
-rw-r--r--net/ethtool/pse-pd.c10
-rw-r--r--net/ethtool/rss.c37
-rw-r--r--net/ethtool/strset.c2
-rw-r--r--net/ethtool/tsconfig.c15
-rw-r--r--net/ethtool/tsinfo.c19
-rw-r--r--net/handshake/genl.c3
-rw-r--r--net/handshake/genl.h1
-rw-r--r--net/handshake/handshake-test.c38
-rw-r--r--net/handshake/handshake.h6
-rw-r--r--net/handshake/netlink.c29
-rw-r--r--net/handshake/request.c81
-rw-r--r--net/handshake/tlshd.c6
-rw-r--r--net/hsr/hsr_forward.c4
-rw-r--r--net/hsr/hsr_framereg.c13
-rw-r--r--net/ieee802154/6lowpan/tx.c5
-rw-r--r--net/ipv4/ah4.c16
-rw-r--r--net/ipv4/bpf_tcp_ca.c2
-rw-r--r--net/ipv4/esp4.c7
-rw-r--r--net/ipv4/icmp.c3
-rw-r--r--net/ipv4/igmp.c58
-rw-r--r--net/ipv4/inet_connection_sock.c10
-rw-r--r--net/ipv4/inetpeer.c3
-rw-r--r--net/ipv4/ip_options.c4
-rw-r--r--net/ipv4/ip_output.c2
-rw-r--r--net/ipv4/ip_tunnel_core.c22
-rw-r--r--net/ipv4/ipmr.c10
-rw-r--r--net/ipv4/netfilter/arp_tables.c53
-rw-r--r--net/ipv4/netfilter/arptable_filter.c27
-rw-r--r--net/ipv4/netfilter/ip_tables.c59
-rw-r--r--net/ipv4/netfilter/iptable_filter.c27
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c29
-rw-r--r--net/ipv4/netfilter/iptable_nat.c6
-rw-r--r--net/ipv4/netfilter/iptable_raw.c26
-rw-r--r--net/ipv4/netfilter/iptable_security.c27
-rw-r--r--net/ipv4/netfilter/nf_socket_ipv4.c3
-rw-r--r--net/ipv4/raw.c2
-rw-r--r--net/ipv4/route.c2
-rw-r--r--net/ipv4/sysctl_net_ipv4.c2
-rw-r--r--net/ipv4/tcp.c3
-rw-r--r--net/ipv4/tcp_ao.c3
-rw-r--r--net/ipv4/tcp_input.c15
-rw-r--r--net/ipv4/tcp_ipv4.c17
-rw-r--r--net/ipv4/tcp_minisocks.c2
-rw-r--r--net/ipv4/tcp_output.c1
-rw-r--r--net/ipv4/udp.c8
-rw-r--r--net/ipv4/udp_offload.c22
-rw-r--r--net/ipv6/Kconfig4
-rw-r--r--net/ipv6/addrconf.c2
-rw-r--r--net/ipv6/ah6.c16
-rw-r--r--net/ipv6/anycast.c16
-rw-r--r--net/ipv6/datagram.c54
-rw-r--r--net/ipv6/esp6.c7
-rw-r--r--net/ipv6/exthdrs.c27
-rw-r--r--net/ipv6/exthdrs_core.c7
-rw-r--r--net/ipv6/ip6_flowlabel.c46
-rw-r--r--net/ipv6/ip6_gre.c5
-rw-r--r--net/ipv6/ip6_input.c5
-rw-r--r--net/ipv6/ip6_output.c5
-rw-r--r--net/ipv6/ip6_tunnel.c4
-rw-r--r--net/ipv6/ip6_vti.c23
-rw-r--r--net/ipv6/mcast.c8
-rw-r--r--net/ipv6/netfilter/ip6_tables.c56
-rw-r--r--net/ipv6/netfilter/ip6t_hbh.c4
-rw-r--r--net/ipv6/netfilter/ip6table_filter.c26
-rw-r--r--net/ipv6/netfilter/ip6table_mangle.c27
-rw-r--r--net/ipv6/netfilter/ip6table_nat.c6
-rw-r--r--net/ipv6/netfilter/ip6table_raw.c24
-rw-r--r--net/ipv6/netfilter/ip6table_security.c27
-rw-r--r--net/ipv6/netfilter/nf_socket_ipv6.c5
-rw-r--r--net/ipv6/netfilter/nft_fib_ipv6.c21
-rw-r--r--net/ipv6/route.c16
-rw-r--r--net/ipv6/tcp_ipv6.c20
-rw-r--r--net/ipv6/xfrm6_protocol.c4
-rw-r--r--net/iucv/af_iucv.c20
-rw-r--r--net/key/af_key.c6
-rw-r--r--net/l2tp/l2tp_core.c13
-rw-r--r--net/l2tp/l2tp_ppp.c82
-rw-r--r--net/mac80211/cfg.c5
-rw-r--r--net/mac80211/mlme.c23
-rw-r--r--net/mac80211/parse.c105
-rw-r--r--net/mac80211/rx.c12
-rw-r--r--net/mac80211/tests/chan-mode.c1
-rw-r--r--net/mac80211/tx.c4
-rw-r--r--net/mac80211/util.c4
-rw-r--r--net/mctp/test/route-test.c2
-rw-r--r--net/mctp/test/utils.c2
-rw-r--r--net/mptcp/bpf.c2
-rw-r--r--net/mptcp/fastopen.c4
-rw-r--r--net/mptcp/options.c79
-rw-r--r--net/mptcp/pm.c127
-rw-r--r--net/mptcp/pm_kernel.c13
-rw-r--r--net/mptcp/pm_userspace.c14
-rw-r--r--net/mptcp/protocol.c35
-rw-r--r--net/mptcp/protocol.h7
-rw-r--r--net/mptcp/sockopt.c29
-rw-r--r--net/mptcp/subflow.c4
-rw-r--r--net/netfilter/ipset/ip_set_core.c5
-rw-r--r--net/netfilter/ipset/ip_set_hash_gen.h57
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipmark.c6
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipport.c5
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipportip.c5
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipportnet.c5
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c76
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c356
-rw-r--r--net/netfilter/ipvs/ip_vs_est.c83
-rw-r--r--net/netfilter/ipvs/ip_vs_sched.c14
-rw-r--r--net/netfilter/nf_conntrack_broadcast.c1
-rw-r--r--net/netfilter/nf_conntrack_core.c15
-rw-r--r--net/netfilter/nf_conntrack_expect.c1
-rw-r--r--net/netfilter/nf_conntrack_h323_main.c12
-rw-r--r--net/netfilter/nf_conntrack_helper.c18
-rw-r--r--net/netfilter/nf_conntrack_irc.c4
-rw-r--r--net/netfilter/nf_conntrack_netlink.c21
-rw-r--r--net/netfilter/nf_conntrack_pptp.c8
-rw-r--r--net/netfilter/nf_conntrack_proto_gre.c106
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c3
-rw-r--r--net/netfilter/nf_conntrack_sip.c10
-rw-r--r--net/netfilter/nf_dup_netdev.c16
-rw-r--r--net/netfilter/nf_flow_table_core.c1
-rw-r--r--net/netfilter/nf_flow_table_ip.c151
-rw-r--r--net/netfilter/nf_flow_table_path.c7
-rw-r--r--net/netfilter/nf_queue.c4
-rw-r--r--net/netfilter/nf_synproxy_core.c26
-rw-r--r--net/netfilter/nf_tables_api.c35
-rw-r--r--net/netfilter/nf_tables_core.c2
-rw-r--r--net/netfilter/nfnetlink_queue.c8
-rw-r--r--net/netfilter/nft_bitwise.c18
-rw-r--r--net/netfilter/nft_byteorder.c60
-rw-r--r--net/netfilter/nft_compat.c45
-rw-r--r--net/netfilter/nft_ct.c10
-rw-r--r--net/netfilter/nft_ct_fast.c2
-rw-r--r--net/netfilter/nft_exthdr.c2
-rw-r--r--net/netfilter/nft_fwd_netdev.c29
-rw-r--r--net/netfilter/nft_inner.c3
-rw-r--r--net/netfilter/nft_osf.c2
-rw-r--r--net/netfilter/nft_payload.c3
-rw-r--r--net/netfilter/nft_tproxy.c8
-rw-r--r--net/netfilter/nft_tunnel.c2
-rw-r--r--net/netfilter/x_tables.c256
-rw-r--r--net/netfilter/xt_CT.c8
-rw-r--r--net/netfilter/xt_NFQUEUE.c2
-rw-r--r--net/netfilter/xt_TCPMSS.c33
-rw-r--r--net/netfilter/xt_TPROXY.c11
-rw-r--r--net/netfilter/xt_addrtype.c25
-rw-r--r--net/netfilter/xt_cpu.c2
-rw-r--r--net/netfilter/xt_devgroup.c18
-rw-r--r--net/netfilter/xt_ecn.c4
-rw-r--r--net/netfilter/xt_hashlimit.c4
-rw-r--r--net/netfilter/xt_osf.c3
-rw-r--r--net/netfilter/xt_physdev.c20
-rw-r--r--net/netfilter/xt_policy.c24
-rw-r--r--net/netfilter/xt_set.c39
-rw-r--r--net/netfilter/xt_tcpmss.c4
-rw-r--r--net/netlink/af_netlink.c11
-rw-r--r--net/netlink/genetlink.c8
-rw-r--r--net/nfc/hci/core.c10
-rw-r--r--net/nfc/llcp_core.c11
-rw-r--r--net/nfc/llcp_sock.c2
-rw-r--r--net/nfc/nci/hci.c10
-rw-r--r--net/openvswitch/vport-geneve.c5
-rw-r--r--net/openvswitch/vport-gre.c5
-rw-r--r--net/openvswitch/vport-netdev.c68
-rw-r--r--net/openvswitch/vport-netdev.h2
-rw-r--r--net/openvswitch/vport-vxlan.c5
-rw-r--r--net/phonet/pep.c19
-rw-r--r--net/psp/psp_main.c42
-rw-r--r--net/rds/ib_cm.c1
-rw-r--r--net/rds/message.c21
-rw-r--r--net/rds/tcp.c9
-rw-r--r--net/rxrpc/ar-internal.h14
-rw-r--r--net/rxrpc/call_event.c20
-rw-r--r--net/rxrpc/call_object.c2
-rw-r--r--net/rxrpc/conn_event.c29
-rw-r--r--net/rxrpc/insecure.c8
-rw-r--r--net/rxrpc/recvmsg.c68
-rw-r--r--net/rxrpc/rxgk.c160
-rw-r--r--net/rxrpc/rxgk_app.c46
-rw-r--r--net/rxrpc/rxgk_common.h66
-rw-r--r--net/rxrpc/rxkad.c115
-rw-r--r--net/sched/act_api.c7
-rw-r--r--net/sched/act_mirred.c77
-rw-r--r--net/sched/act_pedit.c77
-rw-r--r--net/sched/sch_cake.c155
-rw-r--r--net/sched/sch_cbs.c16
-rw-r--r--net/sched/sch_dualpi2.c4
-rw-r--r--net/sched/sch_fq_codel.c39
-rw-r--r--net/sched/sch_netem.c47
-rw-r--r--net/sched/sch_pie.c14
-rw-r--r--net/sched/sch_red.c2
-rw-r--r--net/sched/sch_sfb.c2
-rw-r--r--net/sched/sch_sfq.c48
-rw-r--r--net/sctp/diag.c17
-rw-r--r--net/sctp/sm_make_chunk.c5
-rw-r--r--net/sctp/sm_statefuns.c6
-rw-r--r--net/sctp/socket.c11
-rw-r--r--net/shaper/shaper.c224
-rw-r--r--net/shaper/shaper_nl_gen.c7
-rw-r--r--net/shaper/shaper_nl_gen.h2
-rw-r--r--net/smc/af_smc.c32
-rw-r--r--net/smc/smc_tracepoint.h2
-rw-r--r--net/sunrpc/cache.c5
-rw-r--r--net/tls/tls_sw.c52
-rw-r--r--net/unix/af_unix.c25
-rw-r--r--net/unix/garbage.c6
-rw-r--r--net/vmw_vsock/af_vsock.c49
-rw-r--r--net/vmw_vsock/hyperv_transport.c9
-rw-r--r--net/vmw_vsock/virtio_transport_common.c185
-rw-r--r--net/vmw_vsock/vmci_transport.c14
-rw-r--r--net/wireless/nl80211.c36
-rw-r--r--net/wireless/pmsr.c2
-rw-r--r--net/wireless/scan.c12
-rw-r--r--net/wireless/wext-compat.c2
-rw-r--r--net/xdp/xsk.c126
-rw-r--r--net/xdp/xsk_buff_pool.c3
-rw-r--r--net/xdp/xskmap.c4
-rw-r--r--net/xfrm/xfrm_input.c16
-rw-r--r--net/xfrm/xfrm_ipcomp.c12
-rw-r--r--net/xfrm/xfrm_iptfs.c28
-rw-r--r--net/xfrm/xfrm_output.c20
-rw-r--r--net/xfrm/xfrm_policy.c17
-rw-r--r--net/xfrm/xfrm_state.c35
-rw-r--r--net/xfrm/xfrm_user.c6
298 files changed, 5406 insertions, 2830 deletions
diff --git a/net/6lowpan/iphc.c b/net/6lowpan/iphc.c
index e116d308a8df..37eaff3f7b69 100644
--- a/net/6lowpan/iphc.c
+++ b/net/6lowpan/iphc.c
@@ -1086,12 +1086,12 @@ static u8 lowpan_iphc_mcast_ctx_addr_compress(u8 **hc_ptr,
const struct lowpan_iphc_ctx *ctx,
const struct in6_addr *ipaddr)
{
- u8 data[6];
+ u8 data[6] = {};
/* flags/scope, reserved (RIID) */
memcpy(data, &ipaddr->s6_addr[1], 2);
/* group ID */
- memcpy(&data[1], &ipaddr->s6_addr[11], 4);
+ memcpy(&data[2], &ipaddr->s6_addr[12], 4);
lowpan_push_hc_data(hc_ptr, data, 6);
return LOWPAN_IPHC_DAM_00;
diff --git a/net/802/garp.c b/net/802/garp.c
index 6f563b6797d9..c7a39f298ad6 100644
--- a/net/802/garp.c
+++ b/net/802/garp.c
@@ -453,7 +453,7 @@ static int garp_pdu_parse_attr(struct garp_applicant *app, struct sk_buff *skb,
if (!pskb_may_pull(skb, ga->len))
return -1;
skb_pull(skb, ga->len);
- dlen = sizeof(*ga) - ga->len;
+ dlen = ga->len - sizeof(*ga);
if (attrtype > app->app->maxattr)
return 0;
diff --git a/net/802/mrp.c b/net/802/mrp.c
index ff0e80574e6b..160a3b14569c 100644
--- a/net/802/mrp.c
+++ b/net/802/mrp.c
@@ -703,6 +703,12 @@ static int mrp_pdu_parse_vecattr(struct mrp_applicant *app,
valen = be16_to_cpu(get_unaligned(&mrp_cb(skb)->vah->lenflags) &
MRP_VECATTR_HDR_LEN_MASK);
+ /* If valen is 0, only a LeaveAllEvent is present; FirstValue and
+ * Vector fields are absent per IEEE 802.1ak.
+ */
+ if (valen == 0)
+ return 0;
+
/* The VectorAttribute structure in a PDU carries event information
* about one or more attributes having consecutive values. Only the
* value for the first attribute is contained in the structure. So
@@ -753,6 +759,9 @@ static int mrp_pdu_parse_vecattr(struct mrp_applicant *app,
vaevents %= __MRP_VECATTR_EVENT_MAX;
vaevent = vaevents;
mrp_pdu_parse_vecattr_event(app, skb, vaevent);
+ valen--;
+ mrp_attrvalue_inc(mrp_cb(skb)->attrvalue,
+ mrp_cb(skb)->mh->attrlen);
}
return 0;
}
diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c
index e7315c01a299..078fb7a6efa5 100644
--- a/net/appletalk/aarp.c
+++ b/net/appletalk/aarp.c
@@ -393,7 +393,7 @@ static void aarp_purge(void)
*/
static struct aarp_entry *aarp_alloc(void)
{
- struct aarp_entry *a = kmalloc_obj(*a, GFP_ATOMIC);
+ struct aarp_entry *a = kzalloc_obj(*a, GFP_ATOMIC);
if (!a)
return NULL;
@@ -542,6 +542,11 @@ int aarp_send_ddp(struct net_device *dev, struct sk_buff *skb,
struct ddpehdr *ddp = (struct ddpehdr *)skb->data;
int ft = 2;
+ if (!at) {
+ kfree_skb(skb);
+ return NET_XMIT_DROP;
+ }
+
/*
* Compressible ?
*
diff --git a/net/atm/signaling.c b/net/atm/signaling.c
index 358fbe5e4d1d..b991d937205a 100644
--- a/net/atm/signaling.c
+++ b/net/atm/signaling.c
@@ -179,6 +179,7 @@ as_indicate_complete:
break;
default:
pr_alert("bad message type %d\n", (int)msg->type);
+ dev_kfree_skb(skb);
/* Paired with find_get_vcc(msg->vcc) above */
sock_put(sk);
return -EINVAL;
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index f28e9cbf8ad5..b8b1b997960a 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -173,19 +173,12 @@ free_orig_node_hash:
static struct batadv_neigh_node *
batadv_iv_ogm_neigh_new(struct batadv_hard_iface *hard_iface,
const u8 *neigh_addr,
- struct batadv_orig_node *orig_node,
- struct batadv_orig_node *orig_neigh)
+ struct batadv_orig_node *orig_node)
{
struct batadv_neigh_node *neigh_node;
neigh_node = batadv_neigh_node_get_or_create(orig_node,
hard_iface, neigh_addr);
- if (!neigh_node)
- goto out;
-
- neigh_node->orig_node = orig_neigh;
-
-out:
return neigh_node;
}
@@ -231,6 +224,8 @@ static void batadv_iv_ogm_iface_disable(struct batadv_hard_iface *hard_iface)
hard_iface->bat_iv.ogm_buff = NULL;
mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex);
+
+ cancel_delayed_work_sync(&hard_iface->bat_iv.reschedule_work);
}
static void batadv_iv_ogm_iface_update_mac(struct batadv_hard_iface *hard_iface)
@@ -335,7 +330,7 @@ static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet,
struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface);
const char *fwd_str;
u8 packet_num;
- s16 buff_pos;
+ int buff_pos;
struct batadv_ogm_packet *batadv_ogm_packet;
struct sk_buff *skb;
u8 *packet_pos;
@@ -543,8 +538,10 @@ out:
* @if_incoming: interface where the packet was received
* @if_outgoing: interface for which the retransmission should be considered
* @own_packet: true if it is a self-generated ogm
+ *
+ * Return: whether forward packet was scheduled
*/
-static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff,
+static bool batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff,
int packet_len, unsigned long send_time,
bool direct_link,
struct batadv_hard_iface *if_incoming,
@@ -568,13 +565,13 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff,
skb = netdev_alloc_skb_ip_align(NULL, skb_size);
if (!skb)
- return;
+ return false;
forw_packet_aggr = batadv_forw_packet_alloc(if_incoming, if_outgoing,
queue_left, bat_priv, skb);
if (!forw_packet_aggr) {
kfree_skb(skb);
- return;
+ return false;
}
forw_packet_aggr->skb->priority = TC_PRIO_CONTROL;
@@ -597,6 +594,8 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff,
batadv_iv_send_outstanding_bat_ogm_packet);
batadv_forw_packet_ogmv1_queue(bat_priv, forw_packet_aggr, send_time);
+
+ return true;
}
/* aggregate a new packet into the existing ogm packet */
@@ -624,8 +623,10 @@ static void batadv_iv_ogm_aggregate(struct batadv_forw_packet *forw_packet_aggr,
* @if_outgoing: interface for which the retransmission should be considered
* @own_packet: true if it is a self-generated ogm
* @send_time: timestamp (jiffies) when the packet is to be sent
+ *
+ * Return: whether forward packet was scheduled
*/
-static void batadv_iv_ogm_queue_add(struct batadv_priv *bat_priv,
+static bool batadv_iv_ogm_queue_add(struct batadv_priv *bat_priv,
unsigned char *packet_buff,
int packet_len,
struct batadv_hard_iface *if_incoming,
@@ -677,14 +678,16 @@ static void batadv_iv_ogm_queue_add(struct batadv_priv *bat_priv,
if (!own_packet && atomic_read(&bat_priv->aggregated_ogms))
send_time += max_aggregation_jiffies;
- batadv_iv_ogm_aggregate_new(packet_buff, packet_len,
- send_time, direct_link,
- if_incoming, if_outgoing,
- own_packet);
+ return batadv_iv_ogm_aggregate_new(packet_buff, packet_len,
+ send_time, direct_link,
+ if_incoming, if_outgoing,
+ own_packet);
} else {
batadv_iv_ogm_aggregate(forw_packet_aggr, packet_buff,
packet_len, direct_link);
spin_unlock_bh(&bat_priv->forw_bat_list_lock);
+
+ return true;
}
}
@@ -797,6 +800,9 @@ static void batadv_iv_ogm_schedule_buff(struct batadv_hard_iface *hard_iface)
u32 seqno;
u16 tvlv_len = 0;
unsigned long send_time;
+ bool reschedule = false;
+ bool scheduled;
+ int ret;
lockdep_assert_held(&hard_iface->bat_iv.ogm_buff_mutex);
@@ -820,9 +826,15 @@ static void batadv_iv_ogm_schedule_buff(struct batadv_hard_iface *hard_iface)
* appended as it may alter the tt tvlv container
*/
batadv_tt_local_commit_changes(bat_priv);
- tvlv_len = batadv_tvlv_container_ogm_append(bat_priv, ogm_buff,
- ogm_buff_len,
- BATADV_OGM_HLEN);
+ ret = batadv_tvlv_container_ogm_append(bat_priv, ogm_buff,
+ ogm_buff_len,
+ BATADV_OGM_HLEN);
+ if (ret < 0) {
+ reschedule = true;
+ goto out;
+ }
+
+ tvlv_len = ret;
}
batadv_ogm_packet = (struct batadv_ogm_packet *)(*ogm_buff);
@@ -841,8 +853,11 @@ static void batadv_iv_ogm_schedule_buff(struct batadv_hard_iface *hard_iface)
/* OGMs from secondary interfaces are only scheduled on their
* respective interfaces.
*/
- batadv_iv_ogm_queue_add(bat_priv, *ogm_buff, *ogm_buff_len,
- hard_iface, hard_iface, 1, send_time);
+ scheduled = batadv_iv_ogm_queue_add(bat_priv, *ogm_buff, *ogm_buff_len,
+ hard_iface, hard_iface, 1, send_time);
+ if (!scheduled)
+ reschedule = true;
+
goto out;
}
@@ -854,15 +869,28 @@ static void batadv_iv_ogm_schedule_buff(struct batadv_hard_iface *hard_iface)
if (!kref_get_unless_zero(&tmp_hard_iface->refcount))
continue;
- batadv_iv_ogm_queue_add(bat_priv, *ogm_buff,
- *ogm_buff_len, hard_iface,
- tmp_hard_iface, 1, send_time);
-
+ scheduled = batadv_iv_ogm_queue_add(bat_priv, *ogm_buff,
+ *ogm_buff_len, hard_iface,
+ tmp_hard_iface, 1, send_time);
batadv_hardif_put(tmp_hard_iface);
+
+ if (!scheduled && tmp_hard_iface == hard_iface)
+ reschedule = true;
}
rcu_read_unlock();
out:
+ if (reschedule) {
+ /* there was a failure scheduling the own forward packet.
+ * as result, the batadv_iv_send_outstanding_bat_ogm_packet()
+ * work item is no longer scheduled. it is therefore necessary
+ * to reschedule it manually
+ */
+ queue_delayed_work(batadv_event_workqueue,
+ &hard_iface->bat_iv.reschedule_work,
+ msecs_to_jiffies(atomic_read(&bat_priv->orig_interval)));
+ }
+
batadv_hardif_put(primary_if);
}
@@ -877,6 +905,17 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface)
mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex);
}
+static void batadv_iv_ogm_reschedule(struct work_struct *work)
+{
+ struct delayed_work *delayed_work = to_delayed_work(work);
+ struct batadv_hard_iface *hard_iface;
+
+ hard_iface = container_of(delayed_work,
+ struct batadv_hard_iface,
+ bat_iv.reschedule_work);
+ batadv_iv_ogm_schedule(hard_iface);
+}
+
/**
* batadv_iv_orig_ifinfo_sum() - Get bcast_own sum for originator over interface
* @orig_node: originator which reproadcasted the OGMs directly
@@ -907,6 +946,31 @@ static u8 batadv_iv_orig_ifinfo_sum(struct batadv_orig_node *orig_node,
}
/**
+ * batadv_iv_ogm_neigh_ifinfo_sum() - Get bcast_own sum for a last-hop neighbor
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @neigh_node: last-hop neighbor of an originator
+ *
+ * Return: Number of replied (rebroadcasted) OGMs for the originator currently
+ * announced by the neighbor. Returns 0 if the neighbor's originator entry is
+ * not available anymore.
+ */
+static u8 batadv_iv_ogm_neigh_ifinfo_sum(struct batadv_priv *bat_priv,
+ const struct batadv_neigh_node *neigh_node)
+{
+ struct batadv_orig_node *orig_neigh;
+ u8 sum;
+
+ orig_neigh = batadv_orig_hash_find(bat_priv, neigh_node->addr);
+ if (!orig_neigh)
+ return 0;
+
+ sum = batadv_iv_orig_ifinfo_sum(orig_neigh, neigh_node->if_incoming);
+ batadv_orig_node_put(orig_neigh);
+
+ return sum;
+}
+
+/**
* batadv_iv_ogm_orig_update() - use OGM to update corresponding data in an
* originator
* @bat_priv: the bat priv with all the mesh interface information
@@ -975,17 +1039,9 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv,
}
if (!neigh_node) {
- struct batadv_orig_node *orig_tmp;
-
- orig_tmp = batadv_iv_ogm_orig_get(bat_priv, ethhdr->h_source);
- if (!orig_tmp)
- goto unlock;
-
neigh_node = batadv_iv_ogm_neigh_new(if_incoming,
ethhdr->h_source,
- orig_node, orig_tmp);
-
- batadv_orig_node_put(orig_tmp);
+ orig_node);
if (!neigh_node)
goto unlock;
} else {
@@ -1037,10 +1093,9 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv,
*/
if (router_ifinfo &&
neigh_ifinfo->bat_iv.tq_avg == router_ifinfo->bat_iv.tq_avg) {
- sum_orig = batadv_iv_orig_ifinfo_sum(router->orig_node,
- router->if_incoming);
- sum_neigh = batadv_iv_orig_ifinfo_sum(neigh_node->orig_node,
- neigh_node->if_incoming);
+ sum_orig = batadv_iv_ogm_neigh_ifinfo_sum(bat_priv, router);
+ sum_neigh = batadv_iv_ogm_neigh_ifinfo_sum(bat_priv,
+ neigh_node);
if (sum_orig >= sum_neigh)
goto out;
}
@@ -1106,7 +1161,6 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node,
if (!neigh_node)
neigh_node = batadv_iv_ogm_neigh_new(if_incoming,
orig_neigh_node->orig,
- orig_neigh_node,
orig_neigh_node);
if (!neigh_node)
@@ -1303,6 +1357,32 @@ out:
}
/**
+ * batadv_orig_to_direct_router() - get direct next hop neighbor to an orig address
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @orig_addr: the originator MAC address to search the best next hop router for
+ * @if_outgoing: the interface where the OGM should be sent to
+ *
+ * Return: A neighbor node which is the best router towards the given originator
+ * address. Bonding candidates are ignored.
+ */
+static struct batadv_neigh_node *
+batadv_orig_to_direct_router(struct batadv_priv *bat_priv, u8 *orig_addr,
+ struct batadv_hard_iface *if_outgoing)
+{
+ struct batadv_neigh_node *neigh_node;
+ struct batadv_orig_node *orig_node;
+
+ orig_node = batadv_orig_hash_find(bat_priv, orig_addr);
+ if (!orig_node)
+ return NULL;
+
+ neigh_node = batadv_orig_router_get(orig_node, if_outgoing);
+ batadv_orig_node_put(orig_node);
+
+ return neigh_node;
+}
+
+/**
* batadv_iv_ogm_process_per_outif() - process a batman iv OGM for an outgoing
* interface
* @skb: the skb containing the OGM
@@ -1372,8 +1452,9 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset,
router = batadv_orig_router_get(orig_node, if_outgoing);
if (router) {
- router_router = batadv_orig_router_get(router->orig_node,
- if_outgoing);
+ router_router = batadv_orig_to_direct_router(bat_priv,
+ router->addr,
+ if_outgoing);
router_ifinfo = batadv_neigh_ifinfo_get(router, if_outgoing);
}
@@ -2227,6 +2308,8 @@ batadv_iv_ogm_neigh_is_sob(struct batadv_neigh_node *neigh1,
static void batadv_iv_iface_enabled(struct batadv_hard_iface *hard_iface)
{
+ INIT_DELAYED_WORK(&hard_iface->bat_iv.reschedule_work, batadv_iv_ogm_reschedule);
+
/* begin scheduling originator messages on that interface */
batadv_iv_ogm_schedule(hard_iface);
}
diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c
index e3870492dab7..d66ca77b1aaa 100644
--- a/net/batman-adv/bat_v_ogm.c
+++ b/net/batman-adv/bat_v_ogm.c
@@ -113,14 +113,14 @@ static void batadv_v_ogm_start_timer(struct batadv_priv *bat_priv)
/**
* batadv_v_ogm_send_to_if() - send a batman ogm using a given interface
+ * @bat_priv: the bat priv with all the mesh interface information
* @skb: the OGM to send
* @hard_iface: the interface to use to send the OGM
*/
-static void batadv_v_ogm_send_to_if(struct sk_buff *skb,
+static void batadv_v_ogm_send_to_if(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
struct batadv_hard_iface *hard_iface)
{
- struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface);
-
if (hard_iface->if_status != BATADV_IF_ACTIVE) {
kfree_skb(skb);
return;
@@ -187,6 +187,7 @@ static void batadv_v_ogm_aggr_list_free(struct batadv_hard_iface *hard_iface)
/**
* batadv_v_ogm_aggr_send() - flush & send aggregation queue
+ * @bat_priv: the bat priv with all the mesh interface information
* @hard_iface: the interface with the aggregation queue to flush
*
* Aggregates all OGMv2 packets currently in the aggregation queue into a
@@ -196,7 +197,8 @@ static void batadv_v_ogm_aggr_list_free(struct batadv_hard_iface *hard_iface)
*
* Caller needs to hold the hard_iface->bat_v.aggr_list.lock.
*/
-static void batadv_v_ogm_aggr_send(struct batadv_hard_iface *hard_iface)
+static void batadv_v_ogm_aggr_send(struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *hard_iface)
{
unsigned int aggr_len = hard_iface->bat_v.aggr_len;
struct sk_buff *skb_aggr;
@@ -226,27 +228,32 @@ static void batadv_v_ogm_aggr_send(struct batadv_hard_iface *hard_iface)
consume_skb(skb);
}
- batadv_v_ogm_send_to_if(skb_aggr, hard_iface);
+ batadv_v_ogm_send_to_if(bat_priv, skb_aggr, hard_iface);
}
/**
* batadv_v_ogm_queue_on_if() - queue a batman ogm on a given interface
+ * @bat_priv: the bat priv with all the mesh interface information
* @skb: the OGM to queue
* @hard_iface: the interface to queue the OGM on
*/
-static void batadv_v_ogm_queue_on_if(struct sk_buff *skb,
+static void batadv_v_ogm_queue_on_if(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
struct batadv_hard_iface *hard_iface)
{
- struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface);
+ if (hard_iface->mesh_iface != bat_priv->mesh_iface) {
+ kfree_skb(skb);
+ return;
+ }
if (!atomic_read(&bat_priv->aggregated_ogms)) {
- batadv_v_ogm_send_to_if(skb, hard_iface);
+ batadv_v_ogm_send_to_if(bat_priv, skb, hard_iface);
return;
}
spin_lock_bh(&hard_iface->bat_v.aggr_list.lock);
if (!batadv_v_ogm_queue_left(skb, hard_iface))
- batadv_v_ogm_aggr_send(hard_iface);
+ batadv_v_ogm_aggr_send(bat_priv, hard_iface);
hard_iface->bat_v.aggr_len += batadv_v_ogm_len(skb);
__skb_queue_tail(&hard_iface->bat_v.aggr_list, skb);
@@ -262,10 +269,10 @@ static void batadv_v_ogm_send_meshif(struct batadv_priv *bat_priv)
struct batadv_hard_iface *hard_iface;
struct batadv_ogm2_packet *ogm_packet;
struct sk_buff *skb, *skb_tmp;
- unsigned char *ogm_buff;
+ unsigned char **ogm_buff;
struct list_head *iter;
- int ogm_buff_len;
- u16 tvlv_len = 0;
+ int *ogm_buff_len;
+ u16 tvlv_len;
int ret;
lockdep_assert_held(&bat_priv->bat_v.ogm_buff_mutex);
@@ -273,25 +280,27 @@ static void batadv_v_ogm_send_meshif(struct batadv_priv *bat_priv)
if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING)
goto out;
- ogm_buff = bat_priv->bat_v.ogm_buff;
- ogm_buff_len = bat_priv->bat_v.ogm_buff_len;
+ ogm_buff = &bat_priv->bat_v.ogm_buff;
+ ogm_buff_len = &bat_priv->bat_v.ogm_buff_len;
+
/* tt changes have to be committed before the tvlv data is
* appended as it may alter the tt tvlv container
*/
batadv_tt_local_commit_changes(bat_priv);
- tvlv_len = batadv_tvlv_container_ogm_append(bat_priv, &ogm_buff,
- &ogm_buff_len,
- BATADV_OGM2_HLEN);
+ ret = batadv_tvlv_container_ogm_append(bat_priv, ogm_buff,
+ ogm_buff_len,
+ BATADV_OGM2_HLEN);
+ if (ret < 0)
+ goto reschedule;
- bat_priv->bat_v.ogm_buff = ogm_buff;
- bat_priv->bat_v.ogm_buff_len = ogm_buff_len;
+ tvlv_len = ret;
- skb = netdev_alloc_skb_ip_align(NULL, ETH_HLEN + ogm_buff_len);
+ skb = netdev_alloc_skb_ip_align(NULL, ETH_HLEN + *ogm_buff_len);
if (!skb)
goto reschedule;
skb_reserve(skb, ETH_HLEN);
- skb_put_data(skb, ogm_buff, ogm_buff_len);
+ skb_put_data(skb, *ogm_buff, *ogm_buff_len);
ogm_packet = (struct batadv_ogm2_packet *)skb->data;
ogm_packet->seqno = htonl(atomic_read(&bat_priv->bat_v.ogm_seqno));
@@ -343,7 +352,7 @@ static void batadv_v_ogm_send_meshif(struct batadv_priv *bat_priv)
break;
}
- batadv_v_ogm_queue_on_if(skb_tmp, hard_iface);
+ batadv_v_ogm_queue_on_if(bat_priv, skb_tmp, hard_iface);
batadv_hardif_put(hard_iface);
}
rcu_read_unlock();
@@ -383,12 +392,14 @@ void batadv_v_ogm_aggr_work(struct work_struct *work)
{
struct batadv_hard_iface_bat_v *batv;
struct batadv_hard_iface *hard_iface;
+ struct batadv_priv *bat_priv;
batv = container_of(work, struct batadv_hard_iface_bat_v, aggr_wq.work);
hard_iface = container_of(batv, struct batadv_hard_iface, bat_v);
+ bat_priv = netdev_priv(hard_iface->mesh_iface);
spin_lock_bh(&hard_iface->bat_v.aggr_list.lock);
- batadv_v_ogm_aggr_send(hard_iface);
+ batadv_v_ogm_aggr_send(bat_priv, hard_iface);
spin_unlock_bh(&hard_iface->bat_v.aggr_list.lock);
batadv_v_ogm_start_queue_timer(hard_iface);
@@ -578,7 +589,7 @@ static void batadv_v_ogm_forward(struct batadv_priv *bat_priv,
if_outgoing->net_dev->name, ntohl(ogm_forward->throughput),
ogm_forward->ttl, if_incoming->net_dev->name);
- batadv_v_ogm_queue_on_if(skb, if_outgoing);
+ batadv_v_ogm_queue_on_if(bat_priv, skb, if_outgoing);
out:
batadv_orig_ifinfo_put(orig_ifinfo);
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index 51fe028b9088..ffe854018bd3 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -318,8 +318,8 @@ batadv_bla_del_backbone_claims(struct batadv_bla_backbone_gw *backbone_gw)
if (claim->backbone_gw != backbone_gw)
continue;
- batadv_claim_put(claim);
hlist_del_rcu(&claim->hash_entry);
+ batadv_claim_put(claim);
}
spin_unlock_bh(list_lock);
}
@@ -356,12 +356,14 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, const u8 *mac,
sizeof(local_claim_dest));
local_claim_dest.type = claimtype;
- mesh_iface = primary_if->mesh_iface;
+ mesh_iface = READ_ONCE(primary_if->mesh_iface);
+ if (!mesh_iface)
+ goto out;
skb = arp_create(ARPOP_REPLY, ETH_P_ARP,
/* IP DST: 0.0.0.0 */
zeroip,
- primary_if->mesh_iface,
+ mesh_iface,
/* IP SRC: 0.0.0.0 */
zeroip,
/* Ethernet DST: Broadcast */
@@ -514,8 +516,8 @@ batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, const u8 *orig,
entry->crc = BATADV_BLA_CRC_INIT;
entry->bat_priv = bat_priv;
spin_lock_init(&entry->crc_lock);
- atomic_set(&entry->request_sent, 0);
- atomic_set(&entry->wait_periods, 0);
+ entry->state = BATADV_BLA_BACKBONE_GW_SYNCED;
+ entry->wait_periods = 0;
ether_addr_copy(entry->orig, orig);
INIT_WORK(&entry->report_work, batadv_bla_loopdetect_report);
kref_init(&entry->refcount);
@@ -544,9 +546,13 @@ batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, const u8 *orig,
batadv_bla_send_announce(bat_priv, entry);
/* this will be decreased in the worker thread */
- atomic_inc(&entry->request_sent);
- atomic_set(&entry->wait_periods, BATADV_BLA_WAIT_PERIODS);
- atomic_inc(&bat_priv->bla.num_requests);
+ spin_lock_bh(&bat_priv->bla.num_requests_lock);
+ if (entry->state == BATADV_BLA_BACKBONE_GW_SYNCED) {
+ entry->state = BATADV_BLA_BACKBONE_GW_UNSYNCED;
+ entry->wait_periods = BATADV_BLA_WAIT_PERIODS;
+ atomic_inc(&bat_priv->bla.num_requests);
+ }
+ spin_unlock_bh(&bat_priv->bla.num_requests_lock);
}
return entry;
@@ -649,10 +655,12 @@ static void batadv_bla_send_request(struct batadv_bla_backbone_gw *backbone_gw)
backbone_gw->vid, BATADV_CLAIM_TYPE_REQUEST);
/* no local broadcasts should be sent or received, for now. */
- if (!atomic_read(&backbone_gw->request_sent)) {
+ spin_lock_bh(&backbone_gw->bat_priv->bla.num_requests_lock);
+ if (backbone_gw->state == BATADV_BLA_BACKBONE_GW_SYNCED) {
+ backbone_gw->state = BATADV_BLA_BACKBONE_GW_UNSYNCED;
atomic_inc(&backbone_gw->bat_priv->bla.num_requests);
- atomic_set(&backbone_gw->request_sent, 1);
}
+ spin_unlock_bh(&backbone_gw->bat_priv->bla.num_requests_lock);
}
/**
@@ -723,6 +731,7 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv,
if (unlikely(hash_added != 0)) {
/* only local changes happened. */
+ batadv_backbone_gw_put(backbone_gw);
kfree(claim);
return;
}
@@ -872,10 +881,12 @@ static bool batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr,
/* if we have sent a request and the crc was OK,
* we can allow traffic again.
*/
- if (atomic_read(&backbone_gw->request_sent)) {
+ spin_lock_bh(&bat_priv->bla.num_requests_lock);
+ if (backbone_gw->state == BATADV_BLA_BACKBONE_GW_UNSYNCED) {
+ backbone_gw->state = BATADV_BLA_BACKBONE_GW_SYNCED;
atomic_dec(&backbone_gw->bat_priv->bla.num_requests);
- atomic_set(&backbone_gw->request_sent, 0);
}
+ spin_unlock_bh(&bat_priv->bla.num_requests_lock);
}
batadv_backbone_gw_put(backbone_gw);
@@ -1223,6 +1234,7 @@ static void batadv_bla_purge_backbone_gw(struct batadv_priv *bat_priv, int now)
struct hlist_head *head;
struct batadv_hashtable *hash;
spinlock_t *list_lock; /* protects write access to the hash lists */
+ bool purged;
int i;
hash = bat_priv->bla.backbone_hash;
@@ -1233,30 +1245,49 @@ static void batadv_bla_purge_backbone_gw(struct batadv_priv *bat_priv, int now)
head = &hash->table[i];
list_lock = &hash->list_locks[i];
- spin_lock_bh(list_lock);
- hlist_for_each_entry_safe(backbone_gw, node_tmp,
- head, hash_entry) {
- if (now)
- goto purge_now;
- if (!batadv_has_timed_out(backbone_gw->lasttime,
- BATADV_BLA_BACKBONE_TIMEOUT))
- continue;
+ do {
+ purged = false;
+
+ spin_lock_bh(list_lock);
+ hlist_for_each_entry_safe(backbone_gw, node_tmp,
+ head, hash_entry) {
+ if (now)
+ goto purge_now;
+ if (!batadv_has_timed_out(backbone_gw->lasttime,
+ BATADV_BLA_BACKBONE_TIMEOUT))
+ continue;
- batadv_dbg(BATADV_DBG_BLA, backbone_gw->bat_priv,
- "%s(): backbone gw %pM timed out\n",
- __func__, backbone_gw->orig);
+ batadv_dbg(BATADV_DBG_BLA, backbone_gw->bat_priv,
+ "%s(): backbone gw %pM timed out\n",
+ __func__, backbone_gw->orig);
purge_now:
- /* don't wait for the pending request anymore */
- if (atomic_read(&backbone_gw->request_sent))
- atomic_dec(&bat_priv->bla.num_requests);
+ purged = true;
- batadv_bla_del_backbone_claims(backbone_gw);
+ /* don't wait for the pending request anymore */
+ spin_lock_bh(&bat_priv->bla.num_requests_lock);
+ if (backbone_gw->state == BATADV_BLA_BACKBONE_GW_UNSYNCED)
+ atomic_dec(&bat_priv->bla.num_requests);
- hlist_del_rcu(&backbone_gw->hash_entry);
- batadv_backbone_gw_put(backbone_gw);
- }
- spin_unlock_bh(list_lock);
+ backbone_gw->state = BATADV_BLA_BACKBONE_GW_STOPPED;
+ spin_unlock_bh(&bat_priv->bla.num_requests_lock);
+
+ batadv_bla_del_backbone_claims(backbone_gw);
+
+ hlist_del_rcu(&backbone_gw->hash_entry);
+ break;
+ }
+ spin_unlock_bh(list_lock);
+
+ if (purged) {
+ /* reference for pending report_work */
+ if (cancel_work_sync(&backbone_gw->report_work))
+ batadv_backbone_gw_put(backbone_gw);
+
+ /* reference for hash_entry */
+ batadv_backbone_gw_put(backbone_gw);
+ }
+ } while (purged);
}
}
@@ -1288,6 +1319,13 @@ static void batadv_bla_purge_claims(struct batadv_priv *bat_priv,
rcu_read_lock();
hlist_for_each_entry_rcu(claim, head, hash_entry) {
+ /* only purge claims not currently in the process of being released.
+ * Such claims could otherwise have a NULL-ptr backbone_gw set because
+ * they already went through batadv_claim_release()
+ */
+ if (!kref_get_unless_zero(&claim->refcount))
+ continue;
+
backbone_gw = batadv_bla_claim_get_backbone_gw(claim);
if (now)
goto purge_now;
@@ -1313,6 +1351,7 @@ purge_now:
claim->addr, claim->vid);
skip:
batadv_backbone_gw_put(backbone_gw);
+ batadv_claim_put(claim);
}
rcu_read_unlock();
}
@@ -1483,7 +1522,7 @@ static void batadv_bla_periodic_work(struct work_struct *work)
batadv_bla_send_loopdetect(bat_priv,
backbone_gw);
- /* request_sent is only set after creation to avoid
+ /* state is only set to unsynced after creation to avoid
* problems when we are not yet known as backbone gw
* in the backbone.
*
@@ -1492,14 +1531,21 @@ static void batadv_bla_periodic_work(struct work_struct *work)
* some grace time.
*/
- if (atomic_read(&backbone_gw->request_sent) == 0)
- continue;
+ spin_lock_bh(&bat_priv->bla.num_requests_lock);
+ if (backbone_gw->state != BATADV_BLA_BACKBONE_GW_UNSYNCED)
+ goto unlock_next;
- if (!atomic_dec_and_test(&backbone_gw->wait_periods))
- continue;
+ if (backbone_gw->wait_periods > 0)
+ backbone_gw->wait_periods--;
+
+ if (backbone_gw->wait_periods > 0)
+ goto unlock_next;
+ backbone_gw->state = BATADV_BLA_BACKBONE_GW_SYNCED;
atomic_dec(&backbone_gw->bat_priv->bla.num_requests);
- atomic_set(&backbone_gw->request_sent, 0);
+
+unlock_next:
+ spin_unlock_bh(&bat_priv->bla.num_requests_lock);
}
rcu_read_unlock();
}
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index 3efc4cf50b46..0a8bd95e2f99 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -696,6 +696,9 @@ static bool batadv_dat_forward_data(struct batadv_priv *bat_priv,
goto free_orig;
tmp_skb = pskb_copy_for_clone(skb, GFP_ATOMIC);
+ if (!tmp_skb)
+ goto free_neigh;
+
if (!batadv_send_skb_prepare_unicast_4addr(bat_priv, tmp_skb,
cand[i].orig_node,
packet_subtype)) {
diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c
index f4e45cc25816..e9553db42349 100644
--- a/net/batman-adv/fragmentation.c
+++ b/net/batman-adv/fragmentation.c
@@ -17,6 +17,7 @@
#include <linux/lockdep.h>
#include <linux/minmax.h>
#include <linux/netdevice.h>
+#include <linux/overflow.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
@@ -80,9 +81,9 @@ void batadv_frag_purge_orig(struct batadv_orig_node *orig_node,
*
* Return: the maximum size of payload that can be fragmented.
*/
-static int batadv_frag_size_limit(void)
+static size_t batadv_frag_size_limit(void)
{
- int limit = BATADV_FRAG_MAX_FRAG_SIZE;
+ size_t limit = BATADV_FRAG_MAX_FRAG_SIZE;
limit -= sizeof(struct batadv_frag_packet);
limit *= BATADV_FRAG_MAX_FRAGMENTS;
@@ -143,7 +144,9 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node,
struct batadv_frag_packet *frag_packet;
u8 bucket;
u16 seqno, hdr_size = sizeof(struct batadv_frag_packet);
+ bool overflow = false;
bool ret = false;
+ size_t data_len;
/* Linearize packet to avoid linearizing 16 packets in a row when doing
* the later merge. Non-linear merge should be added to remove this
@@ -153,6 +156,7 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node,
goto err;
frag_packet = (struct batadv_frag_packet *)skb->data;
+ data_len = skb->len - hdr_size;
seqno = ntohs(frag_packet->seqno);
bucket = seqno % BATADV_FRAG_BUFFER_COUNT;
@@ -171,7 +175,7 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node,
spin_lock_bh(&chain->lock);
if (batadv_frag_init_chain(chain, seqno)) {
hlist_add_head(&frag_entry_new->list, &chain->fragment_list);
- chain->size = skb->len - hdr_size;
+ chain->size = data_len;
chain->timestamp = jiffies;
chain->total_size = ntohs(frag_packet->total_size);
ret = true;
@@ -188,7 +192,11 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node,
if (frag_entry_curr->no < frag_entry_new->no) {
hlist_add_before(&frag_entry_new->list,
&frag_entry_curr->list);
- chain->size += skb->len - hdr_size;
+
+ if (check_add_overflow(chain->size, data_len,
+ &chain->size))
+ overflow = true;
+
chain->timestamp = jiffies;
ret = true;
goto out;
@@ -201,13 +209,16 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node,
/* Reached the end of the list, so insert after 'frag_entry_last'. */
if (likely(frag_entry_last)) {
hlist_add_behind(&frag_entry_new->list, &frag_entry_last->list);
- chain->size += skb->len - hdr_size;
+
+ if (check_add_overflow(chain->size, data_len, &chain->size))
+ overflow = true;
+
chain->timestamp = jiffies;
ret = true;
}
out:
- if (chain->size > batadv_frag_size_limit() ||
+ if (overflow || chain->size > batadv_frag_size_limit() ||
chain->total_size != ntohs(frag_packet->total_size) ||
chain->total_size > batadv_frag_size_limit()) {
/* Clear chain if total size of either the list or the packet
@@ -294,6 +305,31 @@ free:
}
/**
+ * batadv_skb_is_frag() - check if newly merged skb contains unicast fragment
+ * @skb: newly merged skb
+ *
+ * Return: if newly merged skb is of type BATADV_UNICAST_FRAG
+ */
+static bool batadv_skb_is_frag(struct sk_buff *skb)
+{
+ struct batadv_ogm_packet *batadv_ogm_packet;
+
+ /* packet should hold at least type and version */
+ if (unlikely(!pskb_may_pull(skb, 2)))
+ return false;
+
+ batadv_ogm_packet = (struct batadv_ogm_packet *)skb->data;
+
+ if (batadv_ogm_packet->version != BATADV_COMPAT_VERSION)
+ return false;
+
+ if (batadv_ogm_packet->packet_type != BATADV_UNICAST_FRAG)
+ return false;
+
+ return true;
+}
+
+/**
* batadv_frag_skb_buffer() - buffer fragment for later merge
* @skb: skb to buffer
* @orig_node_src: originator that the skb is received from
@@ -326,6 +362,16 @@ bool batadv_frag_skb_buffer(struct sk_buff **skb,
if (!skb_out)
goto out_err;
+ /* fragment in fragment is not allowed. otherwise it is possible
+ * to exhaust the stack when receiving a matryoshka-style
+ * "fragments in a fragment packet"
+ */
+ if (batadv_skb_is_frag(skb_out)) {
+ kfree_skb(skb_out);
+ skb_out = NULL;
+ goto out_err;
+ }
+
out:
ret = true;
out_err:
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index 51e9c081a2a4..a9d0346e8332 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -478,10 +478,14 @@ void batadv_gw_node_delete(struct batadv_priv *bat_priv,
*/
void batadv_gw_node_free(struct batadv_priv *bat_priv)
{
+ struct batadv_gw_node *curr_gw;
struct batadv_gw_node *gw_node;
struct hlist_node *node_tmp;
spin_lock_bh(&bat_priv->gw.list_lock);
+ curr_gw = rcu_replace_pointer(bat_priv->gw.curr_gw, NULL, true);
+ batadv_gw_node_put(curr_gw);
+
hlist_for_each_entry_safe(gw_node, node_tmp,
&bat_priv->gw.gateway_list, list) {
hlist_del_init_rcu(&gw_node->list);
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index 3a35aadd8b41..a4d33ee0fda5 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -249,6 +249,7 @@ void batadv_mesh_free(struct net_device *mesh_iface)
atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING);
batadv_purge_outstanding_packets(bat_priv, NULL);
+ batadv_tp_stop_all(bat_priv);
batadv_gw_node_free(bat_priv);
diff --git a/net/batman-adv/mesh-interface.c b/net/batman-adv/mesh-interface.c
index 56ca1c1b83f2..e7aa45bc6b7a 100644
--- a/net/batman-adv/mesh-interface.c
+++ b/net/batman-adv/mesh-interface.c
@@ -787,6 +787,7 @@ static int batadv_meshif_init_late(struct net_device *dev)
atomic_set(&bat_priv->tt.ogm_append_cnt, 0);
#ifdef CONFIG_BATMAN_ADV_BLA
atomic_set(&bat_priv->bla.num_requests, 0);
+ spin_lock_init(&bat_priv->bla.num_requests_lock);
#endif
atomic_set(&bat_priv->tp_num, 0);
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index b3468ccab535..ad4921b659d9 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -835,8 +835,6 @@ static void batadv_orig_node_free_rcu(struct rcu_head *rcu)
orig_node = container_of(rcu, struct batadv_orig_node, rcu);
- batadv_mcast_purge_orig(orig_node);
-
batadv_frag_purge_orig(orig_node, NULL);
kfree(orig_node->tt_buff);
@@ -887,6 +885,8 @@ void batadv_orig_node_release(struct kref *ref)
}
spin_unlock_bh(&orig_node->vlan_list_lock);
+ batadv_mcast_purge_orig(orig_node);
+
call_rcu(&orig_node->rcu, batadv_orig_node_free_rcu);
}
diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c
index 2e42f6b348c8..0fc4ca78e84e 100644
--- a/net/batman-adv/tp_meter.c
+++ b/net/batman-adv/tp_meter.c
@@ -8,10 +8,12 @@
#include "main.h"
#include <linux/atomic.h>
+#include <linux/bug.h>
#include <linux/build_bug.h>
#include <linux/byteorder/generic.h>
#include <linux/cache.h>
#include <linux/compiler.h>
+#include <linux/completion.h>
#include <linux/container_of.h>
#include <linux/err.h>
#include <linux/etherdevice.h>
@@ -253,6 +255,7 @@ static void batadv_tp_batctl_error_notify(enum batadv_tp_meter_reason reason,
* batadv_tp_list_find() - find a tp_vars object in the global list
* @bat_priv: the bat priv with all the mesh interface information
* @dst: the other endpoint MAC address to look for
+ * @role: role of the session
*
* Look for a tp_vars object matching dst as end_point and return it after
* having increment the refcounter. Return NULL is not found
@@ -260,7 +263,8 @@ static void batadv_tp_batctl_error_notify(enum batadv_tp_meter_reason reason,
* Return: matching tp_vars or NULL when no tp_vars with @dst was found
*/
static struct batadv_tp_vars *batadv_tp_list_find(struct batadv_priv *bat_priv,
- const u8 *dst)
+ const u8 *dst,
+ enum batadv_tp_meter_role role)
{
struct batadv_tp_vars *pos, *tp_vars = NULL;
@@ -269,6 +273,9 @@ static struct batadv_tp_vars *batadv_tp_list_find(struct batadv_priv *bat_priv,
if (!batadv_compare_eth(pos->other_end, dst))
continue;
+ if (pos->role != role)
+ continue;
+
/* most of the time this function is invoked during the normal
* process..it makes sens to pay more when the session is
* finished and to speed the process up during the measurement
@@ -285,11 +292,32 @@ static struct batadv_tp_vars *batadv_tp_list_find(struct batadv_priv *bat_priv,
}
/**
+ * batadv_tp_list_active() - check if session from/to destination is ongoing
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @dst: the other endpoint MAC address to look for
+ *
+ * Return: if matching session with @dst was found
+ */
+static bool batadv_tp_list_active(struct batadv_priv *bat_priv, const u8 *dst)
+ __must_hold(&bat_priv->tp_list_lock)
+{
+ struct batadv_tp_vars *tp_vars;
+
+ hlist_for_each_entry_rcu(tp_vars, &bat_priv->tp_list, list) {
+ if (batadv_compare_eth(tp_vars->other_end, dst))
+ return true;
+ }
+
+ return false;
+}
+
+/**
* batadv_tp_list_find_session() - find tp_vars session object in the global
* list
* @bat_priv: the bat priv with all the mesh interface information
* @dst: the other endpoint MAC address to look for
* @session: session identifier
+ * @role: role of the session
*
* Look for a tp_vars object matching dst as end_point, session as tp meter
* session and return it after having increment the refcounter. Return NULL
@@ -299,7 +327,7 @@ static struct batadv_tp_vars *batadv_tp_list_find(struct batadv_priv *bat_priv,
*/
static struct batadv_tp_vars *
batadv_tp_list_find_session(struct batadv_priv *bat_priv, const u8 *dst,
- const u8 *session)
+ const u8 *session, enum batadv_tp_meter_role role)
{
struct batadv_tp_vars *pos, *tp_vars = NULL;
@@ -311,6 +339,9 @@ batadv_tp_list_find_session(struct batadv_priv *bat_priv, const u8 *dst,
if (memcmp(pos->session, session, sizeof(pos->session)) != 0)
continue;
+ if (pos->role != role)
+ continue;
+
/* most of the time this function is invoked during the normal
* process..it makes sense to pay more when the session is
* finished and to speed the process up during the measurement
@@ -365,32 +396,41 @@ static void batadv_tp_vars_put(struct batadv_tp_vars *tp_vars)
}
/**
- * batadv_tp_sender_cleanup() - cleanup sender data and drop and timer
- * @bat_priv: the bat priv with all the mesh interface information
- * @tp_vars: the private data of the current TP meter session to cleanup
+ * batadv_tp_list_detach() - remove tp session from mesh session list once
+ * @tp_vars: the private data of the current TP meter session
*/
-static void batadv_tp_sender_cleanup(struct batadv_priv *bat_priv,
- struct batadv_tp_vars *tp_vars)
+static void batadv_tp_list_detach(struct batadv_tp_vars *tp_vars)
{
- cancel_delayed_work(&tp_vars->finish_work);
+ bool detached = false;
spin_lock_bh(&tp_vars->bat_priv->tp_list_lock);
- hlist_del_rcu(&tp_vars->list);
+ if (!hlist_unhashed(&tp_vars->list)) {
+ hlist_del_init_rcu(&tp_vars->list);
+ detached = true;
+ }
spin_unlock_bh(&tp_vars->bat_priv->tp_list_lock);
+ if (!detached)
+ return;
+
+ atomic_dec(&tp_vars->bat_priv->tp_num);
+
/* drop list reference */
batadv_tp_vars_put(tp_vars);
+}
- atomic_dec(&tp_vars->bat_priv->tp_num);
+/**
+ * batadv_tp_sender_cleanup() - cleanup sender data and drop and timer
+ * @tp_vars: the private data of the current TP meter session to cleanup
+ */
+static void batadv_tp_sender_cleanup(struct batadv_tp_vars *tp_vars)
+{
+ cancel_delayed_work_sync(&tp_vars->finish_work);
+
+ batadv_tp_list_detach(tp_vars);
/* kill the timer and remove its reference */
- timer_delete_sync(&tp_vars->timer);
- /* the worker might have rearmed itself therefore we kill it again. Note
- * that if the worker should run again before invoking the following
- * timer_delete(), it would not re-arm itself once again because the status
- * is OFF now
- */
- timer_delete(&tp_vars->timer);
+ timer_shutdown_sync(&tp_vars->timer);
batadv_tp_vars_put(tp_vars);
}
@@ -402,11 +442,14 @@ static void batadv_tp_sender_cleanup(struct batadv_priv *bat_priv,
static void batadv_tp_sender_end(struct batadv_priv *bat_priv,
struct batadv_tp_vars *tp_vars)
{
+ enum batadv_tp_meter_reason reason;
u32 session_cookie;
+ reason = atomic_read(&tp_vars->send_result);
+
batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
"Test towards %pM finished..shutting down (reason=%d)\n",
- tp_vars->other_end, tp_vars->reason);
+ tp_vars->other_end, reason);
batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
"Last timing stats: SRTT=%ums RTTVAR=%ums RTO=%ums\n",
@@ -419,7 +462,7 @@ static void batadv_tp_sender_end(struct batadv_priv *bat_priv,
session_cookie = batadv_tp_session_cookie(tp_vars->session,
tp_vars->icmp_uid);
- batadv_tp_batctl_notify(tp_vars->reason,
+ batadv_tp_batctl_notify(reason,
tp_vars->other_end,
bat_priv,
tp_vars->start_time,
@@ -435,10 +478,18 @@ static void batadv_tp_sender_end(struct batadv_priv *bat_priv,
static void batadv_tp_sender_shutdown(struct batadv_tp_vars *tp_vars,
enum batadv_tp_meter_reason reason)
{
- if (!atomic_dec_and_test(&tp_vars->sending))
- return;
+ atomic_cmpxchg(&tp_vars->send_result, 0, reason);
+}
- tp_vars->reason = reason;
+/**
+ * batadv_tp_sender_stopped() - check if tp session was stopped with reason
+ * @tp_vars: the private data of the current TP meter session
+ *
+ * Return: whether stop reason was found
+ */
+static bool batadv_tp_sender_stopped(struct batadv_tp_vars *tp_vars)
+{
+ return atomic_read(&tp_vars->send_result) != 0;
}
/**
@@ -468,7 +519,7 @@ static void batadv_tp_reset_sender_timer(struct batadv_tp_vars *tp_vars)
/* most of the time this function is invoked while normal packet
* reception...
*/
- if (unlikely(atomic_read(&tp_vars->sending) == 0))
+ if (unlikely(batadv_tp_sender_stopped(tp_vars)))
/* timer ref will be dropped in batadv_tp_sender_cleanup */
return;
@@ -488,7 +539,7 @@ static void batadv_tp_sender_timeout(struct timer_list *t)
struct batadv_tp_vars *tp_vars = timer_container_of(tp_vars, t, timer);
struct batadv_priv *bat_priv = tp_vars->bat_priv;
- if (atomic_read(&tp_vars->sending) == 0)
+ if (batadv_tp_sender_stopped(tp_vars))
return;
/* if the user waited long enough...shutdown the test */
@@ -643,11 +694,11 @@ static void batadv_tp_recv_ack(struct batadv_priv *bat_priv,
/* find the tp_vars */
tp_vars = batadv_tp_list_find_session(bat_priv, icmp->orig,
- icmp->session);
+ icmp->session, BATADV_TP_SENDER);
if (unlikely(!tp_vars))
return;
- if (unlikely(atomic_read(&tp_vars->sending) == 0))
+ if (unlikely(batadv_tp_sender_stopped(tp_vars)))
goto out;
/* old ACK? silently drop it.. */
@@ -813,21 +864,21 @@ static int batadv_tp_send(void *arg)
if (unlikely(tp_vars->role != BATADV_TP_SENDER)) {
err = BATADV_TP_REASON_DST_UNREACHABLE;
- tp_vars->reason = err;
+ batadv_tp_sender_shutdown(tp_vars, err);
goto out;
}
orig_node = batadv_orig_hash_find(bat_priv, tp_vars->other_end);
if (unlikely(!orig_node)) {
err = BATADV_TP_REASON_DST_UNREACHABLE;
- tp_vars->reason = err;
+ batadv_tp_sender_shutdown(tp_vars, err);
goto out;
}
primary_if = batadv_primary_if_get_selected(bat_priv);
if (unlikely(!primary_if)) {
err = BATADV_TP_REASON_DST_UNREACHABLE;
- tp_vars->reason = err;
+ batadv_tp_sender_shutdown(tp_vars, err);
goto out;
}
@@ -846,7 +897,7 @@ static int batadv_tp_send(void *arg)
queue_delayed_work(batadv_event_workqueue, &tp_vars->finish_work,
msecs_to_jiffies(tp_vars->test_length));
- while (atomic_read(&tp_vars->sending) != 0) {
+ while (!batadv_tp_sender_stopped(tp_vars)) {
if (unlikely(!batadv_tp_avail(tp_vars, payload_len))) {
batadv_tp_wait_available(tp_vars, payload_len);
continue;
@@ -869,8 +920,7 @@ static int batadv_tp_send(void *arg)
"Meter: %s() cannot send packets (%d)\n",
__func__, err);
/* ensure nobody else tries to stop the thread now */
- if (atomic_dec_and_test(&tp_vars->sending))
- tp_vars->reason = err;
+ batadv_tp_sender_shutdown(tp_vars, err);
break;
}
@@ -886,7 +936,8 @@ out:
batadv_orig_node_put(orig_node);
batadv_tp_sender_end(bat_priv, tp_vars);
- batadv_tp_sender_cleanup(bat_priv, tp_vars);
+ batadv_tp_sender_cleanup(tp_vars);
+ complete(&tp_vars->finished);
batadv_tp_vars_put(tp_vars);
@@ -918,7 +969,8 @@ static void batadv_tp_start_kthread(struct batadv_tp_vars *tp_vars)
batadv_tp_vars_put(tp_vars);
/* cleanup of failed tp meter variables */
- batadv_tp_sender_cleanup(bat_priv, tp_vars);
+ batadv_tp_sender_cleanup(tp_vars);
+ complete(&tp_vars->finished);
return;
}
@@ -947,10 +999,15 @@ void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst,
/* look for an already existing test towards this node */
spin_lock_bh(&bat_priv->tp_list_lock);
- tp_vars = batadv_tp_list_find(bat_priv, dst);
- if (tp_vars) {
+ if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) {
+ spin_unlock_bh(&bat_priv->tp_list_lock);
+ batadv_tp_batctl_error_notify(BATADV_TP_REASON_DST_UNREACHABLE,
+ dst, bat_priv, session_cookie);
+ return;
+ }
+
+ if (batadv_tp_list_active(bat_priv, dst)) {
spin_unlock_bh(&bat_priv->tp_list_lock);
- batadv_tp_vars_put(tp_vars);
batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
"Meter: test to or from the same node already ongoing, aborting\n");
batadv_tp_batctl_error_notify(BATADV_TP_REASON_ALREADY_ONGOING,
@@ -969,6 +1026,7 @@ void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst,
tp_vars = kmalloc_obj(*tp_vars, GFP_ATOMIC);
if (!tp_vars) {
+ atomic_dec(&bat_priv->tp_num);
spin_unlock_bh(&bat_priv->tp_list_lock);
batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
"Meter: %s cannot allocate list elements\n",
@@ -982,7 +1040,7 @@ void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst,
ether_addr_copy(tp_vars->other_end, dst);
kref_init(&tp_vars->refcount);
tp_vars->role = BATADV_TP_SENDER;
- atomic_set(&tp_vars->sending, 1);
+ atomic_set(&tp_vars->send_result, 0);
memcpy(tp_vars->session, session_id, sizeof(session_id));
tp_vars->icmp_uid = icmp_uid;
@@ -1017,6 +1075,7 @@ void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst,
tp_vars->start_time = jiffies;
init_waitqueue_head(&tp_vars->more_bytes);
+ init_completion(&tp_vars->finished);
spin_lock_init(&tp_vars->unacked_lock);
INIT_LIST_HEAD(&tp_vars->unacked_list);
@@ -1069,16 +1128,16 @@ void batadv_tp_stop(struct batadv_priv *bat_priv, const u8 *dst,
if (!orig_node)
return;
- tp_vars = batadv_tp_list_find(bat_priv, orig_node->orig);
+ tp_vars = batadv_tp_list_find(bat_priv, orig_node->orig, BATADV_TP_SENDER);
if (!tp_vars) {
batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
"Meter: trying to interrupt an already over connection\n");
- goto out;
+ goto out_put_orig_node;
}
batadv_tp_sender_shutdown(tp_vars, return_value);
batadv_tp_vars_put(tp_vars);
-out:
+out_put_orig_node:
batadv_orig_node_put(orig_node);
}
@@ -1119,14 +1178,7 @@ static void batadv_tp_receiver_shutdown(struct timer_list *t)
"Shutting down for inactivity (more than %dms) from %pM\n",
BATADV_TP_RECV_TIMEOUT, tp_vars->other_end);
- spin_lock_bh(&tp_vars->bat_priv->tp_list_lock);
- hlist_del_rcu(&tp_vars->list);
- spin_unlock_bh(&tp_vars->bat_priv->tp_list_lock);
-
- /* drop list reference */
- batadv_tp_vars_put(tp_vars);
-
- atomic_dec(&bat_priv->tp_num);
+ batadv_tp_list_detach(tp_vars);
spin_lock_bh(&tp_vars->unacked_lock);
list_for_each_entry_safe(un, safe, &tp_vars->unacked_list, list) {
@@ -1136,6 +1188,9 @@ static void batadv_tp_receiver_shutdown(struct timer_list *t)
spin_unlock_bh(&tp_vars->unacked_lock);
/* drop reference of timer */
+ if (WARN_ON(atomic_xchg(&tp_vars->receiving, 0) != 1))
+ return;
+
batadv_tp_vars_put(tp_vars);
}
@@ -1329,11 +1384,14 @@ static struct batadv_tp_vars *
batadv_tp_init_recv(struct batadv_priv *bat_priv,
const struct batadv_icmp_tp_packet *icmp)
{
- struct batadv_tp_vars *tp_vars;
+ struct batadv_tp_vars *tp_vars = NULL;
spin_lock_bh(&bat_priv->tp_list_lock);
+ if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE)
+ goto out_unlock;
+
tp_vars = batadv_tp_list_find_session(bat_priv, icmp->orig,
- icmp->session);
+ icmp->session, BATADV_TP_RECEIVER);
if (tp_vars)
goto out_unlock;
@@ -1344,11 +1402,14 @@ batadv_tp_init_recv(struct batadv_priv *bat_priv,
}
tp_vars = kmalloc_obj(*tp_vars, GFP_ATOMIC);
- if (!tp_vars)
+ if (!tp_vars) {
+ atomic_dec(&bat_priv->tp_num);
goto out_unlock;
+ }
ether_addr_copy(tp_vars->other_end, icmp->orig);
tp_vars->role = BATADV_TP_RECEIVER;
+ atomic_set(&tp_vars->receiving, 1);
memcpy(tp_vars->session, icmp->session, sizeof(tp_vars->session));
tp_vars->last_recv = BATADV_TP_FIRST_SEQ;
tp_vars->bat_priv = bat_priv;
@@ -1401,7 +1462,7 @@ static void batadv_tp_recv_msg(struct batadv_priv *bat_priv,
}
} else {
tp_vars = batadv_tp_list_find_session(bat_priv, icmp->orig,
- icmp->session);
+ icmp->session, BATADV_TP_RECEIVER);
if (!tp_vars) {
batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
"Unexpected packet from %pM!\n",
@@ -1410,13 +1471,6 @@ static void batadv_tp_recv_msg(struct batadv_priv *bat_priv,
}
}
- if (unlikely(tp_vars->role != BATADV_TP_RECEIVER)) {
- batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
- "Meter: dropping packet: not expected (role=%u)\n",
- tp_vars->role);
- goto out;
- }
-
tp_vars->last_recv_time = jiffies;
/* if the packet is a duplicate, it may be the case that an ACK has been
@@ -1464,6 +1518,9 @@ void batadv_tp_meter_recv(struct batadv_priv *bat_priv, struct sk_buff *skb)
{
struct batadv_icmp_tp_packet *icmp;
+ if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE)
+ goto out;
+
icmp = (struct batadv_icmp_tp_packet *)skb->data;
switch (icmp->subtype) {
@@ -1478,10 +1535,62 @@ void batadv_tp_meter_recv(struct batadv_priv *bat_priv, struct sk_buff *skb)
"Received unknown TP Metric packet type %u\n",
icmp->subtype);
}
+
+out:
consume_skb(skb);
}
/**
+ * batadv_tp_stop_all() - stop all currently running tp meter sessions
+ * @bat_priv: the bat priv with all the mesh interface information
+ */
+void batadv_tp_stop_all(struct batadv_priv *bat_priv)
+{
+ struct batadv_tp_vars *tp_vars[BATADV_TP_MAX_NUM];
+ struct batadv_tp_vars *tp_var;
+ size_t count = 0;
+ size_t i;
+
+ spin_lock_bh(&bat_priv->tp_list_lock);
+ hlist_for_each_entry(tp_var, &bat_priv->tp_list, list) {
+ if (WARN_ON_ONCE(count >= BATADV_TP_MAX_NUM))
+ break;
+
+ if (!kref_get_unless_zero(&tp_var->refcount))
+ continue;
+
+ tp_vars[count++] = tp_var;
+ }
+ spin_unlock_bh(&bat_priv->tp_list_lock);
+
+ for (i = 0; i < count; i++) {
+ tp_var = tp_vars[i];
+
+ switch (tp_var->role) {
+ case BATADV_TP_SENDER:
+ batadv_tp_sender_shutdown(tp_var,
+ BATADV_TP_REASON_CANCEL);
+ wake_up(&tp_var->more_bytes);
+ wait_for_completion(&tp_var->finished);
+ break;
+ case BATADV_TP_RECEIVER:
+ batadv_tp_list_detach(tp_var);
+ timer_shutdown_sync(&tp_var->timer);
+
+ if (atomic_xchg(&tp_var->receiving, 0) != 1)
+ break;
+
+ batadv_tp_vars_put(tp_var);
+ break;
+ }
+
+ batadv_tp_vars_put(tp_var);
+ }
+
+ synchronize_net();
+}
+
+/**
* batadv_tp_meter_init() - initialize global tp_meter structures
*/
void __init batadv_tp_meter_init(void)
diff --git a/net/batman-adv/tp_meter.h b/net/batman-adv/tp_meter.h
index f0046d366eac..4e97cd10cd02 100644
--- a/net/batman-adv/tp_meter.h
+++ b/net/batman-adv/tp_meter.h
@@ -17,6 +17,7 @@ void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst,
u32 test_length, u32 *cookie);
void batadv_tp_stop(struct batadv_priv *bat_priv, const u8 *dst,
u8 return_value);
+void batadv_tp_stop_all(struct batadv_priv *bat_priv);
void batadv_tp_meter_recv(struct batadv_priv *bat_priv, struct sk_buff *skb);
#endif /* _NET_BATMAN_ADV_TP_METER_H_ */
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 05cddcf994f6..9f6e67771ffa 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -797,24 +797,33 @@ batadv_tt_prepare_tvlv_global_data(struct batadv_orig_node *orig_node,
s32 *tt_len)
{
u16 num_vlan = 0;
- u16 num_entries = 0;
u16 tvlv_len = 0;
unsigned int change_offset;
struct batadv_tvlv_tt_vlan_data *tt_vlan;
struct batadv_orig_node_vlan *vlan;
+ u16 total_entries = 0;
u8 *tt_change_ptr;
+ int vlan_entries;
+ u16 sum_entries;
spin_lock_bh(&orig_node->vlan_list_lock);
hlist_for_each_entry(vlan, &orig_node->vlan_list, list) {
+ vlan_entries = atomic_read(&vlan->tt.num_entries);
+
+ if (check_add_overflow(vlan_entries, total_entries, &sum_entries)) {
+ *tt_len = 0;
+ goto out;
+ }
+
+ total_entries = sum_entries;
num_vlan++;
- num_entries += atomic_read(&vlan->tt.num_entries);
}
change_offset = struct_size(*tt_data, vlan_data, num_vlan);
/* if tt_len is negative, allocate the space needed by the full table */
if (*tt_len < 0)
- *tt_len = batadv_tt_len(num_entries);
+ *tt_len = batadv_tt_len(total_entries);
if (change_offset > U16_MAX || *tt_len > U16_MAX - change_offset) {
*tt_len = 0;
@@ -835,14 +844,26 @@ batadv_tt_prepare_tvlv_global_data(struct batadv_orig_node *orig_node,
(*tt_data)->num_vlan = htons(num_vlan);
tt_vlan = (*tt_data)->vlan_data;
+ num_vlan = 0;
hlist_for_each_entry(vlan, &orig_node->vlan_list, list) {
+ vlan_entries = atomic_read(&vlan->tt.num_entries);
+ if (vlan_entries < 1)
+ continue;
+
tt_vlan->vid = htons(vlan->vid);
tt_vlan->crc = htonl(vlan->tt.crc);
tt_vlan->reserved = 0;
tt_vlan++;
+ num_vlan++;
}
+ /* recalculate in case number of VLANs reduced */
+ change_offset = struct_size(*tt_data, vlan_data, num_vlan);
+ tvlv_len = *tt_len + change_offset;
+
+ (*tt_data)->num_vlan = htons(num_vlan);
+
tt_change_ptr = (u8 *)*tt_data + change_offset;
*tt_change = (struct batadv_tvlv_tt_change *)tt_change_ptr;
@@ -877,21 +898,25 @@ batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv,
{
struct batadv_tvlv_tt_vlan_data *tt_vlan;
struct batadv_meshif_vlan *vlan;
+ size_t change_offset;
u16 num_vlan = 0;
- u16 vlan_entries = 0;
u16 total_entries = 0;
u16 tvlv_len;
u8 *tt_change_ptr;
- int change_offset;
+ int vlan_entries;
+ u16 sum_entries;
spin_lock_bh(&bat_priv->meshif_vlan_list_lock);
hlist_for_each_entry(vlan, &bat_priv->meshif_vlan_list, list) {
vlan_entries = atomic_read(&vlan->tt.num_entries);
- if (vlan_entries < 1)
- continue;
+ if (check_add_overflow(vlan_entries, total_entries, &sum_entries)) {
+ tvlv_len = 0;
+ goto out;
+ }
+
+ total_entries = sum_entries;
num_vlan++;
- total_entries += vlan_entries;
}
change_offset = struct_size(*tt_data, vlan_data, num_vlan);
@@ -900,8 +925,10 @@ batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv,
if (*tt_len < 0)
*tt_len = batadv_tt_len(total_entries);
- tvlv_len = *tt_len;
- tvlv_len += change_offset;
+ if (check_add_overflow(*tt_len, change_offset, &tvlv_len)) {
+ tvlv_len = 0;
+ goto out;
+ }
*tt_data = kmalloc(tvlv_len, GFP_ATOMIC);
if (!*tt_data) {
@@ -914,6 +941,7 @@ batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv,
(*tt_data)->num_vlan = htons(num_vlan);
tt_vlan = (*tt_data)->vlan_data;
+ num_vlan = 0;
hlist_for_each_entry(vlan, &bat_priv->meshif_vlan_list, list) {
vlan_entries = atomic_read(&vlan->tt.num_entries);
if (vlan_entries < 1)
@@ -924,8 +952,15 @@ batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv,
tt_vlan->reserved = 0;
tt_vlan++;
+ num_vlan++;
}
+ /* recalculate in case number of VLANs reduced */
+ change_offset = struct_size(*tt_data, vlan_data, num_vlan);
+ tvlv_len = *tt_len + change_offset;
+
+ (*tt_data)->num_vlan = htons(num_vlan);
+
tt_change_ptr = (u8 *)*tt_data + change_offset;
*tt_change = (struct batadv_tvlv_tt_change *)tt_change_ptr;
diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c
index 8129a3f9c44d..cc6ac580c620 100644
--- a/net/batman-adv/tvlv.c
+++ b/net/batman-adv/tvlv.c
@@ -8,10 +8,12 @@
#include <linux/byteorder/generic.h>
#include <linux/container_of.h>
+#include <linux/errno.h>
#include <linux/etherdevice.h>
#include <linux/gfp.h>
#include <linux/if_ether.h>
#include <linux/kref.h>
+#include <linux/limits.h>
#include <linux/list.h>
#include <linux/lockdep.h>
#include <linux/netdevice.h>
@@ -159,10 +161,10 @@ batadv_tvlv_container_get(struct batadv_priv *bat_priv, u8 type, u8 version)
*
* Return: size of all currently registered tvlv containers in bytes.
*/
-static u16 batadv_tvlv_container_list_size(struct batadv_priv *bat_priv)
+static size_t batadv_tvlv_container_list_size(struct batadv_priv *bat_priv)
{
struct batadv_tvlv_container *tvlv;
- u16 tvlv_len = 0;
+ size_t tvlv_len = 0;
lockdep_assert_held(&bat_priv->tvlv.container_list_lock);
@@ -306,26 +308,35 @@ static bool batadv_tvlv_realloc_packet_buff(unsigned char **packet_buff,
* The ogm packet might be enlarged or shrunk depending on the current size
* and the size of the to-be-appended tvlv containers.
*
- * Return: size of all appended tvlv containers in bytes.
+ * Return: size of all appended tvlv containers in bytes (max U16_MAX), negative
+ * if operation failed
*/
-u16 batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv,
+int batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv,
unsigned char **packet_buff,
int *packet_buff_len, int packet_min_len)
{
struct batadv_tvlv_container *tvlv;
struct batadv_tvlv_hdr *tvlv_hdr;
- u16 tvlv_value_len;
+ size_t tvlv_value_len;
void *tvlv_value;
+ int tvlv_len_ret;
bool ret;
spin_lock_bh(&bat_priv->tvlv.container_list_lock);
tvlv_value_len = batadv_tvlv_container_list_size(bat_priv);
+ if (tvlv_value_len > U16_MAX) {
+ tvlv_len_ret = -E2BIG;
+ goto end;
+ }
ret = batadv_tvlv_realloc_packet_buff(packet_buff, packet_buff_len,
packet_min_len, tvlv_value_len);
-
- if (!ret)
+ if (!ret) {
+ tvlv_len_ret = -ENOMEM;
goto end;
+ }
+
+ tvlv_len_ret = tvlv_value_len;
if (!tvlv_value_len)
goto end;
@@ -344,7 +355,8 @@ u16 batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv,
end:
spin_unlock_bh(&bat_priv->tvlv.container_list_lock);
- return tvlv_value_len;
+
+ return tvlv_len_ret;
}
/**
diff --git a/net/batman-adv/tvlv.h b/net/batman-adv/tvlv.h
index e5697230d991..f96f6b3f44a0 100644
--- a/net/batman-adv/tvlv.h
+++ b/net/batman-adv/tvlv.h
@@ -16,7 +16,7 @@
void batadv_tvlv_container_register(struct batadv_priv *bat_priv,
u8 type, u8 version,
void *tvlv_value, u16 tvlv_value_len);
-u16 batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv,
+int batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv,
unsigned char **packet_buff,
int *packet_buff_len, int packet_min_len);
void batadv_tvlv_ogm_receive(struct batadv_priv *bat_priv,
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 8fc5fe0e9b05..a01ee46d97f3 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -14,6 +14,7 @@
#include <linux/average.h>
#include <linux/bitops.h>
#include <linux/compiler.h>
+#include <linux/completion.h>
#include <linux/if.h>
#include <linux/if_ether.h>
#include <linux/kref.h>
@@ -82,6 +83,9 @@ struct batadv_hard_iface_bat_iv {
/** @ogm_seqno: OGM sequence number - used to identify each OGM */
atomic_t ogm_seqno;
+ /** @reschedule_work: recover OGM schedule after schedule error */
+ struct delayed_work reschedule_work;
+
/** @ogm_buff_mutex: lock protecting ogm_buff and ogm_buff_len */
struct mutex ogm_buff_mutex;
};
@@ -300,7 +304,7 @@ struct batadv_frag_table_entry {
u16 seqno;
/** @size: accumulated size of packets in list */
- u16 size;
+ size_t size;
/** @total_size: expected size of the assembled packet */
u16 total_size;
@@ -451,7 +455,7 @@ struct batadv_orig_node {
* @tt_buff_len: length of the last tt changeset this node received
* from the orig node
*/
- s16 tt_buff_len;
+ u16 tt_buff_len;
/** @tt_buff_lock: lock that protects tt_buff and tt_buff_len */
spinlock_t tt_buff_lock;
@@ -992,7 +996,7 @@ struct batadv_priv_tt {
* @last_changeset_len: length of last tt changeset this host has
* generated
*/
- s16 last_changeset_len;
+ u16 last_changeset_len;
/**
* @last_changeset_lock: lock protecting last_changeset &
@@ -1023,6 +1027,12 @@ struct batadv_priv_bla {
atomic_t num_requests;
/**
+ * @num_requests_lock: locks update num_requests +
+ * batadv_backbone_gw::state + batadv_backbone_gw::wait_periods update
+ */
+ spinlock_t num_requests_lock;
+
+ /**
* @claim_hash: hash table containing mesh nodes this host has claimed
*/
struct batadv_hashtable *claim_hash;
@@ -1319,15 +1329,21 @@ struct batadv_tp_vars {
/** @role: receiver/sender modi */
enum batadv_tp_meter_role role;
- /** @sending: sending binary semaphore: 1 if sending, 0 is not */
- atomic_t sending;
+ /**
+ * @send_result: 0 when sending is ongoing and otherwise
+ * enum batadv_tp_meter_reason
+ */
+ atomic_t send_result;
- /** @reason: reason for a stopped session */
- enum batadv_tp_meter_reason reason;
+ /** @receiving: receiving binary semaphore: 1 if receiving, 0 is not */
+ atomic_t receiving;
/** @finish_work: work item for the finishing procedure */
struct delayed_work finish_work;
+ /** @finished: completion signaled when a sender thread exits */
+ struct completion finished;
+
/** @test_length: test length in milliseconds */
u32 test_length;
@@ -1662,6 +1678,27 @@ struct batadv_priv {
#ifdef CONFIG_BATMAN_ADV_BLA
+enum batadv_bla_backbone_gw_state {
+ /**
+ * @BATADV_BLA_BACKBONE_GW_STOPPED: backbone gw is being removed
+ * and it must not longer work on requests
+ */
+ BATADV_BLA_BACKBONE_GW_STOPPED,
+
+ /**
+ * @BATADV_BLA_BACKBONE_GW_UNSYNCED: backbone was detected out
+ * of sync and a request was send. No traffic is forwarded until the
+ * situation is resolved
+ */
+ BATADV_BLA_BACKBONE_GW_UNSYNCED,
+
+ /**
+ * @BATADV_BLA_BACKBONE_GW_SYNCED: backbone is consider to be in
+ * sync. traffic can be forwarded
+ */
+ BATADV_BLA_BACKBONE_GW_SYNCED,
+};
+
/**
* struct batadv_bla_backbone_gw - batman-adv gateway bridged into the LAN
*/
@@ -1687,16 +1724,12 @@ struct batadv_bla_backbone_gw {
/**
* @wait_periods: grace time for bridge forward delays and bla group
* forming at bootup phase - no bcast traffic is formwared until it has
- * elapsed
+ * elapsed. Must only be access with num_requests_lock.
*/
- atomic_t wait_periods;
+ u8 wait_periods;
- /**
- * @request_sent: if this bool is set to true we are out of sync with
- * this backbone gateway - no bcast traffic is formwared until the
- * situation was resolved
- */
- atomic_t request_sent;
+ /** @state: sync state. Must only be access with num_requests_lock. */
+ enum batadv_bla_backbone_gw_state state;
/** @crc: crc16 checksum over all claims */
u16 crc;
diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c
index 2f03b780b40d..960a19b3e26d 100644
--- a/net/bluetooth/6lowpan.c
+++ b/net/bluetooth/6lowpan.c
@@ -486,6 +486,8 @@ static int send_mcast_pkt(struct sk_buff *skb, struct net_device *netdev)
int ret;
local_skb = skb_clone(skb, GFP_ATOMIC);
+ if (!local_skb)
+ continue;
BT_DBG("xmit %s to %pMR type %u IP %pI6c chan %p",
netdev->name,
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 33d053d63407..1a6aa3f8d4d6 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -154,6 +154,7 @@ struct sock *bt_sock_alloc(struct net *net, struct socket *sock,
sock_init_data(sock, sk);
INIT_LIST_HEAD(&bt_sk(sk)->accept_q);
+ spin_lock_init(&bt_sk(sk)->accept_q_lock);
sock_reset_flag(sk, SOCK_ZAPPED);
@@ -214,6 +215,7 @@ void bt_accept_enqueue(struct sock *parent, struct sock *sk, bool bh)
{
const struct cred *old_cred;
struct pid *old_pid;
+ struct bt_sock *par = bt_sk(parent);
BT_DBG("parent %p, sk %p", parent, sk);
@@ -224,9 +226,13 @@ void bt_accept_enqueue(struct sock *parent, struct sock *sk, bool bh)
else
lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
- list_add_tail(&bt_sk(sk)->accept_q, &bt_sk(parent)->accept_q);
bt_sk(sk)->parent = parent;
+ spin_lock_bh(&par->accept_q_lock);
+ list_add_tail(&bt_sk(sk)->accept_q, &par->accept_q);
+ sk_acceptq_added(parent);
+ spin_unlock_bh(&par->accept_q_lock);
+
/* Copy credentials from parent since for incoming connections the
* socket is allocated by the kernel.
*/
@@ -244,8 +250,6 @@ void bt_accept_enqueue(struct sock *parent, struct sock *sk, bool bh)
bh_unlock_sock(sk);
else
release_sock(sk);
-
- sk_acceptq_added(parent);
}
EXPORT_SYMBOL(bt_accept_enqueue);
@@ -254,45 +258,72 @@ EXPORT_SYMBOL(bt_accept_enqueue);
*/
void bt_accept_unlink(struct sock *sk)
{
+ struct sock *parent = bt_sk(sk)->parent;
+
BT_DBG("sk %p state %d", sk, sk->sk_state);
+ spin_lock_bh(&bt_sk(parent)->accept_q_lock);
list_del_init(&bt_sk(sk)->accept_q);
- sk_acceptq_removed(bt_sk(sk)->parent);
+ sk_acceptq_removed(parent);
+ spin_unlock_bh(&bt_sk(parent)->accept_q_lock);
bt_sk(sk)->parent = NULL;
sock_put(sk);
}
EXPORT_SYMBOL(bt_accept_unlink);
+static struct sock *bt_accept_get(struct sock *parent, struct sock *sk)
+{
+ struct bt_sock *bt = bt_sk(parent);
+ struct sock *next = NULL;
+
+ /* accept_q is modified from child teardown paths too, so take a
+ * temporary reference before dropping the queue lock.
+ */
+ spin_lock_bh(&bt->accept_q_lock);
+
+ if (sk) {
+ if (bt_sk(sk)->parent != parent)
+ goto out;
+
+ if (!list_is_last(&bt_sk(sk)->accept_q, &bt->accept_q)) {
+ next = &list_next_entry(bt_sk(sk), accept_q)->sk;
+ sock_hold(next);
+ }
+ } else if (!list_empty(&bt->accept_q)) {
+ next = &list_first_entry(&bt->accept_q,
+ struct bt_sock, accept_q)->sk;
+ sock_hold(next);
+ }
+
+out:
+ spin_unlock_bh(&bt->accept_q_lock);
+ return next;
+}
+
struct sock *bt_accept_dequeue(struct sock *parent, struct socket *newsock)
{
- struct bt_sock *s, *n;
- struct sock *sk;
+ struct sock *sk, *next;
BT_DBG("parent %p", parent);
restart:
- list_for_each_entry_safe(s, n, &bt_sk(parent)->accept_q, accept_q) {
- sk = (struct sock *)s;
-
+ for (sk = bt_accept_get(parent, NULL); sk; sk = next) {
/* Prevent early freeing of sk due to unlink and sock_kill */
- sock_hold(sk);
lock_sock(sk);
/* Check sk has not already been unlinked via
* bt_accept_unlink() due to serialisation caused by sk locking
*/
- if (!bt_sk(sk)->parent) {
+ if (bt_sk(sk)->parent != parent) {
BT_DBG("sk %p, already unlinked", sk);
release_sock(sk);
sock_put(sk);
- /* Restart the loop as sk is no longer in the list
- * and also avoid a potential infinite loop because
- * list_for_each_entry_safe() is not thread safe.
- */
goto restart;
}
+ next = bt_accept_get(parent, sk);
+
/* sk is safely in the parent list so reduce reference count */
sock_put(sk);
@@ -309,7 +340,19 @@ restart:
if (newsock)
sock_graft(sk, newsock);
+ /* Hand the caller a reference taken while sk is
+ * still locked. bt_accept_unlink() just dropped
+ * the accept-queue reference; without this hold a
+ * concurrent teardown (e.g. l2cap_conn_del() ->
+ * l2cap_sock_kill()) could free sk between
+ * release_sock() and the caller using it. Every
+ * caller drops this with sock_put() when done.
+ */
+ sock_hold(sk);
+
release_sock(sk);
+ if (next)
+ sock_put(next);
return sk;
}
@@ -518,18 +561,28 @@ EXPORT_SYMBOL(bt_sock_stream_recvmsg);
static inline __poll_t bt_accept_poll(struct sock *parent)
{
- struct bt_sock *s, *n;
+ struct bt_sock *bt = bt_sk(parent);
+ struct bt_sock *s;
struct sock *sk;
+ __poll_t mask = 0;
+
+ spin_lock_bh(&bt->accept_q_lock);
+ list_for_each_entry(s, &bt->accept_q, accept_q) {
+ int state;
- list_for_each_entry_safe(s, n, &bt_sk(parent)->accept_q, accept_q) {
sk = (struct sock *)s;
- if (sk->sk_state == BT_CONNECTED ||
- (test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags) &&
- sk->sk_state == BT_CONNECT2))
- return EPOLLIN | EPOLLRDNORM;
+ state = READ_ONCE(sk->sk_state);
+
+ if (state == BT_CONNECTED ||
+ (test_bit(BT_SK_DEFER_SETUP, &bt->flags) &&
+ state == BT_CONNECT2)) {
+ mask = EPOLLIN | EPOLLRDNORM;
+ break;
+ }
}
+ spin_unlock_bh(&bt->accept_q_lock);
- return 0;
+ return mask;
}
__poll_t bt_sock_poll(struct file *file, struct socket *sock,
diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c
index d44987d4515c..5c5f53ff30e8 100644
--- a/net/bluetooth/bnep/core.c
+++ b/net/bluetooth/bnep/core.c
@@ -206,14 +206,11 @@ static int bnep_ctrl_set_mcfilter(struct bnep_session *s, u8 *data, int len)
return 0;
}
-static int bnep_rx_control(struct bnep_session *s, void *data, int len)
+static int bnep_rx_control_cmd(struct bnep_session *s, u8 cmd, void *data,
+ int len)
{
- u8 cmd = *(u8 *)data;
int err = 0;
- data++;
- len--;
-
switch (cmd) {
case BNEP_CMD_NOT_UNDERSTOOD:
case BNEP_SETUP_CONN_RSP:
@@ -254,6 +251,14 @@ static int bnep_rx_control(struct bnep_session *s, void *data, int len)
return err;
}
+static int bnep_rx_control(struct bnep_session *s, void *data, int len)
+{
+ if (len < 1)
+ return -EILSEQ;
+
+ return bnep_rx_control_cmd(s, *(u8 *)data, data + 1, len - 1);
+}
+
static int bnep_rx_extension(struct bnep_session *s, struct sk_buff *skb)
{
struct bnep_ext_hdr *h;
@@ -299,19 +304,26 @@ static int bnep_rx_frame(struct bnep_session *s, struct sk_buff *skb)
{
struct net_device *dev = s->dev;
struct sk_buff *nskb;
+ u8 *data;
u8 type, ctrl_type;
dev->stats.rx_bytes += skb->len;
- type = *(u8 *) skb->data;
- skb_pull(skb, 1);
- ctrl_type = *(u8 *)skb->data;
+ data = skb_pull_data(skb, sizeof(type));
+ if (!data)
+ goto badframe;
+ type = *data;
if ((type & BNEP_TYPE_MASK) >= sizeof(__bnep_rx_hlen))
goto badframe;
if ((type & BNEP_TYPE_MASK) == BNEP_CONTROL) {
- if (bnep_rx_control(s, skb->data, skb->len) < 0) {
+ data = skb_pull_data(skb, sizeof(ctrl_type));
+ if (!data)
+ goto badframe;
+ ctrl_type = *data;
+
+ if (bnep_rx_control_cmd(s, ctrl_type, skb->data, skb->len) < 0) {
dev->stats.tx_errors++;
kfree_skb(skb);
return 0;
@@ -324,15 +336,25 @@ static int bnep_rx_frame(struct bnep_session *s, struct sk_buff *skb)
/* Verify and pull ctrl message since it's already processed */
switch (ctrl_type) {
- case BNEP_SETUP_CONN_REQ:
- /* Pull: ctrl type (1 b), len (1 b), data (len bytes) */
- if (!skb_pull(skb, 2 + *(u8 *)(skb->data + 1) * 2))
+ case BNEP_SETUP_CONN_REQ: {
+ u8 uuid_size;
+
+ /* Pull uuid_size and the dst/src service UUIDs. */
+ data = skb_pull_data(skb, sizeof(uuid_size));
+ if (!data)
+ goto badframe;
+ uuid_size = *data;
+ if (!skb_pull(skb, uuid_size + uuid_size))
goto badframe;
break;
+ }
case BNEP_FILTER_MULTI_ADDR_SET:
case BNEP_FILTER_NET_TYPE_SET:
- /* Pull: ctrl type (1 b), len (2 b), data (len bytes) */
- if (!skb_pull(skb, 3 + *(u16 *)(skb->data + 1) * 2))
+ /* Pull: len (2 b), data (len bytes) */
+ data = skb_pull_data(skb, sizeof(u16));
+ if (!data)
+ goto badframe;
+ if (!skb_pull(skb, get_unaligned_be16(data)))
goto badframe;
break;
default:
@@ -638,8 +660,8 @@ int bnep_add_connection(struct bnep_connadd_req *req, struct socket *sock)
goto failed;
}
- up_write(&bnep_session_sem);
strcpy(req->device, dev->name);
+ up_write(&bnep_session_sem);
return 0;
failed:
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 3a0592599086..54eabaa46960 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -480,40 +480,107 @@ bool hci_setup_sync(struct hci_conn *conn, __u16 handle)
return hci_setup_sync_conn(conn, handle);
}
-u8 hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency,
- u16 to_multiplier)
+struct le_conn_update_data {
+ struct hci_conn *conn;
+ u16 min;
+ u16 max;
+ u16 latency;
+ u16 to_multiplier;
+};
+
+static int le_conn_update_sync(struct hci_dev *hdev, void *data)
{
- struct hci_dev *hdev = conn->hdev;
+ struct le_conn_update_data *d = data;
+ struct hci_conn *conn = d->conn;
struct hci_conn_params *params;
struct hci_cp_le_conn_update cp;
+ u16 timeout;
+ u8 store_hint;
+ int err;
+ /* Verify connection is still alive and read conn fields under
+ * the same lock to prevent a concurrent disconnect from freeing
+ * or reusing the connection while we build the HCI command.
+ */
hci_dev_lock(hdev);
- params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type);
- if (params) {
- params->conn_min_interval = min;
- params->conn_max_interval = max;
- params->conn_latency = latency;
- params->supervision_timeout = to_multiplier;
+ if (!hci_conn_valid(hdev, conn)) {
+ hci_dev_unlock(hdev);
+ return -ECANCELED;
}
- hci_dev_unlock(hdev);
-
memset(&cp, 0, sizeof(cp));
cp.handle = cpu_to_le16(conn->handle);
- cp.conn_interval_min = cpu_to_le16(min);
- cp.conn_interval_max = cpu_to_le16(max);
- cp.conn_latency = cpu_to_le16(latency);
- cp.supervision_timeout = cpu_to_le16(to_multiplier);
+ cp.conn_interval_min = cpu_to_le16(d->min);
+ cp.conn_interval_max = cpu_to_le16(d->max);
+ cp.conn_latency = cpu_to_le16(d->latency);
+ cp.supervision_timeout = cpu_to_le16(d->to_multiplier);
cp.min_ce_len = cpu_to_le16(0x0000);
cp.max_ce_len = cpu_to_le16(0x0000);
+ timeout = conn->conn_timeout;
- hci_send_cmd(hdev, HCI_OP_LE_CONN_UPDATE, sizeof(cp), &cp);
+ hci_dev_unlock(hdev);
- if (params)
- return 0x01;
+ err = __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_CONN_UPDATE,
+ sizeof(cp), &cp,
+ HCI_EV_LE_CONN_UPDATE_COMPLETE,
+ timeout, NULL);
+ if (err)
+ return err;
- return 0x00;
+ /* Update stored connection parameters after the controller has
+ * confirmed the update via the LE Connection Update Complete event.
+ */
+ hci_dev_lock(hdev);
+
+ params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type);
+ if (params) {
+ params->conn_min_interval = d->min;
+ params->conn_max_interval = d->max;
+ params->conn_latency = d->latency;
+ params->supervision_timeout = d->to_multiplier;
+ store_hint = 0x01;
+ } else {
+ store_hint = 0x00;
+ }
+
+ hci_dev_unlock(hdev);
+
+ mgmt_new_conn_param(hdev, &conn->dst, conn->dst_type, store_hint,
+ d->min, d->max, d->latency, d->to_multiplier);
+
+ return 0;
+}
+
+static void le_conn_update_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct le_conn_update_data *d = data;
+
+ hci_conn_put(d->conn);
+ kfree(d);
+}
+
+void hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency,
+ u16 to_multiplier)
+{
+ struct le_conn_update_data *d;
+
+ d = kzalloc_obj(*d);
+ if (!d)
+ return;
+
+ hci_conn_get(conn);
+ d->conn = conn;
+ d->min = min;
+ d->max = max;
+ d->latency = latency;
+ d->to_multiplier = to_multiplier;
+
+ if (hci_cmd_sync_queue(conn->hdev, le_conn_update_sync, d,
+ le_conn_update_complete) < 0) {
+ hci_conn_put(conn);
+ kfree(d);
+ }
}
void hci_le_start_enc(struct hci_conn *conn, __le16 ediv, __le64 rand,
@@ -803,8 +870,10 @@ static int hci_le_big_terminate(struct hci_dev *hdev, struct hci_conn *conn)
d->big_sync_term = true;
}
- if (!d->pa_sync_term && !d->big_sync_term)
+ if (!d->pa_sync_term && !d->big_sync_term) {
+ kfree(d);
return 0;
+ }
ret = hci_cmd_sync_queue(hdev, big_terminate_sync, d,
terminate_big_destroy);
@@ -2130,6 +2199,9 @@ static int create_big_sync(struct hci_dev *hdev, void *data)
u32 flags = 0;
int err;
+ if (!hci_conn_valid(hdev, conn))
+ return -ECANCELED;
+
if (qos->bcast.out.phys == BIT(1))
flags |= MGMT_ADV_FLAG_SEC_2M;
@@ -2204,11 +2276,24 @@ static void create_big_complete(struct hci_dev *hdev, void *data, int err)
bt_dev_dbg(hdev, "conn %p", conn);
+ if (err == -ECANCELED)
+ goto done;
+
+ hci_dev_lock(hdev);
+
+ if (!hci_conn_valid(hdev, conn))
+ goto unlock;
+
if (err) {
bt_dev_err(hdev, "Unable to create BIG: %d", err);
hci_connect_cfm(conn, err);
hci_conn_del(conn);
}
+
+unlock:
+ hci_dev_unlock(hdev);
+done:
+ hci_conn_put(conn);
}
struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid,
@@ -2336,10 +2421,11 @@ struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst,
BT_BOUND, &data);
/* Queue start periodic advertising and create BIG */
- err = hci_cmd_sync_queue(hdev, create_big_sync, conn,
+ err = hci_cmd_sync_queue(hdev, create_big_sync, hci_conn_get(conn),
create_big_complete);
if (err < 0) {
hci_conn_drop(conn);
+ hci_conn_put(conn);
return ERR_PTR(err);
}
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index c46c1236ebfa..28d7929dc593 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -539,46 +539,9 @@ static int hci_dev_do_reset(struct hci_dev *hdev)
hci_req_sync_lock(hdev);
- /* Drop queues */
- skb_queue_purge(&hdev->rx_q);
- skb_queue_purge(&hdev->cmd_q);
-
- /* Cancel these to avoid queueing non-chained pending work */
- hci_dev_set_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE);
- /* Wait for
- *
- * if (!hci_dev_test_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE))
- * queue_delayed_work(&hdev->{cmd,ncmd}_timer)
- *
- * inside RCU section to see the flag or complete scheduling.
- */
- synchronize_rcu();
- /* Explicitly cancel works in case scheduled after setting the flag. */
- cancel_delayed_work(&hdev->cmd_timer);
- cancel_delayed_work(&hdev->ncmd_timer);
-
- /* Avoid potential lockdep warnings from the *_flush() calls by
- * ensuring the workqueue is empty up front.
- */
- drain_workqueue(hdev->workqueue);
-
- hci_dev_lock(hdev);
- hci_inquiry_cache_flush(hdev);
- hci_conn_hash_flush(hdev);
- hci_dev_unlock(hdev);
-
- if (hdev->flush)
- hdev->flush(hdev);
-
- hci_dev_clear_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE);
-
- atomic_set(&hdev->cmd_cnt, 1);
- hdev->acl_cnt = 0;
- hdev->sco_cnt = 0;
- hdev->le_cnt = 0;
- hdev->iso_cnt = 0;
-
- ret = hci_reset_sync(hdev);
+ ret = hci_dev_close_sync(hdev);
+ if (!ret)
+ ret = hci_dev_open_sync(hdev);
hci_req_sync_unlock(hdev);
return ret;
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index b2ee6b6a0f56..eea2f810aafa 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -7118,9 +7118,29 @@ static void hci_le_create_big_complete_evt(struct hci_dev *hdev, void *data,
continue;
}
+ if (ev->num_bis <= i) {
+ bt_dev_err(hdev,
+ "Not enough BIS handles for BIG 0x%2.2x",
+ ev->handle);
+ ev->status = HCI_ERROR_UNSPECIFIED;
+ hci_connect_cfm(conn, ev->status);
+ hci_conn_del(conn);
+ continue;
+ }
+
if (hci_conn_set_handle(conn,
- __le16_to_cpu(ev->bis_handle[i++])))
+ __le16_to_cpu(ev->bis_handle[i++]))) {
+ bt_dev_err(hdev,
+ "Failed to set BIS handle for BIG 0x%2.2x",
+ ev->handle);
+ /* Force error so BIG gets terminated as not all BIS
+ * could be connected.
+ */
+ ev->status = HCI_ERROR_UNSPECIFIED;
+ hci_connect_cfm(conn, ev->status);
+ hci_conn_del(conn);
continue;
+ }
conn->state = BT_CONNECTED;
set_bit(HCI_CONN_BIG_CREATED, &conn->flags);
@@ -7129,7 +7149,10 @@ static void hci_le_create_big_complete_evt(struct hci_dev *hdev, void *data,
hci_iso_setup_path(conn);
}
- if (!ev->status && !i)
+ /* If there is an unexpected error or if no BISes have been connected
+ * for the BIG, terminate it.
+ */
+ if (ev->status == HCI_ERROR_UNSPECIFIED || (!ev->status && !i))
/* If no BISes have been connected for the BIG,
* terminate. This is in case all bound connections
* have been closed before the BIG creation
@@ -7168,7 +7191,7 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data,
clear_bit(HCI_CONN_CREATE_BIG_SYNC, &conn->flags);
conn->num_bis = 0;
- memset(conn->bis, 0, sizeof(conn->num_bis));
+ memset(conn->bis, 0, sizeof(conn->bis));
for (i = 0; i < ev->num_bis; i++) {
u16 handle = le16_to_cpu(ev->bis[i]);
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index fd3aacdea512..df23245d6ccd 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -1725,6 +1725,11 @@ static int hci_adv_bcast_annoucement(struct hci_dev *hdev, struct adv_info *adv)
/* Generate Broadcast ID */
get_random_bytes(bid, sizeof(bid));
len = eir_append_service_data(ad, 0, 0x1852, bid, sizeof(bid));
+ if (adv->adv_data_len > sizeof(ad) - len) {
+ bt_dev_err(hdev, "No room for Broadcast Announcement");
+ return -EINVAL;
+ }
+
memcpy(ad + len, adv->adv_data, adv->adv_data_len);
hci_set_adv_instance_data(hdev, adv->instance, len + adv->adv_data_len,
ad, 0, NULL);
@@ -4438,6 +4443,9 @@ static int hci_le_set_event_mask_sync(struct hci_dev *hdev)
events[4] |= 0x02; /* LE BIG Info Advertising Report */
}
+ if (ll_ext_feature_capable(hdev))
+ events[5] |= BIT(2);
+
if (le_cs_capable(hdev)) {
/* Channel Sounding events */
events[5] |= 0x08; /* LE CS Read Remote Supported Cap Complete event */
@@ -5298,6 +5306,12 @@ int hci_dev_close_sync(struct hci_dev *hdev)
bt_dev_dbg(hdev, "");
+ /* Set HCI_DRAIN_WORKQUEUE flag to prevent queuing work during
+ * reset/close. See hci_cmd_work() and handle_cmd_cnt_and_timer().
+ */
+ hci_dev_set_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE);
+ synchronize_rcu();
+
if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) {
disable_delayed_work(&hdev->power_off);
disable_delayed_work(&hdev->ncmd_timer);
@@ -5321,6 +5335,7 @@ int hci_dev_close_sync(struct hci_dev *hdev)
if (!test_and_clear_bit(HCI_UP, &hdev->flags)) {
cancel_delayed_work_sync(&hdev->cmd_timer);
+ hci_dev_clear_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE);
return err;
}
@@ -5383,6 +5398,10 @@ int hci_dev_close_sync(struct hci_dev *hdev)
/* Reset device */
skb_queue_purge(&hdev->cmd_q);
atomic_set(&hdev->cmd_cnt, 1);
+ hdev->acl_cnt = 0;
+ hdev->sco_cnt = 0;
+ hdev->le_cnt = 0;
+ hdev->iso_cnt = 0;
if (hci_test_quirk(hdev, HCI_QUIRK_RESET_ON_CLOSE) &&
!auto_off && !hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) {
set_bit(HCI_INIT, &hdev->flags);
@@ -5420,6 +5439,7 @@ int hci_dev_close_sync(struct hci_dev *hdev)
/* Clear flags */
hdev->flags &= BIT(HCI_RAW);
hci_dev_clear_volatile_flags(hdev);
+ hci_dev_clear_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE);
memset(hdev->eir, 0, sizeof(hdev->eir));
memset(hdev->dev_class, 0, sizeof(hdev->dev_class));
@@ -6696,6 +6716,7 @@ int hci_le_create_cis_sync(struct hci_dev *hdev)
DEFINE_FLEX(struct hci_cp_le_create_cis, cmd, cis, num_cis, 0x1f);
size_t aux_num_cis = 0;
struct hci_conn *conn;
+ u16 timeout = 0;
u8 cig = BT_ISO_QOS_CIG_UNSET;
/* The spec allows only one pending LE Create CIS command at a time. If
@@ -6766,6 +6787,7 @@ int hci_le_create_cis_sync(struct hci_dev *hdev)
set_bit(HCI_CONN_CREATE_CIS, &conn->flags);
cis->acl_handle = cpu_to_le16(conn->parent->handle);
cis->cis_handle = cpu_to_le16(conn->handle);
+ timeout = conn->conn_timeout;
aux_num_cis++;
if (aux_num_cis >= cmd->num_cis)
@@ -6785,7 +6807,7 @@ done:
return __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_CREATE_CIS,
struct_size(cmd, cis, cmd->num_cis),
cmd, HCI_EVT_LE_CIS_ESTABLISHED,
- conn->conn_timeout, NULL);
+ timeout, NULL);
}
int hci_le_remove_cig_sync(struct hci_dev *hdev, u8 handle)
@@ -7413,9 +7435,6 @@ static int hci_le_read_all_remote_features_sync(struct hci_dev *hdev,
sizeof(cp), &cp,
HCI_EVT_LE_ALL_REMOTE_FEATURES_COMPLETE,
HCI_CMD_TIMEOUT, NULL);
-
- return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_ALL_REMOTE_FEATURES,
- sizeof(cp), &cp, HCI_CMD_TIMEOUT);
}
static int hci_le_read_remote_features_sync(struct hci_dev *hdev, void *data)
diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c
index 041ce9adc378..8957ce7c21b7 100644
--- a/net/bluetooth/hci_sysfs.c
+++ b/net/bluetooth/hci_sysfs.c
@@ -83,10 +83,12 @@ static void bt_host_release(struct device *dev)
{
struct hci_dev *hdev = to_hci_dev(dev);
- if (hci_dev_test_flag(hdev, HCI_UNREGISTER))
+ if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) {
hci_release_dev(hdev);
- else
+ } else {
+ cleanup_srcu_struct(&hdev->srcu);
kfree(hdev);
+ }
module_put(THIS_MODULE);
}
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index 7bcf8c5ceaee..70344bd3248a 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -179,12 +179,21 @@ static void hidp_input_report(struct hidp_session *session, struct sk_buff *skb)
{
struct input_dev *dev = session->input;
unsigned char *keys = session->keys;
- unsigned char *udata = skb->data + 1;
- signed char *sdata = skb->data + 1;
- int i, size = skb->len - 1;
+ unsigned char *udata;
+ signed char *sdata;
+ u8 *hdr;
+ int i;
+
+ hdr = skb_pull_data(skb, 1);
+ if (!hdr)
+ return;
- switch (skb->data[0]) {
+ switch (*hdr) {
case 0x01: /* Keyboard report */
+ udata = skb_pull_data(skb, 8);
+ if (!udata)
+ break;
+
for (i = 0; i < 8; i++)
input_report_key(dev, hidp_keycode[i + 224], (udata[0] >> i) & 1);
@@ -213,6 +222,10 @@ static void hidp_input_report(struct hidp_session *session, struct sk_buff *skb)
break;
case 0x02: /* Mouse report */
+ sdata = skb_pull_data(skb, 3);
+ if (!sdata)
+ break;
+
input_report_key(dev, BTN_LEFT, sdata[0] & 0x01);
input_report_key(dev, BTN_RIGHT, sdata[0] & 0x02);
input_report_key(dev, BTN_MIDDLE, sdata[0] & 0x04);
@@ -222,7 +235,7 @@ static void hidp_input_report(struct hidp_session *session, struct sk_buff *skb)
input_report_rel(dev, REL_X, sdata[1]);
input_report_rel(dev, REL_Y, sdata[2]);
- if (size > 3)
+ if (skb->len > 0)
input_report_rel(dev, REL_WHEEL, sdata[3]);
break;
}
@@ -1036,6 +1049,28 @@ static struct hidp_session *hidp_session_find(const bdaddr_t *bdaddr)
}
/*
+ * Consume session->conn: clear the member under hidp_session_sem, then
+ * l2cap_unregister_user() and l2cap_conn_put() the snapshot outside the
+ * sem. At most one caller wins; later callers see NULL and skip. The
+ * reference is the one hidp_session_new() took via l2cap_conn_get().
+ */
+static void hidp_session_unregister_conn(struct hidp_session *session)
+{
+ struct l2cap_conn *conn;
+
+ down_write(&hidp_session_sem);
+ conn = session->conn;
+ if (conn)
+ session->conn = NULL;
+ up_write(&hidp_session_sem);
+
+ if (conn) {
+ l2cap_unregister_user(conn, &session->user);
+ l2cap_conn_put(conn);
+ }
+}
+
+/*
* Start session synchronously
* This starts a session thread and waits until initialization
* is done or returns an error if it couldn't be started.
@@ -1311,8 +1346,7 @@ static int hidp_session_thread(void *arg)
* Instead, this call has the same semantics as if user-space tried to
* delete the session.
*/
- if (session->conn)
- l2cap_unregister_user(session->conn, &session->user);
+ hidp_session_unregister_conn(session);
hidp_session_put(session);
@@ -1418,7 +1452,7 @@ int hidp_connection_del(struct hidp_conndel_req *req)
HIDP_CTRL_VIRTUAL_CABLE_UNPLUG,
NULL, 0);
else
- l2cap_unregister_user(session->conn, &session->user);
+ hidp_session_unregister_conn(session);
hidp_session_put(session);
diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c
index be145e2736b7..3abd8111dda8 100644
--- a/net/bluetooth/iso.c
+++ b/net/bluetooth/iso.c
@@ -337,16 +337,25 @@ static int iso_connect_bis(struct sock *sk)
struct iso_conn *conn;
struct hci_conn *hcon;
struct hci_dev *hdev;
+ bdaddr_t src, dst;
+ u8 src_type, bc_sid;
int err;
- BT_DBG("%pMR (SID 0x%2.2x)", &iso_pi(sk)->src, iso_pi(sk)->bc_sid);
+ lock_sock(sk);
+ bacpy(&src, &iso_pi(sk)->src);
+ bacpy(&dst, &iso_pi(sk)->dst);
+ src_type = iso_pi(sk)->src_type;
+ bc_sid = iso_pi(sk)->bc_sid;
+ release_sock(sk);
- hdev = hci_get_route(&iso_pi(sk)->dst, &iso_pi(sk)->src,
- iso_pi(sk)->src_type);
+ BT_DBG("%pMR (SID 0x%2.2x)", &src, bc_sid);
+
+ hdev = hci_get_route(&dst, &src, src_type);
if (!hdev)
return -EHOSTUNREACH;
hci_dev_lock(hdev);
+ lock_sock(sk);
if (!bis_capable(hdev)) {
err = -EOPNOTSUPP;
@@ -399,13 +408,9 @@ static int iso_connect_bis(struct sock *sk)
goto unlock;
}
- lock_sock(sk);
-
err = iso_chan_add(conn, sk, NULL);
- if (err) {
- release_sock(sk);
+ if (err)
goto unlock;
- }
/* Update source addr of the socket */
bacpy(&iso_pi(sk)->src, &hcon->src);
@@ -421,9 +426,8 @@ static int iso_connect_bis(struct sock *sk)
iso_sock_set_timer(sk, READ_ONCE(sk->sk_sndtimeo));
}
- release_sock(sk);
-
unlock:
+ release_sock(sk);
hci_dev_unlock(hdev);
hci_dev_put(hdev);
return err;
@@ -434,16 +438,24 @@ static int iso_connect_cis(struct sock *sk)
struct iso_conn *conn;
struct hci_conn *hcon;
struct hci_dev *hdev;
+ bdaddr_t src, dst;
+ u8 src_type;
int err;
- BT_DBG("%pMR -> %pMR", &iso_pi(sk)->src, &iso_pi(sk)->dst);
+ lock_sock(sk);
+ bacpy(&src, &iso_pi(sk)->src);
+ bacpy(&dst, &iso_pi(sk)->dst);
+ src_type = iso_pi(sk)->src_type;
+ release_sock(sk);
+
+ BT_DBG("%pMR -> %pMR", &src, &dst);
- hdev = hci_get_route(&iso_pi(sk)->dst, &iso_pi(sk)->src,
- iso_pi(sk)->src_type);
+ hdev = hci_get_route(&dst, &src, src_type);
if (!hdev)
return -EHOSTUNREACH;
hci_dev_lock(hdev);
+ lock_sock(sk);
if (!cis_central_capable(hdev)) {
err = -EOPNOTSUPP;
@@ -498,13 +510,9 @@ static int iso_connect_cis(struct sock *sk)
goto unlock;
}
- lock_sock(sk);
-
err = iso_chan_add(conn, sk, NULL);
- if (err) {
- release_sock(sk);
+ if (err)
goto unlock;
- }
/* Update source addr of the socket */
bacpy(&iso_pi(sk)->src, &hcon->src);
@@ -520,9 +528,8 @@ static int iso_connect_cis(struct sock *sk)
iso_sock_set_timer(sk, READ_ONCE(sk->sk_sndtimeo));
}
- release_sock(sk);
-
unlock:
+ release_sock(sk);
hci_dev_unlock(hdev);
hci_dev_put(hdev);
return err;
@@ -572,7 +579,7 @@ static void iso_recv_frame(struct iso_conn *conn, struct sk_buff *skb)
struct sock *sk;
iso_conn_lock(conn);
- sk = conn->sk;
+ sk = iso_sock_hold(conn);
iso_conn_unlock(conn);
if (!sk)
@@ -581,11 +588,15 @@ static void iso_recv_frame(struct iso_conn *conn, struct sk_buff *skb)
BT_DBG("sk %p len %d", sk, skb->len);
if (sk->sk_state != BT_CONNECTED)
- goto drop;
+ goto drop_put;
- if (!sock_queue_rcv_skb(sk, skb))
+ if (!sock_queue_rcv_skb(sk, skb)) {
+ sock_put(sk);
return;
+ }
+drop_put:
+ sock_put(sk);
drop:
kfree_skb(skb);
}
@@ -759,6 +770,8 @@ static void iso_sock_cleanup_listen(struct sock *parent)
while ((sk = bt_accept_dequeue(parent, NULL))) {
iso_sock_close(sk);
iso_sock_kill(sk);
+ /* Drop the reference handed back by bt_accept_dequeue(). */
+ sock_put(sk);
}
/* If listening socket has a hcon, properly disconnect it */
@@ -866,8 +879,8 @@ static void __iso_sock_close(struct sock *sk)
/* Must be called on unlocked socket. */
static void iso_sock_close(struct sock *sk)
{
- iso_sock_clear_timer(sk);
lock_sock(sk);
+ iso_sock_clear_timer(sk);
__iso_sock_close(sk);
release_sock(sk);
iso_sock_kill(sk);
@@ -1084,7 +1097,7 @@ static int iso_sock_rebind_bc(struct sock *sk, struct sockaddr_iso *sa,
* ordering.
*/
release_sock(sk);
- hci_dev_lock(bis->hdev);
+ hci_dev_lock(hdev);
lock_sock(sk);
if (!iso_pi(sk)->conn || iso_pi(sk)->conn->hcon != bis) {
@@ -1193,7 +1206,7 @@ static int iso_sock_connect(struct socket *sock, struct sockaddr_unsized *addr,
release_sock(sk);
- if (bacmp(&iso_pi(sk)->dst, BDADDR_ANY))
+ if (bacmp(&sa->iso_bdaddr, BDADDR_ANY))
err = iso_connect_cis(sk);
else
err = iso_connect_bis(sk);
@@ -1214,18 +1227,25 @@ static int iso_sock_connect(struct socket *sock, struct sockaddr_unsized *addr,
static int iso_listen_bis(struct sock *sk)
{
- struct hci_dev *hdev;
- int err = 0;
struct iso_conn *conn;
struct hci_conn *hcon;
+ struct hci_dev *hdev;
+ bdaddr_t src, dst;
+ u8 src_type, bc_sid;
+ int err = 0;
+
+ lock_sock(sk);
+ bacpy(&src, &iso_pi(sk)->src);
+ bacpy(&dst, &iso_pi(sk)->dst);
+ src_type = iso_pi(sk)->src_type;
+ bc_sid = iso_pi(sk)->bc_sid;
+ release_sock(sk);
- BT_DBG("%pMR -> %pMR (SID 0x%2.2x)", &iso_pi(sk)->src,
- &iso_pi(sk)->dst, iso_pi(sk)->bc_sid);
+ BT_DBG("%pMR -> %pMR (SID 0x%2.2x)", &src, &dst, bc_sid);
write_lock(&iso_sk_list.lock);
- if (__iso_get_sock_listen_by_sid(&iso_pi(sk)->src, &iso_pi(sk)->dst,
- iso_pi(sk)->bc_sid))
+ if (__iso_get_sock_listen_by_sid(&src, &dst, bc_sid))
err = -EADDRINUSE;
write_unlock(&iso_sk_list.lock);
@@ -1233,8 +1253,7 @@ static int iso_listen_bis(struct sock *sk)
if (err)
return err;
- hdev = hci_get_route(&iso_pi(sk)->dst, &iso_pi(sk)->src,
- iso_pi(sk)->src_type);
+ hdev = hci_get_route(&dst, &src, src_type);
if (!hdev)
return -EHOSTUNREACH;
@@ -1364,8 +1383,13 @@ static int iso_sock_accept(struct socket *sock, struct socket *newsock,
}
ch = bt_accept_dequeue(sk, newsock);
- if (ch)
+ if (ch) {
+ /* Drop the bridging ref from bt_accept_dequeue();
+ * the grafted socket keeps ch alive from here.
+ */
+ sock_put(ch);
break;
+ }
if (!timeo) {
err = -EAGAIN;
@@ -1565,9 +1589,16 @@ static void iso_conn_big_sync(struct sock *sk)
{
int err;
struct hci_dev *hdev;
+ bdaddr_t src, dst;
+ u8 src_type;
- hdev = hci_get_route(&iso_pi(sk)->dst, &iso_pi(sk)->src,
- iso_pi(sk)->src_type);
+ lock_sock(sk);
+ bacpy(&src, &iso_pi(sk)->src);
+ bacpy(&dst, &iso_pi(sk)->dst);
+ src_type = iso_pi(sk)->src_type;
+ release_sock(sk);
+
+ hdev = hci_get_route(&dst, &src, src_type);
if (!hdev)
return;
@@ -1592,6 +1623,7 @@ static void iso_conn_big_sync(struct sock *sk)
release_sock(sk);
hci_dev_unlock(hdev);
+ hci_dev_put(hdev);
}
static int iso_sock_recvmsg(struct socket *sock, struct msghdr *msg,
@@ -2256,8 +2288,10 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags)
sk = iso_get_sock(hdev, &hdev->bdaddr, bdaddr, BT_LISTEN,
iso_match_sid, ev1);
if (sk && !ev1->status) {
+ lock_sock(sk);
iso_pi(sk)->sync_handle = le16_to_cpu(ev1->handle);
iso_pi(sk)->bc_sid = ev1->sid;
+ release_sock(sk);
}
goto done;
@@ -2268,8 +2302,10 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags)
sk = iso_get_sock(hdev, &hdev->bdaddr, bdaddr, BT_LISTEN,
iso_match_sid_past, ev1a);
if (sk && !ev1a->status) {
+ lock_sock(sk);
iso_pi(sk)->sync_handle = le16_to_cpu(ev1a->sync_handle);
iso_pi(sk)->bc_sid = ev1a->sid;
+ release_sock(sk);
}
goto done;
@@ -2296,27 +2332,35 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags)
ev2);
if (sk) {
- int err;
- struct hci_conn *hcon = iso_pi(sk)->conn->hcon;
+ int err = 0;
+ bool big_sync;
+ struct hci_conn *hcon;
+ lock_sock(sk);
+
+ hcon = iso_pi(sk)->conn->hcon;
iso_pi(sk)->qos.bcast.encryption = ev2->encryption;
if (ev2->num_bis < iso_pi(sk)->bc_num_bis)
iso_pi(sk)->bc_num_bis = ev2->num_bis;
- if (!test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags) &&
- !test_and_set_bit(BT_SK_BIG_SYNC, &iso_pi(sk)->flags)) {
+ big_sync = !test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags) &&
+ !test_and_set_bit(BT_SK_BIG_SYNC, &iso_pi(sk)->flags);
+
+ if (big_sync)
err = hci_conn_big_create_sync(hdev, hcon,
&iso_pi(sk)->qos,
iso_pi(sk)->sync_handle,
iso_pi(sk)->bc_num_bis,
iso_pi(sk)->bc_bis);
- if (err) {
- bt_dev_err(hdev, "hci_le_big_create_sync: %d",
- err);
- sock_put(sk);
- sk = NULL;
- }
+
+ release_sock(sk);
+
+ if (big_sync && err) {
+ bt_dev_err(hdev, "hci_le_big_create_sync: %d",
+ err);
+ sock_put(sk);
+ sk = NULL;
}
}
@@ -2370,8 +2414,10 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags)
if (!base || base_len > BASE_MAX_LENGTH)
goto done;
+ lock_sock(sk);
memcpy(iso_pi(sk)->base, base, base_len);
iso_pi(sk)->base_len = base_len;
+ release_sock(sk);
} else {
/* This is a PA data fragment. Keep pa_data_len set to 0
* until all data has been reassembled.
@@ -2587,6 +2633,11 @@ int iso_recv(struct hci_dev *hdev, u16 handle, struct sk_buff *skb, u16 flags)
break;
case ISO_END:
+ if (!conn->rx_len) {
+ BT_ERR("Unexpected end frame (len %d)", skb->len);
+ goto drop;
+ }
+
skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len),
skb->len);
conn->rx_len -= skb->len;
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 77dec104a9c3..c4ccfbda9d78 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -411,8 +411,10 @@ static void l2cap_chan_timeout(struct work_struct *work)
BT_DBG("chan %p state %s", chan, state_to_string(chan->state));
- if (!conn)
+ if (!conn) {
+ l2cap_chan_put(chan);
return;
+ }
mutex_lock(&conn->lock);
/* __set_chan_timer() calls l2cap_chan_hold(chan) while scheduling
@@ -4706,16 +4708,8 @@ static inline int l2cap_conn_param_update_req(struct l2cap_conn *conn,
l2cap_send_cmd(conn, cmd->ident, L2CAP_CONN_PARAM_UPDATE_RSP,
sizeof(rsp), &rsp);
- if (!err) {
- u8 store_hint;
-
- store_hint = hci_le_conn_update(hcon, min, max, latency,
- to_multiplier);
- mgmt_new_conn_param(hcon->hdev, &hcon->dst, hcon->dst_type,
- store_hint, min, max, latency,
- to_multiplier);
-
- }
+ if (!err)
+ hci_le_conn_update(hcon, min, max, latency, to_multiplier);
return 0;
}
@@ -5268,6 +5262,7 @@ static inline int l2cap_ecred_conn_rsp(struct l2cap_conn *conn,
cmd_len -= sizeof(*rsp);
list_for_each_entry_safe(chan, tmp, &conn->chan_l, list) {
+ struct l2cap_chan *orig;
u16 dcid;
if (chan->ident != cmd->ident ||
@@ -5289,8 +5284,10 @@ static inline int l2cap_ecred_conn_rsp(struct l2cap_conn *conn,
BT_DBG("dcid[%d] 0x%4.4x", i, dcid);
+ orig = __l2cap_get_chan_by_dcid(conn, dcid);
+
/* Check if dcid is already in use */
- if (dcid && __l2cap_get_chan_by_dcid(conn, dcid)) {
+ if (dcid && orig) {
/* If a device receives a
* L2CAP_CREDIT_BASED_CONNECTION_RSP packet with an
* already-assigned Destination CID, then both the
@@ -5299,10 +5296,24 @@ static inline int l2cap_ecred_conn_rsp(struct l2cap_conn *conn,
*/
l2cap_chan_del(chan, ECONNREFUSED);
l2cap_chan_unlock(chan);
- chan = __l2cap_get_chan_by_dcid(conn, dcid);
- l2cap_chan_lock(chan);
- l2cap_chan_del(chan, ECONNRESET);
- l2cap_chan_unlock(chan);
+
+ /* Check that the dcid channel mode is
+ * L2CAP_MODE_EXT_FLOWCTL since this procedure is only
+ * valid for that mode and shouldn't disconnect a dcid
+ * in other modes.
+ */
+ if (orig->mode == L2CAP_MODE_EXT_FLOWCTL) {
+ l2cap_chan_lock(orig);
+ /* Disconnect the original channel as it may be
+ * considered connected since dcid has already
+ * been assigned; don't call l2cap_chan_close
+ * directly since that could lead to
+ * l2cap_chan_del and then removing the channel
+ * from the list while we're iterating over it.
+ */
+ __set_chan_timer(orig, 0);
+ l2cap_chan_unlock(orig);
+ }
continue;
}
@@ -5428,7 +5439,7 @@ static inline int l2cap_ecred_reconf_req(struct l2cap_conn *conn,
* configured, the MPS field may be less than the current MPS
* of that channel.
*/
- if (chan[i]->remote_mps >= mps && i) {
+ if (chan[i]->remote_mps > mps && num_scid > 1) {
BT_ERR("chan %p decreased MPS %u -> %u", chan[i],
chan[i]->remote_mps, mps);
result = L2CAP_RECONF_INVALID_MPS;
@@ -5466,14 +5477,20 @@ static inline int l2cap_ecred_reconf_rsp(struct l2cap_conn *conn,
BT_DBG("result 0x%4.4x", result);
- if (!result)
+ if (!result) {
+ list_for_each_entry(chan, &conn->chan_l, list) {
+ if (chan->ident == cmd->ident)
+ chan->ident = 0;
+ }
return 0;
+ }
list_for_each_entry_safe(chan, tmp, &conn->chan_l, list) {
if (chan->ident != cmd->ident)
continue;
- l2cap_chan_hold(chan);
+ if (!l2cap_chan_hold_unless_zero(chan))
+ continue;
l2cap_chan_lock(chan);
l2cap_chan_del(chan, ECONNRESET);
@@ -5626,6 +5643,15 @@ static inline void l2cap_sig_send_rej(struct l2cap_conn *conn, u16 ident)
l2cap_send_cmd(conn, ident, L2CAP_COMMAND_REJ, sizeof(rej), &rej);
}
+static inline void l2cap_sig_send_mtu_rej(struct l2cap_conn *conn, u8 ident)
+{
+ struct l2cap_cmd_rej_mtu rej;
+
+ rej.reason = cpu_to_le16(L2CAP_REJ_MTU_EXCEEDED);
+ rej.max_mtu = cpu_to_le16(L2CAP_SIG_MTU);
+ l2cap_send_cmd(conn, ident, L2CAP_COMMAND_REJ, sizeof(rej), &rej);
+}
+
static inline void l2cap_sig_channel(struct l2cap_conn *conn,
struct sk_buff *skb)
{
@@ -5638,6 +5664,43 @@ static inline void l2cap_sig_channel(struct l2cap_conn *conn,
if (hcon->type != ACL_LINK)
goto drop;
+ /*
+ * Bluetooth Core v5.4, Vol 3, Part A, Section 4: the BR/EDR
+ * signaling channel has a fixed signaling MTU (MTUsig) whose
+ * minimum and default is 48 octets. Section 4.1 says that on
+ * an MTUExceeded command reject the identifier "shall match
+ * the first request command in the L2CAP packet" and that
+ * packets containing only response commands "shall be
+ * silently discarded".
+ *
+ * Linux intentionally deviates from that prescription:
+ *
+ * 1. Silently discarding desynchronizes the peer. The
+ * remote stack never learns its responses were dropped,
+ * so any state machine waiting on a paired response
+ * stalls until its own timer fires.
+ *
+ * 2. Locating "the first request command" requires walking
+ * command headers past MTUsig, i.e. processing bytes
+ * from a packet we have already decided is too large to
+ * process.
+ *
+ * Reject every over-MTUsig signaling packet with one
+ * L2CAP_REJ_MTU_EXCEEDED command reject. The reject's
+ * reason field is what tells the peer that the whole packet
+ * was discarded; the identifier value is informational, so
+ * we use the identifier from the first command header, a
+ * single fixed-offset byte read.
+ */
+ if (skb->len > L2CAP_SIG_MTU) {
+ u8 ident = skb->data[1];
+
+ BT_DBG("signaling packet exceeds MTU: %u > %u",
+ skb->len, L2CAP_SIG_MTU);
+ l2cap_sig_send_mtu_rej(conn, ident);
+ goto drop;
+ }
+
while (skb->len >= L2CAP_CMD_HDR_SIZE) {
u16 len;
@@ -7282,7 +7345,7 @@ static void l2cap_ecred_reconfigure(struct l2cap_chan *chan)
chan->ident = l2cap_get_ident(conn);
l2cap_send_cmd(conn, chan->ident, L2CAP_ECRED_RECONF_REQ,
- sizeof(pdu), &pdu);
+ struct_size(pdu, scid, 1), pdu);
}
int l2cap_chan_reconfigure(struct l2cap_chan *chan, __u16 mtu)
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index 71e8c1b45bce..c138aa4ae266 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -349,8 +349,13 @@ static int l2cap_sock_accept(struct socket *sock, struct socket *newsock,
}
nsk = bt_accept_dequeue(sk, newsock);
- if (nsk)
+ if (nsk) {
+ /* Drop the bridging ref from bt_accept_dequeue();
+ * the grafted socket keeps nsk alive from here.
+ */
+ sock_put(nsk);
break;
+ }
if (!timeo) {
err = -EAGAIN;
@@ -1475,22 +1480,56 @@ static void l2cap_sock_cleanup_listen(struct sock *parent)
BT_DBG("parent %p state %s", parent,
state_to_string(parent->sk_state));
- /* Close not yet accepted channels */
+ /* Close not yet accepted channels.
+ *
+ * bt_accept_dequeue() now returns sk with an extra reference held
+ * (taken while sk was still locked) so a concurrent l2cap_conn_del()
+ * -> l2cap_sock_kill() cannot free sk under us.
+ *
+ * cleanup_listen() runs under the parent sk lock, so unlike
+ * l2cap_sock_shutdown() we must NOT take conn->lock here: that would
+ * establish sk_lock -> conn->lock and invert the established
+ * conn->lock -> chan->lock -> sk_lock order (lockdep deadlock).
+ *
+ * Instead, briefly take the child sk lock to fetch and pin its chan.
+ * l2cap_conn_del() reaches the chan free only via
+ * l2cap_chan_del() -> l2cap_sock_teardown_cb(), which itself takes
+ * the child sk lock; holding it across l2cap_chan_hold_unless_zero()
+ * therefore guarantees the chan cannot be freed while we read and
+ * pin it (hold_unless_zero() additionally skips a chan already past
+ * its last reference). We then drop the sk lock before taking
+ * chan->lock, so sk and chan locks are never held together.
+ *
+ * Since we cannot call l2cap_chan_close() without conn->lock,
+ * schedule l2cap_chan_timeout to close the channel; it already
+ * acquires conn->lock -> chan->lock in the correct order.
+ */
while ((sk = bt_accept_dequeue(parent, NULL))) {
- struct l2cap_chan *chan = l2cap_pi(sk)->chan;
+ struct l2cap_chan *chan;
+
+ lock_sock_nested(sk, L2CAP_NESTING_NORMAL);
+ chan = l2cap_chan_hold_unless_zero(l2cap_pi(sk)->chan);
+ release_sock(sk);
+ if (!chan) {
+ /* l2cap_conn_del() already tearing this child down */
+ sock_put(sk);
+ continue;
+ }
BT_DBG("child chan %p state %s", chan,
state_to_string(chan->state));
- l2cap_chan_hold(chan);
l2cap_chan_lock(chan);
-
- __clear_chan_timer(chan);
- l2cap_chan_close(chan, ECONNRESET);
- l2cap_sock_kill(sk);
-
+ /* Since we cannot call l2cap_chan_close() without
+ * conn->lock, schedule its timer to trigger the close
+ * and cleanup of this channel.
+ */
+ if (chan->conn)
+ __set_chan_timer(chan, 0);
l2cap_chan_unlock(chan);
+
l2cap_chan_put(chan);
+ sock_put(sk);
}
}
@@ -1498,6 +1537,9 @@ static struct l2cap_chan *l2cap_sock_new_connection_cb(struct l2cap_chan *chan)
{
struct sock *sk, *parent = chan->data;
+ if (!parent)
+ return NULL;
+
lock_sock(parent);
/* Check for backlog size */
@@ -1657,6 +1699,9 @@ static void l2cap_sock_state_change_cb(struct l2cap_chan *chan, int state,
{
struct sock *sk = chan->data;
+ if (!sk)
+ return;
+
sk->sk_state = state;
if (err)
@@ -1758,6 +1803,9 @@ static long l2cap_sock_get_sndtimeo_cb(struct l2cap_chan *chan)
{
struct sock *sk = chan->data;
+ if (!sk)
+ return 0;
+
return READ_ONCE(sk->sk_sndtimeo);
}
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index b05bb380e5f8..f4aa814a0397 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -8638,6 +8638,12 @@ static bool tlv_data_is_valid(struct hci_dev *hdev, u32 adv_flags, u8 *data,
if (!cur_len)
continue;
+ /* If the current field length would exceed the total data
+ * length, then it's invalid.
+ */
+ if (i + cur_len >= len)
+ return false;
+
if (data[i + 1] == EIR_FLAGS &&
(!is_adv_data || flags_managed(adv_flags)))
return false;
@@ -8654,12 +8660,6 @@ static bool tlv_data_is_valid(struct hci_dev *hdev, u32 adv_flags, u8 *data,
if (data[i + 1] == EIR_APPEARANCE &&
appearance_managed(adv_flags))
return false;
-
- /* If the current field length would exceed the total data
- * length, then it's invalid.
- */
- if (i + cur_len >= len)
- return false;
}
return true;
@@ -9110,9 +9110,16 @@ static int add_ext_adv_data(struct sock *sk, struct hci_dev *hdev, void *data,
struct adv_info *adv_instance;
int err = 0;
struct mgmt_pending_cmd *cmd;
+ u16 expected_len;
BT_DBG("%s", hdev->name);
+ expected_len = struct_size(cp, data, cp->adv_data_len +
+ cp->scan_rsp_len);
+ if (expected_len > data_len)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_DATA,
+ MGMT_STATUS_INVALID_PARAMS);
+
hci_dev_lock(hdev);
adv_instance = hci_find_adv_instance(hdev, cp->instance);
diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index 611a9a94151e..364b9381c2dc 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -1431,10 +1431,15 @@ static int rfcomm_apply_pn(struct rfcomm_dlc *d, int cr, struct rfcomm_pn *pn)
static int rfcomm_recv_pn(struct rfcomm_session *s, int cr, struct sk_buff *skb)
{
- struct rfcomm_pn *pn = (void *) skb->data;
+ struct rfcomm_pn *pn;
struct rfcomm_dlc *d;
- u8 dlci = pn->dlci;
+ u8 dlci;
+
+ pn = skb_pull_data(skb, sizeof(*pn));
+ if (!pn)
+ return -EILSEQ;
+ dlci = pn->dlci;
BT_DBG("session %p state %ld dlci %d", s, s->state, dlci);
if (!dlci)
@@ -1483,8 +1488,8 @@ static int rfcomm_recv_pn(struct rfcomm_session *s, int cr, struct sk_buff *skb)
static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_buff *skb)
{
- struct rfcomm_rpn *rpn = (void *) skb->data;
- u8 dlci = __get_dlci(rpn->dlci);
+ struct rfcomm_rpn *rpn;
+ u8 dlci;
u8 bit_rate = 0;
u8 data_bits = 0;
@@ -1495,15 +1500,16 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
u8 xoff_char = 0;
u16 rpn_mask = RFCOMM_RPN_PM_ALL;
- BT_DBG("dlci %d cr %d len 0x%x bitr 0x%x line 0x%x flow 0x%x xonc 0x%x xoffc 0x%x pm 0x%x",
- dlci, cr, len, rpn->bit_rate, rpn->line_settings, rpn->flow_ctrl,
- rpn->xon_char, rpn->xoff_char, rpn->param_mask);
+ if (len == 1) {
+ rpn = skb_pull_data(skb, 1);
+ if (!rpn)
+ return -EILSEQ;
- if (!cr)
- return 0;
+ dlci = __get_dlci(rpn->dlci);
+
+ if (!cr)
+ return 0;
- if (len == 1) {
- /* This is a request, return default (according to ETSI TS 07.10) settings */
bit_rate = RFCOMM_RPN_BR_9600;
data_bits = RFCOMM_RPN_DATA_8;
stop_bits = RFCOMM_RPN_STOP_1;
@@ -1514,6 +1520,19 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
goto rpn_out;
}
+ rpn = skb_pull_data(skb, sizeof(*rpn));
+ if (!rpn)
+ return -EILSEQ;
+
+ dlci = __get_dlci(rpn->dlci);
+
+ BT_DBG("dlci %d cr %d len 0x%x bitr 0x%x line 0x%x flow 0x%x xonc 0x%x xoffc 0x%x pm 0x%x",
+ dlci, cr, len, rpn->bit_rate, rpn->line_settings, rpn->flow_ctrl,
+ rpn->xon_char, rpn->xoff_char, rpn->param_mask);
+
+ if (!cr)
+ return 0;
+
/* Check for sane values, ignore/accept bit_rate, 8 bits, 1 stop bit,
* no parity, no flow control lines, normal XON/XOFF chars */
@@ -1589,9 +1608,14 @@ rpn_out:
static int rfcomm_recv_rls(struct rfcomm_session *s, int cr, struct sk_buff *skb)
{
- struct rfcomm_rls *rls = (void *) skb->data;
- u8 dlci = __get_dlci(rls->dlci);
+ struct rfcomm_rls *rls;
+ u8 dlci;
+
+ rls = skb_pull_data(skb, sizeof(*rls));
+ if (!rls)
+ return -EILSEQ;
+ dlci = __get_dlci(rls->dlci);
BT_DBG("dlci %d cr %d status 0x%x", dlci, cr, rls->status);
if (!cr)
@@ -1608,10 +1632,15 @@ static int rfcomm_recv_rls(struct rfcomm_session *s, int cr, struct sk_buff *skb
static int rfcomm_recv_msc(struct rfcomm_session *s, int cr, struct sk_buff *skb)
{
- struct rfcomm_msc *msc = (void *) skb->data;
+ struct rfcomm_msc *msc;
struct rfcomm_dlc *d;
- u8 dlci = __get_dlci(msc->dlci);
+ u8 dlci;
+ msc = skb_pull_data(skb, sizeof(*msc));
+ if (!msc)
+ return -EILSEQ;
+
+ dlci = __get_dlci(msc->dlci);
BT_DBG("dlci %d cr %d v24 0x%x", dlci, cr, msc->v24_sig);
d = rfcomm_dlc_get(s, dlci);
@@ -1644,17 +1673,19 @@ static int rfcomm_recv_msc(struct rfcomm_session *s, int cr, struct sk_buff *skb
static int rfcomm_recv_mcc(struct rfcomm_session *s, struct sk_buff *skb)
{
- struct rfcomm_mcc *mcc = (void *) skb->data;
+ struct rfcomm_mcc *mcc;
u8 type, cr, len;
+ mcc = skb_pull_data(skb, sizeof(*mcc));
+ if (!mcc)
+ return -EILSEQ;
+
cr = __test_cr(mcc->type);
type = __get_mcc_type(mcc->type);
len = __get_mcc_len(mcc->len);
BT_DBG("%p type 0x%x cr %d", s, type, cr);
- skb_pull(skb, 2);
-
switch (type) {
case RFCOMM_PN:
rfcomm_recv_pn(s, cr, skb);
@@ -1715,9 +1746,12 @@ static int rfcomm_recv_data(struct rfcomm_session *s, u8 dlci, int pf, struct sk
}
if (pf && d->cfc) {
- u8 credits = *(u8 *) skb->data; skb_pull(skb, 1);
+ u8 *credits = skb_pull_data(skb, 1);
+
+ if (!credits)
+ goto drop;
- d->tx_credits += credits;
+ d->tx_credits += *credits;
if (d->tx_credits)
clear_bit(RFCOMM_TX_THROTTLED, &d->flags);
}
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index be6639cd6f59..805ed5d28ed6 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -122,7 +122,7 @@ static struct sock *__rfcomm_get_listen_sock_by_addr(u8 channel, bdaddr_t *src)
}
/* Find socket with channel and source bdaddr.
- * Returns closest match.
+ * Returns closest match with an extra reference held.
*/
static struct sock *rfcomm_get_sock_by_channel(int state, u8 channel, bdaddr_t *src)
{
@@ -136,15 +136,25 @@ static struct sock *rfcomm_get_sock_by_channel(int state, u8 channel, bdaddr_t *
if (rfcomm_pi(sk)->channel == channel) {
/* Exact match. */
- if (!bacmp(&rfcomm_pi(sk)->src, src))
+ if (!bacmp(&rfcomm_pi(sk)->src, src)) {
+ sock_hold(sk);
break;
+ }
/* Closest match */
- if (!bacmp(&rfcomm_pi(sk)->src, BDADDR_ANY))
+ if (!bacmp(&rfcomm_pi(sk)->src, BDADDR_ANY)) {
+ if (sk1)
+ sock_put(sk1);
+
sk1 = sk;
+ sock_hold(sk1);
+ }
}
}
+ if (sk && sk1)
+ sock_put(sk1);
+
read_unlock(&rfcomm_sk_list.lock);
return sk ? sk : sk1;
@@ -180,6 +190,8 @@ static void rfcomm_sock_cleanup_listen(struct sock *parent)
while ((sk = bt_accept_dequeue(parent, NULL))) {
rfcomm_sock_close(sk);
rfcomm_sock_kill(sk);
+ /* Drop the reference handed back by bt_accept_dequeue(). */
+ sock_put(sk);
}
parent->sk_state = BT_CLOSED;
@@ -497,8 +509,13 @@ static int rfcomm_sock_accept(struct socket *sock, struct socket *newsock,
}
nsk = bt_accept_dequeue(sk, newsock);
- if (nsk)
+ if (nsk) {
+ /* Drop the bridging ref from bt_accept_dequeue();
+ * the grafted socket keeps nsk alive from here.
+ */
+ sock_put(nsk);
break;
+ }
if (!timeo) {
err = -EAGAIN;
@@ -934,6 +951,7 @@ int rfcomm_connect_ind(struct rfcomm_session *s, u8 channel, struct rfcomm_dlc *
{
struct sock *sk, *parent;
bdaddr_t src, dst;
+ bool defer_setup = false;
int result = 0;
BT_DBG("session %p channel %d", s, channel);
@@ -947,6 +965,11 @@ int rfcomm_connect_ind(struct rfcomm_session *s, u8 channel, struct rfcomm_dlc *
lock_sock(parent);
+ if (parent->sk_state != BT_LISTEN)
+ goto done;
+
+ defer_setup = test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags);
+
/* Check for backlog size */
if (sk_acceptq_is_full(parent)) {
BT_DBG("backlog full %d", parent->sk_ack_backlog);
@@ -974,9 +997,11 @@ int rfcomm_connect_ind(struct rfcomm_session *s, u8 channel, struct rfcomm_dlc *
done:
release_sock(parent);
- if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags))
+ if (defer_setup)
parent->sk_state_change(parent);
+ sock_put(parent);
+
return result;
}
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 18826d4b9c0b..140869e5b2df 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -312,11 +312,21 @@ static int sco_connect(struct sock *sk)
struct sco_conn *conn;
struct hci_conn *hcon;
struct hci_dev *hdev;
+ bdaddr_t src, dst;
+ struct bt_codec codec;
+ __u16 setting;
int err, type;
- BT_DBG("%pMR -> %pMR", &sco_pi(sk)->src, &sco_pi(sk)->dst);
+ lock_sock(sk);
+ bacpy(&src, &sco_pi(sk)->src);
+ bacpy(&dst, &sco_pi(sk)->dst);
+ setting = sco_pi(sk)->setting;
+ codec = sco_pi(sk)->codec;
+ release_sock(sk);
- hdev = hci_get_route(&sco_pi(sk)->dst, &sco_pi(sk)->src, BDADDR_BREDR);
+ BT_DBG("%pMR -> %pMR", &src, &dst);
+
+ hdev = hci_get_route(&dst, &src, BDADDR_BREDR);
if (!hdev)
return -EHOSTUNREACH;
@@ -327,7 +337,7 @@ static int sco_connect(struct sock *sk)
else
type = SCO_LINK;
- switch (sco_pi(sk)->setting & SCO_AIRMODE_MASK) {
+ switch (setting & SCO_AIRMODE_MASK) {
case SCO_AIRMODE_TRANSP:
if (!lmp_transp_capable(hdev) || !lmp_esco_capable(hdev)) {
err = -EOPNOTSUPP;
@@ -336,8 +346,8 @@ static int sco_connect(struct sock *sk)
break;
}
- hcon = hci_connect_sco(hdev, type, &sco_pi(sk)->dst,
- sco_pi(sk)->setting, &sco_pi(sk)->codec,
+ hcon = hci_connect_sco(hdev, type, &dst,
+ setting, &codec,
READ_ONCE(sk->sk_sndtimeo));
if (IS_ERR(hcon)) {
err = PTR_ERR(hcon);
@@ -472,9 +482,13 @@ static struct sock *sco_get_sock_listen(bdaddr_t *src)
sk1 = sk;
}
+ sk = sk ? sk : sk1;
+ if (sk)
+ sock_hold(sk);
+
read_unlock(&sco_sk_list.lock);
- return sk ? sk : sk1;
+ return sk;
}
static void sco_sock_destruct(struct sock *sk)
@@ -498,6 +512,8 @@ static void sco_sock_cleanup_listen(struct sock *parent)
while ((sk = bt_accept_dequeue(parent, NULL))) {
sco_sock_close(sk);
sco_sock_kill(sk);
+ /* Drop the reference handed back by bt_accept_dequeue(). */
+ sock_put(sk);
}
parent->sk_state = BT_CLOSED;
@@ -515,11 +531,13 @@ static void sco_sock_kill(struct sock *sk)
BT_DBG("sk %p state %d", sk, sk->sk_state);
/* Sock is dead, so set conn->sk to NULL to avoid possible UAF */
+ lock_sock(sk);
if (sco_pi(sk)->conn) {
sco_conn_lock(sco_pi(sk)->conn);
sco_pi(sk)->conn->sk = NULL;
sco_conn_unlock(sco_pi(sk)->conn);
}
+ release_sock(sk);
/* Kill poor orphan */
bt_sock_unlink(&sco_sk_list, sk);
@@ -759,8 +777,13 @@ static int sco_sock_accept(struct socket *sock, struct socket *newsock,
}
ch = bt_accept_dequeue(sk, newsock);
- if (ch)
+ if (ch) {
+ /* Drop the bridging ref from bt_accept_dequeue();
+ * the grafted socket keeps ch alive from here.
+ */
+ sock_put(ch);
break;
+ }
if (!timeo) {
err = -EAGAIN;
@@ -1365,40 +1388,51 @@ static int sco_sock_release(struct socket *sock)
static void sco_conn_ready(struct sco_conn *conn)
{
- struct sock *parent;
- struct sock *sk = conn->sk;
+ struct sock *parent, *sk;
+
+ sco_conn_lock(conn);
+ sk = sco_sock_hold(conn);
+ sco_conn_unlock(conn);
BT_DBG("conn %p", conn);
if (sk) {
lock_sock(sk);
- sco_sock_clear_timer(sk);
- sk->sk_state = BT_CONNECTED;
- sk->sk_state_change(sk);
+
+ /* conn->sk may have become NULL if racing with sk close, but
+ * due to held hdev->lock, it can't become different sk.
+ */
+ if (conn->sk) {
+ sco_sock_clear_timer(sk);
+ sk->sk_state = BT_CONNECTED;
+ sk->sk_state_change(sk);
+ }
+
release_sock(sk);
+ sock_put(sk);
} else {
- sco_conn_lock(conn);
-
- if (!conn->hcon) {
- sco_conn_unlock(conn);
+ if (!conn->hcon)
return;
- }
+
+ lockdep_assert_held(&conn->hcon->hdev->lock);
parent = sco_get_sock_listen(&conn->hcon->src);
- if (!parent) {
- sco_conn_unlock(conn);
+ if (!parent)
return;
- }
lock_sock(parent);
+ sco_conn_lock(conn);
+
+ /* hdev->lock guarantees conn->sk == NULL still here */
+
+ if (parent->sk_state != BT_LISTEN)
+ goto release;
+
sk = sco_sock_alloc(sock_net(parent), NULL,
BTPROTO_SCO, GFP_ATOMIC, 0);
- if (!sk) {
- release_sock(parent);
- sco_conn_unlock(conn);
- return;
- }
+ if (!sk)
+ goto release;
sco_sock_init(sk, parent);
@@ -1417,9 +1451,10 @@ static void sco_conn_ready(struct sco_conn *conn)
/* Wake up parent */
parent->sk_data_ready(parent);
- release_sock(parent);
-
+release:
sco_conn_unlock(conn);
+ release_sock(parent);
+ sock_put(parent);
}
}
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 881d866d687a..2eef4f3345cd 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -4640,10 +4640,24 @@ static void br_multicast_start_querier(struct net_bridge_mcast *brmctx,
rcu_read_unlock();
}
-static void br_multicast_del_grps(struct net_bridge *br)
+static void br_multicast_enable_all_ports(struct net_bridge *br)
{
struct net_bridge_port *port;
+ if (br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED))
+ return;
+
+ list_for_each_entry(port, &br->port_list, list)
+ __br_multicast_enable_port_ctx(&port->multicast_ctx);
+}
+
+static void br_multicast_disable_all_ports(struct net_bridge *br)
+{
+ struct net_bridge_port *port;
+
+ if (br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED))
+ return;
+
list_for_each_entry(port, &br->port_list, list)
__br_multicast_disable_port_ctx(&port->multicast_ctx);
}
@@ -4651,7 +4665,6 @@ static void br_multicast_del_grps(struct net_bridge *br)
int br_multicast_toggle(struct net_bridge *br, unsigned long val,
struct netlink_ext_ack *extack)
{
- struct net_bridge_port *port;
bool change_snoopers = false;
int err = 0;
@@ -4668,7 +4681,7 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val,
br_opt_toggle(br, BROPT_MULTICAST_ENABLED, !!val);
if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) {
change_snoopers = true;
- br_multicast_del_grps(br);
+ br_multicast_disable_all_ports(br);
goto unlock;
}
@@ -4676,8 +4689,7 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val,
goto unlock;
br_multicast_open(br);
- list_for_each_entry(port, &br->port_list, list)
- __br_multicast_enable_port_ctx(&port->multicast_ctx);
+ br_multicast_enable_all_ports(br);
change_snoopers = true;
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 0ab1c94db4b9..0a394e5f4391 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -297,7 +297,11 @@ int br_nf_pre_routing_finish_bridge(struct net *net, struct sock *sk, struct sk_
goto free_skb;
}
- neigh_hh_bridge(&neigh->hh, skb);
+ if (neigh_hh_bridge(&neigh->hh, skb)) {
+ neigh_release(neigh);
+ goto free_skb;
+ }
+
skb->dev = br_indev;
ret = br_handle_frame_finish(net, sk, skb);
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 6fd5386a1d64..b9591dd755f9 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -1000,19 +1000,25 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[],
br_port_flags_change(p, changed_mask);
if (tb[IFLA_BRPORT_COST]) {
+ spin_lock_bh(&p->br->lock);
err = br_stp_set_path_cost(p, nla_get_u32(tb[IFLA_BRPORT_COST]));
+ spin_unlock_bh(&p->br->lock);
if (err)
return err;
}
if (tb[IFLA_BRPORT_PRIORITY]) {
+ spin_lock_bh(&p->br->lock);
err = br_stp_set_port_priority(p, nla_get_u16(tb[IFLA_BRPORT_PRIORITY]));
+ spin_unlock_bh(&p->br->lock);
if (err)
return err;
}
if (tb[IFLA_BRPORT_STATE]) {
+ spin_lock_bh(&p->br->lock);
err = br_set_port_state(p, nla_get_u8(tb[IFLA_BRPORT_STATE]));
+ spin_unlock_bh(&p->br->lock);
if (err)
return err;
}
@@ -1114,9 +1120,7 @@ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags,
if (err)
return err;
- spin_lock_bh(&p->br->lock);
err = br_setport(p, tb, extack);
- spin_unlock_bh(&p->br->lock);
} else {
/* Binary compatibility with old RSTP */
if (nla_len(protinfo) < sizeof(u8))
@@ -1203,17 +1207,10 @@ static int br_port_slave_changelink(struct net_device *brdev,
struct nlattr *data[],
struct netlink_ext_ack *extack)
{
- struct net_bridge *br = netdev_priv(brdev);
- int ret;
-
if (!data)
return 0;
- spin_lock_bh(&br->lock);
- ret = br_setport(br_port_get_rtnl(dev), data, extack);
- spin_unlock_bh(&br->lock);
-
- return ret;
+ return br_setport(br_port_get_rtnl(dev), data, extack);
}
static int br_port_fill_slave_info(struct sk_buff *skb,
@@ -1824,6 +1821,7 @@ static int br_fill_linkxstats(struct sk_buff *skb,
const struct net_device *dev,
int *prividx, int attr)
{
+ unsigned int limit = U16_MAX - nla_total_size(0);
struct nlattr *nla __maybe_unused;
struct net_bridge_port *p = NULL;
struct net_bridge_vlan_group *vg;
@@ -1841,6 +1839,7 @@ static int br_fill_linkxstats(struct sk_buff *skb,
p = br_port_get_rtnl(dev);
if (!p)
return 0;
+ limit -= nla_total_size_64bit(sizeof(p->stp_xstats));
br = p->br;
vg = nbp_vlan_group(p);
break;
@@ -1855,6 +1854,9 @@ static int br_fill_linkxstats(struct sk_buff *skb,
if (vg) {
u16 pvid;
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ limit -= nla_total_size_64bit(sizeof(struct br_mcast_stats));
+#endif
pvid = br_get_pvid(vg);
list_for_each_entry(v, &vg->vlan_list, vlist) {
struct bridge_vlan_xstats vxi;
@@ -1862,6 +1864,11 @@ static int br_fill_linkxstats(struct sk_buff *skb,
if (++vl_idx < *prividx)
continue;
+
+ if (skb_tail_pointer(skb) - (unsigned char *)nest +
+ nla_total_size(sizeof(vxi)) >= limit)
+ goto nla_put_failure;
+
memset(&vxi, 0, sizeof(vxi));
vxi.vid = v->vid;
vxi.flags = v->flags;
diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c
index 18b558a931ad..ee3ad9dfbab9 100644
--- a/net/bridge/br_switchdev.c
+++ b/net/bridge/br_switchdev.c
@@ -99,7 +99,6 @@ int br_switchdev_set_port_flag(struct net_bridge_port *p,
attr.u.brport_flags.val = flags;
attr.u.brport_flags.mask = mask;
- /* We run from atomic context here */
err = call_switchdev_notifiers(SWITCHDEV_PORT_ATTR_SET, p->dev,
&info.info, extack);
err = notifier_to_errno(err);
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index 1f57c36a7fc0..d6df81fa0d13 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -86,16 +86,34 @@ static ssize_t show_path_cost(struct net_bridge_port *p, char *buf)
return sysfs_emit(buf, "%d\n", p->path_cost);
}
-static BRPORT_ATTR(path_cost, 0644,
- show_path_cost, br_stp_set_path_cost);
+static int store_path_cost(struct net_bridge_port *p, unsigned long v)
+{
+ int ret;
+
+ spin_lock_bh(&p->br->lock);
+ ret = br_stp_set_path_cost(p, v);
+ spin_unlock_bh(&p->br->lock);
+ return ret;
+}
+
+static BRPORT_ATTR(path_cost, 0644, show_path_cost, store_path_cost);
static ssize_t show_priority(struct net_bridge_port *p, char *buf)
{
return sysfs_emit(buf, "%d\n", p->priority);
}
-static BRPORT_ATTR(priority, 0644,
- show_priority, br_stp_set_port_priority);
+static int store_priority(struct net_bridge_port *p, unsigned long v)
+{
+ int ret;
+
+ spin_lock_bh(&p->br->lock);
+ ret = br_stp_set_port_priority(p, v);
+ spin_unlock_bh(&p->br->lock);
+ return ret;
+}
+
+static BRPORT_ATTR(priority, 0644, show_priority, store_priority);
static ssize_t show_designated_root(struct net_bridge_port *p, char *buf)
{
@@ -334,17 +352,13 @@ static ssize_t brport_store(struct kobject *kobj,
ret = -ENOMEM;
goto out_unlock;
}
- spin_lock_bh(&p->br->lock);
ret = brport_attr->store_raw(p, buf_copy);
- spin_unlock_bh(&p->br->lock);
kfree(buf_copy);
} else if (brport_attr->store) {
val = simple_strtoul(buf, &endp, 0);
if (endp == buf)
goto out_unlock;
- spin_lock_bh(&p->br->lock);
ret = brport_attr->store(p, val);
- spin_unlock_bh(&p->br->lock);
}
if (!ret) {
diff --git a/net/bridge/netfilter/ebt_snat.c b/net/bridge/netfilter/ebt_snat.c
index 7dfbcdfc30e5..c9e229af0366 100644
--- a/net/bridge/netfilter/ebt_snat.c
+++ b/net/bridge/netfilter/ebt_snat.c
@@ -31,6 +31,9 @@ ebt_snat_tg(struct sk_buff *skb, const struct xt_action_param *par)
const struct arphdr *ap;
struct arphdr _ah;
+ if (skb_ensure_writable(skb, sizeof(_ah) + ETH_ALEN))
+ return EBT_DROP;
+
ap = skb_header_pointer(skb, 0, sizeof(_ah), &_ah);
if (ap == NULL)
return EBT_DROP;
diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c
index 741360219552..f05c79f215ea 100644
--- a/net/bridge/netfilter/ebtable_broute.c
+++ b/net/bridge/netfilter/ebtable_broute.c
@@ -112,24 +112,22 @@ static struct pernet_operations broute_net_ops = {
static int __init ebtable_broute_init(void)
{
- int ret = ebt_register_template(&broute_table, broute_table_init);
+ int ret = register_pernet_subsys(&broute_net_ops);
if (ret)
return ret;
- ret = register_pernet_subsys(&broute_net_ops);
- if (ret) {
- ebt_unregister_template(&broute_table);
- return ret;
- }
+ ret = ebt_register_template(&broute_table, broute_table_init);
+ if (ret)
+ unregister_pernet_subsys(&broute_net_ops);
- return 0;
+ return ret;
}
static void __exit ebtable_broute_fini(void)
{
- unregister_pernet_subsys(&broute_net_ops);
ebt_unregister_template(&broute_table);
+ unregister_pernet_subsys(&broute_net_ops);
}
module_init(ebtable_broute_init);
diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c
index dacd81b12e62..0fc03b07e62a 100644
--- a/net/bridge/netfilter/ebtable_filter.c
+++ b/net/bridge/netfilter/ebtable_filter.c
@@ -93,24 +93,22 @@ static struct pernet_operations frame_filter_net_ops = {
static int __init ebtable_filter_init(void)
{
- int ret = ebt_register_template(&frame_filter, frame_filter_table_init);
+ int ret = register_pernet_subsys(&frame_filter_net_ops);
if (ret)
return ret;
- ret = register_pernet_subsys(&frame_filter_net_ops);
- if (ret) {
- ebt_unregister_template(&frame_filter);
- return ret;
- }
+ ret = ebt_register_template(&frame_filter, frame_filter_table_init);
+ if (ret)
+ unregister_pernet_subsys(&frame_filter_net_ops);
- return 0;
+ return ret;
}
static void __exit ebtable_filter_fini(void)
{
- unregister_pernet_subsys(&frame_filter_net_ops);
ebt_unregister_template(&frame_filter);
+ unregister_pernet_subsys(&frame_filter_net_ops);
}
module_init(ebtable_filter_init);
diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c
index 0f2a8c6118d4..8a10375d8909 100644
--- a/net/bridge/netfilter/ebtable_nat.c
+++ b/net/bridge/netfilter/ebtable_nat.c
@@ -93,24 +93,22 @@ static struct pernet_operations frame_nat_net_ops = {
static int __init ebtable_nat_init(void)
{
- int ret = ebt_register_template(&frame_nat, frame_nat_table_init);
+ int ret = register_pernet_subsys(&frame_nat_net_ops);
if (ret)
return ret;
- ret = register_pernet_subsys(&frame_nat_net_ops);
- if (ret) {
- ebt_unregister_template(&frame_nat);
- return ret;
- }
+ ret = ebt_register_template(&frame_nat, frame_nat_table_init);
+ if (ret)
+ unregister_pernet_subsys(&frame_nat_net_ops);
return ret;
}
static void __exit ebtable_nat_fini(void)
{
- unregister_pernet_subsys(&frame_nat_net_ops);
ebt_unregister_template(&frame_nat);
+ unregister_pernet_subsys(&frame_nat_net_ops);
}
module_init(ebtable_nat_init);
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index aea3e19875c6..8a6a069329d2 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -42,6 +42,7 @@
struct ebt_pernet {
struct list_head tables;
+ struct list_head dead_tables;
};
struct ebt_template {
@@ -1162,11 +1163,6 @@ free_newinfo:
static void __ebt_unregister_table(struct net *net, struct ebt_table *table)
{
- mutex_lock(&ebt_mutex);
- list_del(&table->list);
- mutex_unlock(&ebt_mutex);
- audit_log_nfcfg(table->name, AF_BRIDGE, table->private->nentries,
- AUDIT_XT_OP_UNREGISTER, GFP_KERNEL);
EBT_ENTRY_ITERATE(table->private->entries, table->private->entries_size,
ebt_cleanup_entry, net, NULL);
if (table->private->nentries)
@@ -1267,13 +1263,15 @@ int ebt_register_table(struct net *net, const struct ebt_table *input_table,
for (i = 0; i < num_ops; i++)
ops[i].priv = table;
- list_add(&table->list, &ebt_net->tables);
- mutex_unlock(&ebt_mutex);
-
table->ops = ops;
ret = nf_register_net_hooks(net, ops, num_ops);
- if (ret)
+ if (ret) {
+ synchronize_rcu();
__ebt_unregister_table(net, table);
+ } else {
+ list_add(&table->list, &ebt_net->tables);
+ }
+ mutex_unlock(&ebt_mutex);
audit_log_nfcfg(repl->name, AF_BRIDGE, repl->nentries,
AUDIT_XT_OP_REGISTER, GFP_KERNEL);
@@ -1339,7 +1337,7 @@ void ebt_unregister_template(const struct ebt_table *t)
}
EXPORT_SYMBOL(ebt_unregister_template);
-static struct ebt_table *__ebt_find_table(struct net *net, const char *name)
+void ebt_unregister_table_pre_exit(struct net *net, const char *name)
{
struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id);
struct ebt_table *t;
@@ -1348,30 +1346,36 @@ static struct ebt_table *__ebt_find_table(struct net *net, const char *name)
list_for_each_entry(t, &ebt_net->tables, list) {
if (strcmp(t->name, name) == 0) {
+ list_move(&t->list, &ebt_net->dead_tables);
mutex_unlock(&ebt_mutex);
- return t;
+ nf_unregister_net_hooks(net, t->ops, hweight32(t->valid_hooks));
+ return;
}
}
mutex_unlock(&ebt_mutex);
- return NULL;
-}
-
-void ebt_unregister_table_pre_exit(struct net *net, const char *name)
-{
- struct ebt_table *table = __ebt_find_table(net, name);
-
- if (table)
- nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks));
}
EXPORT_SYMBOL(ebt_unregister_table_pre_exit);
void ebt_unregister_table(struct net *net, const char *name)
{
- struct ebt_table *table = __ebt_find_table(net, name);
+ struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id);
+ struct ebt_table *t;
- if (table)
- __ebt_unregister_table(net, table);
+ mutex_lock(&ebt_mutex);
+
+ list_for_each_entry(t, &ebt_net->dead_tables, list) {
+ if (strcmp(t->name, name) == 0) {
+ list_del(&t->list);
+ audit_log_nfcfg(t->name, AF_BRIDGE, t->private->nentries,
+ AUDIT_XT_OP_UNREGISTER, GFP_KERNEL);
+ __ebt_unregister_table(net, t);
+ mutex_unlock(&ebt_mutex);
+ return;
+ }
+ }
+
+ mutex_unlock(&ebt_mutex);
}
/* userspace just supplied us with counters */
@@ -1952,6 +1956,25 @@ enum compat_mwt {
EBT_COMPAT_TARGET,
};
+static bool match_size_ok(const struct xt_match *match, unsigned int match_size)
+{
+ u16 csize;
+
+ if (match->matchsize == -1) /* cannot validate ebt_among */
+ return true;
+
+ csize = match->compatsize ? : match->matchsize;
+
+ return match_size >= csize;
+}
+
+static bool tgt_size_ok(const struct xt_target *tgt, unsigned int tgt_size)
+{
+ u16 csize = tgt->compatsize ? : tgt->targetsize;
+
+ return tgt_size >= csize;
+}
+
static int compat_mtw_from_user(const struct compat_ebt_entry_mwt *mwt,
enum compat_mwt compat_mwt,
struct ebt_entries_buf_state *state,
@@ -1977,6 +2000,11 @@ static int compat_mtw_from_user(const struct compat_ebt_entry_mwt *mwt,
if (IS_ERR(match))
return PTR_ERR(match);
+ if (!match_size_ok(match, match_size)) {
+ module_put(match->me);
+ return -EINVAL;
+ }
+
off = ebt_compat_match_offset(match, match_size);
if (dst) {
if (match->compat_from_user)
@@ -1996,6 +2024,12 @@ static int compat_mtw_from_user(const struct compat_ebt_entry_mwt *mwt,
mwt->u.revision);
if (IS_ERR(wt))
return PTR_ERR(wt);
+
+ if (!tgt_size_ok(wt, match_size)) {
+ module_put(wt->me);
+ return -EINVAL;
+ }
+
off = xt_compat_target_offset(wt);
if (dst) {
@@ -2556,11 +2590,21 @@ static int __net_init ebt_pernet_init(struct net *net)
struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id);
INIT_LIST_HEAD(&ebt_net->tables);
+ INIT_LIST_HEAD(&ebt_net->dead_tables);
return 0;
}
+static void __net_exit ebt_pernet_exit(struct net *net)
+{
+ struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id);
+
+ WARN_ON_ONCE(!list_empty(&ebt_net->tables));
+ WARN_ON_ONCE(!list_empty(&ebt_net->dead_tables));
+}
+
static struct pernet_operations ebt_net_ops = {
.init = ebt_pernet_init,
+ .exit = ebt_pernet_exit,
.id = &ebt_pernet_id,
.size = sizeof(struct ebt_pernet),
};
@@ -2569,19 +2613,20 @@ static int __init ebtables_init(void)
{
int ret;
- ret = xt_register_target(&ebt_standard_target);
+ ret = register_pernet_subsys(&ebt_net_ops);
if (ret < 0)
return ret;
- ret = nf_register_sockopt(&ebt_sockopts);
+
+ ret = xt_register_target(&ebt_standard_target);
if (ret < 0) {
- xt_unregister_target(&ebt_standard_target);
+ unregister_pernet_subsys(&ebt_net_ops);
return ret;
}
- ret = register_pernet_subsys(&ebt_net_ops);
+ ret = nf_register_sockopt(&ebt_sockopts);
if (ret < 0) {
- nf_unregister_sockopt(&ebt_sockopts);
xt_unregister_target(&ebt_standard_target);
+ unregister_pernet_subsys(&ebt_net_ops);
return ret;
}
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
index 692e0b868822..9e64e82d0b63 100644
--- a/net/ceph/auth_x.c
+++ b/net/ceph/auth_x.c
@@ -115,6 +115,11 @@ static int __ceph_x_decrypt(const struct ceph_crypto_key *key, int usage_slot,
if (ret)
return ret;
+ if (plaintext_len < sizeof(*hdr)) {
+ pr_err("%s plaintext too small %d\n", __func__, plaintext_len);
+ return -EINVAL;
+ }
+
hdr = p + ceph_crypt_data_offset(key);
if (le64_to_cpu(hdr->magic) != CEPHX_ENC_MAGIC) {
pr_err("%s bad magic\n", __func__);
diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c
index 254ded0b05f6..521aec1d5fc0 100644
--- a/net/ceph/crush/crush.c
+++ b/net/ceph/crush/crush.c
@@ -47,7 +47,6 @@ int crush_get_bucket_item_weight(const struct crush_bucket *b, int p)
void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
{
kfree(b->h.items);
- kfree(b);
}
void crush_destroy_bucket_list(struct crush_bucket_list *b)
@@ -55,14 +54,12 @@ void crush_destroy_bucket_list(struct crush_bucket_list *b)
kfree(b->item_weights);
kfree(b->sum_weights);
kfree(b->h.items);
- kfree(b);
}
void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
{
kfree(b->h.items);
kfree(b->node_weights);
- kfree(b);
}
void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
@@ -70,14 +67,12 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
kfree(b->straws);
kfree(b->item_weights);
kfree(b->h.items);
- kfree(b);
}
void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b)
{
kfree(b->item_weights);
kfree(b->h.items);
- kfree(b);
}
void crush_destroy_bucket(struct crush_bucket *b)
@@ -99,6 +94,7 @@ void crush_destroy_bucket(struct crush_bucket *b)
crush_destroy_bucket_straw2((struct crush_bucket_straw2 *)b);
break;
}
+ kfree(b);
}
/**
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index c89e66d4fcb7..8b5b0587a0cf 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -72,8 +72,7 @@ static int crush_decode_uniform_bucket(void **p, void *end,
struct crush_bucket_uniform *b)
{
dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
- ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
- b->item_weight = ceph_decode_32(p);
+ ceph_decode_32_safe(p, end, b->item_weight, bad);
return 0;
bad:
return -EINVAL;
@@ -389,11 +388,15 @@ static int decode_choose_args(void **p, void *end, struct crush_map *c)
goto fail;
if (arg->ids_size &&
- arg->ids_size != c->buckets[bucket_index]->size)
+ (!c->buckets[bucket_index] ||
+ arg->ids_size != c->buckets[bucket_index]->size))
goto e_inval;
}
- insert_choose_arg_map(&c->choose_args, arg_map);
+ if (!__insert_choose_arg_map(&c->choose_args, arg_map)) {
+ ret = -EEXIST;
+ goto fail;
+ }
}
return 0;
@@ -516,6 +519,10 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
b->id = ceph_decode_32(p);
b->type = ceph_decode_16(p);
b->alg = ceph_decode_8(p);
+ if (b->alg != alg) {
+ b->alg = 0;
+ goto bad;
+ }
b->hash = ceph_decode_8(p);
b->weight = ceph_decode_32(p);
b->size = ceph_decode_32(p);
@@ -1702,7 +1709,7 @@ static int osdmap_decode(void **p, void *end, bool msgr2,
ceph_decode_need(p, end, 3*sizeof(u32) +
map->max_osd*(struct_v >= 5 ? sizeof(u32) :
sizeof(u8)) +
- sizeof(*map->osd_weight), e_inval);
+ map->max_osd*sizeof(*map->osd_weight), e_inval);
if (ceph_decode_32(p) != map->max_osd)
goto e_inval;
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 14eb7812bda4..ecd659f79fd4 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -172,7 +172,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
struct bpf_map *map;
smap = rcu_dereference(SDATA(selem)->smap);
- if (!(smap->map.map_flags & BPF_F_CLONE))
+ if (!smap || !(smap->map.map_flags & BPF_F_CLONE))
continue;
/* Note that for lockless listeners adding new element
@@ -531,10 +531,10 @@ err_free:
}
EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_alloc);
-static int diag_get(struct bpf_local_storage_data *sdata, struct sk_buff *skb)
+static int diag_get(struct bpf_local_storage_map *smap,
+ struct bpf_local_storage_data *sdata, struct sk_buff *skb)
{
struct nlattr *nla_stg, *nla_value;
- struct bpf_local_storage_map *smap;
/* It cannot exceed max nlattr's payload */
BUILD_BUG_ON(U16_MAX - NLA_HDRLEN < BPF_LOCAL_STORAGE_MAX_VALUE_SIZE);
@@ -543,7 +543,6 @@ static int diag_get(struct bpf_local_storage_data *sdata, struct sk_buff *skb)
if (!nla_stg)
return -EMSGSIZE;
- smap = rcu_dereference(sdata->smap);
if (nla_put_u32(skb, SK_DIAG_BPF_STORAGE_MAP_ID, smap->map.id))
goto errout;
@@ -558,6 +557,7 @@ static int diag_get(struct bpf_local_storage_data *sdata, struct sk_buff *skb)
sdata->data, true);
else
copy_map_value(&smap->map, nla_data(nla_value), sdata->data);
+ check_and_init_map_value(&smap->map, nla_data(nla_value));
nla_nest_end(skb, nla_stg);
return 0;
@@ -596,9 +596,11 @@ static int bpf_sk_storage_diag_put_all(struct sock *sk, struct sk_buff *skb,
saved_len = skb->len;
hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
smap = rcu_dereference(SDATA(selem)->smap);
+ if (!smap)
+ continue;
diag_size += nla_value_size(smap->map.value_size);
- if (nla_stgs && diag_get(SDATA(selem), skb))
+ if (nla_stgs && diag_get(smap, SDATA(selem), skb))
/* Continue to learn diag_size */
err = -EMSGSIZE;
}
@@ -665,7 +667,7 @@ int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag,
diag_size += nla_value_size(diag->maps[i]->value_size);
- if (nla_stgs && diag_get(sdata, skb))
+ if (nla_stgs && diag_get((struct bpf_local_storage_map *)diag->maps[i], sdata, skb))
/* Continue to learn diag_size */
err = -EMSGSIZE;
}
diff --git a/net/core/dev.c b/net/core/dev.c
index 06c195906231..0c6c270d9f7d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -371,7 +371,7 @@ static void netdev_name_node_alt_free(struct rcu_head *head)
static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
{
netdev_name_node_del(name_node);
- list_del(&name_node->list);
+ list_del_rcu(&name_node->list);
call_rcu(&name_node->rcu, netdev_name_node_alt_free);
}
@@ -6862,9 +6862,9 @@ static void skb_defer_free_flush(void)
#if defined(CONFIG_NET_RX_BUSY_POLL)
-static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
+static void __busy_poll_stop(struct napi_struct *napi, unsigned long timeout)
{
- if (!skip_schedule) {
+ if (!timeout) {
gro_normal_list(&napi->gro);
__napi_schedule(napi);
return;
@@ -6874,6 +6874,8 @@ static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
gro_flush_normal(&napi->gro, HZ >= 1000);
clear_bit(NAPI_STATE_SCHED, &napi->state);
+ hrtimer_start(&napi->timer, ns_to_ktime(timeout),
+ HRTIMER_MODE_REL_PINNED);
}
enum {
@@ -6885,8 +6887,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
unsigned flags, u16 budget)
{
struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
- bool skip_schedule = false;
- unsigned long timeout;
+ unsigned long timeout = 0;
int rc;
/* Busy polling means there is a high chance device driver hard irq
@@ -6906,10 +6907,12 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
if (flags & NAPI_F_PREFER_BUSY_POLL) {
napi->defer_hard_irqs_count = napi_get_defer_hard_irqs(napi);
- timeout = napi_get_gro_flush_timeout(napi);
- if (napi->defer_hard_irqs_count && timeout) {
- hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
- skip_schedule = true;
+ if (napi->defer_hard_irqs_count) {
+ /* A short enough gro flush timeout and long enough
+ * poll can result in timer firing too early.
+ * Timer will be armed later if necessary.
+ */
+ timeout = napi_get_gro_flush_timeout(napi);
}
}
@@ -6924,7 +6927,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
trace_napi_poll(napi, rc, budget);
netpoll_poll_unlock(have_poll_lock);
if (rc == budget)
- __busy_poll_stop(napi, skip_schedule);
+ __busy_poll_stop(napi, timeout);
bpf_net_ctx_clear(bpf_net_ctx);
local_bh_enable();
}
diff --git a/net/core/devmem.c b/net/core/devmem.c
index 468344739db2..4f71de44c0fb 100644
--- a/net/core/devmem.c
+++ b/net/core/devmem.c
@@ -241,6 +241,11 @@ net_devmem_bind_dmabuf(struct net_device *dev,
}
if (direction == DMA_TO_DEVICE) {
+ if (!IS_ALIGNED(dmabuf->size, PAGE_SIZE)) {
+ err = -EINVAL;
+ NL_SET_ERR_MSG(extack, "TX dma-buf size must be a multiple of PAGE_SIZE");
+ goto err_unmap;
+ }
binding->tx_vec = kvmalloc_objs(struct net_iov *,
dmabuf->size / PAGE_SIZE);
if (!binding->tx_vec) {
@@ -267,6 +272,12 @@ net_devmem_bind_dmabuf(struct net_device *dev,
size_t len = sg_dma_len(sg);
struct net_iov *niov;
+ if (!IS_ALIGNED(len, PAGE_SIZE)) {
+ err = -EINVAL;
+ NL_SET_ERR_MSG(extack, "dma-buf SG length must be PAGE_SIZE aligned");
+ goto err_free_chunks;
+ }
+
owner = kzalloc_node(sizeof(*owner), GFP_KERNEL,
dev_to_node(&dev->dev));
if (!owner) {
diff --git a/net/core/failover.c b/net/core/failover.c
index 11bb183c7a1b..e43c59cd6868 100644
--- a/net/core/failover.c
+++ b/net/core/failover.c
@@ -12,6 +12,7 @@
#include <uapi/linux/if_arp.h>
#include <linux/rtnetlink.h>
#include <linux/if_vlan.h>
+#include <net/netdev_lock.h>
#include <net/failover.h>
static LIST_HEAD(failover_list);
@@ -221,8 +222,11 @@ failover_existing_slave_register(struct net_device *failover_dev)
for_each_netdev(net, dev) {
if (netif_is_failover(dev))
continue;
- if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr))
+ if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr)) {
+ netdev_lock_ops(dev);
failover_slave_register(dev);
+ netdev_unlock_ops(dev);
+ }
}
rtnl_unlock();
}
diff --git a/net/core/filter.c b/net/core/filter.c
index 80a3b702a2d4..80439767e0ee 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1654,15 +1654,24 @@ err_prog_put:
return err;
}
+static void sk_reuseport_prog_free_rcu(struct rcu_head *rcu)
+{
+ struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
+ struct bpf_prog *prog = aux->prog;
+
+ bpf_release_orig_filter(prog);
+ bpf_prog_free(prog);
+}
+
void sk_reuseport_prog_free(struct bpf_prog *prog)
{
if (!prog)
return;
- if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
- bpf_prog_put(prog);
+ if (bpf_prog_was_classic(prog))
+ call_rcu(&prog->aux->rcu, sk_reuseport_prog_free_rcu);
else
- bpf_prog_destroy(prog);
+ bpf_prog_put(prog);
}
static inline int __bpf_try_make_writable(struct sk_buff *skb,
@@ -2860,7 +2869,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
psge->length = start - offset;
rsge.length -= psge->length;
- rsge.offset += start;
+ rsge.offset += start - offset;
sk_msg_iter_var_next(i);
sg_unmark_end(psge);
@@ -5481,7 +5490,7 @@ static int sol_tcp_sockopt(struct sock *sk, int optname,
char *optval, int *optlen,
bool getopt)
{
- if (sk->sk_protocol != IPPROTO_TCP)
+ if (!sk_is_tcp(sk))
return -EINVAL;
switch (optname) {
@@ -5688,6 +5697,30 @@ const struct bpf_func_proto bpf_sk_getsockopt_proto = {
.arg5_type = ARG_CONST_SIZE,
};
+BPF_CALL_5(bpf_sk_setsockopt_nodelay, struct sock *, sk, int, level,
+ int, optname, char *, optval, int, optlen)
+{
+ /*
+ * TCP_NODELAY triggers tcp_push_pending_frames() and re-enters
+ * CA_EVENT_TX_START in bpf_tcp_cc.
+ */
+ if (level == SOL_TCP && optname == TCP_NODELAY)
+ return -EOPNOTSUPP;
+
+ return _bpf_setsockopt(sk, level, optname, optval, optlen);
+}
+
+const struct bpf_func_proto bpf_sk_setsockopt_nodelay_proto = {
+ .func = bpf_sk_setsockopt_nodelay,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
BPF_CALL_5(bpf_unlocked_sk_setsockopt, struct sock *, sk, int, level,
int, optname, char *, optval, int, optlen)
{
@@ -5833,6 +5866,12 @@ BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
if (!is_locked_tcp_sock_ops(bpf_sock))
return -EOPNOTSUPP;
+ /* TCP_NODELAY triggers tcp_push_pending_frames() and re-enters these callbacks. */
+ if ((bpf_sock->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB ||
+ bpf_sock->op == BPF_SOCK_OPS_WRITE_HDR_OPT_CB) &&
+ level == SOL_TCP && optname == TCP_NODELAY)
+ return -EOPNOTSUPP;
+
return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen);
}
@@ -6443,6 +6482,8 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
* against MTU of FIB lookup resulting net_device
*/
dev = dev_get_by_index_rcu(net, params->ifindex);
+ if (unlikely(!dev))
+ return -ENODEV;
if (!is_skb_forwardable(dev, skb))
rc = BPF_FIB_LKUP_RET_FRAG_NEEDED;
@@ -7443,7 +7484,7 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
BPF_CALL_1(bpf_tcp_sock, struct sock *, sk)
{
- if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP)
+ if (sk_fullsock(sk) && sk_is_tcp(sk))
return (unsigned long)sk;
return (unsigned long)NULL;
@@ -11915,7 +11956,7 @@ BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk)
*/
BTF_TYPE_EMIT(struct tcp6_sock);
if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP &&
- sk->sk_family == AF_INET6)
+ sk->sk_type == SOCK_STREAM && sk->sk_family == AF_INET6)
return (unsigned long)sk;
return (unsigned long)NULL;
@@ -11931,7 +11972,7 @@ const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = {
BPF_CALL_1(bpf_skc_to_tcp_sock, struct sock *, sk)
{
- if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP)
+ if (sk && sk_fullsock(sk) && sk_is_tcp(sk))
return (unsigned long)sk;
return (unsigned long)NULL;
diff --git a/net/core/gro.c b/net/core/gro.c
index 31d21de5b15a..a84753983467 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -109,6 +109,9 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
if (p->pp_recycle != skb->pp_recycle)
return -ETOOMANYREFS;
+ if (skb_zcopy(p) || skb_zcopy(skb))
+ return -ETOOMANYREFS;
+
if (unlikely(p->len + len >= netif_get_gro_max_size(p->dev, p) ||
NAPI_GRO_CB(skb)->flush))
return -E2BIG;
@@ -213,10 +216,12 @@ done:
p->data_len += len;
p->truesize += delta_truesize;
p->len += len;
+ skb_shinfo(p)->flags |= skbinfo->flags & SKBFL_SHARED_FRAG;
if (lp != p) {
lp->data_len += len;
lp->truesize += delta_truesize;
lp->len += len;
+ skb_shinfo(lp)->flags |= skbinfo->flags & SKBFL_SHARED_FRAG;
}
NAPI_GRO_CB(skb)->same_flow = 1;
return 0;
@@ -244,6 +249,8 @@ int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
p->truesize += skb->truesize;
p->len += skb->len;
+ skb_shinfo(p)->flags |= skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG;
+
NAPI_GRO_CB(skb)->same_flow = 1;
return 0;
diff --git a/net/core/netmem_priv.h b/net/core/netmem_priv.h
index 3e6fde8f1726..23175cb2bd86 100644
--- a/net/core/netmem_priv.h
+++ b/net/core/netmem_priv.h
@@ -8,18 +8,21 @@ static inline unsigned long netmem_get_pp_magic(netmem_ref netmem)
return netmem_to_nmdesc(netmem)->pp_magic & ~PP_DMA_INDEX_MASK;
}
-static inline bool netmem_is_pp(netmem_ref netmem)
+static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic)
+{
+ netmem_to_nmdesc(netmem)->pp_magic |= pp_magic;
+}
+
+static inline void netmem_clear_pp_magic(netmem_ref netmem)
{
- struct page *page;
+ WARN_ON_ONCE(netmem_to_nmdesc(netmem)->pp_magic & PP_DMA_INDEX_MASK);
- /* XXX: Now that the offset of page_type is shared between
- * struct page and net_iov, just cast the netmem to struct page
- * unconditionally by clearing NET_IOV if any, no matter whether
- * it comes from struct net_iov or struct page. This should be
- * adjusted once the offset is no longer shared.
- */
- page = (struct page *)((__force unsigned long)netmem & ~NET_IOV);
- return PageNetpp(page);
+ netmem_to_nmdesc(netmem)->pp_magic = 0;
+}
+
+static inline bool netmem_is_pp(netmem_ref netmem)
+{
+ return (netmem_get_pp_magic(netmem) & PP_MAGIC_MASK) == PP_SIGNATURE;
}
static inline void netmem_set_pp(netmem_ref netmem, struct page_pool *pool)
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 4381e0fc25bf..3f4a17fa5713 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -319,6 +319,8 @@ static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
lockdep_assert_irqs_disabled();
dev = np->dev;
+ /* npinfo->txq belongs to np->dev, so retries must stay bound to it. */
+ skb->dev = dev;
rcu_read_lock();
npinfo = rcu_dereference_bh(dev->npinfo);
@@ -608,14 +610,16 @@ EXPORT_SYMBOL_GPL(__netpoll_setup);
/*
* Returns a pointer to a string representation of the identifier used
* to select the egress interface for the given netpoll instance. buf
- * must be a buffer of length at least MAC_ADDR_STR_LEN + 1.
+ * is used to format np->dev_mac when np->dev_name is empty; bufsz must
+ * be at least MAC_ADDR_STR_LEN + 1 to fit the formatted MAC address
+ * and its NUL terminator.
*/
-static char *egress_dev(struct netpoll *np, char *buf)
+static char *egress_dev(struct netpoll *np, char *buf, size_t bufsz)
{
if (np->dev_name[0])
return np->dev_name;
- snprintf(buf, MAC_ADDR_STR_LEN, "%pM", np->dev_mac);
+ snprintf(buf, bufsz, "%pM", np->dev_mac);
return buf;
}
@@ -645,7 +649,7 @@ static int netpoll_take_ipv6(struct netpoll *np, struct net_device *ndev)
if (!IS_ENABLED(CONFIG_IPV6)) {
np_err(np, "IPv6 is not supported %s, aborting\n",
- egress_dev(np, buf));
+ egress_dev(np, buf, sizeof(buf)));
return -EINVAL;
}
@@ -667,7 +671,7 @@ static int netpoll_take_ipv6(struct netpoll *np, struct net_device *ndev)
}
if (err) {
np_err(np, "no IPv6 address for %s, aborting\n",
- egress_dev(np, buf));
+ egress_dev(np, buf, sizeof(buf)));
return err;
}
@@ -687,14 +691,14 @@ static int netpoll_take_ipv4(struct netpoll *np, struct net_device *ndev)
in_dev = __in_dev_get_rtnl(ndev);
if (!in_dev) {
np_err(np, "no IP address for %s, aborting\n",
- egress_dev(np, buf));
+ egress_dev(np, buf, sizeof(buf)));
return -EDESTADDRREQ;
}
ifa = rtnl_dereference(in_dev->ifa_list);
if (!ifa) {
np_err(np, "no IP address for %s, aborting\n",
- egress_dev(np, buf));
+ egress_dev(np, buf, sizeof(buf)));
return -EDESTADDRREQ;
}
@@ -736,7 +740,8 @@ int netpoll_setup(struct netpoll *np)
ndev = dev_getbyhwaddr(net, ARPHRD_ETHER, np->dev_mac);
if (!ndev) {
- np_err(np, "%s doesn't exist, aborting\n", egress_dev(np, buf));
+ np_err(np, "%s doesn't exist, aborting\n",
+ egress_dev(np, buf, sizeof(buf)));
err = -ENODEV;
goto unlock;
}
@@ -744,14 +749,14 @@ int netpoll_setup(struct netpoll *np)
if (netdev_master_upper_dev_get(ndev)) {
np_err(np, "%s is a slave device, aborting\n",
- egress_dev(np, buf));
+ egress_dev(np, buf, sizeof(buf)));
err = -EBUSY;
goto put;
}
if (!netif_running(ndev)) {
np_info(np, "device %s not up yet, forcing it\n",
- egress_dev(np, buf));
+ egress_dev(np, buf, sizeof(buf)));
err = dev_open(ndev, NULL);
if (err) {
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 6e576dec80db..8171d1173221 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -707,18 +707,8 @@ s32 page_pool_inflight(const struct page_pool *pool, bool strict)
void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem)
{
- struct page *page;
-
netmem_set_pp(netmem, pool);
-
- /* XXX: Now that the offset of page_type is shared between
- * struct page and net_iov, just cast the netmem to struct page
- * unconditionally by clearing NET_IOV if any, no matter whether
- * it comes from struct net_iov or struct page. This should be
- * adjusted once the offset is no longer shared.
- */
- page = (struct page *)((__force unsigned long)netmem & ~NET_IOV);
- __SetPageNetpp(page);
+ netmem_or_pp_magic(netmem, PP_SIGNATURE);
/* Ensuring all pages have been split into one fragment initially:
* page_pool_set_pp_info() is only called once for every page when it
@@ -733,17 +723,7 @@ void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem)
void page_pool_clear_pp_info(netmem_ref netmem)
{
- struct page *page;
-
- /* XXX: Now that the offset of page_type is shared between
- * struct page and net_iov, just cast the netmem to struct page
- * unconditionally by clearing NET_IOV if any, no matter whether
- * it comes from struct net_iov or struct page. This should be
- * adjusted once the offset is no longer shared.
- */
- page = (struct page *)((__force unsigned long)netmem & ~NET_IOV);
- __ClearPageNetpp(page);
-
+ netmem_clear_pp_magic(netmem);
netmem_set_pp(netmem, NULL);
}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index b613bb6e07df..511c25bf6f2a 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1572,6 +1572,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
port_guid.vf = ivi.vf;
memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
+ memset(&vf_broadcast, 0, sizeof(vf_broadcast));
memcpy(vf_broadcast.broadcast, dev->broadcast, dev->addr_len);
vf_vlan.vlan = ivi.vlan;
vf_vlan.qos = ivi.qos;
@@ -6327,8 +6328,9 @@ static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh,
NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
0, &filters, &idxattr, &prividx, extack);
if (err < 0) {
- /* -EMSGSIZE implies BUG in if_nlmsg_stats_size */
- WARN_ON(err == -EMSGSIZE);
+ /* -EMSGSIZE implies BUG in if_nlmsg_stats_size
+ * or a too big nested attribute.
+ */
kfree_skb(nskb);
} else {
err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 7dad68e3b518..c02f0a507ba8 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2248,6 +2248,7 @@ struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
skb_frag_ref(skb, i);
}
skb_shinfo(n)->nr_frags = i;
+ skb_shinfo(n)->flags |= skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG;
}
if (skb_has_frag_list(skb)) {
@@ -2786,6 +2787,8 @@ done:
skb->data_len = 0;
skb_set_tail_pointer(skb, len);
}
+ if (!skb_shinfo(skb)->nr_frags && !skb_has_frag_list(skb))
+ skb->unreadable = 0;
if (!skb->sk || skb->destructor == sock_edemux)
skb_condense(skb);
@@ -2793,16 +2796,37 @@ done:
}
EXPORT_SYMBOL(___pskb_trim);
+static int pskb_trim_rcsum_complete(struct sk_buff *skb, unsigned int len)
+{
+ int delta = skb->len - len;
+
+ if (skb_frags_readable(skb)) {
+ skb->csum = csum_block_sub(skb->csum,
+ skb_checksum(skb, len, delta, 0),
+ len);
+ return 0;
+ }
+
+ if (len > skb_headlen(skb))
+ return -EFAULT;
+
+ /* The trimmed bytes are unreadable, but the remaining packet can be
+ * checksummed by software after trimming.
+ */
+ skb->ip_summed = CHECKSUM_NONE;
+ return 0;
+}
+
/* Note : use pskb_trim_rcsum() instead of calling this directly
*/
int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len)
{
if (skb->ip_summed == CHECKSUM_COMPLETE) {
- int delta = skb->len - len;
+ int err;
- skb->csum = csum_block_sub(skb->csum,
- skb_checksum(skb, len, delta, 0),
- len);
+ err = pskb_trim_rcsum_complete(skb, len);
+ if (err)
+ return err;
} else if (skb->ip_summed == CHECKSUM_PARTIAL) {
int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len;
int offset = skb_checksum_start_offset(skb) + skb->csum_offset;
@@ -4349,6 +4373,8 @@ onlymerged:
tgt->ip_summed = CHECKSUM_PARTIAL;
skb->ip_summed = CHECKSUM_PARTIAL;
+ skb_shinfo(tgt)->flags |= skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG;
+
skb_len_add(skb, -shiftlen);
skb_len_add(tgt, shiftlen);
@@ -4959,7 +4985,8 @@ normal:
skb_copy_from_linear_data_offset(head_skb, offset,
skb_put(nskb, hsize), hsize);
- skb_shinfo(nskb)->flags |= skb_shinfo(head_skb)->flags &
+ skb_shinfo(nskb)->flags |= (skb_shinfo(head_skb)->flags |
+ skb_shinfo(frag_skb)->flags) &
SKBFL_SHARED_FRAG;
if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC))
@@ -4976,6 +5003,9 @@ normal:
nfrags = skb_shinfo(list_skb)->nr_frags;
frag = skb_shinfo(list_skb)->frags;
frag_skb = list_skb;
+
+ skb_shinfo(nskb)->flags |= skb_shinfo(frag_skb)->flags & SKBFL_SHARED_FRAG;
+
if (!skb_headlen(list_skb)) {
BUG_ON(!nfrags);
} else {
@@ -6200,6 +6230,8 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
from_shinfo->frags,
from_shinfo->nr_frags * sizeof(skb_frag_t));
to_shinfo->nr_frags += from_shinfo->nr_frags;
+ if (from_shinfo->nr_frags)
+ to_shinfo->flags |= from_shinfo->flags & SKBFL_SHARED_FRAG;
if (!skb_cloned(from))
from_shinfo->nr_frags = 0;
@@ -6791,6 +6823,11 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
skb_copy_from_linear_data_offset(skb, off, data, new_hlen);
skb->len -= off;
+ /* Remove SKBFL_MANAGED_FRAG_REFS instead of trying to honour it
+ * while refcounting frags below.
+ */
+ skb_zcopy_downgrade_managed(skb);
+
memcpy((struct skb_shared_info *)(data + size),
skb_shinfo(skb),
offsetof(struct skb_shared_info,
@@ -6801,6 +6838,8 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
skb_kfree_head(data);
return -ENOMEM;
}
+ if (skb_zcopy(skb))
+ net_zcopy_get(skb_zcopy(skb));
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
skb_frag_ref(skb, i);
if (skb_has_frag_list(skb))
@@ -6902,6 +6941,11 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
return -ENOMEM;
size = SKB_WITH_OVERHEAD(size);
+ /* Remove SKBFL_MANAGED_FRAG_REFS instead of trying to honour it
+ * while refcounting frags below.
+ */
+ skb_zcopy_downgrade_managed(skb);
+
memcpy((struct skb_shared_info *)(data + size),
skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0]));
if (skb_orphan_frags(skb, gfp_mask)) {
@@ -6944,6 +6988,8 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
skb_kfree_head(data);
return -ENOMEM;
}
+ if (skb_zcopy(skb))
+ net_zcopy_get(skb_zcopy(skb));
skb_release_data(skb, SKB_CONSUMED);
skb->head = data;
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 6187a83bd741..e1850caf1a71 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -1268,12 +1268,19 @@ out:
static void sk_psock_verdict_data_ready(struct sock *sk)
{
const struct proto_ops *ops = NULL;
+ struct sk_psock *psock;
struct socket *sock;
int copied;
trace_sk_data_ready(sk);
rcu_read_lock();
+ psock = sk_psock(sk);
+ if (psock && tls_sw_has_ctx_rx(sk)) {
+ psock->saved_data_ready(sk);
+ rcu_read_unlock();
+ return;
+ }
sock = READ_ONCE(sk->sk_socket);
if (likely(sock))
ops = READ_ONCE(sock->ops);
@@ -1283,8 +1290,6 @@ static void sk_psock_verdict_data_ready(struct sock *sk)
copied = ops->read_skb(sk, sk_psock_verdict_recv);
if (copied >= 0) {
- struct sk_psock *psock;
-
rcu_read_lock();
psock = sk_psock(sk);
if (psock)
diff --git a/net/core/sock.c b/net/core/sock.c
index b37b664b6eb9..d097025c116a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2676,8 +2676,12 @@ void sock_wfree(struct sk_buff *skb)
int old;
if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
+ void (*sk_write_space)(struct sock *sk);
+
+ sk_write_space = READ_ONCE(sk->sk_write_space);
+
if (sock_flag(sk, SOCK_RCU_FREE) &&
- sk->sk_write_space == sock_def_write_space) {
+ sk_write_space == sock_def_write_space) {
rcu_read_lock();
free = __refcount_sub_and_test(len, &sk->sk_wmem_alloc,
&old);
@@ -2693,7 +2697,7 @@ void sock_wfree(struct sk_buff *skb)
* after sk_write_space() call
*/
WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
- sk->sk_write_space(sk);
+ sk_write_space(sk);
len = 1;
}
/*
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 02a68be3002a..99e3789492a0 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -1630,18 +1630,23 @@ void sock_map_unhash(struct sock *sk)
void (*saved_unhash)(struct sock *sk);
struct sk_psock *psock;
+retry:
rcu_read_lock();
psock = sk_psock(sk);
if (unlikely(!psock)) {
rcu_read_unlock();
saved_unhash = READ_ONCE(sk->sk_prot)->unhash;
+ if (unlikely(saved_unhash == sock_map_unhash))
+ goto retry;
} else {
saved_unhash = psock->saved_unhash;
sock_map_remove_links(sk, psock);
rcu_read_unlock();
+
+ if (WARN_ON_ONCE(saved_unhash == sock_map_unhash))
+ return;
}
- if (WARN_ON_ONCE(saved_unhash == sock_map_unhash))
- return;
+
if (saved_unhash)
saved_unhash(sk);
}
@@ -1652,20 +1657,25 @@ void sock_map_destroy(struct sock *sk)
void (*saved_destroy)(struct sock *sk);
struct sk_psock *psock;
+retry:
rcu_read_lock();
psock = sk_psock_get(sk);
if (unlikely(!psock)) {
rcu_read_unlock();
saved_destroy = READ_ONCE(sk->sk_prot)->destroy;
+ if (unlikely(saved_destroy == sock_map_destroy))
+ goto retry;
} else {
saved_destroy = psock->saved_destroy;
sock_map_remove_links(sk, psock);
rcu_read_unlock();
sk_psock_stop(psock);
sk_psock_put(sk, psock);
+
+ if (WARN_ON_ONCE(saved_destroy == sock_map_destroy))
+ return;
}
- if (WARN_ON_ONCE(saved_destroy == sock_map_destroy))
- return;
+
if (saved_destroy)
saved_destroy(sk);
}
@@ -1676,32 +1686,33 @@ void sock_map_close(struct sock *sk, long timeout)
void (*saved_close)(struct sock *sk, long timeout);
struct sk_psock *psock;
+retry:
lock_sock(sk);
rcu_read_lock();
- psock = sk_psock(sk);
+ psock = sk_psock_get(sk);
if (likely(psock)) {
saved_close = psock->saved_close;
sock_map_remove_links(sk, psock);
- psock = sk_psock_get(sk);
- if (unlikely(!psock))
- goto no_psock;
rcu_read_unlock();
sk_psock_stop(psock);
release_sock(sk);
cancel_delayed_work_sync(&psock->work);
sk_psock_put(sk, psock);
+
+ /* Make sure we do not recurse. This is a bug.
+ * Leak the socket instead of crashing on a stack overflow.
+ */
+ if (WARN_ON_ONCE(saved_close == sock_map_close))
+ return;
} else {
saved_close = READ_ONCE(sk->sk_prot)->close;
-no_psock:
rcu_read_unlock();
release_sock(sk);
+
+ if (unlikely(saved_close == sock_map_close))
+ goto retry;
}
- /* Make sure we do not recurse. This is a bug.
- * Leak the socket instead of crashing on a stack overflow.
- */
- if (WARN_ON_ONCE(saved_close == sock_map_close))
- return;
saved_close(sk, timeout);
}
EXPORT_SYMBOL_GPL(sock_map_close);
diff --git a/net/devlink/core.c b/net/devlink/core.c
index eeb6a71f5f56..fe9f6a0a67d5 100644
--- a/net/devlink/core.c
+++ b/net/devlink/core.c
@@ -518,6 +518,8 @@ void devlink_free(struct devlink *devlink)
{
ASSERT_DEVLINK_NOT_REGISTERED(devlink);
+ devlink_rel_put(devlink);
+
WARN_ON(!list_empty(&devlink->trap_policer_list));
WARN_ON(!list_empty(&devlink->trap_group_list));
WARN_ON(!list_empty(&devlink->trap_list));
diff --git a/net/ethtool/bitset.c b/net/ethtool/bitset.c
index 8bb98d3ea3db..a3a2cc6480a0 100644
--- a/net/ethtool/bitset.c
+++ b/net/ethtool/bitset.c
@@ -92,7 +92,7 @@ static bool ethnl_bitmap32_not_zero(const u32 *map, unsigned int start,
u32 mask;
if (end <= start)
- return true;
+ return false;
if (start % 32) {
mask = ethnl_upper_bits(start);
@@ -105,11 +105,11 @@ static bool ethnl_bitmap32_not_zero(const u32 *map, unsigned int start,
start_word++;
}
- if (!memchr_inv(map + start_word, '\0',
- (end_word - start_word) * sizeof(u32)))
+ if (memchr_inv(map + start_word, '\0',
+ (end_word - start_word) * sizeof(u32)))
return true;
if (end % 32 == 0)
- return true;
+ return false;
return map[end_word] & ethnl_lower_bits(end);
}
diff --git a/net/ethtool/cmis.h b/net/ethtool/cmis.h
index 4a9a946cabf0..778783a0f23c 100644
--- a/net/ethtool/cmis.h
+++ b/net/ethtool/cmis.h
@@ -63,9 +63,9 @@ struct ethtool_cmis_cdb_request {
* struct ethtool_cmis_cdb_cmd_args - CDB commands execution arguments
* @req: CDB command fields as described in the CMIS standard.
* @max_duration: Maximum duration time for command completion in msec.
+ * @msleep_pre_rpl: Waiting time before checking reply in msec.
* @read_write_len_ext: Allowable additional number of byte octets to the LPL
* in a READ or a WRITE commands.
- * @msleep_pre_rpl: Waiting time before checking reply in msec.
* @rpl_exp_len: Expected reply length in bytes.
* @flags: Validation flags for CDB commands.
* @err_msg: Error message to be sent to user space.
@@ -73,8 +73,8 @@ struct ethtool_cmis_cdb_request {
struct ethtool_cmis_cdb_cmd_args {
struct ethtool_cmis_cdb_request req;
u16 max_duration;
+ u16 msleep_pre_rpl;
u8 read_write_len_ext;
- u8 msleep_pre_rpl;
u8 rpl_exp_len;
u8 flags;
char *err_msg;
diff --git a/net/ethtool/cmis_cdb.c b/net/ethtool/cmis_cdb.c
index 3670ca42dd40..f3a53a984460 100644
--- a/net/ethtool/cmis_cdb.c
+++ b/net/ethtool/cmis_cdb.c
@@ -513,8 +513,13 @@ static int cmis_cdb_process_reply(struct net_device *dev,
}
rpl = (struct ethtool_cmis_cdb_rpl *)page_data->data;
- if ((args->rpl_exp_len > rpl->hdr.rpl_len + rpl_hdr_len) ||
- !rpl->hdr.rpl_chk_code) {
+ if (rpl->hdr.rpl_len != args->rpl_exp_len) {
+ netdev_warn(dev, "CDB reply length mismatch, expected %u got %u\n",
+ args->rpl_exp_len, rpl->hdr.rpl_len);
+ err = -EIO;
+ goto out;
+ }
+ if (!rpl->hdr.rpl_chk_code) {
err = -EIO;
goto out;
}
diff --git a/net/ethtool/cmis_fw_update.c b/net/ethtool/cmis_fw_update.c
index df5f344209c4..291d04d2776a 100644
--- a/net/ethtool/cmis_fw_update.c
+++ b/net/ethtool/cmis_fw_update.c
@@ -44,6 +44,20 @@ enum cmis_cdb_fw_write_mechanism {
CMIS_CDB_FW_WRITE_MECHANISM_BOTH = 0x11,
};
+/* See section 9.7.2 "CMD 0101h: Start Firmware Download" in CMIS standard
+ * revision 5.2.
+ * struct cmis_cdb_start_fw_download_pl is a structured layout of the
+ * flat array, ethtool_cmis_cdb_request::payload.
+ */
+struct cmis_cdb_start_fw_download_pl {
+ __struct_group(cmis_cdb_start_fw_download_pl_h, head, /* no attrs */,
+ __be32 image_size;
+ __be32 resv1;
+ );
+ u8 vendor_data[ETHTOOL_CMIS_CDB_LPL_MAX_PL_LENGTH -
+ sizeof(struct cmis_cdb_start_fw_download_pl_h)];
+};
+
static int
cmis_fw_update_fw_mng_features_get(struct ethtool_cmis_cdb *cdb,
struct net_device *dev,
@@ -86,6 +100,14 @@ cmis_fw_update_fw_mng_features_get(struct ethtool_cmis_cdb *cdb,
*/
cdb->read_write_len_ext = rpl->read_write_len_ext;
fw_mng->start_cmd_payload_size = rpl->start_cmd_payload_size;
+ if (fw_mng->start_cmd_payload_size >
+ sizeof_field(struct cmis_cdb_start_fw_download_pl, vendor_data)) {
+ ethnl_module_fw_flash_ntf_err(dev, ntf_params,
+ "Start cmd payload size exceeds max LPL payload",
+ NULL);
+ return -EINVAL;
+ }
+
fw_mng->write_mechanism =
rpl->write_mechanism == CMIS_CDB_FW_WRITE_MECHANISM_LPL ?
CMIS_CDB_FW_WRITE_MECHANISM_LPL :
@@ -97,20 +119,6 @@ cmis_fw_update_fw_mng_features_get(struct ethtool_cmis_cdb *cdb,
return 0;
}
-/* See section 9.7.2 "CMD 0101h: Start Firmware Download" in CMIS standard
- * revision 5.2.
- * struct cmis_cdb_start_fw_download_pl is a structured layout of the
- * flat array, ethtool_cmis_cdb_request::payload.
- */
-struct cmis_cdb_start_fw_download_pl {
- __struct_group(cmis_cdb_start_fw_download_pl_h, head, /* no attrs */,
- __be32 image_size;
- __be32 resv1;
- );
- u8 vendor_data[ETHTOOL_CMIS_CDB_LPL_MAX_PL_LENGTH -
- sizeof(struct cmis_cdb_start_fw_download_pl_h)];
-};
-
static int
cmis_fw_update_start_download(struct ethtool_cmis_cdb *cdb,
struct ethtool_cmis_fw_update_params *fw_update,
@@ -122,6 +130,14 @@ cmis_fw_update_start_download(struct ethtool_cmis_cdb *cdb,
u8 lpl_len;
int err;
+ if (fw_update->fw->size < vendor_data_size) {
+ ethnl_module_fw_flash_ntf_err(fw_update->dev,
+ &fw_update->ntf_params,
+ "Firmware image too small for module's start payload",
+ NULL);
+ return -EINVAL;
+ }
+
pl.image_size = cpu_to_be32(fw_update->fw->size);
memcpy(pl.vendor_data, fw_update->fw->data, vendor_data_size);
diff --git a/net/ethtool/coalesce.c b/net/ethtool/coalesce.c
index 1e2c5c7048a8..e73fc3e5a02b 100644
--- a/net/ethtool/coalesce.c
+++ b/net/ethtool/coalesce.c
@@ -472,6 +472,12 @@ static int ethnl_update_profile(struct net_device *dev,
nla_for_each_nested_type(nest, ETHTOOL_A_PROFILE_IRQ_MODERATION,
nests, rem) {
+ if (i >= NET_DIM_PARAMS_NUM_PROFILES) {
+ NL_SET_BAD_ATTR(extack, nest);
+ ret = -E2BIG;
+ goto err_out;
+ }
+
ret = nla_parse_nested(tb, len_irq_moder - 1, nest,
coalesce_irq_moderation_policy,
extack);
diff --git a/net/ethtool/eeprom.c b/net/ethtool/eeprom.c
index a557e3996c85..0b8cfeddb014 100644
--- a/net/ethtool/eeprom.c
+++ b/net/ethtool/eeprom.c
@@ -44,6 +44,9 @@ static int fallback_set_params(struct eeprom_req_info *request,
if (offset >= modinfo->eeprom_len)
return -EINVAL;
+ if (length > modinfo->eeprom_len - offset)
+ return -EINVAL;
+
eeprom->cmd = ETHTOOL_GMODULEEEPROM;
eeprom->len = length;
eeprom->offset = offset;
@@ -69,7 +72,7 @@ static int eeprom_fallback(struct eeprom_req_info *request,
if (err < 0)
return err;
- data = kmalloc(eeprom.len, GFP_KERNEL);
+ data = kzalloc(eeprom.len, GFP_KERNEL);
if (!data)
return -ENOMEM;
err = ethtool_get_module_eeprom_call(dev, &eeprom, data);
@@ -141,12 +144,11 @@ static int eeprom_prepare_data(const struct ethnl_req_info *req_base,
return 0;
err_ops:
+ if (ret == -EOPNOTSUPP)
+ ret = eeprom_fallback(request, reply);
ethnl_ops_complete(dev);
err_free:
kfree(page_data.data);
-
- if (ret == -EOPNOTSUPP)
- return eeprom_fallback(request, reply);
return ret;
}
diff --git a/net/ethtool/linkstate.c b/net/ethtool/linkstate.c
index 8a5985fd7712..24569e92942c 100644
--- a/net/ethtool/linkstate.c
+++ b/net/ethtool/linkstate.c
@@ -106,10 +106,8 @@ static int linkstate_prepare_data(const struct ethnl_req_info *req_base,
phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_LINKSTATE_HEADER,
info->extack);
- if (IS_ERR(phydev)) {
- ret = PTR_ERR(phydev);
- goto out;
- }
+ if (IS_ERR(phydev))
+ return PTR_ERR(phydev);
ret = ethnl_ops_begin(dev);
if (ret < 0)
diff --git a/net/ethtool/module.c b/net/ethtool/module.c
index cad2eb25b5a4..ea4fb2a76650 100644
--- a/net/ethtool/module.c
+++ b/net/ethtool/module.c
@@ -120,12 +120,6 @@ ethnl_set_module_validate(struct ethnl_req_info *req_info,
if (!tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY])
return 0;
- if (req_info->dev->ethtool->module_fw_flash_in_progress) {
- NL_SET_ERR_MSG(info->extack,
- "Module firmware flashing is in progress");
- return -EBUSY;
- }
-
if (!ops->get_module_power_mode || !ops->set_module_power_mode) {
NL_SET_ERR_MSG_ATTR(info->extack,
tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY],
@@ -148,6 +142,12 @@ ethnl_set_module(struct ethnl_req_info *req_info, struct genl_info *info)
ops = dev->ethtool_ops;
+ if (dev->ethtool->module_fw_flash_in_progress) {
+ NL_SET_ERR_MSG(info->extack,
+ "Module firmware flashing is in progress");
+ return -EBUSY;
+ }
+
power_new.policy = nla_get_u8(tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY]);
ret = ops->get_module_power_mode(dev, &power, info->extack);
if (ret < 0)
@@ -221,14 +221,22 @@ static void module_flash_fw_work_list_del(struct list_head *list)
static void module_flash_fw_work(struct work_struct *work)
{
struct ethtool_module_fw_flash *module_fw;
+ struct net_device *dev;
module_fw = container_of(work, struct ethtool_module_fw_flash, work);
+ dev = module_fw->fw_update.dev;
ethtool_cmis_fw_update(&module_fw->fw_update);
module_flash_fw_work_list_del(&module_fw->list);
- module_fw->fw_update.dev->ethtool->module_fw_flash_in_progress = false;
- netdev_put(module_fw->fw_update.dev, &module_fw->dev_tracker);
+
+ rtnl_lock();
+ netdev_lock_ops(dev);
+ dev->ethtool->module_fw_flash_in_progress = false;
+ netdev_unlock_ops(dev);
+ rtnl_unlock();
+
+ netdev_put(dev, &module_fw->dev_tracker);
release_firmware(module_fw->fw_update.fw);
kfree(module_fw);
}
@@ -283,11 +291,9 @@ void ethnl_module_fw_flash_sock_destroy(struct ethnl_sock_priv *sk_priv)
spin_lock(&module_fw_flash_work_list_lock);
list_for_each_entry(work, &module_fw_flash_work_list, list) {
- if (work->fw_update.dev == sk_priv->dev &&
- work->fw_update.ntf_params.portid == sk_priv->portid) {
+ if (work->fw_update.ntf_params.portid == sk_priv->portid &&
+ dev_net(work->fw_update.dev) == sk_priv->net)
work->fw_update.ntf_params.closed_sock = true;
- break;
- }
}
spin_unlock(&module_fw_flash_work_list_lock);
}
@@ -319,14 +325,13 @@ module_flash_fw_schedule(struct net_device *dev, const char *file_name,
if (err < 0)
goto err_release_firmware;
- dev->ethtool->module_fw_flash_in_progress = true;
- netdev_hold(dev, &module_fw->dev_tracker, GFP_KERNEL);
fw_update->dev = dev;
fw_update->ntf_params.portid = info->snd_portid;
fw_update->ntf_params.seq = info->snd_seq;
fw_update->ntf_params.closed_sock = false;
- err = ethnl_sock_priv_set(skb, dev, fw_update->ntf_params.portid,
+ err = ethnl_sock_priv_set(skb, dev_net(dev),
+ fw_update->ntf_params.portid,
ETHTOOL_SOCK_TYPE_MODULE_FW_FLASH);
if (err < 0)
goto err_release_firmware;
@@ -335,6 +340,9 @@ module_flash_fw_schedule(struct net_device *dev, const char *file_name,
if (err < 0)
goto err_release_firmware;
+ dev->ethtool->module_fw_flash_in_progress = true;
+ netdev_hold(dev, &module_fw->dev_tracker, GFP_KERNEL);
+
schedule_work(&module_fw->work);
return 0;
@@ -427,10 +435,11 @@ int ethnl_act_module_fw_flash(struct sk_buff *skb, struct genl_info *info)
ret = ethnl_module_fw_flash_validate(dev, info->extack);
if (ret < 0)
- goto out_unlock;
+ goto out_complete;
ret = module_flash_fw(dev, tb, skb, info);
+out_complete:
ethnl_ops_complete(dev);
out_unlock:
diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c
index 5046023a30b1..7d45f9a884e5 100644
--- a/net/ethtool/netlink.c
+++ b/net/ethtool/netlink.c
@@ -53,7 +53,7 @@ const struct nla_policy ethnl_header_policy_phy_stats[] = {
[ETHTOOL_A_HEADER_PHY_INDEX] = NLA_POLICY_MIN(NLA_U32, 1),
};
-int ethnl_sock_priv_set(struct sk_buff *skb, struct net_device *dev, u32 portid,
+int ethnl_sock_priv_set(struct sk_buff *skb, struct net *net, u32 portid,
enum ethnl_sock_type type)
{
struct ethnl_sock_priv *sk_priv;
@@ -62,7 +62,7 @@ int ethnl_sock_priv_set(struct sk_buff *skb, struct net_device *dev, u32 portid,
if (IS_ERR(sk_priv))
return PTR_ERR(sk_priv);
- sk_priv->dev = dev;
+ sk_priv->net = net;
sk_priv->portid = portid;
sk_priv->type = type;
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index aaf6f2468768..fd2198e45d2b 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -318,12 +318,12 @@ enum ethnl_sock_type {
};
struct ethnl_sock_priv {
- struct net_device *dev;
+ struct net *net;
u32 portid;
enum ethnl_sock_type type;
};
-int ethnl_sock_priv_set(struct sk_buff *skb, struct net_device *dev, u32 portid,
+int ethnl_sock_priv_set(struct sk_buff *skb, struct net *net, u32 portid,
enum ethnl_sock_type type);
/**
diff --git a/net/ethtool/phy.c b/net/ethtool/phy.c
index d4e6887055ab..ddc6eab701ed 100644
--- a/net/ethtool/phy.c
+++ b/net/ethtool/phy.c
@@ -76,6 +76,7 @@ static int phy_prepare_data(const struct ethnl_req_info *req_info,
struct nlattr **tb = info->attrs;
struct phy_device_node *pdn;
struct phy_device *phydev;
+ int ret;
/* RTNL is held by the caller */
phydev = ethnl_req_get_phydev(req_info, tb, ETHTOOL_A_PHY_HEADER,
@@ -88,8 +89,19 @@ static int phy_prepare_data(const struct ethnl_req_info *req_info,
return -EOPNOTSUPP;
rep_data->phyindex = phydev->phyindex;
+
rep_data->name = kstrdup(dev_name(&phydev->mdio.dev), GFP_KERNEL);
- rep_data->drvname = kstrdup(phydev->drv->name, GFP_KERNEL);
+ if (!rep_data->name)
+ return -ENOMEM;
+
+ if (phydev->drv) {
+ rep_data->drvname = kstrdup(phydev->drv->name, GFP_KERNEL);
+ if (!rep_data->drvname) {
+ ret = -ENOMEM;
+ goto err_free_name;
+ }
+ }
+
rep_data->upstream_type = pdn->upstream_type;
if (pdn->upstream_type == PHY_UPSTREAM_PHY) {
@@ -97,15 +109,33 @@ static int phy_prepare_data(const struct ethnl_req_info *req_info,
rep_data->upstream_index = upstream->phyindex;
}
- if (pdn->parent_sfp_bus)
+ if (pdn->parent_sfp_bus) {
rep_data->upstream_sfp_name = kstrdup(sfp_get_name(pdn->parent_sfp_bus),
GFP_KERNEL);
+ if (!rep_data->upstream_sfp_name) {
+ ret = -ENOMEM;
+ goto err_free_drvname;
+ }
+ }
- if (phydev->sfp_bus)
+ if (phydev->sfp_bus) {
rep_data->downstream_sfp_name = kstrdup(sfp_get_name(phydev->sfp_bus),
GFP_KERNEL);
+ if (!rep_data->downstream_sfp_name) {
+ ret = -ENOMEM;
+ goto err_free_upstream_sfp;
+ }
+ }
return 0;
+
+err_free_upstream_sfp:
+ kfree(rep_data->upstream_sfp_name);
+err_free_drvname:
+ kfree(rep_data->drvname);
+err_free_name:
+ kfree(rep_data->name);
+ return ret;
}
static int phy_fill_reply(struct sk_buff *skb,
diff --git a/net/ethtool/pse-pd.c b/net/ethtool/pse-pd.c
index 2eb9bdc2dcb9..757c9e0cc856 100644
--- a/net/ethtool/pse-pd.c
+++ b/net/ethtool/pse-pd.c
@@ -62,14 +62,14 @@ static int pse_prepare_data(const struct ethnl_req_info *req_base,
struct phy_device *phydev;
int ret;
- ret = ethnl_ops_begin(dev);
- if (ret < 0)
- return ret;
-
phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_PSE_HEADER,
info->extack);
if (IS_ERR(phydev))
- return -ENODEV;
+ return PTR_ERR(phydev);
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
ret = pse_get_pse_attributes(phydev, info->extack, data);
diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c
index 353110b862ab..53792f53f922 100644
--- a/net/ethtool/rss.c
+++ b/net/ethtool/rss.c
@@ -134,8 +134,7 @@ rss_get_data_alloc(struct net_device *dev, struct rss_reply_data *data)
if (!rss_config)
return -ENOMEM;
- if (data->indir_size)
- data->indir_table = (u32 *)rss_config;
+ data->indir_table = (u32 *)rss_config;
if (data->hkey_size)
data->hkey = rss_config + indir_bytes;
@@ -170,8 +169,10 @@ rss_prepare_get(const struct rss_req_info *request, struct net_device *dev,
rxfh.key = data->hkey;
ret = ops->get_rxfh(dev, &rxfh);
- if (ret)
+ if (ret) {
+ rss_get_data_free(data);
goto out_unlock;
+ }
data->hfunc = rxfh.hfunc;
data->input_xfrm = rxfh.input_xfrm;
@@ -686,7 +687,7 @@ rss_set_prep_indir(struct net_device *dev, struct genl_info *info,
ethtool_rxfh_indir_default(i, num_rx_rings);
}
- *mod |= memcmp(rxfh->indir, data->indir_table, data->indir_size);
+ *mod |= memcmp(rxfh->indir, data->indir_table, alloc_size);
return user_size;
@@ -981,11 +982,17 @@ ethnl_rss_create_validate(struct net_device *dev, struct genl_info *info)
}
static void
-ethnl_rss_create_send_ntf(struct sk_buff *rsp, struct net_device *dev)
+ethnl_rss_create_send_ntf(const struct sk_buff *rsp, struct net_device *dev)
{
- struct nlmsghdr *nlh = (void *)rsp->data;
struct genlmsghdr *genl_hdr;
+ struct nlmsghdr *nlh;
+ struct sk_buff *ntf;
+
+ ntf = skb_copy_expand(rsp, 0, 0, GFP_KERNEL);
+ if (!ntf)
+ return;
+ nlh = nlmsg_hdr(ntf);
/* Convert the reply into a notification */
nlh->nlmsg_pid = 0;
nlh->nlmsg_seq = ethnl_bcast_seq_next();
@@ -993,7 +1000,7 @@ ethnl_rss_create_send_ntf(struct sk_buff *rsp, struct net_device *dev)
genl_hdr = nlmsg_data(nlh);
genl_hdr->cmd = ETHTOOL_MSG_RSS_CREATE_NTF;
- ethnl_multicast(rsp, dev);
+ ethnl_multicast(ntf, dev);
}
int ethnl_rss_create_doit(struct sk_buff *skb, struct genl_info *info)
@@ -1099,17 +1106,13 @@ int ethnl_rss_create_doit(struct sk_buff *skb, struct genl_info *info)
ntf_fail |= rss_fill_reply(rsp, &req.base, &data.base);
if (WARN_ON(!hdr || ntf_fail)) {
ret = -EMSGSIZE;
- goto exit_unlock;
+ goto err_remove_ctx;
}
genlmsg_end(rsp, hdr);
- /* Use the same skb for the response and the notification,
- * genlmsg_reply() will copy the skb if it has elevated user count.
- */
- skb_get(rsp);
- ret = genlmsg_reply(rsp, info);
ethnl_rss_create_send_ntf(rsp, dev);
+ ret = genlmsg_reply(rsp, info);
rsp = NULL;
exit_unlock:
@@ -1131,6 +1134,10 @@ exit_free_rsp:
nlmsg_free(rsp);
return ret;
+err_remove_ctx:
+ if (ops->remove_rxfh_context(dev, ctx, req.rss_context, NULL))
+ /* leave the context on failure, like ethnl_rss_delete_doit() */
+ goto exit_unlock;
err_ctx_id_free:
xa_erase(&dev->ethtool->rss_ctx, req.rss_context);
err_unlock_free_ctx:
@@ -1168,8 +1175,10 @@ int ethnl_rss_delete_doit(struct sk_buff *skb, struct genl_info *info)
dev = req.dev;
ops = dev->ethtool_ops;
- if (!ops->create_rxfh_context)
+ if (!ops->create_rxfh_context) {
+ ret = -EOPNOTSUPP;
goto exit_free_dev;
+ }
rtnl_lock();
netdev_lock_ops(dev);
diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c
index bb1e829ba099..94c4718d31ae 100644
--- a/net/ethtool/strset.c
+++ b/net/ethtool/strset.c
@@ -311,7 +311,7 @@ static int strset_prepare_data(const struct ethnl_req_info *req_base,
return 0;
}
- phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_HEADER_FLAGS,
+ phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_STRSET_HEADER,
info->extack);
/* phydev can be NULL, check for errors only */
diff --git a/net/ethtool/tsconfig.c b/net/ethtool/tsconfig.c
index e4f518e49d4c..fc4f93cfa459 100644
--- a/net/ethtool/tsconfig.c
+++ b/net/ethtool/tsconfig.c
@@ -69,8 +69,10 @@ static int tsconfig_prepare_data(const struct ethnl_req_info *req_base,
if (ret)
goto out;
- if (ts_info.phc_index == -1)
- return -ENODEV;
+ if (ts_info.phc_index == -1) {
+ ret = -ENODEV;
+ goto out;
+ }
data->hwprov_desc.index = ts_info.phc_index;
data->hwprov_desc.qualifier = ts_info.phc_qualifier;
@@ -224,16 +226,21 @@ static int tsconfig_send_reply(struct net_device *dev, struct genl_info *info)
reply_len = ret + ethnl_reply_header_size();
rskb = ethnl_reply_init(reply_len, dev, ETHTOOL_MSG_TSCONFIG_SET_REPLY,
ETHTOOL_A_TSCONFIG_HEADER, info, &reply_payload);
- if (!rskb)
+ if (!rskb) {
+ ret = -ENOMEM;
goto err_cleanup;
+ }
ret = tsconfig_fill_reply(rskb, &req_info->base, &reply_data->base);
if (ret < 0)
- goto err_cleanup;
+ goto err_free_msg;
genlmsg_end(rskb, reply_payload);
ret = genlmsg_reply(rskb, info);
+ rskb = NULL;
+err_free_msg:
+ nlmsg_free(rskb);
err_cleanup:
kfree(reply_data);
kfree(req_info);
diff --git a/net/ethtool/tsinfo.c b/net/ethtool/tsinfo.c
index a865f0fdd26b..14bf01e3b55c 100644
--- a/net/ethtool/tsinfo.c
+++ b/net/ethtool/tsinfo.c
@@ -83,6 +83,11 @@ tsinfo_parse_request(struct ethnl_req_info *req_base,
if (!tb[ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER])
return 0;
+ if (req_base->flags & ETHTOOL_FLAG_STATS) {
+ NL_SET_ERR_MSG(extack, "can't query statistics for a provider");
+ return -EOPNOTSUPP;
+ }
+
return ts_parse_hwtst_provider(tb[ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER],
&req->hwprov_desc, extack, &mod);
}
@@ -402,10 +407,8 @@ static int ethnl_tsinfo_dump_one_netdev(struct sk_buff *skb,
continue;
ehdr = ethnl_tsinfo_prepare_dump(skb, dev, reply_data, cb);
- if (IS_ERR(ehdr)) {
- ret = PTR_ERR(ehdr);
- goto err;
- }
+ if (IS_ERR(ehdr))
+ return PTR_ERR(ehdr);
reply_data->ts_info.phc_qualifier = ctx->pos_phcqualifier;
ret = ops->get_ts_info(dev, &reply_data->ts_info);
@@ -523,6 +526,12 @@ int ethnl_tsinfo_start(struct netlink_callback *cb)
if (ret < 0)
goto free_reply_data;
+ if (req_info->base.flags & ETHTOOL_FLAG_STATS) {
+ NL_SET_ERR_MSG(cb->extack, "stats not supported in dump");
+ ret = -EOPNOTSUPP;
+ goto err_dev_put;
+ }
+
ctx->req_info = req_info;
ctx->reply_data = reply_data;
ctx->pos_ifindex = 0;
@@ -532,6 +541,8 @@ int ethnl_tsinfo_start(struct netlink_callback *cb)
return 0;
+err_dev_put:
+ ethnl_parse_header_dev_put(&req_info->base);
free_reply_data:
kfree(reply_data);
free_req_info:
diff --git a/net/handshake/genl.c b/net/handshake/genl.c
index 870612609491..4b20cd9cdd0e 100644
--- a/net/handshake/genl.c
+++ b/net/handshake/genl.c
@@ -10,6 +10,7 @@
#include "genl.h"
#include <uapi/linux/handshake.h>
+#include <linux/err.h>
/* HANDSHAKE_CMD_ACCEPT - do */
static const struct nla_policy handshake_accept_nl_policy[HANDSHAKE_A_ACCEPT_HANDLER_CLASS + 1] = {
@@ -18,7 +19,7 @@ static const struct nla_policy handshake_accept_nl_policy[HANDSHAKE_A_ACCEPT_HAN
/* HANDSHAKE_CMD_DONE - do */
static const struct nla_policy handshake_done_nl_policy[HANDSHAKE_A_DONE_REMOTE_AUTH + 1] = {
- [HANDSHAKE_A_DONE_STATUS] = { .type = NLA_U32, },
+ [HANDSHAKE_A_DONE_STATUS] = NLA_POLICY_MAX(NLA_U32, MAX_ERRNO),
[HANDSHAKE_A_DONE_SOCKFD] = { .type = NLA_S32, },
[HANDSHAKE_A_DONE_REMOTE_AUTH] = { .type = NLA_U32, },
};
diff --git a/net/handshake/genl.h b/net/handshake/genl.h
index 8d3e18672daf..46b65f131669 100644
--- a/net/handshake/genl.h
+++ b/net/handshake/genl.h
@@ -11,6 +11,7 @@
#include <net/genetlink.h>
#include <uapi/linux/handshake.h>
+#include <linux/err.h>
int handshake_nl_accept_doit(struct sk_buff *skb, struct genl_info *info);
int handshake_nl_done_doit(struct sk_buff *skb, struct genl_info *info);
diff --git a/net/handshake/handshake-test.c b/net/handshake/handshake-test.c
index 55442b2f518a..3dd507470d5f 100644
--- a/net/handshake/handshake-test.c
+++ b/net/handshake/handshake-test.c
@@ -25,7 +25,7 @@ static int test_accept_func(struct handshake_req *req, struct genl_info *info,
return 0;
}
-static void test_done_func(struct handshake_req *req, unsigned int status,
+static void test_done_func(struct handshake_req *req, int status,
struct genl_info *info)
{
}
@@ -208,6 +208,7 @@ static void handshake_req_submit_test3(struct kunit *test)
static void handshake_req_submit_test4(struct kunit *test)
{
struct handshake_req *req, *result;
+ unsigned long fcount_before;
struct socket *sock;
struct file *filp;
int err;
@@ -224,8 +225,10 @@ static void handshake_req_submit_test4(struct kunit *test)
KUNIT_ASSERT_NOT_NULL(test, sock->sk);
sock->file = filp;
+ fcount_before = file_count(filp);
err = handshake_req_submit(sock, req, GFP_KERNEL);
KUNIT_ASSERT_EQ(test, err, 0);
+ KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before + 1);
/* Act */
result = handshake_req_hash_lookup(sock->sk);
@@ -235,11 +238,13 @@ static void handshake_req_submit_test4(struct kunit *test)
KUNIT_EXPECT_PTR_EQ(test, req, result);
handshake_req_cancel(sock->sk);
+ KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before);
fput(filp);
}
static void handshake_req_submit_test5(struct kunit *test)
{
+ unsigned long fcount_before;
struct handshake_req *req;
struct handshake_net *hn;
struct socket *sock;
@@ -265,12 +270,14 @@ static void handshake_req_submit_test5(struct kunit *test)
saved = hn->hn_pending;
hn->hn_pending = hn->hn_pending_max + 1;
+ fcount_before = file_count(filp);
/* Act */
err = handshake_req_submit(sock, req, GFP_KERNEL);
/* Assert */
KUNIT_EXPECT_EQ(test, err, -EAGAIN);
+ KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before);
fput(filp);
hn->hn_pending = saved;
@@ -279,6 +286,7 @@ static void handshake_req_submit_test5(struct kunit *test)
static void handshake_req_submit_test6(struct kunit *test)
{
struct handshake_req *req1, *req2;
+ unsigned long fcount_before;
struct socket *sock;
struct file *filp;
int err;
@@ -296,21 +304,26 @@ static void handshake_req_submit_test6(struct kunit *test)
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filp);
KUNIT_ASSERT_NOT_NULL(test, sock->sk);
sock->file = filp;
+ fcount_before = file_count(filp);
/* Act */
err = handshake_req_submit(sock, req1, GFP_KERNEL);
KUNIT_ASSERT_EQ(test, err, 0);
+ KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before + 1);
err = handshake_req_submit(sock, req2, GFP_KERNEL);
/* Assert */
KUNIT_EXPECT_EQ(test, err, -EBUSY);
+ KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before + 1);
handshake_req_cancel(sock->sk);
+ KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before);
fput(filp);
}
static void handshake_req_cancel_test1(struct kunit *test)
{
+ unsigned long fcount_before;
struct handshake_req *req;
struct socket *sock;
struct file *filp;
@@ -329,8 +342,10 @@ static void handshake_req_cancel_test1(struct kunit *test)
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filp);
sock->file = filp;
+ fcount_before = file_count(filp);
err = handshake_req_submit(sock, req, GFP_KERNEL);
KUNIT_ASSERT_EQ(test, err, 0);
+ KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before + 1);
/* NB: handshake_req hasn't been accepted */
@@ -339,12 +354,14 @@ static void handshake_req_cancel_test1(struct kunit *test)
/* Assert */
KUNIT_EXPECT_TRUE(test, result);
+ KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before);
fput(filp);
}
static void handshake_req_cancel_test2(struct kunit *test)
{
+ unsigned long fcount_before;
struct handshake_req *req, *next;
struct handshake_net *hn;
struct socket *sock;
@@ -365,8 +382,10 @@ static void handshake_req_cancel_test2(struct kunit *test)
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filp);
sock->file = filp;
+ fcount_before = file_count(filp);
err = handshake_req_submit(sock, req, GFP_KERNEL);
KUNIT_ASSERT_EQ(test, err, 0);
+ KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before + 1);
net = sock_net(sock->sk);
hn = handshake_pernet(net);
@@ -375,18 +394,24 @@ static void handshake_req_cancel_test2(struct kunit *test)
/* Pretend to accept this request */
next = handshake_req_next(hn, HANDSHAKE_HANDLER_CLASS_TLSHD);
KUNIT_ASSERT_PTR_EQ(test, req, next);
+ /* Simulate FD_PREPARE() consuming the file reference handed
+ * off by handshake_req_next(); see handshake_nl_accept_doit().
+ */
+ fput(filp);
/* Act */
result = handshake_req_cancel(sock->sk);
/* Assert */
KUNIT_EXPECT_TRUE(test, result);
+ KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before);
fput(filp);
}
static void handshake_req_cancel_test3(struct kunit *test)
{
+ unsigned long fcount_before;
struct handshake_req *req, *next;
struct handshake_net *hn;
struct socket *sock;
@@ -407,8 +432,10 @@ static void handshake_req_cancel_test3(struct kunit *test)
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filp);
sock->file = filp;
+ fcount_before = file_count(filp);
err = handshake_req_submit(sock, req, GFP_KERNEL);
KUNIT_ASSERT_EQ(test, err, 0);
+ KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before + 1);
net = sock_net(sock->sk);
hn = handshake_pernet(net);
@@ -417,15 +444,21 @@ static void handshake_req_cancel_test3(struct kunit *test)
/* Pretend to accept this request */
next = handshake_req_next(hn, HANDSHAKE_HANDLER_CLASS_TLSHD);
KUNIT_ASSERT_PTR_EQ(test, req, next);
+ /* Simulate FD_PREPARE() consuming the file reference handed
+ * off by handshake_req_next(); see handshake_nl_accept_doit().
+ */
+ fput(filp);
/* Pretend to complete this request */
handshake_complete(next, -ETIMEDOUT, NULL);
+ KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before);
/* Act */
result = handshake_req_cancel(sock->sk);
/* Assert */
KUNIT_EXPECT_FALSE(test, result);
+ KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before);
fput(filp);
}
@@ -446,6 +479,7 @@ static struct handshake_proto handshake_req_alloc_proto_destroy = {
static void handshake_req_destroy_test1(struct kunit *test)
{
+ unsigned long fcount_before;
struct handshake_req *req;
struct socket *sock;
struct file *filp;
@@ -465,10 +499,12 @@ static void handshake_req_destroy_test1(struct kunit *test)
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filp);
sock->file = filp;
+ fcount_before = file_count(filp);
err = handshake_req_submit(sock, req, GFP_KERNEL);
KUNIT_ASSERT_EQ(test, err, 0);
handshake_req_cancel(sock->sk);
+ KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before);
/* Act */
/* Ensure the close/release/put process has run to
diff --git a/net/handshake/handshake.h b/net/handshake/handshake.h
index a48163765a7a..da61cadd1ad3 100644
--- a/net/handshake/handshake.h
+++ b/net/handshake/handshake.h
@@ -24,6 +24,7 @@ enum hn_flags_bits {
HANDSHAKE_F_NET_DRAINING,
};
+struct file;
struct handshake_proto;
/* One handshake request */
@@ -32,6 +33,7 @@ struct handshake_req {
struct rhash_head hr_rhash;
unsigned long hr_flags;
const struct handshake_proto *hr_proto;
+ struct file *hr_file;
struct sock *hr_sk;
void (*hr_odestruct)(struct sock *sk);
@@ -57,7 +59,7 @@ struct handshake_proto {
int (*hp_accept)(struct handshake_req *req,
struct genl_info *info, int fd);
void (*hp_done)(struct handshake_req *req,
- unsigned int status,
+ int status,
struct genl_info *info);
void (*hp_destroy)(struct handshake_req *req);
};
@@ -86,7 +88,7 @@ struct handshake_req *handshake_req_hash_lookup(struct sock *sk);
struct handshake_req *handshake_req_next(struct handshake_net *hn, int class);
int handshake_req_submit(struct socket *sock, struct handshake_req *req,
gfp_t flags);
-void handshake_complete(struct handshake_req *req, unsigned int status,
+void handshake_complete(struct handshake_req *req, int status,
struct genl_info *info);
bool handshake_req_cancel(struct sock *sk);
diff --git a/net/handshake/netlink.c b/net/handshake/netlink.c
index b989456fc4c5..3fd4fef9bab1 100644
--- a/net/handshake/netlink.c
+++ b/net/handshake/netlink.c
@@ -92,7 +92,6 @@ int handshake_nl_accept_doit(struct sk_buff *skb, struct genl_info *info)
struct net *net = sock_net(skb->sk);
struct handshake_net *hn = handshake_pernet(net);
struct handshake_req *req = NULL;
- struct socket *sock;
int class, err;
err = -EOPNOTSUPP;
@@ -107,15 +106,13 @@ int handshake_nl_accept_doit(struct sk_buff *skb, struct genl_info *info)
err = -EAGAIN;
req = handshake_req_next(hn, class);
if (req) {
- sock = req->hr_sk->sk_socket;
-
- FD_PREPARE(fdf, O_CLOEXEC, sock->file);
+ FD_PREPARE(fdf, O_CLOEXEC, req->hr_file);
if (fdf.err) {
+ fput(req->hr_file); /* drop ref from handshake_req_next() */
err = fdf.err;
goto out_complete;
}
- get_file(sock->file); /* FD_PREPARE() consumes a reference. */
err = req->hr_proto->hp_accept(req, info, fd_prepare_fd(fdf));
if (err)
goto out_complete; /* Automatic cleanup handles fput */
@@ -160,7 +157,7 @@ int handshake_nl_done_doit(struct sk_buff *skb, struct genl_info *info)
status = -EIO;
if (info->attrs[HANDSHAKE_A_DONE_STATUS])
- status = nla_get_u32(info->attrs[HANDSHAKE_A_DONE_STATUS]);
+ status = -(int)nla_get_u32(info->attrs[HANDSHAKE_A_DONE_STATUS]);
handshake_complete(req, status, info);
sockfd_put(sock);
@@ -202,21 +199,21 @@ static void __net_exit handshake_net_exit(struct net *net)
* accepted and are in progress will be destroyed when
* the socket is closed.
*/
- spin_lock(&hn->hn_lock);
+ spin_lock_bh(&hn->hn_lock);
set_bit(HANDSHAKE_F_NET_DRAINING, &hn->hn_flags);
- list_splice_init(&requests, &hn->hn_requests);
- spin_unlock(&hn->hn_lock);
+ list_splice_init(&hn->hn_requests, &requests);
+ list_for_each_entry(req, &requests, hr_list)
+ get_file(req->hr_file);
+ spin_unlock_bh(&hn->hn_lock);
while (!list_empty(&requests)) {
- req = list_first_entry(&requests, struct handshake_req, hr_list);
- list_del(&req->hr_list);
-
- /*
- * Requests on this list have not yet been
- * accepted, so they do not have an fd to put.
- */
+ struct file *file;
+ req = list_first_entry(&requests, struct handshake_req, hr_list);
+ file = req->hr_file;
+ list_del_init(&req->hr_list);
handshake_complete(req, -ETIMEDOUT, NULL);
+ fput(file);
}
}
diff --git a/net/handshake/request.c b/net/handshake/request.c
index 2829adbeb149..cd30d54d0501 100644
--- a/net/handshake/request.c
+++ b/net/handshake/request.c
@@ -13,6 +13,7 @@
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/inet.h>
+#include <linux/file.h>
#include <linux/rhashtable.h>
#include <net/sock.h>
@@ -162,35 +163,56 @@ static void __remove_pending_locked(struct handshake_net *hn,
* otherwise %false.
*
* If @req was on a pending list, it has not yet been accepted.
+ * Returns %false when the net namespace is draining; the drain
+ * loop has taken ownership of the pending list.
*/
static bool remove_pending(struct handshake_net *hn, struct handshake_req *req)
{
bool ret = false;
- spin_lock(&hn->hn_lock);
- if (!list_empty(&req->hr_list)) {
+ spin_lock_bh(&hn->hn_lock);
+ if (!test_bit(HANDSHAKE_F_NET_DRAINING, &hn->hn_flags) &&
+ !list_empty(&req->hr_list)) {
__remove_pending_locked(hn, req);
ret = true;
}
- spin_unlock(&hn->hn_lock);
+ spin_unlock_bh(&hn->hn_lock);
return ret;
}
+/**
+ * handshake_req_next - Return the next queued handshake request
+ * @hn: per-net handshake state
+ * @class: handler class to match
+ *
+ * On a non-NULL return, the caller owns an extra reference
+ * on @req->hr_file. FD_PREPARE() consumes it on success; on
+ * the FD_PREPARE() failure path the caller must fput() it.
+ *
+ * Return: pointer to a removed handshake_req, or NULL.
+ */
struct handshake_req *handshake_req_next(struct handshake_net *hn, int class)
{
struct handshake_req *req, *pos;
req = NULL;
- spin_lock(&hn->hn_lock);
+ spin_lock_bh(&hn->hn_lock);
list_for_each_entry(pos, &hn->hn_requests, hr_list) {
if (pos->hr_proto->hp_handler_class != class)
continue;
__remove_pending_locked(hn, pos);
+ /* Hand off a file reference to the accept side under
+ * hn_lock. A concurrent handshake_req_cancel() can drop
+ * hr_file before accept reaches FD_PREPARE(); this extra
+ * reference keeps the file alive until FD_PREPARE() takes
+ * ownership.
+ */
+ get_file(pos->hr_file);
req = pos;
break;
}
- spin_unlock(&hn->hn_lock);
+ spin_unlock_bh(&hn->hn_lock);
return req;
}
@@ -215,9 +237,16 @@ EXPORT_SYMBOL_IF_KUNIT(handshake_req_next);
* A zero return value from handshake_req_submit() means that
* exactly one subsequent completion callback is guaranteed.
*
- * A negative return value from handshake_req_submit() means that
- * no completion callback will be done and that @req has been
- * destroyed.
+ * A negative return value from handshake_req_submit() guarantees that
+ * no completion callback will occur and that @req is no longer owned by
+ * the caller. If cancellation wins the completion race after the request
+ * has been published, final destruction is deferred until socket teardown.
+ *
+ * The caller must hold a reference on @sock->file for the duration
+ * of this call. Once the request is published to the accept side, a
+ * concurrent completion or cancellation may release the request's pin on
+ * @sock->file; the caller's reference is what keeps @sock->sk valid until
+ * handshake_req_submit() returns.
*/
int handshake_req_submit(struct socket *sock, struct handshake_req *req,
gfp_t flags)
@@ -236,6 +265,14 @@ int handshake_req_submit(struct socket *sock, struct handshake_req *req,
kfree(req);
return -EINVAL;
}
+
+ /*
+ * Pin sock->file for the lifetime of the request so the
+ * accept side does not race a consumer that releases the
+ * socket while a handshake is pending.
+ */
+ req->hr_file = get_file(sock->file);
+
req->hr_odestruct = req->hr_sk->sk_destruct;
req->hr_sk->sk_destruct = handshake_sk_destruct;
@@ -249,7 +286,7 @@ int handshake_req_submit(struct socket *sock, struct handshake_req *req,
if (READ_ONCE(hn->hn_pending) >= hn->hn_pending_max)
goto out_err;
- spin_lock(&hn->hn_lock);
+ spin_lock_bh(&hn->hn_lock);
ret = -EOPNOTSUPP;
if (test_bit(HANDSHAKE_F_NET_DRAINING, &hn->hn_flags))
goto out_unlock;
@@ -258,7 +295,7 @@ int handshake_req_submit(struct socket *sock, struct handshake_req *req,
goto out_unlock;
if (!__add_pending_locked(hn, req))
goto out_unlock;
- spin_unlock(&hn->hn_lock);
+ spin_unlock_bh(&hn->hn_lock);
ret = handshake_genl_notify(net, req->hr_proto, flags);
if (ret) {
@@ -267,35 +304,36 @@ int handshake_req_submit(struct socket *sock, struct handshake_req *req,
goto out_err;
}
- /* Prevent socket release while a handshake request is pending */
- sock_hold(req->hr_sk);
-
trace_handshake_submit(net, req, req->hr_sk);
return 0;
out_unlock:
- spin_unlock(&hn->hn_lock);
+ spin_unlock_bh(&hn->hn_lock);
out_err:
- /* Restore original destructor so socket teardown still runs on failure */
- req->hr_sk->sk_destruct = req->hr_odestruct;
trace_handshake_submit_err(net, req, req->hr_sk, ret);
- handshake_req_destroy(req);
+ if (!test_and_set_bit(HANDSHAKE_F_REQ_COMPLETED, &req->hr_flags)) {
+ /* Restore original destructor so socket teardown still runs. */
+ req->hr_sk->sk_destruct = req->hr_odestruct;
+ fput(req->hr_file);
+ handshake_req_destroy(req);
+ }
return ret;
}
EXPORT_SYMBOL(handshake_req_submit);
-void handshake_complete(struct handshake_req *req, unsigned int status,
+void handshake_complete(struct handshake_req *req, int status,
struct genl_info *info)
{
struct sock *sk = req->hr_sk;
struct net *net = sock_net(sk);
if (!test_and_set_bit(HANDSHAKE_F_REQ_COMPLETED, &req->hr_flags)) {
+ struct file *file = req->hr_file;
+
trace_handshake_complete(net, req, sk, status);
req->hr_proto->hp_done(req, status, info);
- /* Handshake request is no longer pending */
- sock_put(sk);
+ fput(file);
}
}
EXPORT_SYMBOL_IF_KUNIT(handshake_complete);
@@ -342,8 +380,7 @@ bool handshake_req_cancel(struct sock *sk)
out_true:
trace_handshake_cancel(net, req, sk);
- /* Handshake request is no longer pending */
- sock_put(sk);
+ fput(req->hr_file);
return true;
}
EXPORT_SYMBOL(handshake_req_cancel);
diff --git a/net/handshake/tlshd.c b/net/handshake/tlshd.c
index 8f9532a15f43..7567150c2a4f 100644
--- a/net/handshake/tlshd.c
+++ b/net/handshake/tlshd.c
@@ -93,7 +93,7 @@ static void tls_handshake_remote_peerids(struct tls_handshake_req *treq,
*
*/
static void tls_handshake_done(struct handshake_req *req,
- unsigned int status, struct genl_info *info)
+ int status, struct genl_info *info)
{
struct tls_handshake_req *treq = handshake_req_private(req);
@@ -104,7 +104,7 @@ static void tls_handshake_done(struct handshake_req *req,
if (!status)
set_bit(HANDSHAKE_F_REQ_SESSION, &req->hr_flags);
- treq->th_consumer_done(treq->th_consumer_data, -status,
+ treq->th_consumer_done(treq->th_consumer_data, status,
treq->th_peerid[0]);
}
@@ -425,6 +425,8 @@ EXPORT_SYMBOL(tls_server_hello_psk);
* Request cancellation races with request completion. To determine
* who won, callers examine the return value from this function.
*
+ * Context: May be called from process or softirq context.
+ *
* Return values:
* %true - Uncompleted handshake request was canceled
* %false - Handshake request already completed or not found
diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c
index 0aca859c88cb..f669a226d728 100644
--- a/net/hsr/hsr_forward.c
+++ b/net/hsr/hsr_forward.c
@@ -84,7 +84,7 @@ static bool is_supervision_frame(struct hsr_priv *hsr, struct sk_buff *skb)
/* Get next tlv */
total_length += hsr_sup_tag->tlv.HSR_TLV_length;
- if (!pskb_may_pull(skb, total_length))
+ if (!pskb_may_pull(skb, total_length + sizeof(struct hsr_sup_tlv)))
return false;
skb_pull(skb, total_length);
hsr_sup_tlv = (struct hsr_sup_tlv *)skb->data;
@@ -100,7 +100,7 @@ static bool is_supervision_frame(struct hsr_priv *hsr, struct sk_buff *skb)
/* make sure another tlv follows */
total_length += sizeof(struct hsr_sup_tlv) + hsr_sup_tlv->HSR_TLV_length;
- if (!pskb_may_pull(skb, total_length))
+ if (!pskb_may_pull(skb, total_length + sizeof(struct hsr_sup_tlv)))
return false;
/* get next tlv */
diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c
index d09875b33588..a28dfd8490c5 100644
--- a/net/hsr/hsr_framereg.c
+++ b/net/hsr/hsr_framereg.c
@@ -35,10 +35,8 @@ bool hsr_addr_is_self(struct hsr_priv *hsr, unsigned char *addr)
rcu_read_lock();
sn = rcu_dereference(hsr->self_node);
- if (!sn) {
- WARN_ONCE(1, "HSR: No self node\n");
+ if (!sn)
goto out;
- }
if (ether_addr_equal(addr, sn->macaddress_A) ||
ether_addr_equal(addr, sn->macaddress_B))
@@ -163,8 +161,8 @@ void hsr_del_nodes(struct list_head *node_db)
struct hsr_node *tmp;
list_for_each_entry_safe(node, tmp, node_db, mac_list) {
- list_del(&node->mac_list);
- hsr_free_node(node);
+ list_del_rcu(&node->mac_list);
+ call_rcu(&node->rcu_head, hsr_free_node_rcu);
}
}
@@ -889,7 +887,10 @@ int hsr_get_node_data(struct hsr_priv *hsr,
if (node->addr_B_port != HSR_PT_NONE) {
port = hsr_port_get_hsr(hsr, node->addr_B_port);
- *addr_b_ifindex = port->dev->ifindex;
+ if (port)
+ *addr_b_ifindex = port->dev->ifindex;
+ else
+ *addr_b_ifindex = -1;
} else {
*addr_b_ifindex = -1;
}
diff --git a/net/ieee802154/6lowpan/tx.c b/net/ieee802154/6lowpan/tx.c
index 0c07662b44c0..4df76ff50699 100644
--- a/net/ieee802154/6lowpan/tx.c
+++ b/net/ieee802154/6lowpan/tx.c
@@ -255,6 +255,11 @@ netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *ldev)
pr_debug("package xmit\n");
+ if (skb->protocol != htons(ETH_P_IPV6)) {
+ kfree_skb(skb);
+ return NET_XMIT_DROP;
+ }
+
WARN_ON_ONCE(skb->len > IPV6_MIN_MTU);
/* We must take a copy of the skb before we modify/replace the ipv6
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 5fb812443a08..6fd642d2278d 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -124,9 +124,14 @@ static void ah_output_done(void *data, int err)
struct iphdr *top_iph = ip_hdr(skb);
struct ip_auth_hdr *ah = ip_auth_hdr(skb);
int ihl = ip_hdrlen(skb);
+ int seqhi_len = 0;
+ __be32 *seqhi;
+ if (x->props.flags & XFRM_STATE_ESN)
+ seqhi_len = sizeof(*seqhi);
iph = AH_SKB_CB(skb)->tmp;
- icv = ah_tmp_icv(iph, ihl);
+ seqhi = (__be32 *)((char *)iph + ihl);
+ icv = ah_tmp_icv(seqhi, seqhi_len);
memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
top_iph->tos = iph->tos;
@@ -138,7 +143,7 @@ static void ah_output_done(void *data, int err)
}
kfree(AH_SKB_CB(skb)->tmp);
- xfrm_output_resume(skb->sk, skb, err);
+ xfrm_output_resume(skb_to_full_sk(skb), skb, err);
}
static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
@@ -270,12 +275,17 @@ static void ah_input_done(void *data, int err)
struct ip_auth_hdr *ah = ip_auth_hdr(skb);
int ihl = ip_hdrlen(skb);
int ah_hlen = (ah->hdrlen + 2) << 2;
+ int seqhi_len = 0;
+ __be32 *seqhi;
if (err)
goto out;
+ if (x->props.flags & XFRM_STATE_ESN)
+ seqhi_len = sizeof(*seqhi);
work_iph = AH_SKB_CB(skb)->tmp;
- auth_data = ah_tmp_auth(work_iph, ihl);
+ seqhi = (__be32 *)((char *)work_iph + ihl);
+ auth_data = ah_tmp_auth(seqhi, seqhi_len);
icv = ah_tmp_icv(auth_data, ahp->icv_trunc_len);
err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0;
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 008edc7f6688..791e15063237 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -168,7 +168,7 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
*/
if (prog_ops_moff(prog) !=
offsetof(struct tcp_congestion_ops, release))
- return &bpf_sk_setsockopt_proto;
+ return &bpf_sk_setsockopt_nodelay_proto;
return NULL;
case BPF_FUNC_getsockopt:
/* Since get/setsockopt is usually expected to
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 6dfc0bcdef65..513c8215c947 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -419,8 +419,8 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
return err;
}
- if (ALIGN(tailen, L1_CACHE_BYTES) > PAGE_SIZE ||
- ALIGN(skb->data_len, L1_CACHE_BYTES) > PAGE_SIZE)
+ if (ALIGN(skb->data_len + tailen, L1_CACHE_BYTES) >
+ PAGE_SIZE)
goto cow;
if (!skb_cloned(skb)) {
@@ -873,7 +873,8 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
nfrags = 1;
goto skip_cow;
- } else if (!skb_has_frag_list(skb)) {
+ } else if (!skb_has_frag_list(skb) &&
+ !skb_has_shared_frag(skb)) {
nfrags = skb_shinfo(skb)->nr_frags;
nfrags++;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 7eeff658b467..23e921d313b3 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -961,6 +961,9 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
if (IS_ERR(rt))
goto out_unlock;
+ if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
+ goto ende;
+
/* peer icmp_ratelimit */
if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code, apply_ratelimit))
goto ende;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index a674fb44ec25..a9ad39064f3b 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -122,16 +122,29 @@
* contradict to specs provided this delay is small enough.
*/
-#define IGMP_V1_SEEN(in_dev) \
- (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 1 || \
- IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 1 || \
- ((in_dev)->mr_v1_seen && \
- time_before(jiffies, (in_dev)->mr_v1_seen)))
-#define IGMP_V2_SEEN(in_dev) \
- (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 2 || \
- IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 2 || \
- ((in_dev)->mr_v2_seen && \
- time_before(jiffies, (in_dev)->mr_v2_seen)))
+static bool IGMP_V1_SEEN(const struct in_device *in_dev)
+{
+ unsigned long seen;
+
+ if (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 1)
+ return true;
+ if (IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 1)
+ return true;
+ seen = READ_ONCE(in_dev->mr_v1_seen);
+ return seen && time_before(jiffies, seen);
+}
+
+static bool IGMP_V2_SEEN(const struct in_device *in_dev)
+{
+ unsigned long seen;
+
+ if (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 2)
+ return true;
+ if (IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 2)
+ return true;
+ seen = READ_ONCE(in_dev->mr_v2_seen);
+ return seen && time_before(jiffies, seen);
+}
static int unsolicited_report_interval(struct in_device *in_dev)
{
@@ -954,23 +967,21 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
int max_delay;
int mark = 0;
struct net *net = dev_net(in_dev->dev);
-
+ unsigned long seen;
if (len == 8) {
+ seen = jiffies + READ_ONCE(in_dev->mr_qrv) * READ_ONCE(in_dev->mr_qi) +
+ READ_ONCE(in_dev->mr_qri);
if (ih->code == 0) {
/* Alas, old v1 router presents here. */
max_delay = IGMP_QUERY_RESPONSE_INTERVAL;
- in_dev->mr_v1_seen = jiffies +
- (in_dev->mr_qrv * in_dev->mr_qi) +
- in_dev->mr_qri;
+ WRITE_ONCE(in_dev->mr_v1_seen, seen);
group = 0;
} else {
/* v2 router present */
max_delay = ih->code*(HZ/IGMP_TIMER_SCALE);
- in_dev->mr_v2_seen = jiffies +
- (in_dev->mr_qrv * in_dev->mr_qi) +
- in_dev->mr_qri;
+ WRITE_ONCE(in_dev->mr_v2_seen, seen);
}
/* cancel the interface change timer */
WRITE_ONCE(in_dev->mr_ifc_count, 0);
@@ -995,6 +1006,8 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
if (!max_delay)
max_delay = 1; /* can't mod w/ 0 */
} else { /* v3 */
+ unsigned long mr_qi;
+
if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)))
return true;
@@ -1015,15 +1028,16 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
* received value was zero, use the default or statically
* configured value.
*/
- in_dev->mr_qrv = ih3->qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv);
- in_dev->mr_qi = IGMPV3_QQIC(ih3->qqic)*HZ ?: IGMP_QUERY_INTERVAL;
-
+ WRITE_ONCE(in_dev->mr_qrv,
+ ih3->qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv));
+ mr_qi = IGMPV3_QQIC(ih3->qqic)*HZ ?: IGMP_QUERY_INTERVAL;
+ WRITE_ONCE(in_dev->mr_qi, mr_qi);
/* RFC3376, 8.3. Query Response Interval:
* The number of seconds represented by the [Query Response
* Interval] must be less than the [Query Interval].
*/
- if (in_dev->mr_qri >= in_dev->mr_qi)
- in_dev->mr_qri = (in_dev->mr_qi/HZ - 1)*HZ;
+ if (READ_ONCE(in_dev->mr_qri) >= mr_qi)
+ WRITE_ONCE(in_dev->mr_qri, (mr_qi/HZ - 1) * HZ);
if (!group) { /* general query */
if (ih3->nsrcs)
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 928654c34156..5b934ce8d98a 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -1108,7 +1108,7 @@ static void reqsk_timer_handler(struct timer_list *t)
if (!inet_ehash_insert(req_to_sk(nreq), req_to_sk(oreq), NULL)) {
/* delete timer */
- __inet_csk_reqsk_queue_drop(sk_listener, nreq, true);
+ __inet_csk_reqsk_queue_drop(sk_listener, nreq, false);
goto no_ownership;
}
@@ -1134,7 +1134,7 @@ no_ownership:
}
drop:
- __inet_csk_reqsk_queue_drop(sk_listener, oreq, true);
+ __inet_csk_reqsk_queue_drop(oreq->rsk_listener, oreq, true);
reqsk_put(oreq);
}
@@ -1148,6 +1148,9 @@ static bool reqsk_queue_hash_req(struct request_sock *req)
/* The timer needs to be setup after a successful insertion. */
req->timeout = tcp_timeout_init((struct sock *)req);
timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED);
+
+ preempt_disable_nested();
+
mod_timer(&req->rsk_timer, jiffies + req->timeout);
/* before letting lookups find us, make sure all req fields
@@ -1155,6 +1158,9 @@ static bool reqsk_queue_hash_req(struct request_sock *req)
*/
smp_wmb();
refcount_set(&req->rsk_refcnt, 2 + 1);
+
+ preempt_enable_nested();
+
return true;
}
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index d8083b9033c2..5b957a831e7c 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -179,7 +179,8 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base,
seq = read_seqbegin(&base->lock);
p = lookup(daddr, base, seq, NULL, &gc_cnt, &parent, &pp);
- if (p)
+ /* Make sure tree was not modified during our lookup. */
+ if (p && !read_seqretry(&base->lock, seq))
return p;
/* retry an exact lookup, taking the lock before.
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index be8815ce3ac2..09d745112c15 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -530,6 +530,10 @@ int ip_options_get(struct net *net, struct ip_options_rcu **optp,
kfree(opt);
return -EINVAL;
}
+ if (opt->opt.srr && !ns_capable(net->user_ns, CAP_NET_RAW)) {
+ kfree(opt);
+ return -EPERM;
+ }
kfree(*optp);
*optp = opt;
return 0;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index e4790cc7b5c2..5bcd73cbdb41 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1233,6 +1233,8 @@ alloc_new_skb:
if (err < 0)
goto error;
copy = err;
+ if (!(flags & MSG_NO_SHARED_FRAGS))
+ skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG;
wmem_alloc_delta += copy;
} else if (!zc) {
int i = skb_shinfo(skb)->nr_frags;
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 2667f53482bd..d3c677e9bff2 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -212,7 +212,7 @@ EXPORT_SYMBOL_GPL(iptunnel_handle_offloads);
*/
static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu)
{
- const struct iphdr *iph = ip_hdr(skb);
+ const struct iphdr *iph;
struct icmphdr *icmph;
struct iphdr *niph;
struct ethhdr eh;
@@ -226,7 +226,6 @@ static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu)
skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN);
pskb_pull(skb, ETH_HLEN);
- skb_reset_network_header(skb);
err = pskb_trim(skb, 576 - sizeof(*niph) - sizeof(*icmph));
if (err)
@@ -236,7 +235,7 @@ static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu)
err = skb_cow(skb, sizeof(*niph) + sizeof(*icmph) + ETH_HLEN);
if (err)
return err;
-
+ iph = ip_hdr(skb);
icmph = skb_push(skb, sizeof(*icmph));
*icmph = (struct icmphdr) {
.type = ICMP_DEST_UNREACH,
@@ -281,7 +280,6 @@ static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu)
*/
static int iptunnel_pmtud_check_icmp(struct sk_buff *skb, int mtu)
{
- const struct icmphdr *icmph = icmp_hdr(skb);
const struct iphdr *iph = ip_hdr(skb);
if (mtu < 576 || iph->frag_off != htons(IP_DF))
@@ -292,9 +290,17 @@ static int iptunnel_pmtud_check_icmp(struct sk_buff *skb, int mtu)
ipv4_is_lbcast(iph->saddr) || ipv4_is_multicast(iph->saddr))
return 0;
- if (iph->protocol == IPPROTO_ICMP && icmp_is_err(icmph->type))
- return 0;
+ if (iph->protocol == IPPROTO_ICMP) {
+ const struct icmphdr *icmph;
+ if (!pskb_network_may_pull(skb, iph->ihl * 4 +
+ offsetofend(struct icmphdr, type)))
+ return 0;
+ iph = ip_hdr(skb);
+ icmph = (void *)iph + iph->ihl * 4;
+ if (icmp_is_err(icmph->type))
+ return 0;
+ }
return iptunnel_pmtud_build_icmp(skb, mtu);
}
@@ -308,7 +314,7 @@ static int iptunnel_pmtud_check_icmp(struct sk_buff *skb, int mtu)
*/
static int iptunnel_pmtud_build_icmpv6(struct sk_buff *skb, int mtu)
{
- const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ const struct ipv6hdr *ip6h;
struct icmp6hdr *icmp6h;
struct ipv6hdr *nip6h;
struct ethhdr eh;
@@ -323,7 +329,6 @@ static int iptunnel_pmtud_build_icmpv6(struct sk_buff *skb, int mtu)
skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN);
pskb_pull(skb, ETH_HLEN);
- skb_reset_network_header(skb);
err = pskb_trim(skb, IPV6_MIN_MTU - sizeof(*nip6h) - sizeof(*icmp6h));
if (err)
@@ -334,6 +339,7 @@ static int iptunnel_pmtud_build_icmpv6(struct sk_buff *skb, int mtu)
if (err)
return err;
+ ip6h = ipv6_hdr(skb);
icmp6h = skb_push(skb, sizeof(*icmp6h));
*icmp6h = (struct icmp6hdr) {
.icmp6_type = ICMPV6_PKT_TOOBIG,
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 2058ca860294..2628cd3a93a6 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -537,15 +537,16 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
};
int err;
+ rcu_read_lock();
err = ipmr_fib_lookup(net, &fl4, &mrt);
if (err < 0) {
+ rcu_read_unlock();
kfree_skb(skb);
return err;
}
DEV_STATS_ADD(dev, tx_bytes, skb->len);
DEV_STATS_INC(dev, tx_packets);
- rcu_read_lock();
/* Pairs with WRITE_ONCE() in vif_add() and vif_delete() */
ipmr_cache_report(mrt, skb, READ_ONCE(mrt->mroute_reg_vif_num),
@@ -1112,11 +1113,12 @@ static int ipmr_cache_report(const struct mr_table *mrt,
msg->im_vif_hi = vifi >> 8;
ipv4_pktinfo_prepare(mroute_sk, pkt, false);
memcpy(skb->cb, pkt->cb, sizeof(skb->cb));
- /* Add our header */
- igmp = skb_put(skb, sizeof(struct igmphdr));
+ /* Add our header.
+ * Note that code, csum and group fields are cleared.
+ */
+ igmp = skb_put_zero(skb, sizeof(struct igmphdr));
igmp->type = assert;
msg->im_msgtype = assert;
- igmp->code = 0;
ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */
skb->transport_header = skb->network_header;
}
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 97ead883e4a1..ad2259678c78 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -1501,13 +1501,11 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
static void __arpt_unregister_table(struct net *net, struct xt_table *table)
{
- struct xt_table_info *private;
- void *loc_cpu_entry;
+ struct xt_table_info *private = table->private;
struct module *table_owner = table->me;
+ void *loc_cpu_entry;
struct arpt_entry *iter;
- private = xt_unregister_table(table);
-
/* Decrease module usage counts and free resources */
loc_cpu_entry = private->entries;
xt_entry_foreach(iter, loc_cpu_entry, private->size)
@@ -1515,6 +1513,7 @@ static void __arpt_unregister_table(struct net *net, struct xt_table *table)
if (private->number > private->initial_entries)
module_put(table_owner);
xt_free_table_info(private);
+ kfree(table);
}
int arpt_register_table(struct net *net,
@@ -1522,13 +1521,11 @@ int arpt_register_table(struct net *net,
const struct arpt_replace *repl,
const struct nf_hook_ops *template_ops)
{
- struct nf_hook_ops *ops;
- unsigned int num_ops;
- int ret, i;
- struct xt_table_info *newinfo;
struct xt_table_info bootstrap = {0};
- void *loc_cpu_entry;
+ struct xt_table_info *newinfo;
struct xt_table *new_table;
+ void *loc_cpu_entry;
+ int ret;
newinfo = xt_alloc_table_info(repl->size);
if (!newinfo)
@@ -1543,7 +1540,7 @@ int arpt_register_table(struct net *net,
return ret;
}
- new_table = xt_register_table(net, table, &bootstrap, newinfo);
+ new_table = xt_register_table(net, table, template_ops, &bootstrap, newinfo);
if (IS_ERR(new_table)) {
struct arpt_entry *iter;
@@ -1553,46 +1550,12 @@ int arpt_register_table(struct net *net,
return PTR_ERR(new_table);
}
- num_ops = hweight32(table->valid_hooks);
- if (num_ops == 0) {
- ret = -EINVAL;
- goto out_free;
- }
-
- ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL);
- if (!ops) {
- ret = -ENOMEM;
- goto out_free;
- }
-
- for (i = 0; i < num_ops; i++)
- ops[i].priv = new_table;
-
- new_table->ops = ops;
-
- ret = nf_register_net_hooks(net, ops, num_ops);
- if (ret != 0)
- goto out_free;
-
return ret;
-
-out_free:
- __arpt_unregister_table(net, new_table);
- return ret;
-}
-
-void arpt_unregister_table_pre_exit(struct net *net, const char *name)
-{
- struct xt_table *table = xt_find_table(net, NFPROTO_ARP, name);
-
- if (table)
- nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks));
}
-EXPORT_SYMBOL(arpt_unregister_table_pre_exit);
void arpt_unregister_table(struct net *net, const char *name)
{
- struct xt_table *table = xt_find_table(net, NFPROTO_ARP, name);
+ struct xt_table *table = xt_unregister_table_exit(net, NFPROTO_ARP, name);
if (table)
__arpt_unregister_table(net, table);
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index 78cd5ee24448..370b635e3523 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -43,7 +43,7 @@ static int arptable_filter_table_init(struct net *net)
static void __net_exit arptable_filter_net_pre_exit(struct net *net)
{
- arpt_unregister_table_pre_exit(net, "filter");
+ xt_unregister_table_pre_exit(net, NFPROTO_ARP, "filter");
}
static void __net_exit arptable_filter_net_exit(struct net *net)
@@ -58,32 +58,33 @@ static struct pernet_operations arptable_filter_net_ops = {
static int __init arptable_filter_init(void)
{
- int ret = xt_register_template(&packet_filter,
- arptable_filter_table_init);
-
- if (ret < 0)
- return ret;
+ int ret;
arpfilter_ops = xt_hook_ops_alloc(&packet_filter, arpt_do_table);
- if (IS_ERR(arpfilter_ops)) {
- xt_unregister_template(&packet_filter);
+ if (IS_ERR(arpfilter_ops))
return PTR_ERR(arpfilter_ops);
- }
ret = register_pernet_subsys(&arptable_filter_net_ops);
+ if (ret < 0)
+ goto err_free;
+
+ ret = xt_register_template(&packet_filter,
+ arptable_filter_table_init);
if (ret < 0) {
- xt_unregister_template(&packet_filter);
- kfree(arpfilter_ops);
- return ret;
+ unregister_pernet_subsys(&arptable_filter_net_ops);
+ goto err_free;
}
+ return 0;
+err_free:
+ kfree(arpfilter_ops);
return ret;
}
static void __exit arptable_filter_fini(void)
{
- unregister_pernet_subsys(&arptable_filter_net_ops);
xt_unregister_template(&packet_filter);
+ unregister_pernet_subsys(&arptable_filter_net_ops);
kfree(arpfilter_ops);
}
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 23c8deff8095..5cbdb0815857 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1704,12 +1704,10 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
static void __ipt_unregister_table(struct net *net, struct xt_table *table)
{
- struct xt_table_info *private;
- void *loc_cpu_entry;
+ struct xt_table_info *private = table->private;
struct module *table_owner = table->me;
struct ipt_entry *iter;
-
- private = xt_unregister_table(table);
+ void *loc_cpu_entry;
/* Decrease module usage counts and free resources */
loc_cpu_entry = private->entries;
@@ -1718,19 +1716,18 @@ static void __ipt_unregister_table(struct net *net, struct xt_table *table)
if (private->number > private->initial_entries)
module_put(table_owner);
xt_free_table_info(private);
+ kfree(table);
}
int ipt_register_table(struct net *net, const struct xt_table *table,
const struct ipt_replace *repl,
const struct nf_hook_ops *template_ops)
{
- struct nf_hook_ops *ops;
- unsigned int num_ops;
- int ret, i;
- struct xt_table_info *newinfo;
struct xt_table_info bootstrap = {0};
- void *loc_cpu_entry;
+ struct xt_table_info *newinfo;
struct xt_table *new_table;
+ void *loc_cpu_entry;
+ int ret;
newinfo = xt_alloc_table_info(repl->size);
if (!newinfo)
@@ -1745,7 +1742,7 @@ int ipt_register_table(struct net *net, const struct xt_table *table,
return ret;
}
- new_table = xt_register_table(net, table, &bootstrap, newinfo);
+ new_table = xt_register_table(net, table, template_ops, &bootstrap, newinfo);
if (IS_ERR(new_table)) {
struct ipt_entry *iter;
@@ -1755,51 +1752,12 @@ int ipt_register_table(struct net *net, const struct xt_table *table,
return PTR_ERR(new_table);
}
- /* No template? No need to do anything. This is used by 'nat' table, it registers
- * with the nat core instead of the netfilter core.
- */
- if (!template_ops)
- return 0;
-
- num_ops = hweight32(table->valid_hooks);
- if (num_ops == 0) {
- ret = -EINVAL;
- goto out_free;
- }
-
- ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL);
- if (!ops) {
- ret = -ENOMEM;
- goto out_free;
- }
-
- for (i = 0; i < num_ops; i++)
- ops[i].priv = new_table;
-
- new_table->ops = ops;
-
- ret = nf_register_net_hooks(net, ops, num_ops);
- if (ret != 0)
- goto out_free;
-
return ret;
-
-out_free:
- __ipt_unregister_table(net, new_table);
- return ret;
-}
-
-void ipt_unregister_table_pre_exit(struct net *net, const char *name)
-{
- struct xt_table *table = xt_find_table(net, NFPROTO_IPV4, name);
-
- if (table)
- nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks));
}
void ipt_unregister_table_exit(struct net *net, const char *name)
{
- struct xt_table *table = xt_find_table(net, NFPROTO_IPV4, name);
+ struct xt_table *table = xt_unregister_table_exit(net, NFPROTO_IPV4, name);
if (table)
__ipt_unregister_table(net, table);
@@ -1887,7 +1845,6 @@ static void __exit ip_tables_fini(void)
}
EXPORT_SYMBOL(ipt_register_table);
-EXPORT_SYMBOL(ipt_unregister_table_pre_exit);
EXPORT_SYMBOL(ipt_unregister_table_exit);
EXPORT_SYMBOL(ipt_do_table);
module_init(ip_tables_init);
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index 3ab908b74795..672d7da1071d 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -61,7 +61,7 @@ static int __net_init iptable_filter_net_init(struct net *net)
static void __net_exit iptable_filter_net_pre_exit(struct net *net)
{
- ipt_unregister_table_pre_exit(net, "filter");
+ xt_unregister_table_pre_exit(net, NFPROTO_IPV4, "filter");
}
static void __net_exit iptable_filter_net_exit(struct net *net)
@@ -77,32 +77,33 @@ static struct pernet_operations iptable_filter_net_ops = {
static int __init iptable_filter_init(void)
{
- int ret = xt_register_template(&packet_filter,
- iptable_filter_table_init);
-
- if (ret < 0)
- return ret;
+ int ret;
filter_ops = xt_hook_ops_alloc(&packet_filter, ipt_do_table);
- if (IS_ERR(filter_ops)) {
- xt_unregister_template(&packet_filter);
+ if (IS_ERR(filter_ops))
return PTR_ERR(filter_ops);
- }
ret = register_pernet_subsys(&iptable_filter_net_ops);
+ if (ret < 0)
+ goto err_free;
+
+ ret = xt_register_template(&packet_filter,
+ iptable_filter_table_init);
if (ret < 0) {
- xt_unregister_template(&packet_filter);
- kfree(filter_ops);
- return ret;
+ unregister_pernet_subsys(&iptable_filter_net_ops);
+ goto err_free;
}
return 0;
+err_free:
+ kfree(filter_ops);
+ return ret;
}
static void __exit iptable_filter_fini(void)
{
- unregister_pernet_subsys(&iptable_filter_net_ops);
xt_unregister_template(&packet_filter);
+ unregister_pernet_subsys(&iptable_filter_net_ops);
kfree(filter_ops);
}
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index 385d945d8ebe..13d25d9a4610 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -96,7 +96,7 @@ static int iptable_mangle_table_init(struct net *net)
static void __net_exit iptable_mangle_net_pre_exit(struct net *net)
{
- ipt_unregister_table_pre_exit(net, "mangle");
+ xt_unregister_table_pre_exit(net, NFPROTO_IPV4, "mangle");
}
static void __net_exit iptable_mangle_net_exit(struct net *net)
@@ -111,32 +111,33 @@ static struct pernet_operations iptable_mangle_net_ops = {
static int __init iptable_mangle_init(void)
{
- int ret = xt_register_template(&packet_mangler,
- iptable_mangle_table_init);
- if (ret < 0)
- return ret;
+ int ret;
mangle_ops = xt_hook_ops_alloc(&packet_mangler, iptable_mangle_hook);
- if (IS_ERR(mangle_ops)) {
- xt_unregister_template(&packet_mangler);
- ret = PTR_ERR(mangle_ops);
- return ret;
- }
+ if (IS_ERR(mangle_ops))
+ return PTR_ERR(mangle_ops);
ret = register_pernet_subsys(&iptable_mangle_net_ops);
+ if (ret < 0)
+ goto err_free;
+
+ ret = xt_register_template(&packet_mangler,
+ iptable_mangle_table_init);
if (ret < 0) {
- xt_unregister_template(&packet_mangler);
- kfree(mangle_ops);
- return ret;
+ unregister_pernet_subsys(&iptable_mangle_net_ops);
+ goto err_free;
}
+ return 0;
+err_free:
+ kfree(mangle_ops);
return ret;
}
static void __exit iptable_mangle_fini(void)
{
- unregister_pernet_subsys(&iptable_mangle_net_ops);
xt_unregister_template(&packet_mangler);
+ unregister_pernet_subsys(&iptable_mangle_net_ops);
kfree(mangle_ops);
}
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index 625a1ca13b1b..a0df72554025 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -119,8 +119,11 @@ static int iptable_nat_table_init(struct net *net)
}
ret = ipt_nat_register_lookups(net);
- if (ret < 0)
+ if (ret < 0) {
+ xt_unregister_table_pre_exit(net, NFPROTO_IPV4, "nat");
+ synchronize_rcu();
ipt_unregister_table_exit(net, "nat");
+ }
kfree(repl);
return ret;
@@ -129,6 +132,7 @@ static int iptable_nat_table_init(struct net *net)
static void __net_exit iptable_nat_net_pre_exit(struct net *net)
{
ipt_nat_unregister_lookups(net);
+ xt_unregister_table_pre_exit(net, NFPROTO_IPV4, "nat");
}
static void __net_exit iptable_nat_net_exit(struct net *net)
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index 0e7f53964d0a..2745c22f4034 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -53,7 +53,7 @@ static int iptable_raw_table_init(struct net *net)
static void __net_exit iptable_raw_net_pre_exit(struct net *net)
{
- ipt_unregister_table_pre_exit(net, "raw");
+ xt_unregister_table_pre_exit(net, NFPROTO_IPV4, "raw");
}
static void __net_exit iptable_raw_net_exit(struct net *net)
@@ -77,32 +77,32 @@ static int __init iptable_raw_init(void)
pr_info("Enabling raw table before defrag\n");
}
- ret = xt_register_template(table,
- iptable_raw_table_init);
- if (ret < 0)
- return ret;
-
rawtable_ops = xt_hook_ops_alloc(table, ipt_do_table);
- if (IS_ERR(rawtable_ops)) {
- xt_unregister_template(table);
+ if (IS_ERR(rawtable_ops))
return PTR_ERR(rawtable_ops);
- }
ret = register_pernet_subsys(&iptable_raw_net_ops);
+ if (ret < 0)
+ goto err_free;
+
+ ret = xt_register_template(table,
+ iptable_raw_table_init);
if (ret < 0) {
- xt_unregister_template(table);
- kfree(rawtable_ops);
- return ret;
+ unregister_pernet_subsys(&iptable_raw_net_ops);
+ goto err_free;
}
+ return 0;
+err_free:
+ kfree(rawtable_ops);
return ret;
}
static void __exit iptable_raw_fini(void)
{
+ xt_unregister_template(&packet_raw);
unregister_pernet_subsys(&iptable_raw_net_ops);
kfree(rawtable_ops);
- xt_unregister_template(&packet_raw);
}
module_init(iptable_raw_init);
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index d885443cb267..491894511c54 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -50,7 +50,7 @@ static int iptable_security_table_init(struct net *net)
static void __net_exit iptable_security_net_pre_exit(struct net *net)
{
- ipt_unregister_table_pre_exit(net, "security");
+ xt_unregister_table_pre_exit(net, NFPROTO_IPV4, "security");
}
static void __net_exit iptable_security_net_exit(struct net *net)
@@ -65,33 +65,34 @@ static struct pernet_operations iptable_security_net_ops = {
static int __init iptable_security_init(void)
{
- int ret = xt_register_template(&security_table,
- iptable_security_table_init);
-
- if (ret < 0)
- return ret;
+ int ret;
sectbl_ops = xt_hook_ops_alloc(&security_table, ipt_do_table);
- if (IS_ERR(sectbl_ops)) {
- xt_unregister_template(&security_table);
+ if (IS_ERR(sectbl_ops))
return PTR_ERR(sectbl_ops);
- }
ret = register_pernet_subsys(&iptable_security_net_ops);
+ if (ret < 0)
+ goto err_free;
+
+ ret = xt_register_template(&security_table,
+ iptable_security_table_init);
if (ret < 0) {
- xt_unregister_template(&security_table);
- kfree(sectbl_ops);
- return ret;
+ unregister_pernet_subsys(&iptable_security_net_ops);
+ goto err_free;
}
+ return 0;
+err_free:
+ kfree(sectbl_ops);
return ret;
}
static void __exit iptable_security_fini(void)
{
+ xt_unregister_template(&security_table);
unregister_pernet_subsys(&iptable_security_net_ops);
kfree(sectbl_ops);
- xt_unregister_template(&security_table);
}
module_init(iptable_security_init);
diff --git a/net/ipv4/netfilter/nf_socket_ipv4.c b/net/ipv4/netfilter/nf_socket_ipv4.c
index 5080fa5fbf6a..f9c6755f5ec5 100644
--- a/net/ipv4/netfilter/nf_socket_ipv4.c
+++ b/net/ipv4/netfilter/nf_socket_ipv4.c
@@ -94,6 +94,9 @@ struct sock *nf_sk_lookup_slow_v4(struct net *net, const struct sk_buff *skb,
#endif
int doff = 0;
+ if (ntohs(iph->frag_off) & IP_OFFSET)
+ return NULL;
+
if (iph->protocol == IPPROTO_UDP || iph->protocol == IPPROTO_TCP) {
struct tcphdr _hdr;
struct udphdr *hp;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 5aaf9c62c8e1..68e88cb3e55c 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -391,7 +391,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
* in, reject the frame as invalid
*/
err = -EINVAL;
- if (iphlen > length)
+ if (iphlen > length || iphlen < sizeof(*iph))
goto error_free;
if (iphlen >= sizeof(*iph)) {
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index bc1296f0ea69..3d62d45d84bd 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1272,7 +1272,7 @@ static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
__func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
skb->dev ? skb->dev->name : "?");
kfree_skb(skb);
- WARN_ON(1);
+ WARN_ON_ONCE(1);
return 0;
}
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index d8bdb1bdbff1..c0e85cc171ae 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1705,10 +1705,10 @@ static __net_exit void ipv4_sysctl_exit_net(struct net *net)
{
const struct ctl_table *table;
- kfree(net->ipv4.sysctl_local_reserved_ports);
table = net->ipv4.ipv4_hdr->ctl_table_arg;
unregister_net_sysctl_table(net->ipv4.ipv4_hdr);
kfree(table);
+ kfree(net->ipv4.sysctl_local_reserved_ports);
}
static __net_initdata struct pernet_operations ipv4_sysctl_ops = {
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 432fa28e47d4..389a7cc17110 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -299,9 +299,6 @@ enum {
DEFINE_PER_CPU(unsigned int, tcp_orphan_count);
EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count);
-DEFINE_PER_CPU(u32, tcp_tw_isn);
-EXPORT_PER_CPU_SYMBOL_GPL(tcp_tw_isn);
-
long sysctl_tcp_mem[3] __read_mostly;
DEFINE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc);
diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c
index a97cdf3e6af4..0a4b38b315fe 100644
--- a/net/ipv4/tcp_ao.c
+++ b/net/ipv4/tcp_ao.c
@@ -116,7 +116,8 @@ struct tcp_ao_key *tcp_ao_established_key(const struct sock *sk,
{
struct tcp_ao_key *key;
- hlist_for_each_entry_rcu(key, &ao->head, node, lockdep_sock_is_held(sk)) {
+ hlist_for_each_entry_rcu(key, &ao->head, node,
+ sk_fullsock(sk) && lockdep_sock_is_held(sk)) {
if ((sndid >= 0 && key->sndid != sndid) ||
(rcvid >= 0 && key->rcvid != rcvid))
continue;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d5c9e65d9760..de9f68a9c0cf 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -7589,6 +7589,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
struct sock *sk, struct sk_buff *skb)
{
struct tcp_fastopen_cookie foc = { .len = -1 };
+ u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
struct tcp_options_received tmp_opt;
const struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
@@ -7599,20 +7600,16 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
struct dst_entry *dst;
struct flowi fl;
u8 syncookies;
- u32 isn;
#ifdef CONFIG_TCP_AO
const struct tcp_ao_hdr *aoh;
#endif
- isn = __this_cpu_read(tcp_tw_isn);
- if (isn) {
- /* TW buckets are converted to open requests without
- * limitations, they conserve resources and peer is
- * evidently real one.
- */
- __this_cpu_write(tcp_tw_isn, 0);
- } else {
+ /* If isn is non-zero, this SYN originally matched a TIME_WAIT socket.
+ * TW sockets are converted to open requests without limitations,
+ * we skip the queue limits and syncookie checks in the block below.
+ */
+ if (!isn) {
syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies);
if (syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 8fc24c3743c5..fdc81150ff6c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1827,7 +1827,6 @@ INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
enum skb_drop_reason reason;
- struct sock *rsk;
reason = psp_sk_rx_policy_check(sk, skb);
if (reason)
@@ -1863,24 +1862,21 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
if (nsk != sk) {
reason = tcp_child_process(sk, nsk, skb);
- if (reason) {
- rsk = nsk;
+ sock_put(nsk);
+ if (reason)
goto reset;
- }
return 0;
}
} else
sock_rps_save_rxhash(sk, skb);
reason = tcp_rcv_state_process(sk, skb);
- if (reason) {
- rsk = sk;
+ if (reason)
goto reset;
- }
return 0;
reset:
- tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
+ tcp_v4_send_reset(sk, skb, sk_rst_convert_drop_reason(reason));
discard:
sk_skb_reason_drop(sk, skb, reason);
/* Be careful here. If this function gets more complicated and
@@ -2193,13 +2189,16 @@ lookup:
rst_reason = sk_rst_convert_drop_reason(drop_reason);
tcp_v4_send_reset(nsk, skb, rst_reason);
+ sock_put(nsk);
goto discard_and_relse;
}
+ sock_put(nsk);
sock_put(sk);
return 0;
}
}
+ isn = 0;
process:
if (static_branch_unlikely(&ip4_min_ttl)) {
/* min_ttl can be changed concurrently from do_ip_setsockopt() */
@@ -2229,6 +2228,7 @@ process:
th = (const struct tcphdr *)skb->data;
iph = ip_hdr(skb);
tcp_v4_fill_cb(skb, iph, th);
+ TCP_SKB_CB(skb)->tcp_tw_isn = isn;
skb->dev = NULL;
@@ -2315,7 +2315,6 @@ do_time_wait:
sk = sk2;
tcp_v4_restore_cb(skb);
refcounted = false;
- __this_cpu_write(tcp_tw_isn, isn);
goto process;
}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 199f0b579e89..e6092c3ac840 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -1012,6 +1012,6 @@ enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child,
}
bh_unlock_sock(child);
- sock_put(child);
+
return reason;
}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f9d8755705f7..6e4bb411dc04 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2626,6 +2626,7 @@ static int tcp_clone_payload(struct sock *sk, struct sk_buff *to,
todo = min_t(int, skb_frag_size(fragfrom),
probe_size - len);
len += todo;
+ skb_shinfo(to)->flags |= skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG;
if (lastfrag &&
skb_frag_page(fragfrom) == skb_frag_page(lastfrag) &&
skb_frag_off(fragfrom) == skb_frag_off(lastfrag) +
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 0ac2bf4f8759..70f6cbd4ef73 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2011,6 +2011,14 @@ try_again:
}
WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk));
+
+ /*
+ * skb->dev still aliases the UDP rx dev_scratch (its charge was freed
+ * on dequeue above); a sockmap verdict program may deref it via
+ * bpf_sk_lookup_*(), so clear it -> bpf_skc_lookup() uses skb->sk
+ */
+ skb->dev = NULL;
+
return recv_actor(sk, skb);
}
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index a0813d425b71..29651b1a0bc7 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -482,11 +482,11 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
struct sock *sk = gso_skb->sk;
unsigned int sum_truesize = 0;
struct sk_buff *segs, *seg;
- __be16 newlen, msslen;
struct udphdr *uh;
unsigned int mss;
bool copy_dtor;
__sum16 check;
+ __be16 newlen;
int ret = 0;
mss = skb_shinfo(gso_skb)->gso_size;
@@ -555,15 +555,6 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
return segs;
}
- msslen = htons(sizeof(*uh) + mss);
-
- /* GSO partial and frag_list segmentation only requires splitting
- * the frame into an MSS multiple and possibly a remainder, both
- * cases return a GSO skb. So update the mss now.
- */
- if (skb_is_gso(segs))
- mss *= skb_shinfo(segs)->gso_segs;
-
seg = segs;
uh = udp_hdr(seg);
@@ -586,7 +577,7 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
if (!seg->next)
break;
- uh->len = msslen;
+ uh->len = newlen;
uh->check = check;
if (seg->ip_summed == CHECKSUM_PARTIAL)
@@ -599,9 +590,12 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
uh = udp_hdr(seg);
}
- /* last packet can be partial gso_size, account for that in checksum */
- newlen = htons(skb_tail_pointer(seg) - skb_transport_header(seg) +
- seg->data_len);
+ /* Unless skb fits perfectly as GSO_PARTIAL, the trailing
+ * segment may not be full MSS, account for that in the checksum
+ */
+ if (!skb_is_gso(seg))
+ newlen = htons(skb_tail_pointer(seg) -
+ skb_transport_header(seg) + seg->data_len);
check = csum16_add(csum16_sub(uh->check, uh->len), newlen);
uh->len = newlen;
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index c024aa77f25b..c3806c6ac96f 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -164,7 +164,7 @@ config IPV6_SIT
select INET_TUNNEL
select NET_IP_TUNNEL
select IPV6_NDISC_NODETYPE
- default y
+ default m
help
Tunneling means encapsulating data of one protocol type within
another protocol and sending it over a channel that understands the
@@ -172,7 +172,7 @@ config IPV6_SIT
into IPv4 packets. This is useful if you want to connect two IPv6
networks over an IPv4-only path.
- Saying M here will produce a module called sit. If unsure, say Y.
+ Saying M here will produce a module called sit. If unsure, say M.
config IPV6_SIT_6RD
bool "IPv6: IPv6 Rapid Deployment (6RD)"
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 5476b6536eb7..bb84a78b80f6 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1013,7 +1013,7 @@ ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp)
list_for_each(p, &idev->addr_list) {
struct inet6_ifaddr *ifa
= list_entry(p, struct inet6_ifaddr, if_list);
- if (ifp_scope > ipv6_addr_src_scope(&ifa->addr))
+ if (ifp_scope >= ipv6_addr_src_scope(&ifa->addr))
break;
}
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index cb26beea4398..76f7a2de9108 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -317,14 +317,19 @@ static void ah6_output_done(void *data, int err)
struct ipv6hdr *top_iph = ipv6_hdr(skb);
struct ip_auth_hdr *ah = ip_auth_hdr(skb);
struct tmp_ext *iph_ext;
+ int seqhi_len = 0;
+ __be32 *seqhi;
extlen = skb_network_header_len(skb) - sizeof(struct ipv6hdr);
if (extlen)
extlen += sizeof(*iph_ext);
+ if (x->props.flags & XFRM_STATE_ESN)
+ seqhi_len = sizeof(*seqhi);
iph_base = AH_SKB_CB(skb)->tmp;
iph_ext = ah_tmp_ext(iph_base);
- icv = ah_tmp_icv(iph_ext, extlen);
+ seqhi = (__be32 *)((char *)iph_ext + extlen);
+ icv = ah_tmp_icv(seqhi, seqhi_len);
memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
memcpy(top_iph, iph_base, IPV6HDR_BASELEN);
@@ -332,7 +337,7 @@ static void ah6_output_done(void *data, int err)
ah6_restore_hdrs(top_iph, iph_ext, extlen);
kfree(AH_SKB_CB(skb)->tmp);
- xfrm_output_resume(skb->sk, skb, err);
+ xfrm_output_resume(skb_to_full_sk(skb), skb, err);
}
static int ah6_output(struct xfrm_state *x, struct sk_buff *skb)
@@ -471,13 +476,18 @@ static void ah6_input_done(void *data, int err)
struct ip_auth_hdr *ah = ip_auth_hdr(skb);
int hdr_len = skb_network_header_len(skb);
int ah_hlen = ipv6_authlen(ah);
+ int seqhi_len = 0;
+ __be32 *seqhi;
if (err)
goto out;
+ if (x->props.flags & XFRM_STATE_ESN)
+ seqhi_len = sizeof(*seqhi);
work_iph = AH_SKB_CB(skb)->tmp;
auth_data = ah_tmp_auth(work_iph, hdr_len);
- icv = ah_tmp_icv(auth_data, ahp->icv_trunc_len);
+ seqhi = (__be32 *)(auth_data + ahp->icv_trunc_len);
+ icv = ah_tmp_icv(seqhi, seqhi_len);
err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0;
if (err)
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 67a42e01dfc3..be6dac8a8566 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -243,16 +243,16 @@ static void ipv6_add_acaddr_hash(struct net *net, struct ifacaddr6 *aca)
{
unsigned int hash = inet6_acaddr_hash(net, &aca->aca_addr);
- spin_lock(&acaddr_hash_lock);
+ spin_lock_bh(&acaddr_hash_lock);
hlist_add_head_rcu(&aca->aca_addr_lst, &inet6_acaddr_lst[hash]);
- spin_unlock(&acaddr_hash_lock);
+ spin_unlock_bh(&acaddr_hash_lock);
}
static void ipv6_del_acaddr_hash(struct ifacaddr6 *aca)
{
- spin_lock(&acaddr_hash_lock);
+ spin_lock_bh(&acaddr_hash_lock);
hlist_del_init_rcu(&aca->aca_addr_lst);
- spin_unlock(&acaddr_hash_lock);
+ spin_unlock_bh(&acaddr_hash_lock);
}
static void aca_get(struct ifacaddr6 *aca)
@@ -371,10 +371,10 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
aca->aca_next = idev->ac_list;
rcu_assign_pointer(idev->ac_list, aca);
- write_unlock_bh(&idev->lock);
-
ipv6_add_acaddr_hash(net, aca);
+ write_unlock_bh(&idev->lock);
+
ip6_ins_rt(net, f6i);
addrconf_join_solict(idev->dev, &aca->aca_addr);
@@ -649,8 +649,8 @@ void ipv6_anycast_cleanup(void)
{
int i;
- spin_lock(&acaddr_hash_lock);
+ spin_lock_bh(&acaddr_hash_lock);
for (i = 0; i < IN6_ADDR_HSIZE; i++)
WARN_ON(!hlist_empty(&inet6_acaddr_lst[i]));
- spin_unlock(&acaddr_hash_lock);
+ spin_unlock_bh(&acaddr_hash_lock);
}
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index ca3605acb433..38d7b4845281 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -617,6 +617,18 @@ void ip6_datagram_recv_common_ctl(struct sock *sk, struct msghdr *msg,
}
}
+static u16 ipv6_get_exthdr_len(const struct sk_buff *skb, const u8 *ptr)
+{
+ u16 len;
+
+ if (ptr + 2 > skb_tail_pointer(skb))
+ return 0;
+
+ len = (ptr[1] + 1) << 3;
+
+ return (len <= skb_tail_pointer(skb) - ptr) ? len : 0;
+}
+
void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg,
struct sk_buff *skb)
{
@@ -643,7 +655,10 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg,
/* HbH is allowed only once */
if (np->rxopt.bits.hopopts && (opt->flags & IP6SKB_HOPBYHOP)) {
u8 *ptr = nh + sizeof(struct ipv6hdr);
- put_cmsg(msg, SOL_IPV6, IPV6_HOPOPTS, (ptr[1]+1)<<3, ptr);
+ u16 len = ipv6_get_exthdr_len(skb, ptr);
+
+ if (len)
+ put_cmsg(msg, SOL_IPV6, IPV6_HOPOPTS, len, ptr);
}
if (opt->lastopt &&
@@ -664,26 +679,37 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg,
unsigned int len;
u8 *ptr = nh + off;
+ if (ptr + 2 > skb_tail_pointer(skb))
+ return;
+
switch (nexthdr) {
case IPPROTO_DSTOPTS:
nexthdr = ptr[0];
- len = (ptr[1] + 1) << 3;
+ len = ipv6_get_exthdr_len(skb, ptr);
+ if (!len)
+ return;
if (np->rxopt.bits.dstopts)
put_cmsg(msg, SOL_IPV6, IPV6_DSTOPTS, len, ptr);
break;
case IPPROTO_ROUTING:
nexthdr = ptr[0];
- len = (ptr[1] + 1) << 3;
+ len = ipv6_get_exthdr_len(skb, ptr);
+ if (!len)
+ return;
if (np->rxopt.bits.srcrt)
put_cmsg(msg, SOL_IPV6, IPV6_RTHDR, len, ptr);
break;
case IPPROTO_AH:
nexthdr = ptr[0];
len = (ptr[1] + 2) << 2;
+ if (ptr + len > skb_tail_pointer(skb))
+ return;
break;
default:
nexthdr = ptr[0];
- len = (ptr[1] + 1) << 3;
+ len = ipv6_get_exthdr_len(skb, ptr);
+ if (!len)
+ return;
break;
}
@@ -705,19 +731,31 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg,
}
if (np->rxopt.bits.ohopopts && (opt->flags & IP6SKB_HOPBYHOP)) {
u8 *ptr = nh + sizeof(struct ipv6hdr);
- put_cmsg(msg, SOL_IPV6, IPV6_2292HOPOPTS, (ptr[1]+1)<<3, ptr);
+ u16 len = ipv6_get_exthdr_len(skb, ptr);
+
+ if (len)
+ put_cmsg(msg, SOL_IPV6, IPV6_2292HOPOPTS, len, ptr);
}
if (np->rxopt.bits.odstopts && opt->dst0) {
u8 *ptr = nh + opt->dst0;
- put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr);
+ u16 len = ipv6_get_exthdr_len(skb, ptr);
+
+ if (len)
+ put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, len, ptr);
}
if (np->rxopt.bits.osrcrt && opt->srcrt) {
struct ipv6_rt_hdr *rthdr = (struct ipv6_rt_hdr *)(nh + opt->srcrt);
- put_cmsg(msg, SOL_IPV6, IPV6_2292RTHDR, (rthdr->hdrlen+1) << 3, rthdr);
+ u16 len = ipv6_get_exthdr_len(skb, (u8 *)rthdr);
+
+ if (len)
+ put_cmsg(msg, SOL_IPV6, IPV6_2292RTHDR, len, rthdr);
}
if (np->rxopt.bits.odstopts && opt->dst1) {
u8 *ptr = nh + opt->dst1;
- put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr);
+ u16 len = ipv6_get_exthdr_len(skb, ptr);
+
+ if (len)
+ put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, len, ptr);
}
if (np->rxopt.bits.rxorigdstaddr) {
struct sockaddr_in6 sin6;
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 9f75313734f8..57481e423e59 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -448,8 +448,8 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info
return err;
}
- if (ALIGN(tailen, L1_CACHE_BYTES) > PAGE_SIZE ||
- ALIGN(skb->data_len, L1_CACHE_BYTES) > PAGE_SIZE)
+ if (ALIGN(skb->data_len + tailen, L1_CACHE_BYTES) >
+ PAGE_SIZE)
goto cow;
if (!skb_cloned(skb)) {
@@ -915,7 +915,8 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
nfrags = 1;
goto skip_cow;
- } else if (!skb_has_frag_list(skb)) {
+ } else if (!skb_has_frag_list(skb) &&
+ !skb_has_shared_frag(skb)) {
nfrags = skb_shinfo(skb)->nr_frags;
nfrags++;
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 03cbce842c1a..43f46ef9c53b 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -184,6 +184,8 @@ static bool ip6_parse_tlv(bool hopbyhop,
case IPV6_TLV_JUMBO:
if (!ipv6_hop_jumbo(skb, off))
return false;
+
+ nh = skb_network_header(skb);
break;
case IPV6_TLV_CALIPSO:
if (!ipv6_hop_calipso(skb, off))
@@ -201,6 +203,8 @@ static bool ip6_parse_tlv(bool hopbyhop,
case IPV6_TLV_HAO:
if (!ipv6_dest_hao(skb, off))
return false;
+
+ nh = skb_network_header(skb);
break;
#endif
default:
@@ -544,7 +548,7 @@ looped_back:
* unsigned char which is segments_left field. Should not be
* higher than that.
*/
- if (r || (n + 1) > 255) {
+ if (r || (n + 1) > 127) {
kfree_skb(skb);
return -1;
}
@@ -910,16 +914,27 @@ static bool ipv6_hop_ra(struct sk_buff *skb, int optoff)
static bool ipv6_hop_ioam(struct sk_buff *skb, int optoff)
{
+ enum skb_drop_reason drop_reason;
struct ioam6_trace_hdr *trace;
struct ioam6_namespace *ns;
+ struct inet6_dev *idev;
struct ioam6_hdr *hdr;
+ drop_reason = SKB_DROP_REASON_IP_INHDR;
+
/* Bad alignment (must be 4n-aligned) */
if (optoff & 3)
goto drop;
+ /* Does the device still have IPv6 configuration? */
+ idev = __in6_dev_get(skb->dev);
+ if (!idev) {
+ drop_reason = SKB_DROP_REASON_IPV6DISABLED;
+ goto drop;
+ }
+
/* Ignore if IOAM is not enabled on ingress */
- if (!READ_ONCE(__in6_dev_get(skb->dev)->cnf.ioam6_enabled))
+ if (!READ_ONCE(idev->cnf.ioam6_enabled))
goto ignore;
/* Truncated Option header */
@@ -955,9 +970,9 @@ static bool ipv6_hop_ioam(struct sk_buff *skb, int optoff)
if (skb_ensure_writable(skb, optoff + 2 + hdr->opt_len))
goto drop;
- /* Trace pointer may have changed */
- trace = (struct ioam6_trace_hdr *)(skb_network_header(skb)
- + optoff + sizeof(*hdr));
+ /* Trace and hdr pointers may have changed */
+ hdr = (struct ioam6_hdr *)(skb_network_header(skb) + optoff);
+ trace = (struct ioam6_trace_hdr *)((u8 *)hdr + sizeof(*hdr));
ioam6_fill_trace_data(skb, ns, trace, true);
@@ -972,7 +987,7 @@ ignore:
return true;
drop:
- kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
+ kfree_skb_reason(skb, drop_reason);
return false;
}
diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c
index 49e31e4ae7b7..9d06d487e8b1 100644
--- a/net/ipv6/exthdrs_core.c
+++ b/net/ipv6/exthdrs_core.c
@@ -73,6 +73,7 @@ int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp,
__be16 *frag_offp)
{
u8 nexthdr = *nexthdrp;
+ int exthdr_cnt = 0;
*frag_offp = 0;
@@ -82,6 +83,8 @@ int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp,
if (nexthdr == NEXTHDR_NONE)
return -1;
+ if (unlikely(exthdr_cnt++ >= IP6_MAX_EXT_HDRS_CNT))
+ return -1;
hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr);
if (!hp)
return -1;
@@ -190,6 +193,7 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
{
unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr);
u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+ int exthdr_cnt = 0;
bool found;
if (fragoff)
@@ -216,6 +220,9 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
return -ENOENT;
}
+ if (unlikely(exthdr_cnt++ >= IP6_MAX_EXT_HDRS_CNT))
+ return -EBADMSG;
+
hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr);
if (!hp)
return -EBADMSG;
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index c92f98c6f6ec..b1ccdf0dc646 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -36,11 +36,11 @@
/* FL hash table */
#define FL_MAX_PER_SOCK 32
-#define FL_MAX_SIZE 4096
+#define FL_MAX_SIZE 8192
#define FL_HASH_MASK 255
#define FL_HASH(l) (ntohl(l)&FL_HASH_MASK)
-static atomic_t fl_size = ATOMIC_INIT(0);
+static int fl_size;
static struct ip6_flowlabel __rcu *fl_ht[FL_HASH_MASK+1];
static void ip6_fl_gc(struct timer_list *unused);
@@ -162,8 +162,9 @@ static void ip6_fl_gc(struct timer_list *unused)
ttd = fl->expires;
if (time_after_eq(now, ttd)) {
*flp = fl->next;
+ fl_size--;
+ fl->fl_net->ipv6.flowlabel_count--;
fl_free(fl);
- atomic_dec(&fl_size);
continue;
}
if (!sched || time_before(ttd, sched))
@@ -172,7 +173,7 @@ static void ip6_fl_gc(struct timer_list *unused)
flp = &fl->next;
}
}
- if (!sched && atomic_read(&fl_size))
+ if (!sched && fl_size)
sched = now + FL_MAX_LINGER;
if (sched) {
mod_timer(&ip6_fl_gc_timer, sched);
@@ -196,7 +197,8 @@ static void __net_exit ip6_fl_purge(struct net *net)
atomic_read(&fl->users) == 0) {
*flp = fl->next;
fl_free(fl);
- atomic_dec(&fl_size);
+ fl_size--;
+ net->ipv6.flowlabel_count--;
continue;
}
flp = &fl->next;
@@ -210,10 +212,10 @@ static struct ip6_flowlabel *fl_intern(struct net *net,
{
struct ip6_flowlabel *lfl;
+ lockdep_assert_held(&ip6_fl_lock);
+
fl->label = label & IPV6_FLOWLABEL_MASK;
- rcu_read_lock();
- spin_lock_bh(&ip6_fl_lock);
if (label == 0) {
for (;;) {
fl->label = htonl(get_random_u32())&IPV6_FLOWLABEL_MASK;
@@ -235,8 +237,6 @@ static struct ip6_flowlabel *fl_intern(struct net *net,
lfl = __fl_lookup(net, fl->label);
if (lfl) {
atomic_inc(&lfl->users);
- spin_unlock_bh(&ip6_fl_lock);
- rcu_read_unlock();
return lfl;
}
}
@@ -244,9 +244,8 @@ static struct ip6_flowlabel *fl_intern(struct net *net,
fl->lastuse = jiffies;
fl->next = fl_ht[FL_HASH(fl->label)];
rcu_assign_pointer(fl_ht[FL_HASH(fl->label)], fl);
- atomic_inc(&fl_size);
- spin_unlock_bh(&ip6_fl_lock);
- rcu_read_unlock();
+ fl_size++;
+ net->ipv6.flowlabel_count++;
return NULL;
}
@@ -464,10 +463,17 @@ done:
static int mem_check(struct sock *sk)
{
- int room = FL_MAX_SIZE - atomic_read(&fl_size);
+ const int unpriv_total_limit = FL_MAX_SIZE - (FL_MAX_SIZE / 4);
+ const int unpriv_user_limit = unpriv_total_limit / 2;
+ struct net *net = sock_net(sk);
+ int room;
struct ipv6_fl_socklist *sfl;
int count = 0;
+ lockdep_assert_held(&ip6_fl_lock);
+
+ room = FL_MAX_SIZE - fl_size;
+
if (room > FL_MAX_SIZE - FL_MAX_PER_SOCK)
return 0;
@@ -478,7 +484,9 @@ static int mem_check(struct sock *sk)
if (room <= 0 ||
((count >= FL_MAX_PER_SOCK ||
- (count > 0 && room < FL_MAX_SIZE/2) || room < FL_MAX_SIZE/4) &&
+ (count > 0 && room < FL_MAX_SIZE / 2) ||
+ room < FL_MAX_SIZE / 4 ||
+ net->ipv6.flowlabel_count >= unpriv_user_limit) &&
!capable(CAP_NET_ADMIN)))
return -ENOBUFS;
@@ -692,11 +700,19 @@ release:
if (!sfl1)
goto done;
+ rcu_read_lock();
+ spin_lock_bh(&ip6_fl_lock);
err = mem_check(sk);
+ if (err == 0)
+ fl1 = fl_intern(net, fl, freq->flr_label);
+ else
+ fl1 = NULL;
+ spin_unlock_bh(&ip6_fl_lock);
+ rcu_read_unlock();
+
if (err != 0)
goto done;
- fl1 = fl_intern(net, fl, freq->flr_label);
if (fl1)
goto recheck;
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 63fc8556b475..365b4059eb20 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -2262,10 +2262,11 @@ static int ip6erspan_changelink(struct net_device *dev, struct nlattr *tb[],
struct nlattr *data[],
struct netlink_ext_ack *extack)
{
- struct ip6gre_net *ign = net_generic(dev_net(dev), ip6gre_net_id);
+ struct ip6_tnl *t = netdev_priv(dev);
struct __ip6_tnl_parm p;
- struct ip6_tnl *t;
+ struct ip6gre_net *ign;
+ ign = net_generic(t->net, ip6gre_net_id);
t = ip6gre_changelink_common(dev, tb, data, &p, extack);
if (IS_ERR(t))
return PTR_ERR(t);
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 967b07aeb683..8972863c93ee 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -403,6 +403,7 @@ INDIRECT_CALLABLE_DECLARE(int tcp_v6_rcv(struct sk_buff *));
void ip6_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int nexthdr,
bool have_final)
{
+ int exthdr_cnt = IP6CB(skb)->flags & IP6SKB_HOPBYHOP ? 1 : 0;
const struct inet6_protocol *ipprot;
struct inet6_dev *idev;
unsigned int nhoff;
@@ -487,6 +488,10 @@ resubmit_final:
nexthdr = ret;
goto resubmit_final;
} else {
+ if (unlikely(exthdr_cnt++ >= IP6_MAX_EXT_HDRS_CNT)) {
+ SKB_DR_SET(reason, IPV6_TOO_MANY_EXTHDRS);
+ goto discard;
+ }
goto resubmit;
}
} else if (ret == 0) {
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 7e92909ab5be..c14adcdd4396 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -468,6 +468,7 @@ static int ip6_forward_proxy_check(struct sk_buff *skb)
default:
break;
}
+ hdr = ipv6_hdr(skb);
}
/*
@@ -582,6 +583,8 @@ int ip6_forward(struct sk_buff *skb)
if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) &&
pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev)) {
int proxied = ip6_forward_proxy_check(skb);
+
+ hdr = ipv6_hdr(skb);
if (proxied > 0) {
/* It's tempting to decrease the hop limit
* here by 1, as we do at the end of the
@@ -1794,6 +1797,8 @@ alloc_new_skb:
if (err < 0)
goto error;
copy = err;
+ if (!(flags & MSG_NO_SHARED_FRAGS))
+ skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG;
wmem_alloc_delta += copy;
} else if (!zc) {
int i = skb_shinfo(skb)->nr_frags;
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index c468c83af0f2..9d1037ac082f 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -399,11 +399,15 @@ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw)
unsigned int nhoff = raw - skb->data;
unsigned int off = nhoff + sizeof(*ipv6h);
u8 nexthdr = ipv6h->nexthdr;
+ int exthdr_cnt = 0;
while (ipv6_ext_hdr(nexthdr) && nexthdr != NEXTHDR_NONE) {
struct ipv6_opt_hdr *hdr;
u16 optlen;
+ if (unlikely(exthdr_cnt++ >= IP6_MAX_EXT_HDRS_CNT))
+ break;
+
if (!pskb_may_pull(skb, off + sizeof(*hdr)))
break;
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index ad5290be4dd6..df793c8bfffb 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -722,10 +722,11 @@ vti6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p,
static int vti6_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p,
bool keep_mtu)
{
- struct net *net = dev_net(t->dev);
- struct vti6_net *ip6n = net_generic(net, vti6_net_id);
+ struct net *net = t->net;
+ struct vti6_net *ip6n;
int err;
+ ip6n = net_generic(net, vti6_net_id);
vti6_tnl_unlink(ip6n, t);
synchronize_net();
err = vti6_tnl_change(t, p, keep_mtu);
@@ -834,17 +835,24 @@ vti6_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data
if (p.proto != IPPROTO_IPV6 && p.proto != 0)
break;
vti6_parm_from_user(&p1, &p);
- t = vti6_locate(net, &p1, cmd == SIOCADDTUNNEL);
if (dev != ip6n->fb_tnl_dev && cmd == SIOCCHGTUNNEL) {
+ struct ip6_tnl *self = netdev_priv(dev);
+
+ err = -EPERM;
+ if (!ns_capable(self->net->user_ns, CAP_NET_ADMIN))
+ break;
+ t = vti6_locate(self->net, &p1, false);
if (t) {
if (t->dev != dev) {
err = -EEXIST;
break;
}
} else
- t = netdev_priv(dev);
+ t = self;
err = vti6_update(t, &p1, false);
+ } else {
+ t = vti6_locate(net, &p1, cmd == SIOCADDTUNNEL);
}
if (t) {
err = 0;
@@ -1031,11 +1039,12 @@ static int vti6_changelink(struct net_device *dev, struct nlattr *tb[],
struct nlattr *data[],
struct netlink_ext_ack *extack)
{
- struct ip6_tnl *t;
+ struct ip6_tnl *t = netdev_priv(dev);
+ struct net *net = t->net;
struct __ip6_tnl_parm p;
- struct net *net = dev_net(dev);
- struct vti6_net *ip6n = net_generic(net, vti6_net_id);
+ struct vti6_net *ip6n;
+ ip6n = net_generic(net, vti6_net_id);
if (dev == ip6n->fb_tnl_dev)
return -EINVAL;
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 3330adcf26db..d9b855d5191b 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -1424,9 +1424,9 @@ out:
static void __mld_query_work(struct sk_buff *skb)
{
struct mld2_query *mlh2 = NULL;
- const struct in6_addr *group;
unsigned long max_delay;
struct inet6_dev *idev;
+ struct in6_addr group;
struct ifmcaddr6 *ma;
struct mld_msg *mld;
int group_type;
@@ -1458,8 +1458,8 @@ static void __mld_query_work(struct sk_buff *skb)
goto kfree_skb;
mld = (struct mld_msg *)icmp6_hdr(skb);
- group = &mld->mld_mca;
- group_type = ipv6_addr_type(group);
+ group = mld->mld_mca;
+ group_type = ipv6_addr_type(&group);
if (group_type != IPV6_ADDR_ANY &&
!(group_type&IPV6_ADDR_MULTICAST))
@@ -1509,7 +1509,7 @@ static void __mld_query_work(struct sk_buff *skb)
}
} else {
for_each_mc_mclock(idev, ma) {
- if (!ipv6_addr_equal(group, &ma->mca_addr))
+ if (!ipv6_addr_equal(&group, &ma->mca_addr))
continue;
if (ma->mca_flags & MAF_TIMER_RUNNING) {
/* gsquery <- gsquery && mark */
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index d585ac3c1113..9d9c3763f2f5 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1713,12 +1713,10 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
static void __ip6t_unregister_table(struct net *net, struct xt_table *table)
{
- struct xt_table_info *private;
- void *loc_cpu_entry;
+ struct xt_table_info *private = table->private;
struct module *table_owner = table->me;
struct ip6t_entry *iter;
-
- private = xt_unregister_table(table);
+ void *loc_cpu_entry;
/* Decrease module usage counts and free resources */
loc_cpu_entry = private->entries;
@@ -1727,19 +1725,18 @@ static void __ip6t_unregister_table(struct net *net, struct xt_table *table)
if (private->number > private->initial_entries)
module_put(table_owner);
xt_free_table_info(private);
+ kfree(table);
}
int ip6t_register_table(struct net *net, const struct xt_table *table,
const struct ip6t_replace *repl,
const struct nf_hook_ops *template_ops)
{
- struct nf_hook_ops *ops;
- unsigned int num_ops;
- int ret, i;
- struct xt_table_info *newinfo;
struct xt_table_info bootstrap = {0};
- void *loc_cpu_entry;
+ struct xt_table_info *newinfo;
struct xt_table *new_table;
+ void *loc_cpu_entry;
+ int ret;
newinfo = xt_alloc_table_info(repl->size);
if (!newinfo)
@@ -1754,7 +1751,7 @@ int ip6t_register_table(struct net *net, const struct xt_table *table,
return ret;
}
- new_table = xt_register_table(net, table, &bootstrap, newinfo);
+ new_table = xt_register_table(net, table, template_ops, &bootstrap, newinfo);
if (IS_ERR(new_table)) {
struct ip6t_entry *iter;
@@ -1764,48 +1761,12 @@ int ip6t_register_table(struct net *net, const struct xt_table *table,
return PTR_ERR(new_table);
}
- if (!template_ops)
- return 0;
-
- num_ops = hweight32(table->valid_hooks);
- if (num_ops == 0) {
- ret = -EINVAL;
- goto out_free;
- }
-
- ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL);
- if (!ops) {
- ret = -ENOMEM;
- goto out_free;
- }
-
- for (i = 0; i < num_ops; i++)
- ops[i].priv = new_table;
-
- new_table->ops = ops;
-
- ret = nf_register_net_hooks(net, ops, num_ops);
- if (ret != 0)
- goto out_free;
-
return ret;
-
-out_free:
- __ip6t_unregister_table(net, new_table);
- return ret;
-}
-
-void ip6t_unregister_table_pre_exit(struct net *net, const char *name)
-{
- struct xt_table *table = xt_find_table(net, NFPROTO_IPV6, name);
-
- if (table)
- nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks));
}
void ip6t_unregister_table_exit(struct net *net, const char *name)
{
- struct xt_table *table = xt_find_table(net, NFPROTO_IPV6, name);
+ struct xt_table *table = xt_unregister_table_exit(net, NFPROTO_IPV6, name);
if (table)
__ip6t_unregister_table(net, table);
@@ -1894,7 +1855,6 @@ static void __exit ip6_tables_fini(void)
}
EXPORT_SYMBOL(ip6t_register_table);
-EXPORT_SYMBOL(ip6t_unregister_table_pre_exit);
EXPORT_SYMBOL(ip6t_unregister_table_exit);
EXPORT_SYMBOL(ip6t_do_table);
diff --git a/net/ipv6/netfilter/ip6t_hbh.c b/net/ipv6/netfilter/ip6t_hbh.c
index e7a3fb9355ee..450dd53846a2 100644
--- a/net/ipv6/netfilter/ip6t_hbh.c
+++ b/net/ipv6/netfilter/ip6t_hbh.c
@@ -168,6 +168,10 @@ static int hbh_mt6_check(const struct xt_mtchk_param *par)
pr_debug("unknown flags %X\n", optsinfo->invflags);
return -EINVAL;
}
+ if (optsinfo->optsnr > IP6T_OPTS_OPTSNR) {
+ pr_debug("too many supported opts specified\n");
+ return -EINVAL;
+ }
if (optsinfo->flags & IP6T_OPTS_NSTRICT) {
pr_debug("Not strict - not implemented");
diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c
index e8992693e14a..b074fc477676 100644
--- a/net/ipv6/netfilter/ip6table_filter.c
+++ b/net/ipv6/netfilter/ip6table_filter.c
@@ -60,7 +60,7 @@ static int __net_init ip6table_filter_net_init(struct net *net)
static void __net_exit ip6table_filter_net_pre_exit(struct net *net)
{
- ip6t_unregister_table_pre_exit(net, "filter");
+ xt_unregister_table_pre_exit(net, NFPROTO_IPV6, "filter");
}
static void __net_exit ip6table_filter_net_exit(struct net *net)
@@ -76,32 +76,32 @@ static struct pernet_operations ip6table_filter_net_ops = {
static int __init ip6table_filter_init(void)
{
- int ret = xt_register_template(&packet_filter,
- ip6table_filter_table_init);
-
- if (ret < 0)
- return ret;
+ int ret;
filter_ops = xt_hook_ops_alloc(&packet_filter, ip6t_do_table);
- if (IS_ERR(filter_ops)) {
- xt_unregister_template(&packet_filter);
+ if (IS_ERR(filter_ops))
return PTR_ERR(filter_ops);
- }
ret = register_pernet_subsys(&ip6table_filter_net_ops);
+ if (ret < 0)
+ goto err_free;
+
+ ret = xt_register_template(&packet_filter, ip6table_filter_table_init);
if (ret < 0) {
- xt_unregister_template(&packet_filter);
- kfree(filter_ops);
- return ret;
+ unregister_pernet_subsys(&ip6table_filter_net_ops);
+ goto err_free;
}
+ return 0;
+err_free:
+ kfree(filter_ops);
return ret;
}
static void __exit ip6table_filter_fini(void)
{
- unregister_pernet_subsys(&ip6table_filter_net_ops);
xt_unregister_template(&packet_filter);
+ unregister_pernet_subsys(&ip6table_filter_net_ops);
kfree(filter_ops);
}
diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c
index 8dd4cd0c47bd..e6ee036a9b2c 100644
--- a/net/ipv6/netfilter/ip6table_mangle.c
+++ b/net/ipv6/netfilter/ip6table_mangle.c
@@ -89,7 +89,7 @@ static int ip6table_mangle_table_init(struct net *net)
static void __net_exit ip6table_mangle_net_pre_exit(struct net *net)
{
- ip6t_unregister_table_pre_exit(net, "mangle");
+ xt_unregister_table_pre_exit(net, NFPROTO_IPV6, "mangle");
}
static void __net_exit ip6table_mangle_net_exit(struct net *net)
@@ -104,32 +104,33 @@ static struct pernet_operations ip6table_mangle_net_ops = {
static int __init ip6table_mangle_init(void)
{
- int ret = xt_register_template(&packet_mangler,
- ip6table_mangle_table_init);
-
- if (ret < 0)
- return ret;
+ int ret;
mangle_ops = xt_hook_ops_alloc(&packet_mangler, ip6table_mangle_hook);
- if (IS_ERR(mangle_ops)) {
- xt_unregister_template(&packet_mangler);
+ if (IS_ERR(mangle_ops))
return PTR_ERR(mangle_ops);
- }
ret = register_pernet_subsys(&ip6table_mangle_net_ops);
+ if (ret < 0)
+ goto err_free;
+
+ ret = xt_register_template(&packet_mangler,
+ ip6table_mangle_table_init);
if (ret < 0) {
- xt_unregister_template(&packet_mangler);
- kfree(mangle_ops);
- return ret;
+ unregister_pernet_subsys(&ip6table_mangle_net_ops);
+ goto err_free;
}
+ return 0;
+err_free:
+ kfree(mangle_ops);
return ret;
}
static void __exit ip6table_mangle_fini(void)
{
- unregister_pernet_subsys(&ip6table_mangle_net_ops);
xt_unregister_template(&packet_mangler);
+ unregister_pernet_subsys(&ip6table_mangle_net_ops);
kfree(mangle_ops);
}
diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c
index 5be723232df8..c2394e2c94b5 100644
--- a/net/ipv6/netfilter/ip6table_nat.c
+++ b/net/ipv6/netfilter/ip6table_nat.c
@@ -121,8 +121,11 @@ static int ip6table_nat_table_init(struct net *net)
}
ret = ip6t_nat_register_lookups(net);
- if (ret < 0)
+ if (ret < 0) {
+ xt_unregister_table_pre_exit(net, NFPROTO_IPV6, "nat");
+ synchronize_rcu();
ip6t_unregister_table_exit(net, "nat");
+ }
kfree(repl);
return ret;
@@ -131,6 +134,7 @@ static int ip6table_nat_table_init(struct net *net)
static void __net_exit ip6table_nat_net_pre_exit(struct net *net)
{
ip6t_nat_unregister_lookups(net);
+ xt_unregister_table_pre_exit(net, NFPROTO_IPV6, "nat");
}
static void __net_exit ip6table_nat_net_exit(struct net *net)
diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c
index fc9f6754028f..3b161ee875bc 100644
--- a/net/ipv6/netfilter/ip6table_raw.c
+++ b/net/ipv6/netfilter/ip6table_raw.c
@@ -52,7 +52,7 @@ static int ip6table_raw_table_init(struct net *net)
static void __net_exit ip6table_raw_net_pre_exit(struct net *net)
{
- ip6t_unregister_table_pre_exit(net, "raw");
+ xt_unregister_table_pre_exit(net, NFPROTO_IPV6, "raw");
}
static void __net_exit ip6table_raw_net_exit(struct net *net)
@@ -75,31 +75,31 @@ static int __init ip6table_raw_init(void)
pr_info("Enabling raw table before defrag\n");
}
- ret = xt_register_template(table, ip6table_raw_table_init);
- if (ret < 0)
- return ret;
-
/* Register hooks */
rawtable_ops = xt_hook_ops_alloc(table, ip6t_do_table);
- if (IS_ERR(rawtable_ops)) {
- xt_unregister_template(table);
+ if (IS_ERR(rawtable_ops))
return PTR_ERR(rawtable_ops);
- }
ret = register_pernet_subsys(&ip6table_raw_net_ops);
+ if (ret < 0)
+ goto err_free;
+
+ ret = xt_register_template(table, ip6table_raw_table_init);
if (ret < 0) {
- kfree(rawtable_ops);
- xt_unregister_template(table);
- return ret;
+ unregister_pernet_subsys(&ip6table_raw_net_ops);
+ goto err_free;
}
+ return 0;
+err_free:
+ kfree(rawtable_ops);
return ret;
}
static void __exit ip6table_raw_fini(void)
{
- unregister_pernet_subsys(&ip6table_raw_net_ops);
xt_unregister_template(&packet_raw);
+ unregister_pernet_subsys(&ip6table_raw_net_ops);
kfree(rawtable_ops);
}
diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c
index 4df14a9bae78..4bd5d97b8ab6 100644
--- a/net/ipv6/netfilter/ip6table_security.c
+++ b/net/ipv6/netfilter/ip6table_security.c
@@ -49,7 +49,7 @@ static int ip6table_security_table_init(struct net *net)
static void __net_exit ip6table_security_net_pre_exit(struct net *net)
{
- ip6t_unregister_table_pre_exit(net, "security");
+ xt_unregister_table_pre_exit(net, NFPROTO_IPV6, "security");
}
static void __net_exit ip6table_security_net_exit(struct net *net)
@@ -64,32 +64,33 @@ static struct pernet_operations ip6table_security_net_ops = {
static int __init ip6table_security_init(void)
{
- int ret = xt_register_template(&security_table,
- ip6table_security_table_init);
-
- if (ret < 0)
- return ret;
+ int ret;
sectbl_ops = xt_hook_ops_alloc(&security_table, ip6t_do_table);
- if (IS_ERR(sectbl_ops)) {
- xt_unregister_template(&security_table);
+ if (IS_ERR(sectbl_ops))
return PTR_ERR(sectbl_ops);
- }
ret = register_pernet_subsys(&ip6table_security_net_ops);
+ if (ret < 0)
+ goto err_free;
+
+ ret = xt_register_template(&security_table,
+ ip6table_security_table_init);
if (ret < 0) {
- kfree(sectbl_ops);
- xt_unregister_template(&security_table);
- return ret;
+ unregister_pernet_subsys(&ip6table_security_net_ops);
+ goto err_free;
}
+ return 0;
+err_free:
+ kfree(sectbl_ops);
return ret;
}
static void __exit ip6table_security_fini(void)
{
- unregister_pernet_subsys(&ip6table_security_net_ops);
xt_unregister_template(&security_table);
+ unregister_pernet_subsys(&ip6table_security_net_ops);
kfree(sectbl_ops);
}
diff --git a/net/ipv6/netfilter/nf_socket_ipv6.c b/net/ipv6/netfilter/nf_socket_ipv6.c
index ced8bd44828e..893f2aeb4711 100644
--- a/net/ipv6/netfilter/nf_socket_ipv6.c
+++ b/net/ipv6/netfilter/nf_socket_ipv6.c
@@ -100,6 +100,7 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb,
const struct in6_addr *daddr = NULL, *saddr = NULL;
struct ipv6hdr *iph = ipv6_hdr(skb), ipv6_var;
struct sk_buff *data_skb = NULL;
+ unsigned short fragoff = 0;
int doff = 0;
int thoff = 0, tproto;
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
@@ -107,8 +108,8 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb,
struct nf_conn const *ct;
#endif
- tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL);
- if (tproto < 0) {
+ tproto = ipv6_find_hdr(skb, &thoff, -1, &fragoff, NULL);
+ if (tproto < 0 || fragoff) {
pr_debug("unable to find transport header in IPv6 packet, dropping\n");
return NULL;
}
diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c
index 8b2dba88ee96..2dbe44715df3 100644
--- a/net/ipv6/netfilter/nft_fib_ipv6.c
+++ b/net/ipv6/netfilter/nft_fib_ipv6.c
@@ -160,21 +160,40 @@ static bool nft_fib6_info_nh_dev_match(const struct net_device *nh_dev,
l3mdev_master_ifindex_rcu(nh_dev) == dev->ifindex;
}
+static int nft_fib6_nh_match_dev_cb(struct fib6_nh *nh, void *arg)
+{
+ const struct net_device *dev = arg;
+
+ return nft_fib6_info_nh_dev_match(nh->fib_nh_dev, dev);
+}
+
static bool nft_fib6_info_nh_uses_dev(struct fib6_info *rt,
const struct net_device *dev)
{
const struct net_device *nh_dev;
struct fib6_info *iter;
+ /* External nexthop: fib6_siblings slot aliases nh_list, walk via nh. */
+ if (rt->nh)
+ return nexthop_for_each_fib6_nh(rt->nh,
+ nft_fib6_nh_match_dev_cb,
+ (void *)dev);
+
nh_dev = fib6_info_nh_dev(rt);
if (nft_fib6_info_nh_dev_match(nh_dev, dev))
return true;
- list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
+ if (!READ_ONCE(rt->fib6_nsiblings))
+ return false;
+
+ list_for_each_entry_rcu(iter, &rt->fib6_siblings, fib6_siblings) {
nh_dev = fib6_info_nh_dev(iter);
if (nft_fib6_info_nh_dev_match(nh_dev, dev))
return true;
+
+ if (!READ_ONCE(rt->fib6_nsiblings))
+ return false;
}
return false;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 19eb6b702227..636f0120d7e3 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -481,6 +481,9 @@ void fib6_select_path(const struct net *net, struct fib6_result *res,
const struct fib6_nh *nh = sibling->fib6_nh;
int nh_upper_bound;
+ if (!READ_ONCE(first->fib6_nsiblings))
+ break;
+
nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
if (hash > nh_upper_bound)
continue;
@@ -1645,6 +1648,10 @@ static unsigned int fib6_mtu(const struct fib6_result *res)
rcu_read_lock();
idev = __in6_dev_get(dev);
+ if (!idev) {
+ rcu_read_unlock();
+ return 0;
+ }
mtu = READ_ONCE(idev->cnf.mtu6);
rcu_read_unlock();
}
@@ -4995,6 +5002,7 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
break;
rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
+ fib6_update_sernum(net, rt);
rt6_multipath_rebalance(rt);
break;
}
@@ -5897,6 +5905,8 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
goto nla_put_failure;
}
+ if (!READ_ONCE(rt->fib6_nsiblings))
+ break;
}
rcu_read_unlock();
@@ -6928,7 +6938,7 @@ int __init ip6_route_init(void)
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
ret = bpf_iter_register();
if (ret)
- goto out_register_late_subsys;
+ goto out_register_notifier;
#endif
for_each_possible_cpu(cpu) {
@@ -6941,6 +6951,10 @@ int __init ip6_route_init(void)
out:
return ret;
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+out_register_notifier:
+ unregister_netdevice_notifier(&ip6_route_dev_notifier);
+#endif
out_register_late_subsys:
rtnl_unregister_all(PF_INET6);
unregister_pernet_subsys(&ip6_route_net_late_ops);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 2c3f7a739709..36d75fb50a70 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -288,8 +288,10 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
saddr = &fl6->saddr;
err = inet_bhash2_update_saddr(sk, saddr, AF_INET6);
- if (err)
+ if (err) {
+ dst_release(dst);
goto failure;
+ }
}
/* set the source address */
@@ -1617,12 +1619,13 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
if (sk->sk_state == TCP_LISTEN) {
struct sock *nsk = tcp_v6_cookie_check(sk, skb);
+ if (!nsk)
+ return 0;
if (nsk != sk) {
- if (nsk) {
- reason = tcp_child_process(sk, nsk, skb);
- if (reason)
- goto reset;
- }
+ reason = tcp_child_process(sk, nsk, skb);
+ sock_put(nsk);
+ if (reason)
+ goto reset;
return 0;
}
} else
@@ -1827,13 +1830,16 @@ lookup:
rst_reason = sk_rst_convert_drop_reason(drop_reason);
tcp_v6_send_reset(nsk, skb, rst_reason);
+ sock_put(nsk);
goto discard_and_relse;
}
+ sock_put(nsk);
sock_put(sk);
return 0;
}
}
+ isn = 0;
process:
if (static_branch_unlikely(&ip6_min_hopcount)) {
/* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */
@@ -1863,6 +1869,7 @@ process:
th = (const struct tcphdr *)skb->data;
hdr = ipv6_hdr(skb);
tcp_v6_fill_cb(skb, hdr, th);
+ TCP_SKB_CB(skb)->tcp_tw_isn = isn;
skb->dev = NULL;
@@ -1951,7 +1958,6 @@ do_time_wait:
sk = sk2;
tcp_v6_restore_cb(skb);
refcounted = false;
- __this_cpu_write(tcp_tw_isn, isn);
goto process;
}
diff --git a/net/ipv6/xfrm6_protocol.c b/net/ipv6/xfrm6_protocol.c
index ea2f805d3b01..9b586fcec485 100644
--- a/net/ipv6/xfrm6_protocol.c
+++ b/net/ipv6/xfrm6_protocol.c
@@ -88,8 +88,10 @@ int xfrm6_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
dst = ip6_route_input_lookup(dev_net(skb->dev), skb->dev, &fl6,
skb, flags);
- if (dst->error)
+ if (dst->error) {
+ dst_release(dst);
goto drop;
+ }
skb_dst_set(skb, dst);
}
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 72dfccd4e3d5..c2dc3338670e 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -1540,7 +1540,7 @@ static int iucv_sock_getsockopt(struct socket *sock, int level, int optname,
struct sock *sk = sock->sk;
struct iucv_sock *iucv = iucv_sk(sk);
unsigned int val;
- int len;
+ int len, rc;
if (level != SOL_IUCV)
return -ENOPROTOOPT;
@@ -1553,26 +1553,34 @@ static int iucv_sock_getsockopt(struct socket *sock, int level, int optname,
len = min_t(unsigned int, len, sizeof(int));
+ rc = 0;
+
+ lock_sock(sk);
switch (optname) {
case SO_IPRMDATA_MSG:
val = (iucv->flags & IUCV_IPRMDATA) ? 1 : 0;
break;
case SO_MSGLIMIT:
- lock_sock(sk);
val = (iucv->path != NULL) ? iucv->path->msglim /* connected */
: iucv->msglimit; /* default */
- release_sock(sk);
break;
case SO_MSGSIZE:
- if (sk->sk_state == IUCV_OPEN)
- return -EBADFD;
+ if (sk->sk_state == IUCV_OPEN) {
+ rc = -EBADFD;
+ break;
+ }
val = (iucv->hs_dev) ? iucv->hs_dev->mtu -
sizeof(struct af_iucv_trans_hdr) - ETH_HLEN :
0x7fffffff;
break;
default:
- return -ENOPROTOOPT;
+ rc = -ENOPROTOOPT;
+ break;
}
+ release_sock(sk);
+
+ if (rc)
+ return rc;
if (put_user(len, optlen))
return -EFAULT;
diff --git a/net/key/af_key.c b/net/key/af_key.c
index a166a88d8788..9cffeef18cd9 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -3564,7 +3564,7 @@ static int set_ipsecrequest(struct sk_buff *skb,
#ifdef CONFIG_NET_KEY_MIGRATE
static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
const struct xfrm_migrate *m, int num_bundles,
- const struct xfrm_kmaddress *k,
+ const struct xfrm_kmaddress *k, struct net *net,
const struct xfrm_encap_tmpl *encap)
{
int i;
@@ -3669,7 +3669,7 @@ static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
}
/* broadcast migrate message to sockets */
- pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, &init_net);
+ pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, net);
return 0;
@@ -3680,7 +3680,7 @@ err:
#else
static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
const struct xfrm_migrate *m, int num_bundles,
- const struct xfrm_kmaddress *k,
+ const struct xfrm_kmaddress *k, struct net *net,
const struct xfrm_encap_tmpl *encap)
{
return -ENOPROTOOPT;
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 157fc23ce4e1..9419c8555d22 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -441,12 +441,13 @@ struct l2tp_session *l2tp_session_get_by_ifname(const struct net *net,
idr_for_each_entry_ul(&pn->l2tp_tunnel_idr, tunnel, tmp, tunnel_id) {
if (tunnel) {
list_for_each_entry_rcu(session, &tunnel->session_list, list) {
- if (!strcmp(session->ifname, ifname)) {
- refcount_inc(&session->ref_count);
- rcu_read_unlock_bh();
+ if (strcmp(session->ifname, ifname))
+ continue;
+ if (!refcount_inc_not_zero(&session->ref_count))
+ continue;
+ rcu_read_unlock_bh();
- return session;
- }
+ return session;
}
}
}
@@ -1360,7 +1361,7 @@ static void l2tp_session_unhash(struct l2tp_session *session)
spin_lock_bh(&pn->l2tp_session_idr_lock);
/* Remove from the per-tunnel list */
- list_del_init(&session->list);
+ list_del_rcu(&session->list);
/* Remove from per-net IDR */
if (tunnel->version == L2TP_HDR_VER_3) {
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 99d6582f41de..e0b1915be1a6 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -1045,64 +1045,76 @@ static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd,
{
struct pppol2tp_ioc_stats stats;
struct l2tp_session *session;
+ int err = 0;
+
+ session = pppol2tp_sock_to_session(sock->sk);
+ /* Validate session presence and magic integrity ONLY for commands
+ * that belong to L2TP and require a valid session.
+ */
switch (cmd) {
case PPPIOCGMRU:
case PPPIOCGFLAGS:
- session = sock->sk->sk_user_data;
+ case PPPIOCSMRU:
+ case PPPIOCSFLAGS:
+ case PPPIOCGL2TPSTATS:
if (!session)
return -ENOTCONN;
- if (WARN_ON(session->magic != L2TP_SESSION_MAGIC))
+ if (session->magic != L2TP_SESSION_MAGIC) {
+ l2tp_session_put(session);
return -EBADF;
+ }
+ break;
+ default:
+ break;
+ }
+ switch (cmd) {
+ case PPPIOCGMRU:
+ case PPPIOCGFLAGS:
/* Not defined for tunnels */
- if (!session->session_id && !session->peer_session_id)
- return -ENOSYS;
+ if (!session->session_id && !session->peer_session_id) {
+ err = -ENOSYS;
+ break;
+ }
- if (put_user(0, (int __user *)arg))
- return -EFAULT;
+ if (put_user(0, (int __user *)arg)) {
+ err = -EFAULT;
+ break;
+ }
break;
case PPPIOCSMRU:
case PPPIOCSFLAGS:
- session = sock->sk->sk_user_data;
- if (!session)
- return -ENOTCONN;
-
- if (WARN_ON(session->magic != L2TP_SESSION_MAGIC))
- return -EBADF;
-
/* Not defined for tunnels */
- if (!session->session_id && !session->peer_session_id)
- return -ENOSYS;
+ if (!session->session_id && !session->peer_session_id) {
+ err = -ENOSYS;
+ break;
+ }
- if (!access_ok((int __user *)arg, sizeof(int)))
- return -EFAULT;
+ if (!access_ok((int __user *)arg, sizeof(int))) {
+ err = -EFAULT;
+ break;
+ }
break;
case PPPIOCGL2TPSTATS:
- session = sock->sk->sk_user_data;
- if (!session)
- return -ENOTCONN;
-
- if (WARN_ON(session->magic != L2TP_SESSION_MAGIC))
- return -EBADF;
-
/* Session 0 represents the parent tunnel */
if (!session->session_id && !session->peer_session_id) {
u32 session_id;
- int err;
if (copy_from_user(&stats, (void __user *)arg,
- sizeof(stats)))
- return -EFAULT;
+ sizeof(stats))) {
+ err = -EFAULT;
+ break;
+ }
session_id = stats.session_id;
err = pppol2tp_tunnel_copy_stats(&stats,
session->tunnel);
if (err < 0)
- return err;
+ break;
stats.session_id = session_id;
} else {
@@ -1112,15 +1124,21 @@ static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd,
stats.tunnel_id = session->tunnel->tunnel_id;
stats.using_ipsec = l2tp_tunnel_uses_xfrm(session->tunnel);
- if (copy_to_user((void __user *)arg, &stats, sizeof(stats)))
- return -EFAULT;
+ if (copy_to_user((void __user *)arg, &stats, sizeof(stats))) {
+ err = -EFAULT;
+ break;
+ }
break;
default:
- return -ENOIOCTLCMD;
+ err = -ENOIOCTLCMD;
+ break;
}
- return 0;
+ if (session)
+ l2tp_session_put(session);
+
+ return err;
}
/*****************************************************************************
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 7b77d57c9f96..f9ee9947a94d 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -2344,8 +2344,9 @@ static int sta_apply_parameters(struct ieee80211_local *local,
sta->sta.max_sp = params->max_sp;
}
- ieee80211_sta_set_max_amsdu_subframes(sta, params->ext_capab,
- params->ext_capab_len);
+ if (params->ext_capab)
+ ieee80211_sta_set_max_amsdu_subframes(sta, params->ext_capab,
+ params->ext_capab_len);
/*
* cfg80211 validates this (1-2007) and allows setting the AID
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 160ae65a5c64..b98ddfa3003e 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -438,6 +438,15 @@ ieee80211_verify_sta_ht_mcs_support(struct ieee80211_sub_if_data *sdata,
ieee80211_apply_htcap_overrides(sdata, &sta_ht_cap);
/*
+ * Some Xfinity XB8 firmware advertises >1 spatial stream MCS indexes in
+ * their basic HT-MCS set. On cards with lower spatial streams, the check
+ * would fail, and we'd be stuck with no HT when it in fact work fine with
+ * its own supported rate. So check it only in strict mode.
+ */
+ if (!ieee80211_hw_check(&sdata->local->hw, STRICT))
+ return true;
+
+ /*
* P802.11REVme/D7.0 - 6.5.4.2.4
* ...
* If the MLME of an HT STA receives an MLME-JOIN.request primitive
@@ -8155,6 +8164,7 @@ ieee80211_parse_neg_ttlm(struct ieee80211_sub_if_data *sdata,
"No active links for TID %d", tid);
return -EINVAL;
}
+ pos += map_size;
} else {
map = 0;
}
@@ -8173,7 +8183,6 @@ ieee80211_parse_neg_ttlm(struct ieee80211_sub_if_data *sdata,
default:
return -EINVAL;
}
- pos += map_size;
}
return 0;
}
@@ -9140,7 +9149,7 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata,
struct ieee80211_bss *bss = (void *)cbss->priv;
struct sta_info *new_sta = NULL;
struct ieee80211_link_data *link;
- bool have_sta = false;
+ struct sta_info *have_sta = NULL;
bool mlo;
int err;
u16 new_links;
@@ -9159,11 +9168,8 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata,
mlo = false;
}
- if (assoc) {
- rcu_read_lock();
+ if (assoc)
have_sta = sta_info_get(sdata, ap_mld_addr);
- rcu_read_unlock();
- }
if (mlo && !have_sta &&
WARN_ON(sdata->vif.valid_links || sdata->vif.active_links))
@@ -9327,6 +9333,8 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata,
out_release_chan:
ieee80211_link_release_channel(link);
out_err:
+ if (mlo && have_sta)
+ WARN_ON(__sta_info_destroy(have_sta));
ieee80211_vif_set_links(sdata, 0, 0);
return err;
}
@@ -11224,6 +11232,9 @@ static void ieee80211_ml_epcs(struct ieee80211_sub_if_data *sdata,
control = get_unaligned_le16(pos);
link_id = control & IEEE80211_MLE_STA_EPCS_CONTROL_LINK_ID;
+ if (link_id >= IEEE80211_MLD_MAX_NUM_LINKS)
+ continue;
+
link = sdata_dereference(sdata->link[link_id], sdata);
if (!link)
continue;
diff --git a/net/mac80211/parse.c b/net/mac80211/parse.c
index 2b3632c6008a..77894d997113 100644
--- a/net/mac80211/parse.c
+++ b/net/mac80211/parse.c
@@ -34,6 +34,22 @@
#include "led.h"
#include "wep.h"
+static const u8 empty_non_inheritance[] = {
+ WLAN_EID_EXTENSION, 1, WLAN_EID_EXT_NON_INHERITANCE,
+ /*
+ * cfg80211_is_element_inherited() hardcodes elements that
+ * cannot be inherited, so we just need an empty one to be
+ * calling it at all.
+ */
+};
+
+struct ieee80211_elem_defrag {
+ const struct element *elem;
+ /* container start/len */
+ const u8 *start;
+ size_t len;
+};
+
struct ieee80211_elems_parse {
/* must be first for kfree to work */
struct ieee802_11_elems elems;
@@ -41,11 +57,7 @@ struct ieee80211_elems_parse {
/* The basic Multi-Link element in the original elements */
const struct element *ml_basic_elem;
- /* The reconfiguration Multi-Link element in the original elements */
- const struct element *ml_reconf_elem;
-
- /* The EPCS Multi-Link element in the original elements */
- const struct element *ml_epcs_elem;
+ struct ieee80211_elem_defrag ml_reconf, ml_epcs;
bool multi_link_inner;
bool skip_vendor;
@@ -162,10 +174,14 @@ ieee80211_parse_extension_element(u32 *crc,
}
break;
case IEEE80211_ML_CONTROL_TYPE_RECONF:
- elems_parse->ml_reconf_elem = elem;
+ elems_parse->ml_reconf.elem = elem;
+ elems_parse->ml_reconf.start = params->start;
+ elems_parse->ml_reconf.len = params->len;
break;
case IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS:
- elems_parse->ml_epcs_elem = elem;
+ elems_parse->ml_epcs.elem = elem;
+ elems_parse->ml_epcs.start = params->start;
+ elems_parse->ml_epcs.len = params->len;
break;
default:
break;
@@ -916,7 +932,7 @@ ieee80211_prep_mle_link_parse(struct ieee80211_elems_parse *elems_parse,
{
struct ieee802_11_elems *elems = &elems_parse->elems;
struct ieee80211_mle_per_sta_profile *prof;
- const struct element *tmp;
+ const struct element *tmp, *ret;
ssize_t ml_len;
const u8 *end;
@@ -986,50 +1002,40 @@ ieee80211_prep_mle_link_parse(struct ieee80211_elems_parse *elems_parse,
sub->from_ap = params->from_ap;
sub->link_id = -1;
- return cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE,
- sub->start, sub->len);
-}
-
-static void
-ieee80211_mle_defrag_reconf(struct ieee80211_elems_parse *elems_parse)
-{
- struct ieee802_11_elems *elems = &elems_parse->elems;
- ssize_t ml_len;
+ ret = cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE,
+ sub->start, sub->len);
+ if (ret)
+ return ret;
- ml_len = cfg80211_defragment_element(elems_parse->ml_reconf_elem,
- elems->ie_start,
- elems->total_len,
- elems_parse->scratch_pos,
- elems_parse->scratch +
- elems_parse->scratch_len -
- elems_parse->scratch_pos,
- WLAN_EID_FRAGMENT);
- if (ml_len < 0)
- return;
- elems->ml_reconf = (void *)elems_parse->scratch_pos;
- elems->ml_reconf_len = ml_len;
- elems_parse->scratch_pos += ml_len;
+ /*
+ * Since we know we want and found a profile, apply an empty
+ * non-inheritance if the profile didn't have one, so that any
+ * element that shouldn't be inherited by spec isn't.
+ */
+ return (const void *)empty_non_inheritance;
}
-static void
-ieee80211_mle_defrag_epcs(struct ieee80211_elems_parse *elems_parse)
+static const void *
+ieee80211_mle_defrag(struct ieee80211_elems_parse *elems_parse,
+ struct ieee80211_elem_defrag *defrag,
+ size_t *out_len)
{
- struct ieee802_11_elems *elems = &elems_parse->elems;
+ const void *ret;
ssize_t ml_len;
- ml_len = cfg80211_defragment_element(elems_parse->ml_epcs_elem,
- elems->ie_start,
- elems->total_len,
+ ml_len = cfg80211_defragment_element(defrag->elem,
+ defrag->start, defrag->len,
elems_parse->scratch_pos,
elems_parse->scratch +
elems_parse->scratch_len -
elems_parse->scratch_pos,
WLAN_EID_FRAGMENT);
if (ml_len < 0)
- return;
- elems->ml_epcs = (void *)elems_parse->scratch_pos;
- elems->ml_epcs_len = ml_len;
+ return NULL;
+ ret = elems_parse->scratch_pos;
+ *out_len = ml_len;
elems_parse->scratch_pos += ml_len;
+ return ret;
}
struct ieee802_11_elems *
@@ -1042,6 +1048,7 @@ ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params)
size_t scratch_len = 3 * params->len;
bool multi_link_inner = false;
+ BUILD_BUG_ON(sizeof(empty_non_inheritance) != empty_non_inheritance[1] + 2);
BUILD_BUG_ON(offsetof(typeof(*elems_parse), elems) != 0);
/* cannot parse for both a specific link and non-transmitted BSS */
@@ -1089,6 +1096,17 @@ ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params)
non_inherit = cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE,
sub.start, nontx_len);
+ /*
+ * If it's a non-transmitted BSS, we shouldn't pick
+ * any elements in the outer parsing that shouldn't
+ * be inherited. If the profile has a non-inheritance
+ * element this automatically happens, but if not then
+ * provide an empty one so that the hard-coded elements
+ * in cfg80211_is_element_inherited() are ignored, but
+ * it must be called.
+ */
+ if (params->bss->transmitted_bss && !non_inherit)
+ non_inherit = (const void *)empty_non_inheritance;
} else {
/* must always parse to get elems_parse->ml_basic_elem */
non_inherit = ieee80211_prep_mle_link_parse(elems_parse, params,
@@ -1109,9 +1127,12 @@ ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params)
_ieee802_11_parse_elems_full(&sub, elems_parse, NULL);
}
- ieee80211_mle_defrag_reconf(elems_parse);
-
- ieee80211_mle_defrag_epcs(elems_parse);
+ elems->ml_reconf = ieee80211_mle_defrag(elems_parse,
+ &elems_parse->ml_reconf,
+ &elems->ml_reconf_len);
+ elems->ml_epcs = ieee80211_mle_defrag(elems_parse,
+ &elems_parse->ml_epcs,
+ &elems->ml_epcs_len);
if (elems->tim && !elems->parse_error) {
const struct ieee80211_tim_ie *tim_ie = elems->tim;
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 3e5d1c47a5b0..3fb40449c6c5 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -4971,7 +4971,7 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx,
struct sk_buff *skb = rx->skb;
struct ieee80211_hdr *hdr = (void *)skb->data;
struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
- static ieee80211_rx_result res;
+ ieee80211_rx_result res;
int orig_len = skb->len;
int hdrlen = ieee80211_hdrlen(hdr->frame_control);
int snap_offs = hdrlen;
@@ -4984,6 +4984,7 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx,
u8 sa[ETH_ALEN];
} addrs __aligned(2);
struct ieee80211_sta_rx_stats *stats;
+ u32 encoded_rate;
/* for parallel-rx, we need to have DUP_VALIDATED, otherwise we write
* to a common data structure; drivers can implement that per queue
@@ -5091,11 +5092,14 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx,
/* push the addresses in front */
memcpy(skb_push(skb, sizeof(addrs)), &addrs, sizeof(addrs));
+ /* capture before mesh forward may memset or free skb->cb */
+ encoded_rate = sta_stats_encode_rate(status);
+
res = ieee80211_rx_mesh_data(rx->sdata, rx->sta, rx->skb);
switch (res) {
case RX_QUEUED:
stats->last_rx = jiffies;
- stats->last_rate = sta_stats_encode_rate(status);
+ stats->last_rate = encoded_rate;
return true;
case RX_CONTINUE:
break;
@@ -5380,7 +5384,9 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
if (!link_sta)
goto out;
- ieee80211_rx_data_set_link(&rx, link_sta->link_id);
+ if (!ieee80211_rx_data_set_link(&rx,
+ link_sta->link_id))
+ goto out;
}
if (ieee80211_prepare_and_rx_handle(&rx, skb, true))
diff --git a/net/mac80211/tests/chan-mode.c b/net/mac80211/tests/chan-mode.c
index adc069065e73..fa370831d617 100644
--- a/net/mac80211/tests/chan-mode.c
+++ b/net/mac80211/tests/chan-mode.c
@@ -65,6 +65,7 @@ static const struct determine_chan_mode_case {
.ht_capa_mask = {
.mcs.rx_mask[0] = 0xf7,
},
+ .strict = true,
}, {
.desc = "Masking out a RX rate in VHT capabilities",
.conn_mode = IEEE80211_CONN_MODE_EHT,
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index b487d2330f25..ea7f63e1fc17 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -2181,7 +2181,9 @@ bool ieee80211_parse_tx_radiotap(struct sk_buff *skb,
case IEEE80211_RADIOTAP_ANTENNA:
/* this can appear multiple times, keep a bitmap */
- info->control.antennas |= BIT(*iterator.this_arg);
+ /* control.antennas is only a 2-bit bitmap */
+ if (*iterator.this_arg < 2)
+ info->control.antennas |= BIT(*iterator.this_arg);
break;
case IEEE80211_RADIOTAP_DATA_RETRIES:
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index b093bc203c81..2529b01e2cd5 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -3700,11 +3700,11 @@ void ieee80211_dfs_radar_detected_work(struct wiphy *wiphy,
struct ieee80211_local *local =
container_of(work, struct ieee80211_local, radar_detected_work);
struct cfg80211_chan_def chandef;
- struct ieee80211_chanctx *ctx;
+ struct ieee80211_chanctx *ctx, *tmp;
lockdep_assert_wiphy(local->hw.wiphy);
- list_for_each_entry(ctx, &local->chanctx_list, list) {
+ list_for_each_entry_safe(ctx, tmp, &local->chanctx_list, list) {
if (ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER)
continue;
diff --git a/net/mctp/test/route-test.c b/net/mctp/test/route-test.c
index e1033643fab0..e4b230ef6099 100644
--- a/net/mctp/test/route-test.c
+++ b/net/mctp/test/route-test.c
@@ -920,9 +920,9 @@ static void mctp_test_route_input_cloned_frag(struct kunit *test)
static void mctp_test_route_input_null_eid(struct kunit *test)
{
struct mctp_hdr hdr = RX_HDR(1, 10, 0, FL_S | FL_E | FL_TO);
+ struct sockaddr_mctp addr = { 0 };
struct sk_buff *skb_pkt, *skb_sk;
struct mctp_test_dev *dev;
- struct sockaddr_mctp addr;
struct socket *sock;
u8 type = 0;
int rc;
diff --git a/net/mctp/test/utils.c b/net/mctp/test/utils.c
index c3987d5ade7a..6eef8d485c25 100644
--- a/net/mctp/test/utils.c
+++ b/net/mctp/test/utils.c
@@ -116,7 +116,7 @@ void mctp_test_destroy_dev(struct mctp_test_dev *dev)
static int mctp_test_dst_output(struct mctp_dst *dst, struct sk_buff *skb)
{
skb->dev = dst->dev->dev;
- dev_queue_xmit(skb);
+ dev_direct_xmit(skb, 0);
return 0;
}
diff --git a/net/mptcp/bpf.c b/net/mptcp/bpf.c
index 8a16672b94e2..4cc16cbeb328 100644
--- a/net/mptcp/bpf.c
+++ b/net/mptcp/bpf.c
@@ -14,7 +14,7 @@
struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk)
{
- if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP && sk_is_mptcp(sk))
+ if (sk && sk_fullsock(sk) && sk_is_tcp(sk) && sk_is_mptcp(sk))
return mptcp_sk(mptcp_subflow_ctx(sk)->conn);
return NULL;
diff --git a/net/mptcp/fastopen.c b/net/mptcp/fastopen.c
index 82ec15bcfd7f..082c46c0f50e 100644
--- a/net/mptcp/fastopen.c
+++ b/net/mptcp/fastopen.c
@@ -12,6 +12,7 @@ void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subf
struct sock *sk, *ssk;
struct sk_buff *skb;
struct tcp_sock *tp;
+ bool has_rxtstamp;
/* on early fallback the subflow context is deleted by
* subflow_syn_recv_sock()
@@ -40,12 +41,13 @@ void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subf
*/
tp->copied_seq += skb->len;
subflow->ssn_offset += skb->len;
+ has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp;
/* Only the sequence delta is relevant */
MPTCP_SKB_CB(skb)->map_seq = -skb->len;
MPTCP_SKB_CB(skb)->end_seq = 0;
MPTCP_SKB_CB(skb)->offset = 0;
- MPTCP_SKB_CB(skb)->has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp;
+ MPTCP_SKB_CB(skb)->has_rxtstamp = has_rxtstamp;
MPTCP_SKB_CB(skb)->cant_coalesce = 1;
mptcp_data_lock(sk);
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 8a1c5698983c..b3ea7854818f 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -566,12 +566,17 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+ struct tcp_sock *tp = tcp_sk(sk);
unsigned int dss_size = 0;
struct mptcp_ext *mpext;
unsigned int ack_size;
bool ret = false;
- u64 ack_seq;
+ /* Zero `use_ack` and `use_map` flags with one shot. */
+ BUILD_BUG_ON(sizeof_field(struct mptcp_ext, flags) != sizeof(u16));
+ BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct mptcp_ext, flags),
+ sizeof(u16)));
+ *(u16 *)&opts->ext_copy.flags = 0;
opts->csum_reqd = READ_ONCE(msk->csum_enabled);
mpext = skb ? mptcp_get_ext(skb) : NULL;
@@ -595,20 +600,16 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
/* passive sockets msk will set the 'can_ack' after accept(), even
* if the first subflow may have the already the remote key handy
*/
- opts->ext_copy.use_ack = 0;
if (!READ_ONCE(msk->can_ack)) {
*size = ALIGN(dss_size, 4);
return ret;
}
- ack_seq = READ_ONCE(msk->ack_seq);
if (READ_ONCE(msk->use_64bit_ack)) {
ack_size = TCPOLEN_MPTCP_DSS_ACK64;
- opts->ext_copy.data_ack = ack_seq;
opts->ext_copy.ack64 = 1;
} else {
ack_size = TCPOLEN_MPTCP_DSS_ACK32;
- opts->ext_copy.data_ack32 = (uint32_t)ack_seq;
opts->ext_copy.ack64 = 0;
}
opts->ext_copy.use_ack = 1;
@@ -618,6 +619,12 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
if (dss_size == 0)
ack_size += TCPOLEN_MPTCP_DSS_BASE;
+ /* The caller is __tcp_transmit_skb(), and will compute the new rcv
+ * wnd soon: ensure that the window can shrink.
+ */
+ if (skb)
+ tp->rcv_wnd = tp->rcv_nxt - tp->rcv_wup;
+
dss_size += ack_size;
*size = ALIGN(dss_size, 4);
@@ -658,7 +665,6 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
- bool drop_other_suboptions = false;
unsigned int opt_size = *size;
struct mptcp_addr_info addr;
bool echo;
@@ -669,36 +675,20 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *
*/
if (!mptcp_pm_should_add_signal(msk) ||
(opts->suboptions & (OPTION_MPTCP_MPJ_ACK | OPTION_MPTCP_MPC_ACK)) ||
- !mptcp_pm_add_addr_signal(msk, skb, opt_size, remaining, &addr,
- &echo, &drop_other_suboptions))
+ !skb || !skb_is_tcp_pure_ack(skb) ||
+ !mptcp_pm_add_addr_signal(msk, opt_size, remaining, &addr, &echo))
return false;
- /*
- * Later on, mptcp_write_options() will enforce mutually exclusion with
- * DSS, bail out if such option is set and we can't drop it.
- */
- if (drop_other_suboptions)
- remaining += opt_size;
- else if (opts->suboptions & OPTION_MPTCP_DSS)
- return false;
+ remaining += opt_size;
len = mptcp_add_addr_len(addr.family, echo, !!addr.port);
if (remaining < len)
return false;
*size = len;
- if (drop_other_suboptions) {
- pr_debug("drop other suboptions\n");
- opts->suboptions = 0;
-
- /* note that e.g. DSS could have written into the memory
- * aliased by ahmac, we must reset the field here
- * to avoid appending the hmac even for ADD_ADDR echo
- * options
- */
- opts->ahmac = 0;
- *size -= opt_size;
- }
+ pr_debug("drop other suboptions\n");
+ opts->suboptions = 0;
+ *size -= opt_size;
opts->addr = addr;
opts->suboptions |= OPTION_MPTCP_ADD_ADDR;
if (!echo) {
@@ -708,6 +698,7 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *
&opts->addr);
} else {
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADDTX);
+ opts->ahmac = 0;
}
pr_debug("addr_id=%d, ahmac=%llu, echo=%d, port=%d\n",
opts->addr.id, opts->ahmac, echo, ntohs(opts->addr.port));
@@ -1297,19 +1288,14 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
return true;
}
-static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th)
+static u64 mptcp_set_rwin(struct mptcp_sock *msk, struct tcp_sock *tp,
+ struct tcphdr *th, u64 ack_seq)
{
const struct sock *ssk = (const struct sock *)tp;
- struct mptcp_subflow_context *subflow;
- u64 ack_seq, rcv_wnd_old, rcv_wnd_new;
- struct mptcp_sock *msk;
+ u64 rcv_wnd_old, rcv_wnd_new;
u32 new_win;
u64 win;
- subflow = mptcp_subflow_ctx(ssk);
- msk = mptcp_sk(subflow->conn);
-
- ack_seq = READ_ONCE(msk->ack_seq);
rcv_wnd_new = ack_seq + tp->rcv_wnd;
rcv_wnd_old = atomic64_read(&msk->rcv_wnd_sent);
@@ -1362,7 +1348,7 @@ raise_win:
update_wspace:
WRITE_ONCE(msk->old_wspace, tp->rcv_wnd);
- subflow->rcv_wnd_sent = rcv_wnd_new;
+ return rcv_wnd_new;
}
static void mptcp_track_rwin(struct tcp_sock *tp)
@@ -1474,13 +1460,25 @@ void mptcp_write_options(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp,
*ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags);
if (mpext->use_ack) {
+ struct mptcp_sock *msk;
+ u64 ack_seq;
+
+ /* DSS option is set only by mptcp_established_options,
+ * the caller is __tcp_transmit_skb() and ssk is always
+ * not NULL.
+ */
+ subflow = mptcp_subflow_ctx(ssk);
+ msk = mptcp_sk(subflow->conn);
+ ack_seq = READ_ONCE(msk->ack_seq);
if (mpext->ack64) {
- put_unaligned_be64(mpext->data_ack, ptr);
+ put_unaligned_be64(ack_seq, ptr);
ptr += 2;
} else {
- put_unaligned_be32(mpext->data_ack32, ptr);
+ put_unaligned_be32(ack_seq, ptr);
ptr += 1;
}
+ subflow->rcv_wnd_sent = mptcp_set_rwin(msk, tp, th,
+ ack_seq);
}
if (mpext->use_map) {
@@ -1708,9 +1706,6 @@ mp_capable_done:
i += 4;
}
}
-
- if (tp)
- mptcp_set_rwin(tp, th);
}
__be32 mptcp_get_reset_option(const struct sk_buff *skb)
diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
index 57a456690406..470501470fe5 100644
--- a/net/mptcp/pm.c
+++ b/net/mptcp/pm.c
@@ -16,6 +16,7 @@ struct mptcp_pm_add_entry {
struct list_head list;
struct mptcp_addr_info addr;
u8 retrans_times;
+ bool timer_done;
struct timer_list add_timer;
struct mptcp_sock *sock;
struct rcu_head rcu;
@@ -283,6 +284,9 @@ int mptcp_pm_mp_prio_send_ack(struct mptcp_sock *msk,
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
struct mptcp_addr_info local, remote;
+ if (!__mptcp_subflow_active(subflow))
+ continue;
+
mptcp_local_address((struct sock_common *)ssk, &local);
if (!mptcp_addresses_equal(&local, addr, addr->port))
continue;
@@ -305,18 +309,31 @@ static unsigned int mptcp_adjust_add_addr_timeout(struct mptcp_sock *msk)
const struct net *net = sock_net((struct sock *)msk);
unsigned int rto = mptcp_get_add_addr_timeout(net);
struct mptcp_subflow_context *subflow;
- unsigned int max = 0;
+ unsigned int max = 0, max_stale = 0;
+
+ if (!rto)
+ return 0;
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
struct inet_connection_sock *icsk = inet_csk(ssk);
- if (icsk->icsk_rto > max)
+ if (!__mptcp_subflow_active(subflow))
+ continue;
+
+ if (unlikely(subflow->stale)) {
+ if (icsk->icsk_rto > max_stale)
+ max_stale = icsk->icsk_rto;
+ } else if (icsk->icsk_rto > max) {
max = icsk->icsk_rto;
+ }
}
- if (max && max < rto)
- rto = max;
+ if (max)
+ return min(max, rto);
+
+ if (max_stale)
+ return min(max_stale, rto);
return rto;
}
@@ -327,31 +344,33 @@ static void mptcp_pm_add_timer(struct timer_list *timer)
add_timer);
struct mptcp_sock *msk = entry->sock;
struct sock *sk = (struct sock *)msk;
- unsigned int timeout;
+ unsigned int timeout = 0;
pr_debug("msk=%p\n", msk);
- if (!msk)
- return;
-
- if (inet_sk_state_load(sk) == TCP_CLOSE)
- return;
-
- if (!entry->addr.id)
- return;
+ bh_lock_sock(sk);
+ if (unlikely(inet_sk_state_load(sk) == TCP_CLOSE))
+ goto out;
- if (mptcp_pm_should_add_signal_addr(msk)) {
- sk_reset_timer(sk, timer, jiffies + TCP_RTO_MAX / 8);
+ if (sock_owned_by_user(sk)) {
+ /* Try again later. */
+ timeout = HZ / 20;
goto out;
}
timeout = mptcp_adjust_add_addr_timeout(msk);
- if (!timeout)
+ if (!timeout || mptcp_pm_should_add_signal_addr(msk))
goto out;
spin_lock_bh(&msk->pm.lock);
- if (!mptcp_pm_should_add_signal_addr(msk)) {
+ /* The cancel path (mptcp_pm_del_add_timer()) can race with this
+ * callback. Once cancel updates retrans_times to MAX, suppress further
+ * retransmissions here. If this callback acquires pm.lock first, one
+ * final transmit attempt is still possible.
+ */
+ if (entry->retrans_times < ADD_ADDR_RETRANS_MAX &&
+ !mptcp_pm_should_add_signal_addr(msk)) {
pr_debug("retransmit ADD_ADDR id=%d\n", entry->addr.id);
mptcp_pm_announce_addr(msk, &entry->addr, false);
mptcp_pm_add_addr_send_ack(msk);
@@ -359,8 +378,9 @@ static void mptcp_pm_add_timer(struct timer_list *timer)
}
if (entry->retrans_times < ADD_ADDR_RETRANS_MAX)
- sk_reset_timer(sk, timer,
- jiffies + (timeout << entry->retrans_times));
+ timeout <<= entry->retrans_times;
+ else
+ timeout = 0;
spin_unlock_bh(&msk->pm.lock);
@@ -368,7 +388,13 @@ static void mptcp_pm_add_timer(struct timer_list *timer)
mptcp_pm_subflow_established(msk);
out:
- __sock_put(sk);
+ if (timeout)
+ sk_reset_timer(sk, timer, jiffies + timeout);
+ else
+ /* if sock_put calls sk_free: avoid waiting for this timer */
+ entry->timer_done = true;
+ bh_unlock_sock(sk);
+ sock_put(sk);
}
struct mptcp_pm_add_entry *
@@ -394,8 +420,12 @@ mptcp_pm_del_add_timer(struct mptcp_sock *msk,
/* Note: entry might have been removed by another thread.
* We hold rcu_read_lock() to ensure it is not freed under us.
*/
- if (stop_timer)
- sk_stop_timer_sync(sk, &entry->add_timer);
+ if (stop_timer) {
+ if (check_id)
+ sk_stop_timer(sk, &entry->add_timer);
+ else
+ sk_stop_timer_sync(sk, &entry->add_timer);
+ }
rcu_read_unlock();
return entry;
@@ -431,6 +461,7 @@ bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk,
timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0);
reset_timer:
+ add_entry->timer_done = false;
timeout = mptcp_adjust_add_addr_timeout(msk);
if (timeout)
sk_reset_timer(sk, &add_entry->add_timer, jiffies + timeout);
@@ -451,7 +482,8 @@ static void mptcp_pm_free_anno_list(struct mptcp_sock *msk)
spin_unlock_bh(&msk->pm.lock);
list_for_each_entry_safe(entry, tmp, &free_list, list) {
- sk_stop_timer_sync(sk, &entry->add_timer);
+ if (!entry->timer_done)
+ sk_stop_timer_sync(sk, &entry->add_timer);
kfree_rcu(entry, rcu);
}
}
@@ -855,11 +887,11 @@ void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq)
}
}
-bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb,
- unsigned int opt_size, unsigned int remaining,
- struct mptcp_addr_info *addr, bool *echo,
- bool *drop_other_suboptions)
+bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, unsigned int opt_size,
+ unsigned int remaining,
+ struct mptcp_addr_info *addr, bool *echo)
{
+ bool skip_add_addr = false;
int ret = false;
u8 add_addr;
u8 family;
@@ -875,30 +907,49 @@ bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb,
* plain dup-ack from TCP perspective. The other MPTCP-relevant info,
* if any, will be carried by the 'original' TCP ack
*/
- if (skb && skb_is_tcp_pure_ack(skb)) {
- remaining += opt_size;
- *drop_other_suboptions = true;
- }
+ remaining += opt_size;
*echo = mptcp_pm_should_add_signal_echo(msk);
- port = !!(*echo ? msk->pm.remote.port : msk->pm.local.port);
-
- family = *echo ? msk->pm.remote.family : msk->pm.local.family;
- if (remaining < mptcp_add_addr_len(family, *echo, port))
- goto out_unlock;
-
if (*echo) {
*addr = msk->pm.remote;
add_addr = msk->pm.addr_signal & ~BIT(MPTCP_ADD_ADDR_ECHO);
+ port = !!msk->pm.remote.port;
+ family = msk->pm.remote.family;
} else {
*addr = msk->pm.local;
add_addr = msk->pm.addr_signal & ~BIT(MPTCP_ADD_ADDR_SIGNAL);
+ port = !!msk->pm.local.port;
+ family = msk->pm.local.family;
}
- WRITE_ONCE(msk->pm.addr_signal, add_addr);
+
+ if (remaining < mptcp_add_addr_len(family, *echo, port)) {
+ struct net *net = sock_net((struct sock *)msk);
+
+ if (*echo) {
+ MPTCP_INC_STATS(net, MPTCP_MIB_ECHOADDTXDROP);
+ } else {
+ skip_add_addr = true;
+ MPTCP_INC_STATS(net, MPTCP_MIB_ADDADDRTXDROP);
+ }
+ goto drop_signal_mark;
+ }
+
ret = true;
+drop_signal_mark:
+ WRITE_ONCE(msk->pm.addr_signal, add_addr);
+
out_unlock:
spin_unlock_bh(&msk->pm.lock);
+
+ /* On pure-ACK option-space exhaustion, stop retrying this ADD_ADDR:
+ * clear the signal bit, cancel the matching retransmission timer, and
+ * let the PM state machine progress.
+ */
+ if (skip_add_addr) {
+ mptcp_pm_del_add_timer(msk, addr, true);
+ mptcp_pm_subflow_established(msk);
+ }
return ret;
}
diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c
index c9f1e5af3cd3..fc818b63752e 100644
--- a/net/mptcp/pm_kernel.c
+++ b/net/mptcp/pm_kernel.c
@@ -347,6 +347,8 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
/* check first for announce */
if (msk->pm.add_addr_signaled < endp_signal_max) {
+ u8 endp_id;
+
/* due to racing events on both ends we can reach here while
* previous add address is still running: if we invoke now
* mptcp_pm_announce_addr(), that will fail and the
@@ -360,19 +362,20 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
if (!select_signal_address(pernet, msk, &local))
goto subflow;
+ /* Special case for ID0: set the correct ID */
+ endp_id = local.addr.id;
+ if (endp_id == msk->mpc_endpoint_id)
+ local.addr.id = 0;
+
/* If the alloc fails, we are on memory pressure, not worth
* continuing, and trying to create subflows.
*/
if (!mptcp_pm_alloc_anno_list(msk, &local.addr))
return;
- __clear_bit(local.addr.id, msk->pm.id_avail_bitmap);
+ __clear_bit(endp_id, msk->pm.id_avail_bitmap);
msk->pm.add_addr_signaled++;
- /* Special case for ID0: set the correct ID */
- if (local.addr.id == msk->mpc_endpoint_id)
- local.addr.id = 0;
-
mptcp_pm_announce_addr(msk, &local.addr, false);
mptcp_pm_addr_send_ack(msk);
diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c
index 8cbc1920afb4..0d3a95e676f1 100644
--- a/net/mptcp/pm_userspace.c
+++ b/net/mptcp/pm_userspace.c
@@ -408,19 +408,21 @@ int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info)
local.flags = entry.flags;
local.ifindex = entry.ifindex;
+ spin_lock_bh(&msk->pm.lock);
+ msk->pm.extra_subflows++;
+ spin_unlock_bh(&msk->pm.lock);
+
lock_sock(sk);
err = __mptcp_subflow_connect(sk, &local, &addr_r);
release_sock(sk);
- if (err)
+ if (err) {
GENL_SET_ERR_MSG_FMT(info, "connect error: %d", err);
- spin_lock_bh(&msk->pm.lock);
- if (err)
+ spin_lock_bh(&msk->pm.lock);
mptcp_userspace_pm_delete_local_addr(msk, &entry);
- else
- msk->pm.extra_subflows++;
- spin_unlock_bh(&msk->pm.lock);
+ spin_unlock_bh(&msk->pm.lock);
+ }
create_err:
sock_put(sk);
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 4546a8b09884..cb9515f505aa 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -397,12 +397,26 @@ static bool __mptcp_move_skb(struct sock *sk, struct sk_buff *skb)
return false;
}
- /* old data, keep it simple and drop the whole pkt, sender
- * will retransmit as needed, if needed.
+ /* Completely old data? */
+ if (!after64(MPTCP_SKB_CB(skb)->end_seq, msk->ack_seq)) {
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
+ mptcp_drop(sk, skb);
+ return false;
+ }
+
+ /* Partial packet: map_seq < ack_seq < end_seq.
+ * Skip the already-acked bytes and enqueue the new data.
*/
- MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
- mptcp_drop(sk, skb);
- return false;
+ copy_len = MPTCP_SKB_CB(skb)->end_seq - msk->ack_seq;
+ MPTCP_SKB_CB(skb)->offset += msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq;
+ MPTCP_SKB_CB(skb)->map_seq += msk->ack_seq -
+ MPTCP_SKB_CB(skb)->map_seq;
+ msk->bytes_received += copy_len;
+ WRITE_ONCE(msk->ack_seq, msk->ack_seq + copy_len);
+
+ skb_set_owner_r(skb, sk);
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+ return true;
}
static void mptcp_stop_rtx_timer(struct sock *sk)
@@ -2262,6 +2276,10 @@ static bool mptcp_move_skbs(struct sock *sk)
mptcp_backlog_spooled(sk, moved, &skbs);
}
mptcp_data_unlock(sk);
+
+ if (enqueued && mptcp_epollin_ready(sk))
+ sk->sk_data_ready(sk);
+
return enqueued;
}
@@ -2851,6 +2869,10 @@ static void __mptcp_retrans(struct sock *sk)
msk->bytes_retrans += len;
dfrag->already_sent = max(dfrag->already_sent, len);
+ /* With csum enabled retransmission can send new data. */
+ if (after64(dfrag->already_sent + dfrag->data_seq, msk->snd_nxt))
+ WRITE_ONCE(msk->snd_nxt, dfrag->already_sent + dfrag->data_seq);
+
reset_timer:
mptcp_check_and_set_pending(sk);
@@ -3473,6 +3495,7 @@ static int mptcp_disconnect(struct sock *sk, int flags)
/* for fallback's sake */
WRITE_ONCE(msk->ack_seq, 0);
+ atomic64_set(&msk->rcv_wnd_sent, 0);
WRITE_ONCE(sk->sk_shutdown, 0);
sk_error_report(sk);
@@ -4405,6 +4428,8 @@ static int __mptcp_read_sock(struct sock *sk, read_descriptor_t *desc,
}
mptcp_eat_recv_skb(sk, skb);
+ if (!desc->count)
+ break;
}
if (noack)
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index e4f5aba24da7..b93b878478d2 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -1229,10 +1229,9 @@ static inline int mptcp_rm_addr_len(const struct mptcp_rm_list *rm_list)
return TCPOLEN_MPTCP_RM_ADDR_BASE + roundup(rm_list->nr - 1, 4) + 1;
}
-bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb,
- unsigned int opt_size, unsigned int remaining,
- struct mptcp_addr_info *addr, bool *echo,
- bool *drop_other_suboptions);
+bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, unsigned int opt_size,
+ unsigned int remaining,
+ struct mptcp_addr_info *addr, bool *echo);
bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
struct mptcp_rm_list *rm_list);
int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc);
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index 0efe40be2fde..fcf6feb2a9eb 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -67,6 +67,12 @@ static int mptcp_get_int_option(struct mptcp_sock *msk, sockptr_t optval,
return 0;
}
+static void __mptcp_subflow_set_rcvbuf(struct sock *ssk, int val)
+{
+ WRITE_ONCE(ssk->sk_rcvbuf, val);
+ tcp_set_rcvbuf(ssk, val);
+}
+
static void mptcp_sol_socket_sync_intval(struct mptcp_sock *msk, int optname, int val)
{
struct mptcp_subflow_context *subflow;
@@ -100,7 +106,7 @@ static void mptcp_sol_socket_sync_intval(struct mptcp_sock *msk, int optname, in
case SO_RCVBUF:
case SO_RCVBUFFORCE:
ssk->sk_userlocks |= SOCK_RCVBUF_LOCK;
- WRITE_ONCE(ssk->sk_rcvbuf, sk->sk_rcvbuf);
+ __mptcp_subflow_set_rcvbuf(ssk, sk->sk_rcvbuf);
break;
case SO_MARK:
if (READ_ONCE(ssk->sk_mark) != sk->sk_mark) {
@@ -235,15 +241,19 @@ static int mptcp_setsockopt_sol_socket_timestamping(struct mptcp_sock *msk,
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ int err;
lock_sock(ssk);
- sock_set_timestamping(ssk, optname, timestamping);
+ err = sock_set_timestamping(ssk, optname, timestamping);
release_sock(ssk);
+
+ if (err < 0 && ret == 0)
+ ret = err;
}
release_sock(sk);
- return 0;
+ return ret;
}
static int mptcp_setsockopt_sol_socket_linger(struct mptcp_sock *msk, sockptr_t optval,
@@ -807,11 +817,16 @@ static int mptcp_setsockopt_all_sf(struct mptcp_sock *msk, int level,
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ int err;
- ret = tcp_setsockopt(ssk, level, optname, optval, optlen);
- if (ret)
- break;
+ err = tcp_setsockopt(ssk, level, optname, optval, optlen);
+ if (err < 0 && ret == 0)
+ ret = err;
}
+
+ if (!ret)
+ sockopt_seq_inc(msk);
+
return ret;
}
@@ -1556,7 +1571,7 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk)
mptcp_subflow_ctx(ssk)->cached_sndbuf = sk->sk_sndbuf;
}
if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
- WRITE_ONCE(ssk->sk_rcvbuf, sk->sk_rcvbuf);
+ __mptcp_subflow_set_rcvbuf(ssk, sk->sk_rcvbuf);
}
if (sock_flag(sk, SOCK_LINGER)) {
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index e2cb9d23e4a0..d562e149606f 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -581,7 +581,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
subflow->backup);
if (!subflow_thmac_valid(subflow)) {
- MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKMAC);
subflow->reset_reason = MPTCP_RST_EMPTCP;
goto do_reset;
}
@@ -908,7 +908,7 @@ create_child:
if (!subflow_hmac_valid(subflow_req, &mp_opt)) {
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC);
- subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT);
+ subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP);
goto dispose_child;
}
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index c5a26236a0bb..3706b4a85a0f 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -1613,6 +1613,7 @@ dump_last:
((dump_type == DUMP_ALL) ==
!!(set->type->features & IPSET_DUMP_LAST))) {
write_unlock_bh(&ip_set_ref_lock);
+ set = NULL;
continue;
}
pr_debug("List set: %s\n", set->name);
@@ -1648,13 +1649,13 @@ dump_last:
if (cb->args[IPSET_CB_PROTO] > IPSET_PROTOCOL_MIN &&
nla_put_net16(skb, IPSET_ATTR_INDEX, htons(index)))
goto nla_put_failure;
+ if (set->variant->uref)
+ set->variant->uref(set, cb, true);
ret = set->variant->head(set, skb);
if (ret < 0)
goto release_refcount;
if (dump_flags & IPSET_FLAG_LIST_HEADER)
goto next_set;
- if (set->variant->uref)
- set->variant->uref(set, cb, true);
fallthrough;
default:
ret = set->variant->list(set, skb, cb);
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index b79e5dd2af03..04e4627ddfc1 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -386,8 +386,9 @@ static void
mtype_ext_cleanup(struct ip_set *set, struct hbucket *n)
{
int i;
+ u8 pos = smp_load_acquire(&n->pos);
- for (i = 0; i < n->pos; i++)
+ for (i = 0; i < pos; i++)
if (test_bit(i, n->used))
ip_set_ext_destroy(set, ahash_data(n, i, set->dsize));
}
@@ -490,7 +491,7 @@ mtype_gc_do(struct ip_set *set, struct htype *h, struct htable *t, u32 r)
#ifdef IP_SET_HASH_WITH_NETS
u8 k;
#endif
- u8 htable_bits = t->htable_bits;
+ u8 pos, htable_bits = t->htable_bits;
spin_lock_bh(&t->hregion[r].lock);
for (i = ahash_bucket_start(r, htable_bits);
@@ -498,7 +499,8 @@ mtype_gc_do(struct ip_set *set, struct htype *h, struct htable *t, u32 r)
n = __ipset_dereference(hbucket(t, i));
if (!n)
continue;
- for (j = 0, d = 0; j < n->pos; j++) {
+ pos = smp_load_acquire(&n->pos);
+ for (j = 0, d = 0; j < pos; j++) {
if (!test_bit(j, n->used)) {
d++;
continue;
@@ -534,7 +536,7 @@ mtype_gc_do(struct ip_set *set, struct htype *h, struct htable *t, u32 r)
/* Still try to delete expired elements. */
continue;
tmp->size = n->size - AHASH_INIT_SIZE;
- for (j = 0, d = 0; j < n->pos; j++) {
+ for (j = 0, d = 0; j < pos; j++) {
if (!test_bit(j, n->used))
continue;
data = ahash_data(n, j, dsize);
@@ -623,7 +625,7 @@ mtype_resize(struct ip_set *set, bool retried)
{
struct htype *h = set->data;
struct htable *t, *orig;
- u8 htable_bits;
+ u8 pos, htable_bits;
size_t hsize, dsize = set->dsize;
#ifdef IP_SET_HASH_WITH_NETS
u8 flags;
@@ -685,7 +687,8 @@ retry:
n = __ipset_dereference(hbucket(orig, i));
if (!n)
continue;
- for (j = 0; j < n->pos; j++) {
+ pos = smp_load_acquire(&n->pos);
+ for (j = 0; j < pos; j++) {
if (!test_bit(j, n->used))
continue;
data = ahash_data(n, j, dsize);
@@ -809,9 +812,10 @@ mtype_ext_size(struct ip_set *set, u32 *elements, size_t *ext_size)
{
struct htype *h = set->data;
const struct htable *t;
- u32 i, j, r;
struct hbucket *n;
struct mtype_elem *data;
+ u32 i, j, r;
+ u8 pos;
t = rcu_dereference_bh(h->table);
for (r = 0; r < ahash_numof_locks(t->htable_bits); r++) {
@@ -820,7 +824,8 @@ mtype_ext_size(struct ip_set *set, u32 *elements, size_t *ext_size)
n = rcu_dereference_bh(hbucket(t, i));
if (!n)
continue;
- for (j = 0; j < n->pos; j++) {
+ pos = smp_load_acquire(&n->pos);
+ for (j = 0; j < pos; j++) {
if (!test_bit(j, n->used))
continue;
data = ahash_data(n, j, set->dsize);
@@ -848,6 +853,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
bool flag_exist = flags & IPSET_FLAG_EXIST;
bool deleted = false, forceadd = false, reuse = false;
u32 r, key, multi = 0, elements, maxelem;
+ u8 npos = 0;
rcu_read_lock_bh();
t = rcu_dereference_bh(h->table);
@@ -889,7 +895,8 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
ext_size(AHASH_INIT_SIZE, set->dsize);
goto copy_elem;
}
- for (i = 0; i < n->pos; i++) {
+ npos = smp_load_acquire(&n->pos);
+ for (i = 0; i < npos; i++) {
if (!test_bit(i, n->used)) {
/* Reuse first deleted entry */
if (j == -1) {
@@ -933,7 +940,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
if (elements >= maxelem)
goto set_full;
/* Create a new slot */
- if (n->pos >= n->size) {
+ if (npos >= n->size) {
#ifdef IP_SET_HASH_WITH_MULTI
if (h->bucketsize >= AHASH_MAX_TUNED)
goto set_full;
@@ -962,7 +969,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
}
copy_elem:
- j = n->pos++;
+ j = npos++;
data = ahash_data(n, j, set->dsize);
copy_data:
t->hregion[r].elements++;
@@ -985,6 +992,8 @@ overwrite_extensions:
if (SET_WITH_TIMEOUT(set))
ip_set_timeout_set(ext_timeout(data, set), ext->timeout);
smp_mb__before_atomic();
+ /* Ensure all data writes are visible before updating position */
+ smp_store_release(&n->pos, npos);
set_bit(j, n->used);
if (old != ERR_PTR(-ENOENT)) {
rcu_assign_pointer(hbucket(t, key), n);
@@ -1043,6 +1052,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
int i, j, k, r, ret = -IPSET_ERR_EXIST;
u32 key, multi = 0;
size_t dsize = set->dsize;
+ u8 pos;
/* Userspace add and resize is excluded by the mutex.
* Kernespace add does not trigger resize.
@@ -1058,7 +1068,8 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
n = rcu_dereference_bh(hbucket(t, key));
if (!n)
goto out;
- for (i = 0, k = 0; i < n->pos; i++) {
+ pos = smp_load_acquire(&n->pos);
+ for (i = 0, k = 0; i < pos; i++) {
if (!test_bit(i, n->used)) {
k++;
continue;
@@ -1072,8 +1083,8 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
ret = 0;
clear_bit(i, n->used);
smp_mb__after_atomic();
- if (i + 1 == n->pos)
- n->pos--;
+ if (i + 1 == pos)
+ smp_store_release(&n->pos, --pos);
t->hregion[r].elements--;
#ifdef IP_SET_HASH_WITH_NETS
for (j = 0; j < IPSET_NET_COUNT; j++)
@@ -1094,11 +1105,11 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
x->flags = flags;
}
}
- for (; i < n->pos; i++) {
+ for (; i < pos; i++) {
if (!test_bit(i, n->used))
k++;
}
- if (k == n->pos) {
+ if (k == pos) {
t->hregion[r].ext_size -= ext_size(n->size, dsize);
rcu_assign_pointer(hbucket(t, key), NULL);
kfree_rcu(n, rcu);
@@ -1109,7 +1120,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
if (!tmp)
goto out;
tmp->size = n->size - AHASH_INIT_SIZE;
- for (j = 0, k = 0; j < n->pos; j++) {
+ for (j = 0, k = 0; j < pos; j++) {
if (!test_bit(j, n->used))
continue;
data = ahash_data(n, j, dsize);
@@ -1170,6 +1181,7 @@ mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d,
int ret, i, j = 0;
#endif
u32 key, multi = 0;
+ u8 pos;
pr_debug("test by nets\n");
for (; j < NLEN && h->nets[j].cidr[0] && !multi; j++) {
@@ -1187,7 +1199,8 @@ mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d,
n = rcu_dereference_bh(hbucket(t, key));
if (!n)
continue;
- for (i = 0; i < n->pos; i++) {
+ pos = smp_load_acquire(&n->pos);
+ for (i = 0; i < pos; i++) {
if (!test_bit(i, n->used))
continue;
data = ahash_data(n, i, set->dsize);
@@ -1221,6 +1234,7 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext,
struct mtype_elem *data;
int i, ret = 0;
u32 key, multi = 0;
+ u8 pos;
rcu_read_lock_bh();
t = rcu_dereference_bh(h->table);
@@ -1243,7 +1257,8 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext,
ret = 0;
goto out;
}
- for (i = 0; i < n->pos; i++) {
+ pos = smp_load_acquire(&n->pos);
+ for (i = 0; i < pos; i++) {
if (!test_bit(i, n->used))
continue;
data = ahash_data(n, i, set->dsize);
@@ -1360,6 +1375,7 @@ mtype_list(const struct ip_set *set,
/* We assume that one hash bucket fills into one page */
void *incomplete;
int i, ret = 0;
+ u8 pos;
atd = nla_nest_start(skb, IPSET_ATTR_ADT);
if (!atd)
@@ -1378,7 +1394,8 @@ mtype_list(const struct ip_set *set,
cb->args[IPSET_CB_ARG0], t, n);
if (!n)
continue;
- for (i = 0; i < n->pos; i++) {
+ pos = smp_load_acquire(&n->pos);
+ for (i = 0; i < pos; i++) {
if (!test_bit(i, n->used))
continue;
e = ahash_data(n, i, set->dsize);
diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c
index a22ec1a6f6ec..e26ca2a370e3 100644
--- a/net/netfilter/ipset/ip_set_hash_ipmark.c
+++ b/net/netfilter/ipset/ip_set_hash_ipmark.c
@@ -150,7 +150,7 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[],
if (retried)
ip = ntohl(h->next.ip);
- for (; ip <= ip_to; ip++, i++) {
+ for (; ip <= ip_to; i++) {
e.ip = htonl(ip);
if (i > IPSET_MAX_RANGE) {
hash_ipmark4_data_next(&h->next, &e);
@@ -162,6 +162,10 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[],
return ret;
ret = 0;
+
+ if (ip == ip_to)
+ break;
+ ip++;
}
return ret;
}
diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c
index e977b5a9c48d..41ca24a22a02 100644
--- a/net/netfilter/ipset/ip_set_hash_ipport.c
+++ b/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -186,7 +186,7 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
if (retried)
ip = ntohl(h->next.ip);
- for (; ip <= ip_to; ip++) {
+ for (; ip <= ip_to;) {
p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port)
: port;
for (; p <= port_to; p++, i++) {
@@ -203,6 +203,9 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
ret = 0;
}
+ if (ip == ip_to)
+ break;
+ ip++;
}
return ret;
}
diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c
index 39a01934b153..b9ac2efaa15c 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportip.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportip.c
@@ -182,7 +182,7 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
if (retried)
ip = ntohl(h->next.ip);
- for (; ip <= ip_to; ip++) {
+ for (; ip <= ip_to;) {
p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port)
: port;
for (; p <= port_to; p++, i++) {
@@ -199,6 +199,9 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
ret = 0;
}
+ if (ip == ip_to)
+ break;
+ ip++;
}
return ret;
}
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
index 5c6de605a9fb..2d6652d43199 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -274,7 +274,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
p = port;
ip2 = ip2_from;
}
- for (; ip <= ip_to; ip++) {
+ for (; ip <= ip_to;) {
e.ip = htonl(ip);
for (; p <= port_to; p++) {
e.port = htons(p);
@@ -298,6 +298,9 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
ip2 = ip2_from;
}
p = port;
+ if (ip == ip_to)
+ break;
+ ip++;
}
return ret;
}
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 2082bfb2d93c..9ea6b4fa78bf 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -267,27 +267,20 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
hash_key2 = hash_key;
use2 = false;
}
+
conn_tab_lock(t, cp, hash_key, hash_key2, use2, true /* new_hash */,
&head, &head2);
- spin_lock(&cp->lock);
-
- if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
- cp->flags |= IP_VS_CONN_F_HASHED;
- WRITE_ONCE(cp->hn0.hash_key, hash_key);
- WRITE_ONCE(cp->hn1.hash_key, hash_key2);
- refcount_inc(&cp->refcnt);
- hlist_bl_add_head_rcu(&cp->hn0.node, head);
- if (use2)
- hlist_bl_add_head_rcu(&cp->hn1.node, head2);
- ret = 1;
- } else {
- pr_err("%s(): request for already hashed, called from %pS\n",
- __func__, __builtin_return_address(0));
- ret = 0;
- }
- spin_unlock(&cp->lock);
+ cp->flags |= IP_VS_CONN_F_HASHED;
+ WRITE_ONCE(cp->hn0.hash_key, hash_key);
+ WRITE_ONCE(cp->hn1.hash_key, hash_key2);
+ refcount_inc(&cp->refcnt);
+ hlist_bl_add_head_rcu(&cp->hn0.node, head);
+ if (use2)
+ hlist_bl_add_head_rcu(&cp->hn1.node, head2);
+
conn_tab_unlock(head, head2);
+ ret = 1;
/* Schedule resizing if load increases */
if (atomic_read(&ipvs->conn_count) > t->u_thresh &&
@@ -321,7 +314,6 @@ static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp)
conn_tab_lock(t, cp, hash_key, hash_key2, use2, false /* new_hash */,
&head, &head2);
- spin_lock(&cp->lock);
if (cp->flags & IP_VS_CONN_F_HASHED) {
/* Decrease refcnt and unlink conn only if we are last user */
@@ -334,7 +326,6 @@ static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp)
}
}
- spin_unlock(&cp->lock);
conn_tab_unlock(head, head2);
rcu_read_unlock();
@@ -637,6 +628,7 @@ void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
struct ip_vs_conn_hnode *hn;
u32 hash_key, hash_key_new;
struct ip_vs_conn_param p;
+ bool by_me = false;
int ntbl;
int dir;
@@ -664,8 +656,16 @@ retry:
t = rcu_dereference(t->new_tbl);
ntbl++;
/* We are lost? */
- if (ntbl >= 2)
+ if (ntbl >= 2) {
+ spin_lock_bh(&cp->lock);
+ if (cp->flags & IP_VS_CONN_F_NO_CPORT && by_me)
+ cp->cport = 0;
+ /* hn1 will be rehashed on next packet */
+ spin_unlock_bh(&cp->lock);
+ IP_VS_ERR_RL("%s(): Too many ht changes for dir %d\n",
+ __func__, dir);
return;
+ }
}
/* Rehashing during resize? Use the recent table for adds */
@@ -683,10 +683,13 @@ retry:
if (head > head2 && t == t2)
swap(head, head2);
+ /* Protect the cp->flags modification */
+ spin_lock_bh(&cp->lock);
+
/* Lock seqcount only for the old bucket, even if we are on new table
* because it affects the del operation, not the adding.
*/
- spin_lock_bh(&t->lock[hash_key & t->lock_mask].l);
+ spin_lock(&t->lock[hash_key & t->lock_mask].l);
preempt_disable_nested();
write_seqcount_begin(&t->seqc[hash_key & t->seqc_mask]);
@@ -704,14 +707,23 @@ retry:
hlist_bl_unlock(head);
write_seqcount_end(&t->seqc[hash_key & t->seqc_mask]);
preempt_enable_nested();
- spin_unlock_bh(&t->lock[hash_key & t->lock_mask].l);
+ spin_unlock(&t->lock[hash_key & t->lock_mask].l);
+ spin_unlock_bh(&cp->lock);
hash_key = hash_key_new;
goto retry;
}
- spin_lock(&cp->lock);
- if ((cp->flags & IP_VS_CONN_F_NO_CPORT) &&
- (cp->flags & IP_VS_CONN_F_HASHED)) {
+ /* Fill cport once, even if multiple packets try to do it */
+ if (cp->flags & IP_VS_CONN_F_NO_CPORT && (!cp->cport || by_me)) {
+ /* If we race with resizing make sure cport is set for dir 1 */
+ if (!cp->cport) {
+ cp->cport = cport;
+ by_me = true;
+ }
+ if (!dir) {
+ atomic_dec(&ipvs->no_cport_conns[af_id]);
+ cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
+ }
/* We do not recalc hash_key_r under lock, we assume the
* parameters in cp do not change, i.e. cport is
* the only possible change.
@@ -726,21 +738,17 @@ retry:
hlist_bl_del_rcu(&hn->node);
hlist_bl_add_head_rcu(&hn->node, head_new);
}
- if (!dir) {
- atomic_dec(&ipvs->no_cport_conns[af_id]);
- cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
- cp->cport = cport;
- }
}
- spin_unlock(&cp->lock);
if (head != head2)
hlist_bl_unlock(head2);
hlist_bl_unlock(head);
write_seqcount_end(&t->seqc[hash_key & t->seqc_mask]);
preempt_enable_nested();
- spin_unlock_bh(&t->lock[hash_key & t->lock_mask].l);
- if (dir--)
+ spin_unlock(&t->lock[hash_key & t->lock_mask].l);
+
+ spin_unlock_bh(&cp->lock);
+ if (dir-- && by_me)
goto next_dir;
}
@@ -1835,7 +1843,7 @@ static void ip_vs_conn_flush(struct netns_ipvs *ipvs)
if (!rcu_dereference_protected(ipvs->conn_tab, 1))
return;
- cancel_delayed_work_sync(&ipvs->conn_resize_work);
+ disable_delayed_work_sync(&ipvs->conn_resize_work);
if (!atomic_read(&ipvs->conn_count))
goto unreg;
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index f5b7a2047291..d40b404c1bf6 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -237,7 +237,7 @@ int ip_vs_rht_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, int n,
{
if (!t)
return 1 << min_bits;
- n = roundup_pow_of_two(n);
+ n = n > 0 ? roundup_pow_of_two(n) : 1;
if (lfactor < 0) {
int factor = min(-lfactor, max_bits);
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 6632daa87ded..16daba8cac83 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -261,12 +261,28 @@ static void est_reload_work_handler(struct work_struct *work)
if (!kd)
continue;
/* New config ? Stop kthread tasks */
- if (genid != genid_done)
- ip_vs_est_kthread_stop(kd);
+ if (genid != genid_done) {
+ if (!id) {
+ /* Only we can stop kt 0 but not under mutex */
+ mutex_unlock(&ipvs->est_mutex);
+ ip_vs_est_kthread_stop(kd);
+ mutex_lock(&ipvs->est_mutex);
+ if (!READ_ONCE(ipvs->enable))
+ goto unlock;
+ /* kd for kt 0 is never destroyed */
+ } else {
+ ip_vs_est_kthread_stop(kd);
+ }
+ }
if (!kd->task && !ip_vs_est_stopped(ipvs)) {
+ bool start;
+
/* Do not start kthreads above 0 in calc phase */
- if ((!id || !ipvs->est_calc_phase) &&
- ip_vs_est_kthread_start(ipvs, kd) < 0)
+ if (id)
+ start = !ipvs->est_calc_phase;
+ else
+ start = kd->needed;
+ if (start && ip_vs_est_kthread_start(ipvs, kd) < 0)
repeat = true;
}
}
@@ -311,18 +327,22 @@ ip_vs_use_count_dec(void)
/* Service hashing:
* Operation Locking order
* ---------------------------------------------------------------------------
- * add table service_mutex, svc_resize_sem(W)
- * del table service_mutex
- * move between tables svc_resize_sem(W), seqcount_t(W), bit lock
- * add/del service service_mutex, bit lock
+ * add first table service_mutex
+ * attach new table service_mutex
+ * add/del service service_mutex, RCU, bit lock
+ * move between tables (rehash) svc_resize_sem(W), seqcount_t(W), bit lock
+ * replace old with attached svc_resize_sem(W), svc_replace_sem(W)
* find service RCU, seqcount_t(R)
* walk services(blocking) service_mutex, svc_resize_sem(R)
* walk services(non-blocking) RCU, seqcount_t(R)
+ * walk services(non-blocking) svc_resize_sem(R), RCU, seqcount_t(R)
+ * walk services(non-blocking) svc_replace_sem(R), RCU, seqcount_t(R)
+ * del table service_mutex after stopped work
*
- * - new tables are linked/unlinked under service_mutex and svc_resize_sem
- * - new table is linked on resizing and all operations can run in parallel
- * in 2 tables until the new table is registered as current one
- * - two contexts can modify buckets: config and table resize, both in
+ * - new table is attached on resizing under service_mutex and all operations
+ * can run in parallel in 2 tables until the new table is registered as current
+ * one
+ * - two contexts can modify buckets: config and table resize (work), both in
* process context
* - only table resizer can move entries, so we do not protect t->seqc[]
* items with t->lock[]
@@ -330,9 +350,13 @@ ip_vs_use_count_dec(void)
* services are moved to new table
* - move operations may disturb readers: find operation will not miss entries
* but walkers may see same entry twice if they are forced to retry chains
- * - walkers using cond_resched_rcu() on !PREEMPT_RCU may need to hold
- * service_mutex to disallow new tables to be installed or to check
+ * or to walk the newly attached second table
+ * - walkers using cond_resched_rcu() on !PREEMPT_RCU may need to check
* svc_table_changes and repeat the RCU read section if new table is installed
+ * - walkers may serialize with the whole resizing process (svc_resize_sem)
+ * to prevent seeing same service twice or just with the svc_table
+ * replace (svc_replace_sem) when we can see entries twice but we
+ * prefer to run concurrently with the rehashing.
*/
/*
@@ -371,9 +395,16 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc)
/* increase its refcnt because it is referenced by the svc table */
atomic_inc(&svc->refcnt);
+ /* We know if new table is attached under service_mutex but rely on
+ * RCU to hold the old table to be freed in resizer
+ */
+ rcu_read_lock();
+
+ /* This can be the old or the new table */
+ t = rcu_dereference(ipvs->svc_table);
+
/* New entries go into recent table */
- t = rcu_dereference_protected(ipvs->svc_table, 1);
- t = rcu_dereference_protected(t->new_tbl, 1);
+ t = rcu_dereference(t->new_tbl);
if (svc->fwmark == 0) {
/*
@@ -394,6 +425,8 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc)
hlist_bl_add_head_rcu(&svc->s_list, head);
hlist_bl_unlock(head);
+ rcu_read_unlock();
+
return 1;
}
@@ -416,7 +449,13 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc)
return 0;
}
- t = rcu_dereference_protected(ipvs->svc_table, 1);
+ /* We know if new table is attached under service_mutex but rely on
+ * RCU to hold the old table to be freed in resizer
+ */
+ rcu_read_lock();
+
+ /* This can be the old or the new table */
+ t = rcu_dereference(ipvs->svc_table);
hash_key = READ_ONCE(svc->hash_key);
/* We need to lock the bucket in the right table */
if (ip_vs_rht_same_table(t, hash_key)) {
@@ -427,13 +466,13 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc)
/* Moved to new table ? */
if (hash_key != hash_key2) {
hlist_bl_unlock(head);
- t = rcu_dereference_protected(t->new_tbl, 1);
+ t = rcu_dereference(t->new_tbl);
head = t->buckets + (hash_key2 & t->mask);
hlist_bl_lock(head);
}
} else {
/* It is already moved to new table */
- t = rcu_dereference_protected(t->new_tbl, 1);
+ t = rcu_dereference(t->new_tbl);
head = t->buckets + (hash_key & t->mask);
hlist_bl_lock(head);
}
@@ -443,6 +482,8 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc)
svc->flags &= ~IP_VS_SVC_F_HASHED;
atomic_dec(&svc->refcnt);
hlist_bl_unlock(head);
+
+ rcu_read_unlock();
return 1;
}
@@ -650,15 +691,14 @@ static void svc_resize_work_handler(struct work_struct *work)
goto unlock_sem;
more_work = false;
clear_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags);
- if (!READ_ONCE(ipvs->enable) ||
- test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
+ if (!READ_ONCE(ipvs->enable))
goto unlock_m;
t = rcu_dereference_protected(ipvs->svc_table, 1);
/* Do nothing if table is removed */
if (!t)
goto unlock_m;
- /* New table needs to be registered? BUG! */
- if (t != rcu_dereference_protected(t->new_tbl, 1))
+ /* New table already attached? BUG! */
+ if (t != rcu_access_pointer(t->new_tbl))
goto unlock_m;
lfactor = sysctl_svc_lfactor(ipvs);
@@ -675,6 +715,7 @@ static void svc_resize_work_handler(struct work_struct *work)
/* Flip the table_id */
t_new->table_id = t->table_id ^ IP_VS_RHT_TABLE_ID_MASK;
+ /* Attach new table */
rcu_assign_pointer(t->new_tbl, t_new);
/* Allow add/del to new_tbl while moving from old table */
mutex_unlock(&ipvs->service_mutex);
@@ -682,8 +723,8 @@ static void svc_resize_work_handler(struct work_struct *work)
ip_vs_rht_for_each_bucket(t, bucket, head) {
same_bucket:
if (++limit >= 16) {
- if (!READ_ONCE(ipvs->enable) ||
- test_bit(IP_VS_WORK_SVC_NORESIZE,
+ /* Check if work is stopped */
+ if (test_bit(IP_VS_WORK_SVC_NORESIZE,
&ipvs->work_flags))
goto unlock_sem;
if (resched_score >= 100) {
@@ -748,16 +789,12 @@ same_bucket:
goto same_bucket;
}
- /* Tables can be switched only under service_mutex */
- while (!mutex_trylock(&ipvs->service_mutex)) {
- cond_resched();
- if (!READ_ONCE(ipvs->enable) ||
- test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
- goto unlock_sem;
- }
- if (!READ_ONCE(ipvs->enable) ||
- test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
- goto unlock_m;
+ /* Serialize with readers that don't like svc_table changes */
+ down_write(&ipvs->svc_replace_sem);
+
+ /* Check if work is stopped to avoid synchronize_rcu() */
+ if (test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
+ goto unlock_repl;
rcu_assign_pointer(ipvs->svc_table, t_new);
/* Inform readers that new table is installed */
@@ -765,8 +802,8 @@ same_bucket:
atomic_inc(&ipvs->svc_table_changes);
t_free = t;
-unlock_m:
- mutex_unlock(&ipvs->service_mutex);
+unlock_repl:
+ up_write(&ipvs->svc_replace_sem);
unlock_sem:
up_write(&ipvs->svc_resize_sem);
@@ -785,6 +822,11 @@ out:
test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
return;
queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, 1);
+ return;
+
+unlock_m:
+ mutex_unlock(&ipvs->service_mutex);
+ goto unlock_sem;
}
static inline void
@@ -1102,6 +1144,24 @@ out:
return dest;
}
+/* Put destination in trash */
+static void ip_vs_trash_put_dest(struct netns_ipvs *ipvs,
+ struct ip_vs_dest *dest, unsigned long istart,
+ bool cleanup)
+{
+ spin_lock_bh(&ipvs->dest_trash_lock);
+ IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n",
+ IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port),
+ refcount_read(&dest->refcnt));
+ if (list_empty(&ipvs->dest_trash) && !cleanup)
+ mod_timer(&ipvs->dest_trash_timer,
+ jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
+ /* dest lives in trash with reference */
+ list_add(&dest->t_list, &ipvs->dest_trash);
+ dest->idle_start = istart;
+ spin_unlock_bh(&ipvs->dest_trash_lock);
+}
+
static void ip_vs_dest_rcu_free(struct rcu_head *head)
{
struct ip_vs_dest *dest;
@@ -1461,9 +1521,12 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
ntohs(dest->vport));
ret = ip_vs_start_estimator(svc->ipvs, &dest->stats);
+ /* On error put back dest into the trash */
if (ret < 0)
- return ret;
- __ip_vs_update_dest(svc, dest, udest, 1);
+ ip_vs_trash_put_dest(svc->ipvs, dest, dest->idle_start,
+ false);
+ else
+ __ip_vs_update_dest(svc, dest, udest, 1);
} else {
/*
* Allocate and initialize the dest structure
@@ -1533,17 +1596,7 @@ static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest,
*/
ip_vs_rs_unhash(dest);
- spin_lock_bh(&ipvs->dest_trash_lock);
- IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n",
- IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port),
- refcount_read(&dest->refcnt));
- if (list_empty(&ipvs->dest_trash) && !cleanup)
- mod_timer(&ipvs->dest_trash_timer,
- jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
- /* dest lives in trash with reference */
- list_add(&dest->t_list, &ipvs->dest_trash);
- dest->idle_start = 0;
- spin_unlock_bh(&ipvs->dest_trash_lock);
+ ip_vs_trash_put_dest(ipvs, dest, 0, cleanup);
/* Queue up delayed work to expire all no destination connections.
* No-op when CONFIG_SYSCTL is disabled.
@@ -1664,6 +1717,7 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
struct ip_vs_pe *pe = NULL;
int ret_hooks = -1;
int ret = 0;
+ bool grow;
/* increase the module use count */
if (!ip_vs_use_count_inc())
@@ -1705,16 +1759,25 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
}
#endif
- t = rcu_dereference_protected(ipvs->svc_table, 1);
+ /* The old table can be freed, protect it with RCU */
+ rcu_read_lock();
+ t = rcu_dereference(ipvs->svc_table);
if (!t) {
int lfactor = sysctl_svc_lfactor(ipvs);
int new_size = ip_vs_svc_desired_size(ipvs, NULL, lfactor);
+ rcu_read_unlock();
t_new = ip_vs_svc_table_alloc(ipvs, new_size, lfactor);
if (!t_new) {
ret = -ENOMEM;
goto out_err;
}
+ grow = false;
+ } else {
+ /* Even the currently attached new table may need to grow */
+ t = rcu_dereference(t->new_tbl);
+ grow = ip_vs_get_num_services(ipvs) + 1 > t->u_thresh;
+ rcu_read_unlock();
}
if (!rcu_dereference_protected(ipvs->conn_tab, 1)) {
@@ -1773,6 +1836,7 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
goto out_err;
if (t_new) {
+ /* Add table for first time */
clear_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags);
rcu_assign_pointer(ipvs->svc_table, t_new);
t_new = NULL;
@@ -1804,19 +1868,23 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
ip_vs_svc_hash(svc);
/* Schedule resize work */
- if (t && ip_vs_get_num_services(ipvs) > t->u_thresh &&
- !test_and_set_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags))
+ if (grow && !test_and_set_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags))
queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work,
1);
*svc_p = svc;
if (!READ_ONCE(ipvs->enable)) {
+ mutex_lock(&ipvs->est_mutex);
+
/* Now there is a service - full throttle */
WRITE_ONCE(ipvs->enable, 1);
+ ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
+
/* Start estimation for first time */
- ip_vs_est_reload_start(ipvs);
+ ip_vs_est_reload_start(ipvs, true);
+ mutex_unlock(&ipvs->est_mutex);
}
return 0;
@@ -1830,7 +1898,7 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
if (ret_hooks >= 0)
ip_vs_unregister_hooks(ipvs, u->af);
if (svc != NULL) {
- ip_vs_unbind_scheduler(svc, sched);
+ ip_vs_unbind_scheduler(svc);
ip_vs_service_free(svc);
}
ip_vs_scheduler_put(sched);
@@ -1894,9 +1962,8 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
old_sched = rcu_dereference_protected(svc->scheduler, 1);
if (sched != old_sched) {
if (old_sched) {
- ip_vs_unbind_scheduler(svc, old_sched);
- RCU_INIT_POINTER(svc->scheduler, NULL);
- /* Wait all svc->sched_data users */
+ ip_vs_unbind_scheduler(svc);
+ /* Wait all svc->scheduler/sched_data users */
synchronize_rcu();
}
/* Bind the new scheduler */
@@ -1904,6 +1971,10 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
ret = ip_vs_bind_scheduler(svc, sched);
if (ret) {
ip_vs_scheduler_put(sched);
+ /* Try to restore the old_sched */
+ if (old_sched &&
+ !ip_vs_bind_scheduler(svc, old_sched))
+ old_sched = NULL;
goto out;
}
}
@@ -1959,7 +2030,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
/* Unbind scheduler */
old_sched = rcu_dereference_protected(svc->scheduler, 1);
- ip_vs_unbind_scheduler(svc, old_sched);
+ ip_vs_unbind_scheduler(svc);
ip_vs_scheduler_put(old_sched);
/* Unbind persistence engine, keep svc->pe */
@@ -2022,7 +2093,6 @@ static int ip_vs_del_service(struct ip_vs_service *svc)
return -EEXIST;
ipvs = svc->ipvs;
ip_vs_unlink_service(svc, false);
- t = rcu_dereference_protected(ipvs->svc_table, 1);
/* Drop the table if no more services */
ns = ip_vs_get_num_services(ipvs);
@@ -2030,8 +2100,12 @@ static int ip_vs_del_service(struct ip_vs_service *svc)
/* Stop the resizer and drop the tables */
set_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags);
cancel_delayed_work_sync(&ipvs->svc_resize_work);
+ t = rcu_dereference_protected(ipvs->svc_table, 1);
if (t) {
rcu_assign_pointer(ipvs->svc_table, NULL);
+ /* Inform readers that table is removed */
+ smp_mb__before_atomic();
+ atomic_inc(&ipvs->svc_table_changes);
while (1) {
p = rcu_dereference_protected(t->new_tbl, 1);
call_rcu(&t->rcu_head, ip_vs_rht_rcu_free);
@@ -2040,11 +2114,19 @@ static int ip_vs_del_service(struct ip_vs_service *svc)
t = p;
}
}
- } else if (ns <= t->l_thresh &&
- !test_and_set_bit(IP_VS_WORK_SVC_RESIZE,
- &ipvs->work_flags)) {
- queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work,
- 1);
+ } else {
+ bool shrink;
+
+ rcu_read_lock();
+ t = rcu_dereference(ipvs->svc_table);
+ /* Even the currently attached new table may need to shrink */
+ t = rcu_dereference(t->new_tbl);
+ shrink = ns <= t->l_thresh;
+ rcu_read_unlock();
+ if (shrink && !test_and_set_bit(IP_VS_WORK_SVC_RESIZE,
+ &ipvs->work_flags))
+ queue_delayed_work(system_unbound_wq,
+ &ipvs->svc_resize_work, 1);
}
return 0;
}
@@ -2078,6 +2160,9 @@ static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup)
t = rcu_dereference_protected(ipvs->svc_table, 1);
if (t) {
rcu_assign_pointer(ipvs->svc_table, NULL);
+ /* Inform readers that table is removed */
+ smp_mb__before_atomic();
+ atomic_inc(&ipvs->svc_table_changes);
while (1) {
p = rcu_dereference_protected(t->new_tbl, 1);
call_rcu(&t->rcu_head, ip_vs_rht_rcu_free);
@@ -2086,6 +2171,11 @@ static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup)
t = p;
}
}
+ /* Stop the tot_stats estimator early under service_mutex
+ * to avoid locking it again later.
+ */
+ if (cleanup)
+ ip_vs_stop_estimator_tot_stats(ipvs);
return 0;
}
@@ -2141,17 +2231,21 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
struct ip_vs_service *svc;
struct hlist_bl_node *e;
struct ip_vs_dest *dest;
- int old_gen, new_gen;
+ int old_gen;
if (event != NETDEV_DOWN || !ipvs)
return NOTIFY_DONE;
IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
+ /* Allow concurrent rehashing on resize but to avoid loop
+ * serialize with installing the new table.
+ */
+ down_read(&ipvs->svc_replace_sem);
+
old_gen = atomic_read(&ipvs->svc_table_changes);
rcu_read_lock();
-repeat:
smp_rmb(); /* ipvs->svc_table and svc_table_changes */
ip_vs_rht_walk_buckets_rcu(ipvs->svc_table, head) {
hlist_bl_for_each_entry_rcu(svc, e, head, s_list) {
@@ -2164,17 +2258,17 @@ repeat:
}
resched_score++;
if (resched_score >= 100) {
- resched_score = 0;
cond_resched_rcu();
- new_gen = atomic_read(&ipvs->svc_table_changes);
- /* New table installed ? */
- if (old_gen != new_gen) {
- old_gen = new_gen;
- goto repeat;
- }
+ /* Flushed? So no more dev refs */
+ if (atomic_read(&ipvs->svc_table_changes) != old_gen)
+ goto done;
+ resched_score = 0;
}
}
+
+done:
rcu_read_unlock();
+ up_read(&ipvs->svc_replace_sem);
return NOTIFY_DONE;
}
@@ -2201,6 +2295,10 @@ static int ip_vs_zero_all(struct netns_ipvs *ipvs)
struct ip_vs_service *svc;
struct hlist_bl_node *e;
+ /* svc_table can not be replaced (svc_replace_sem) or
+ * removed (service_mutex)
+ */
+ down_read(&ipvs->svc_replace_sem);
rcu_read_lock();
ip_vs_rht_walk_buckets_rcu(ipvs->svc_table, head) {
@@ -2216,6 +2314,7 @@ static int ip_vs_zero_all(struct netns_ipvs *ipvs)
}
rcu_read_unlock();
+ up_read(&ipvs->svc_replace_sem);
ip_vs_zero_stats(&ipvs->tot_stats->s);
return 0;
@@ -2331,7 +2430,7 @@ static int ipvs_proc_est_cpumask_set(const struct ctl_table *table,
/* est_max_threads may depend on cpulist size */
ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
ipvs->est_calc_phase = 1;
- ip_vs_est_reload_start(ipvs);
+ ip_vs_est_reload_start(ipvs, true);
unlock:
mutex_unlock(&ipvs->est_mutex);
@@ -2351,11 +2450,14 @@ static int ipvs_proc_est_cpumask_get(const struct ctl_table *table,
mutex_lock(&ipvs->est_mutex);
- if (ipvs->est_cpulist_valid)
- mask = *valp;
- else
- mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KTHREAD);
- ret = scnprintf(buffer, size, "%*pbl\n", cpumask_pr_args(mask));
+ /* HK_TYPE_KTHREAD cpumask needs RCU protection */
+ scoped_guard(rcu) {
+ if (ipvs->est_cpulist_valid)
+ mask = *valp;
+ else
+ mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KTHREAD);
+ ret = scnprintf(buffer, size, "%*pbl\n", cpumask_pr_args(mask));
+ }
mutex_unlock(&ipvs->est_mutex);
@@ -2411,7 +2513,7 @@ static int ipvs_proc_est_nice(const struct ctl_table *table, int write,
mutex_lock(&ipvs->est_mutex);
if (*valp != val) {
*valp = val;
- ip_vs_est_reload_start(ipvs);
+ ip_vs_est_reload_start(ipvs, true);
}
mutex_unlock(&ipvs->est_mutex);
}
@@ -2438,7 +2540,7 @@ static int ipvs_proc_run_estimation(const struct ctl_table *table, int write,
mutex_lock(&ipvs->est_mutex);
if (*valp != val) {
*valp = val;
- ip_vs_est_reload_start(ipvs);
+ ip_vs_est_reload_start(ipvs, true);
}
mutex_unlock(&ipvs->est_mutex);
}
@@ -2463,7 +2565,7 @@ static int ipvs_proc_conn_lfactor(const struct ctl_table *table, int write,
if (val < -8 || val > 8) {
ret = -EINVAL;
} else {
- *valp = val;
+ WRITE_ONCE(*valp, val);
if (rcu_access_pointer(ipvs->conn_tab))
mod_delayed_work(system_unbound_wq,
&ipvs->conn_resize_work, 0);
@@ -2490,10 +2592,16 @@ static int ipvs_proc_svc_lfactor(const struct ctl_table *table, int write,
if (val < -8 || val > 8) {
ret = -EINVAL;
} else {
- *valp = val;
- if (rcu_access_pointer(ipvs->svc_table))
+ mutex_lock(&ipvs->service_mutex);
+ WRITE_ONCE(*valp, val);
+ /* Make sure the services are present */
+ if (rcu_access_pointer(ipvs->svc_table) &&
+ READ_ONCE(ipvs->enable) &&
+ !test_bit(IP_VS_WORK_SVC_NORESIZE,
+ &ipvs->work_flags))
mod_delayed_work(system_unbound_wq,
&ipvs->svc_resize_work, 0);
+ mutex_unlock(&ipvs->service_mutex);
}
}
return ret;
@@ -3004,11 +3112,13 @@ static int ip_vs_status_show(struct seq_file *seq, void *v)
int old_gen, new_gen;
u32 counts[8];
u32 bucket;
- int count;
+ u32 count;
+ int loops;
u32 sum1;
u32 sum;
int i;
+ /* Info for conns */
rcu_read_lock();
t = rcu_dereference(ipvs->conn_tab);
@@ -3020,6 +3130,7 @@ static int ip_vs_status_show(struct seq_file *seq, void *v)
if (!atomic_read(&ipvs->conn_count))
goto after_conns;
old_gen = atomic_read(&ipvs->conn_tab_changes);
+ loops = 0;
repeat_conn:
smp_rmb(); /* ipvs->conn_tab and conn_tab_changes */
@@ -3032,8 +3143,11 @@ repeat_conn:
resched_score++;
ip_vs_rht_walk_bucket_rcu(t, bucket, head) {
count = 0;
- hlist_bl_for_each_entry_rcu(hn, e, head, node)
+ hlist_bl_for_each_entry_rcu(hn, e, head, node) {
count++;
+ if (count >= ARRAY_SIZE(counts) - 1)
+ break;
+ }
}
resched_score += count;
if (resched_score >= 100) {
@@ -3042,31 +3156,40 @@ repeat_conn:
new_gen = atomic_read(&ipvs->conn_tab_changes);
/* New table installed ? */
if (old_gen != new_gen) {
+ /* Too many changes? */
+ if (++loops >= 5)
+ goto after_conns;
old_gen = new_gen;
goto repeat_conn;
}
}
- counts[min(count, (int)ARRAY_SIZE(counts) - 1)]++;
+ counts[count]++;
}
}
for (sum = 0, i = 0; i < ARRAY_SIZE(counts); i++)
sum += counts[i];
sum1 = sum - counts[0];
- seq_printf(seq, "Conn buckets empty:\t%u (%lu%%)\n",
- counts[0], (unsigned long)counts[0] * 100 / max(sum, 1U));
+ seq_printf(seq, "Conn buckets empty:\t%u (%llu%%)\n",
+ counts[0], div_u64((u64)counts[0] * 100U, max(sum, 1U)));
for (i = 1; i < ARRAY_SIZE(counts); i++) {
if (!counts[i])
continue;
- seq_printf(seq, "Conn buckets len-%d:\t%u (%lu%%)\n",
+ seq_printf(seq, "Conn buckets len-%d:\t%u (%llu%%)\n",
i, counts[i],
- (unsigned long)counts[i] * 100 / max(sum1, 1U));
+ div_u64((u64)counts[i] * 100U, max(sum1, 1U)));
}
after_conns:
+ rcu_read_unlock();
+
+ /* Info for services */
+ down_read(&ipvs->svc_replace_sem);
+ rcu_read_lock();
+
t = rcu_dereference(ipvs->svc_table);
count = ip_vs_get_num_services(ipvs);
- seq_printf(seq, "Services:\t%d\n", count);
+ seq_printf(seq, "Services:\t%u\n", count);
seq_printf(seq, "Service buckets:\t%d (%d bits, lfactor %d)\n",
t ? t->size : 0, t ? t->bits : 0, t ? t->lfactor : 0);
@@ -3074,7 +3197,6 @@ after_conns:
goto after_svc;
old_gen = atomic_read(&ipvs->svc_table_changes);
-repeat_svc:
smp_rmb(); /* ipvs->svc_table and svc_table_changes */
memset(counts, 0, sizeof(counts));
ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, pt) {
@@ -3086,37 +3208,41 @@ repeat_svc:
ip_vs_rht_walk_bucket_rcu(t, bucket, head) {
count = 0;
hlist_bl_for_each_entry_rcu(svc, e, head,
- s_list)
+ s_list) {
count++;
+ if (count >= ARRAY_SIZE(counts) - 1)
+ break;
+ }
}
resched_score += count;
if (resched_score >= 100) {
resched_score = 0;
cond_resched_rcu();
- new_gen = atomic_read(&ipvs->svc_table_changes);
- /* New table installed ? */
- if (old_gen != new_gen) {
- old_gen = new_gen;
- goto repeat_svc;
- }
+ /* Flushed? */
+ if (atomic_read(&ipvs->svc_table_changes) !=
+ old_gen)
+ goto after_svc;
}
- counts[min(count, (int)ARRAY_SIZE(counts) - 1)]++;
+ counts[count]++;
}
}
for (sum = 0, i = 0; i < ARRAY_SIZE(counts); i++)
sum += counts[i];
sum1 = sum - counts[0];
- seq_printf(seq, "Service buckets empty:\t%u (%lu%%)\n",
- counts[0], (unsigned long)counts[0] * 100 / max(sum, 1U));
+ seq_printf(seq, "Service buckets empty:\t%u (%llu%%)\n",
+ counts[0], div_u64((u64)counts[0] * 100U, max(sum, 1U)));
for (i = 1; i < ARRAY_SIZE(counts); i++) {
if (!counts[i])
continue;
- seq_printf(seq, "Service buckets len-%d:\t%u (%lu%%)\n",
+ seq_printf(seq, "Service buckets len-%d:\t%u (%llu%%)\n",
i, counts[i],
- (unsigned long)counts[i] * 100 / max(sum1, 1U));
+ div_u64((u64)counts[i] * 100U, max(sum1, 1U)));
}
after_svc:
+ rcu_read_unlock();
+ up_read(&ipvs->svc_replace_sem);
+
seq_printf(seq, "Stats thread slots:\t%d (max %lu)\n",
ipvs->est_kt_count, ipvs->est_max_threads);
seq_printf(seq, "Stats chain max len:\t%d\n", ipvs->est_chain_max);
@@ -3124,7 +3250,6 @@ after_svc:
ipvs->est_chain_max * IPVS_EST_CHAIN_FACTOR *
IPVS_EST_NTICKS);
- rcu_read_unlock();
return 0;
}
@@ -3436,7 +3561,7 @@ __ip_vs_get_service_entries(struct netns_ipvs *ipvs,
int ret = 0;
lockdep_assert_held(&ipvs->svc_resize_sem);
- /* All service modifications are disabled, go ahead */
+ /* All svc_table modifications are disabled, go ahead */
ip_vs_rht_walk_buckets(ipvs->svc_table, head) {
hlist_bl_for_each_entry(svc, e, head, s_list) {
/* Only expose IPv4 entries to old interface */
@@ -3620,7 +3745,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
pr_err("length: %u != %zu\n", *len, size);
return -EINVAL;
}
- /* Protect against table resizer moving the entries.
+ /* Prevent modifications to the list with services.
* Try reverse locking, so that we do not hold the mutex
* while waiting for semaphore.
*/
@@ -3962,6 +4087,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,
int start = cb->args[0];
int idx = 0;
+ /* Make sure we do not see same service twice during resize */
down_read(&ipvs->svc_resize_sem);
rcu_read_lock();
ip_vs_rht_walk_buckets_safe_rcu(ipvs->svc_table, head) {
@@ -4967,7 +5093,14 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs)
cancel_delayed_work_sync(&ipvs->defense_work);
cancel_work_sync(&ipvs->defense_work.work);
unregister_net_sysctl_table(ipvs->sysctl_hdr);
- ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s);
+ if (ipvs->tot_stats->s.est.ktid != -2) {
+ /* Not stopped yet? This happens only on netns init error and
+ * we even do not need to lock the service_mutex for this case.
+ */
+ mutex_lock(&ipvs->service_mutex);
+ ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s);
+ mutex_unlock(&ipvs->service_mutex);
+ }
if (ipvs->est_cpulist_valid)
free_cpumask_var(ipvs->sysctl_est_cpulist);
@@ -4998,6 +5131,7 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
/* Initialize service_mutex, svc_table per netns */
__mutex_init(&ipvs->service_mutex, "ipvs->service_mutex", &__ipvs_service_key);
init_rwsem(&ipvs->svc_resize_sem);
+ init_rwsem(&ipvs->svc_replace_sem);
INIT_DELAYED_WORK(&ipvs->svc_resize_work, svc_resize_work_handler);
atomic_set(&ipvs->svc_table_changes, 0);
RCU_INIT_POINTER(ipvs->svc_table, NULL);
@@ -5039,7 +5173,7 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
ipvs->net->proc_net,
ip_vs_stats_percpu_show, NULL))
goto err_percpu;
- if (!proc_create_net_single("ip_vs_status", 0, ipvs->net->proc_net,
+ if (!proc_create_net_single("ip_vs_status", 0440, ipvs->net->proc_net,
ip_vs_status_show, NULL))
goto err_status;
#endif
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
index 433ba3cab58c..ab09f5182951 100644
--- a/net/netfilter/ipvs/ip_vs_est.c
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -68,6 +68,11 @@
and the limit of estimators per kthread
- est_add_ktid: ktid where to add new ests, can point to empty slot where
we should add kt data
+ - data protected by service_mutex: est_temp_list, est_add_ktid,
+ est_kt_count(R/W), est_kt_arr(R/W), est_genid_done, kd->needed(R/W)
+ - data protected by est_mutex: est_genid, est_max_threads, sysctl_est_cpulist,
+ est_cpulist_valid, sysctl_est_nice, est_stopped, sysctl_run_estimation,
+ est_kt_count(R), est_kt_arr(R), kd->needed(R), kd->task (id > 0)
*/
static struct lock_class_key __ipvs_est_key;
@@ -227,14 +232,17 @@ static int ip_vs_estimation_kthread(void *data)
}
/* Schedule stop/start for kthread tasks */
-void ip_vs_est_reload_start(struct netns_ipvs *ipvs)
+void ip_vs_est_reload_start(struct netns_ipvs *ipvs, bool restart)
{
+ lockdep_assert_held(&ipvs->est_mutex);
+
/* Ignore reloads before first service is added */
if (!READ_ONCE(ipvs->enable))
return;
ip_vs_est_stopped_recalc(ipvs);
- /* Bump the kthread configuration genid */
- atomic_inc(&ipvs->est_genid);
+ /* Bump the kthread configuration genid if stopping is requested */
+ if (restart)
+ atomic_inc(&ipvs->est_genid);
queue_delayed_work(system_long_wq, &ipvs->est_reload_work, 0);
}
@@ -304,12 +312,17 @@ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs)
void *arr = NULL;
int i;
- if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads &&
- READ_ONCE(ipvs->enable) && ipvs->est_max_threads)
- return -EINVAL;
-
mutex_lock(&ipvs->est_mutex);
+ /* Allow kt 0 data to be created before the services are added
+ * and limit the kthreads when services are present.
+ */
+ if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads &&
+ READ_ONCE(ipvs->enable) && ipvs->est_max_threads) {
+ ret = -EINVAL;
+ goto out;
+ }
+
for (i = 0; i < id; i++) {
if (!ipvs->est_kt_arr[i])
break;
@@ -333,6 +346,7 @@ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs)
kd->est_timer = jiffies;
kd->id = id;
ip_vs_est_set_params(ipvs, kd);
+ kd->needed = 1;
/* Pre-allocate stats used in calc phase */
if (!id && !kd->calc_stats) {
@@ -341,12 +355,8 @@ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs)
goto out;
}
- /* Start kthread tasks only when services are present */
- if (READ_ONCE(ipvs->enable) && !ip_vs_est_stopped(ipvs)) {
- ret = ip_vs_est_kthread_start(ipvs, kd);
- if (ret < 0)
- goto out;
- }
+ /* Request kthread to be started */
+ ip_vs_est_reload_start(ipvs, false);
if (arr)
ipvs->est_kt_count++;
@@ -482,12 +492,11 @@ out:
/* Start estimation for stats */
int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
{
+ struct ip_vs_est_kt_data *kd = ipvs->est_kt_count > 0 ?
+ ipvs->est_kt_arr[0] : NULL;
struct ip_vs_estimator *est = &stats->est;
int ret;
- if (!ipvs->est_max_threads && READ_ONCE(ipvs->enable))
- ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
-
est->ktid = -1;
est->ktrow = IPVS_EST_NTICKS - 1; /* Initial delay */
@@ -496,8 +505,15 @@ int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
* will not allocate much memory, just for kt 0.
*/
ret = 0;
- if (!ipvs->est_kt_count || !ipvs->est_kt_arr[0])
+ if (!kd) {
ret = ip_vs_est_add_kthread(ipvs);
+ } else if (!kd->needed) {
+ mutex_lock(&ipvs->est_mutex);
+ /* We have job for the kt 0 task */
+ kd->needed = 1;
+ ip_vs_est_reload_start(ipvs, true);
+ mutex_unlock(&ipvs->est_mutex);
+ }
if (ret >= 0)
hlist_add_head(&est->list, &ipvs->est_temp_list);
else
@@ -578,16 +594,14 @@ void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
}
end_kt0:
- /* kt 0 is freed after all other kthreads and chains are empty */
+ /* kt 0 task is stopped after all other kt slots and chains are empty */
if (ipvs->est_kt_count == 1 && hlist_empty(&ipvs->est_temp_list)) {
kd = ipvs->est_kt_arr[0];
- if (!kd || !kd->est_count) {
+ if (kd && !kd->est_count) {
mutex_lock(&ipvs->est_mutex);
- if (kd) {
- ip_vs_est_kthread_destroy(kd);
- ipvs->est_kt_arr[0] = NULL;
- }
- ipvs->est_kt_count--;
+ /* Keep the kt0 data but request kthread_stop */
+ kd->needed = 0;
+ ip_vs_est_reload_start(ipvs, true);
mutex_unlock(&ipvs->est_mutex);
ipvs->est_add_ktid = 0;
}
@@ -647,9 +661,9 @@ static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max)
u64 val;
INIT_HLIST_HEAD(&chain);
- mutex_lock(&ipvs->service_mutex);
+ mutex_lock(&ipvs->est_mutex);
kd = ipvs->est_kt_arr[0];
- mutex_unlock(&ipvs->service_mutex);
+ mutex_unlock(&ipvs->est_mutex);
s = kd ? kd->calc_stats : NULL;
if (!s)
goto out;
@@ -748,16 +762,16 @@ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs)
if (!ip_vs_est_calc_limits(ipvs, &chain_max))
return;
- mutex_lock(&ipvs->service_mutex);
-
/* Stop all other tasks, so that we can immediately move the
* estimators to est_temp_list without RCU grace period
*/
mutex_lock(&ipvs->est_mutex);
for (id = 1; id < ipvs->est_kt_count; id++) {
/* netns clean up started, abort */
- if (!READ_ONCE(ipvs->enable))
- goto unlock2;
+ if (kthread_should_stop() || !READ_ONCE(ipvs->enable)) {
+ mutex_unlock(&ipvs->est_mutex);
+ return;
+ }
kd = ipvs->est_kt_arr[id];
if (!kd)
continue;
@@ -765,9 +779,11 @@ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs)
}
mutex_unlock(&ipvs->est_mutex);
+ mutex_lock(&ipvs->service_mutex);
+
/* Move all estimators to est_temp_list but carefully,
* all estimators and kthread data can be released while
- * we reschedule. Even for kthread 0.
+ * we reschedule.
*/
step = 0;
@@ -849,9 +865,7 @@ walk_chain:
ip_vs_stop_estimator(ipvs, stats);
/* Tasks are stopped, move without RCU grace period */
est->ktid = -1;
- est->ktrow = row - kd->est_row;
- if (est->ktrow < 0)
- est->ktrow += IPVS_EST_NTICKS;
+ est->ktrow = delay;
hlist_add_head(&est->list, &ipvs->est_temp_list);
/* kd freed ? */
if (last)
@@ -889,7 +903,6 @@ end_dequeue:
if (genid == atomic_read(&ipvs->est_genid))
ipvs->est_calc_phase = 0;
-unlock2:
mutex_unlock(&ipvs->est_mutex);
unlock:
diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c
index c6e421c4e299..24adc38942a0 100644
--- a/net/netfilter/ipvs/ip_vs_sched.c
+++ b/net/netfilter/ipvs/ip_vs_sched.c
@@ -56,19 +56,19 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc,
/*
* Unbind a service with its scheduler
*/
-void ip_vs_unbind_scheduler(struct ip_vs_service *svc,
- struct ip_vs_scheduler *sched)
+void ip_vs_unbind_scheduler(struct ip_vs_service *svc)
{
- struct ip_vs_scheduler *cur_sched;
+ struct ip_vs_scheduler *sched;
- cur_sched = rcu_dereference_protected(svc->scheduler, 1);
- /* This check proves that old 'sched' was installed */
- if (!cur_sched)
+ sched = rcu_dereference_protected(svc->scheduler, 1);
+ if (!sched)
return;
+ /* Reset the scheduler before initiating any RCU callbacks */
+ rcu_assign_pointer(svc->scheduler, NULL);
+ smp_wmb(); /* paired with smp_rmb() in ip_vs_schedule() */
if (sched->done_service)
sched->done_service(svc);
- /* svc->scheduler can be set to NULL only by caller */
}
diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c
index 4f39bf7c843f..75e53fde6b29 100644
--- a/net/netfilter/nf_conntrack_broadcast.c
+++ b/net/netfilter/nf_conntrack_broadcast.c
@@ -72,6 +72,7 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb,
exp->flags = NF_CT_EXPECT_PERMANENT;
exp->class = NF_CT_EXPECT_CLASS_DEFAULT;
rcu_assign_pointer(exp->helper, helper);
+ rcu_assign_pointer(exp->assign_helper, NULL);
write_pnet(&exp->net, net);
#ifdef CONFIG_NF_CONNTRACK_ZONES
exp->zone = ct->zone;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index b08189226320..b521b5ebd664 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -568,6 +568,13 @@ static void destroy_gre_conntrack(struct nf_conn *ct)
#endif
}
+static void warn_on_keymap_list_leak(const struct net *net)
+{
+#ifdef CONFIG_NF_CT_PROTO_GRE
+ WARN_ON_ONCE(!list_empty(&net->ct.nf_ct_proto.gre.keymap_list));
+#endif
+}
+
void nf_ct_destroy(struct nf_conntrack *nfct)
{
struct nf_conn *ct = (struct nf_conn *)nfct;
@@ -1811,14 +1818,17 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
spin_lock_bh(&nf_conntrack_expect_lock);
exp = nf_ct_find_expectation(net, zone, tuple, !tmpl || nf_ct_is_confirmed(tmpl));
if (exp) {
+ struct nf_conntrack_helper *assign_helper;
+
/* Welcome, Mr. Bond. We've been expecting you... */
__set_bit(IPS_EXPECTED_BIT, &ct->status);
/* exp->master safe, refcnt bumped in nf_ct_find_expectation */
ct->master = exp->master;
- if (exp->helper) {
+ assign_helper = rcu_dereference(exp->assign_helper);
+ if (assign_helper) {
help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
if (help)
- rcu_assign_pointer(help->helper, exp->helper);
+ rcu_assign_pointer(help->helper, assign_helper);
}
#ifdef CONFIG_NF_CONNTRACK_MARK
@@ -2507,6 +2517,7 @@ i_see_dead_people:
}
list_for_each_entry(net, net_exit_list, exit_list) {
+ warn_on_keymap_list_leak(net);
nf_conntrack_ecache_pernet_fini(net);
nf_conntrack_expect_pernet_fini(net);
free_percpu(net->ct.stat);
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 24d0576d84b7..8e943efbdf0a 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -344,6 +344,7 @@ void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class,
helper = rcu_dereference(help->helper);
rcu_assign_pointer(exp->helper, helper);
+ rcu_assign_pointer(exp->assign_helper, NULL);
write_pnet(&exp->net, net);
#ifdef CONFIG_NF_CONNTRACK_ZONES
exp->zone = ct->zone;
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index 3f5c50455b71..b2fe6554b9cf 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -643,7 +643,7 @@ static int expect_h245(struct sk_buff *skb, struct nf_conn *ct,
&ct->tuplehash[!dir].tuple.src.u3,
&ct->tuplehash[!dir].tuple.dst.u3,
IPPROTO_TCP, NULL, &port);
- rcu_assign_pointer(exp->helper, &nf_conntrack_helper_h245);
+ rcu_assign_pointer(exp->assign_helper, &nf_conntrack_helper_h245);
nathook = rcu_dereference(nfct_h323_nat_hook);
if (memcmp(&ct->tuplehash[dir].tuple.src.u3,
@@ -767,7 +767,7 @@ static int expect_callforwarding(struct sk_buff *skb,
nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
&ct->tuplehash[!dir].tuple.src.u3, &addr,
IPPROTO_TCP, NULL, &port);
- rcu_assign_pointer(exp->helper, nf_conntrack_helper_q931);
+ rcu_assign_pointer(exp->assign_helper, nf_conntrack_helper_q931);
nathook = rcu_dereference(nfct_h323_nat_hook);
if (memcmp(&ct->tuplehash[dir].tuple.src.u3,
@@ -1234,7 +1234,7 @@ static int expect_q931(struct sk_buff *skb, struct nf_conn *ct,
&ct->tuplehash[!dir].tuple.src.u3 : NULL,
&ct->tuplehash[!dir].tuple.dst.u3,
IPPROTO_TCP, NULL, &port);
- rcu_assign_pointer(exp->helper, nf_conntrack_helper_q931);
+ rcu_assign_pointer(exp->assign_helper, nf_conntrack_helper_q931);
exp->flags = NF_CT_EXPECT_PERMANENT; /* Accept multiple calls */
nathook = rcu_dereference(nfct_h323_nat_hook);
@@ -1306,7 +1306,7 @@ static int process_gcf(struct sk_buff *skb, struct nf_conn *ct,
nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
&ct->tuplehash[!dir].tuple.src.u3, &addr,
IPPROTO_UDP, NULL, &port);
- rcu_assign_pointer(exp->helper, nf_conntrack_helper_ras);
+ rcu_assign_pointer(exp->assign_helper, nf_conntrack_helper_ras);
if (nf_ct_expect_related(exp, 0) == 0) {
pr_debug("nf_ct_ras: expect RAS ");
@@ -1523,7 +1523,7 @@ static int process_acf(struct sk_buff *skb, struct nf_conn *ct,
&ct->tuplehash[!dir].tuple.src.u3, &addr,
IPPROTO_TCP, NULL, &port);
exp->flags = NF_CT_EXPECT_PERMANENT;
- rcu_assign_pointer(exp->helper, nf_conntrack_helper_q931);
+ rcu_assign_pointer(exp->assign_helper, nf_conntrack_helper_q931);
if (nf_ct_expect_related(exp, 0) == 0) {
pr_debug("nf_ct_ras: expect Q.931 ");
@@ -1577,7 +1577,7 @@ static int process_lcf(struct sk_buff *skb, struct nf_conn *ct,
&ct->tuplehash[!dir].tuple.src.u3, &addr,
IPPROTO_TCP, NULL, &port);
exp->flags = NF_CT_EXPECT_PERMANENT;
- rcu_assign_pointer(exp->helper, nf_conntrack_helper_q931);
+ rcu_assign_pointer(exp->assign_helper, nf_conntrack_helper_q931);
if (nf_ct_expect_related(exp, 0) == 0) {
pr_debug("nf_ct_ras: expect Q.931 ");
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index a715304a53d8..17e971bd4c74 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -321,8 +321,8 @@ __printf(3, 4)
void nf_ct_helper_log(struct sk_buff *skb, const struct nf_conn *ct,
const char *fmt, ...)
{
+ const char *helper_name = "(null)";
const struct nf_conn_help *help;
- const struct nf_conntrack_helper *helper;
struct va_format vaf;
va_list args;
@@ -331,14 +331,17 @@ void nf_ct_helper_log(struct sk_buff *skb, const struct nf_conn *ct,
vaf.fmt = fmt;
vaf.va = &args;
- /* Called from the helper function, this call never fails */
help = nfct_help(ct);
+ if (help) {
+ const struct nf_conntrack_helper *helper;
- /* rcu_read_lock()ed by nf_hook_thresh */
- helper = rcu_dereference(help->helper);
+ helper = rcu_dereference(help->helper);
+ if (helper)
+ helper_name = helper->name;
+ }
nf_log_packet(nf_ct_net(ct), nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL,
- "nf_ct_%s: dropping packet: %pV ", helper->name, &vaf);
+ "helper %s dropping packet: %pV ", helper_name, &vaf);
va_end(args);
}
@@ -400,6 +403,11 @@ static bool expect_iter_me(struct nf_conntrack_expect *exp, void *data)
this = rcu_dereference_protected(exp->helper,
lockdep_is_held(&nf_conntrack_expect_lock));
+ if (this == me)
+ return true;
+
+ this = rcu_dereference_protected(exp->assign_helper,
+ lockdep_is_held(&nf_conntrack_expect_lock));
return this == me;
}
diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c
index 522183b9a604..2ebe4cb47cf6 100644
--- a/net/netfilter/nf_conntrack_irc.c
+++ b/net/netfilter/nf_conntrack_irc.c
@@ -203,7 +203,7 @@ static int help(struct sk_buff *skb, unsigned int protoff,
if (parse_dcc(data, data_limit, &dcc_ip,
&dcc_port, &addr_beg_p, &addr_end_p)) {
pr_debug("unable to parse dcc command\n");
- continue;
+ goto out;
}
pr_debug("DCC bound ip/port: %pI4:%u\n",
@@ -217,7 +217,7 @@ static int help(struct sk_buff *skb, unsigned int protoff,
net_warn_ratelimited("Forged DCC command from %pI4: %pI4:%u\n",
&tuple->src.u3.ip,
&dcc_ip, dcc_port);
- continue;
+ goto out;
}
exp = nf_ct_expect_alloc(ct);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index eda5fe4a75c8..befa7e83ee49 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -2634,6 +2634,7 @@ static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = {
static struct nf_conntrack_expect *
ctnetlink_alloc_expect(const struct nlattr *const cda[], struct nf_conn *ct,
+ const struct nf_conntrack_helper *assign_helper,
struct nf_conntrack_tuple *tuple,
struct nf_conntrack_tuple *mask);
@@ -2860,6 +2861,7 @@ static int
ctnetlink_glue_attach_expect(const struct nlattr *attr, struct nf_conn *ct,
u32 portid, u32 report)
{
+ struct nf_conntrack_helper *assign_helper = NULL;
struct nlattr *cda[CTA_EXPECT_MAX+1];
struct nf_conntrack_tuple tuple, mask;
struct nf_conntrack_expect *exp;
@@ -2870,13 +2872,26 @@ ctnetlink_glue_attach_expect(const struct nlattr *attr, struct nf_conn *ct,
if (err < 0)
return err;
+ if (!cda[CTA_EXPECT_TUPLE] || !cda[CTA_EXPECT_MASK])
+ return -EINVAL;
+
err = ctnetlink_glue_exp_parse((const struct nlattr * const *)cda,
ct, &tuple, &mask);
if (err < 0)
return err;
+ if (cda[CTA_EXPECT_HELP_NAME]) {
+ const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]);
+
+ assign_helper = __nf_conntrack_helper_find(helpname,
+ nf_ct_l3num(ct),
+ tuple.dst.protonum);
+ if (!assign_helper)
+ return -EOPNOTSUPP;
+ }
+
exp = ctnetlink_alloc_expect((const struct nlattr * const *)cda, ct,
- &tuple, &mask);
+ assign_helper, &tuple, &mask);
if (IS_ERR(exp))
return PTR_ERR(exp);
@@ -3515,6 +3530,7 @@ ctnetlink_parse_expect_nat(const struct nlattr *attr,
static struct nf_conntrack_expect *
ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct,
+ const struct nf_conntrack_helper *assign_helper,
struct nf_conntrack_tuple *tuple,
struct nf_conntrack_tuple *mask)
{
@@ -3568,6 +3584,7 @@ ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct,
exp->zone = ct->zone;
#endif
rcu_assign_pointer(exp->helper, helper);
+ rcu_assign_pointer(exp->assign_helper, assign_helper);
exp->tuple = *tuple;
exp->mask.src.u3 = mask->src.u3;
exp->mask.src.u.all = mask->src.u.all;
@@ -3623,7 +3640,7 @@ ctnetlink_create_expect(struct net *net,
ct = nf_ct_tuplehash_to_ctrack(h);
rcu_read_lock();
- exp = ctnetlink_alloc_expect(cda, ct, &tuple, &mask);
+ exp = ctnetlink_alloc_expect(cda, ct, NULL, &tuple, &mask);
if (IS_ERR(exp)) {
err = PTR_ERR(exp);
goto err_rcu;
diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c
index 4c679638df06..dc23e4181618 100644
--- a/net/netfilter/nf_conntrack_pptp.c
+++ b/net/netfilter/nf_conntrack_pptp.c
@@ -225,13 +225,9 @@ static int exp_gre(struct nf_conn *ct, __be16 callid, __be16 peer_callid)
if (nf_ct_expect_related(exp_reply, 0) != 0)
goto out_unexpect_orig;
- /* Add GRE keymap entries */
- if (nf_ct_gre_keymap_add(ct, IP_CT_DIR_ORIGINAL, &exp_orig->tuple) != 0)
+ if (!nf_ct_gre_keymap_add(ct, &exp_orig->tuple,
+ &exp_reply->tuple))
goto out_unexpect_both;
- if (nf_ct_gre_keymap_add(ct, IP_CT_DIR_REPLY, &exp_reply->tuple) != 0) {
- nf_ct_gre_keymap_destroy(ct);
- goto out_unexpect_both;
- }
ret = 0;
out_put_both:
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index 94c19bc4edc5..35e22082d65a 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -87,41 +87,97 @@ static __be16 gre_keymap_lookup(struct net *net, struct nf_conntrack_tuple *t)
return key;
}
-/* add a single keymap entry, associate with specified master ct */
-int nf_ct_gre_keymap_add(struct nf_conn *ct, enum ip_conntrack_dir dir,
- struct nf_conntrack_tuple *t)
+enum nf_ct_gre_km_act {
+ NF_CT_GRE_KM_NEW,
+ NF_CT_GRE_KM_BAD,
+ NF_CT_GRE_KM_DUP
+};
+
+static enum nf_ct_gre_km_act
+nf_ct_gre_km_acceptable(const struct nf_ct_pptp_master *ct_pptp_info,
+ const struct nf_conntrack_tuple *orig,
+ const struct nf_conntrack_tuple *repl)
+{
+ struct nf_ct_gre_keymap *km_orig, *km_repl;
+
+ lockdep_assert_held(&keymap_lock);
+
+ km_orig = ct_pptp_info->keymap[IP_CT_DIR_ORIGINAL];
+ km_repl = ct_pptp_info->keymap[IP_CT_DIR_REPLY];
+
+ if (km_orig && km_repl) {
+ if (!gre_key_cmpfn(km_orig, orig))
+ return NF_CT_GRE_KM_BAD;
+
+ if (!gre_key_cmpfn(km_repl, repl))
+ return NF_CT_GRE_KM_BAD;
+
+ return NF_CT_GRE_KM_DUP;
+ }
+
+ DEBUG_NET_WARN_ON_ONCE(km_orig);
+ DEBUG_NET_WARN_ON_ONCE(km_repl);
+ return NF_CT_GRE_KM_NEW;
+}
+
+/* add keymap entries, associate with specified master ct */
+bool nf_ct_gre_keymap_add(struct nf_conn *ct,
+ const struct nf_conntrack_tuple *orig,
+ const struct nf_conntrack_tuple *repl)
{
struct net *net = nf_ct_net(ct);
struct nf_gre_net *net_gre = gre_pernet(net);
struct nf_ct_pptp_master *ct_pptp_info = nfct_help_data(ct);
- struct nf_ct_gre_keymap **kmp, *km;
-
- kmp = &ct_pptp_info->keymap[dir];
- if (*kmp) {
- /* check whether it's a retransmission */
- list_for_each_entry_rcu(km, &net_gre->keymap_list, list) {
- if (gre_key_cmpfn(km, t) && km == *kmp)
- return 0;
- }
- pr_debug("trying to override keymap_%s for ct %p\n",
- dir == IP_CT_DIR_REPLY ? "reply" : "orig", ct);
- return -EEXIST;
- }
+ struct nf_ct_gre_keymap *km_orig, *km_repl;
+ bool ret = false;
- km = kmalloc_obj(*km, GFP_ATOMIC);
- if (!km)
- return -ENOMEM;
- memcpy(&km->tuple, t, sizeof(*t));
- *kmp = km;
+ km_orig = kmalloc_obj(*km_orig, GFP_ATOMIC);
+ if (!km_orig)
+ return false;
+ km_repl = kmalloc_obj(*km_repl, GFP_ATOMIC);
+ if (!km_repl)
+ goto km_free;
- pr_debug("adding new entry %p: ", km);
- nf_ct_dump_tuple(&km->tuple);
+ memcpy(&km_orig->tuple, orig, sizeof(*orig));
+ memcpy(&km_repl->tuple, repl, sizeof(*repl));
spin_lock_bh(&keymap_lock);
- list_add_tail(&km->list, &net_gre->keymap_list);
+ if (nf_ct_is_dying(ct))
+ goto unlock_free;
+
+ switch (nf_ct_gre_km_acceptable(ct_pptp_info, orig, repl)) {
+ case NF_CT_GRE_KM_NEW:
+ break;
+ case NF_CT_GRE_KM_DUP:
+ ret = true;
+ goto unlock_free;
+ case NF_CT_GRE_KM_BAD:
+ pr_debug("trying to override keymap for ct %p\n", ct);
+ goto unlock_free;
+ }
+
+ if (ct_pptp_info->keymap[IP_CT_DIR_ORIGINAL] ||
+ ct_pptp_info->keymap[IP_CT_DIR_REPLY])
+ goto unlock_free;
+
+ pr_debug("adding new entries %p,%p: ", km_orig, km_repl);
+ nf_ct_dump_tuple(&km_orig->tuple);
+ nf_ct_dump_tuple(&km_repl->tuple);
+
+ list_add_tail_rcu(&km_orig->list, &net_gre->keymap_list);
+ list_add_tail_rcu(&km_repl->list, &net_gre->keymap_list);
+ ct_pptp_info->keymap[IP_CT_DIR_ORIGINAL] = km_orig;
+ ct_pptp_info->keymap[IP_CT_DIR_REPLY] = km_repl;
spin_unlock_bh(&keymap_lock);
- return 0;
+ return true;
+
+unlock_free:
+ spin_unlock_bh(&keymap_lock);
+km_free:
+ kfree(km_orig);
+ kfree(km_repl);
+ return ret;
}
EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_add);
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index b67426c2189b..e99ab1e88e9f 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -1221,7 +1221,8 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
new_state = old_state;
}
if (((test_bit(IPS_SEEN_REPLY_BIT, &ct->status)
- && ct->proto.tcp.last_index == TCP_SYN_SET)
+ && ct->proto.tcp.last_index == TCP_SYN_SET
+ && ct->proto.tcp.last_dir != dir)
|| (!test_bit(IPS_ASSURED_BIT, &ct->status)
&& ct->proto.tcp.last_index == TCP_ACK_SET))
&& ntohl(th->ack_seq) == ct->proto.tcp.last_end) {
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 1eb55907d470..e69941f1a101 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -1366,6 +1366,10 @@ static int process_register_request(struct sk_buff *skb, unsigned int protoff,
goto store_cseq;
}
+ helper = rcu_dereference(nfct_help(ct)->helper);
+ if (!helper)
+ return NF_DROP;
+
exp = nf_ct_expect_alloc(ct);
if (!exp) {
nf_ct_helper_log(skb, ct, "cannot alloc expectation");
@@ -1376,14 +1380,10 @@ static int process_register_request(struct sk_buff *skb, unsigned int protoff,
if (sip_direct_signalling)
saddr = &ct->tuplehash[!dir].tuple.src.u3;
- helper = rcu_dereference(nfct_help(ct)->helper);
- if (!helper)
- return NF_DROP;
-
nf_ct_expect_init(exp, SIP_EXPECT_SIGNALLING, nf_ct_l3num(ct),
saddr, &daddr, proto, NULL, &port);
exp->timeout.expires = sip_timeout * HZ;
- rcu_assign_pointer(exp->helper, helper);
+ rcu_assign_pointer(exp->assign_helper, helper);
exp->flags = NF_CT_EXPECT_PERMANENT | NF_CT_EXPECT_INACTIVE;
hooks = rcu_dereference(nf_nat_sip_hooks);
diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c
index e348fb90b8dc..3b0a70e154cd 100644
--- a/net/netfilter/nf_dup_netdev.c
+++ b/net/netfilter/nf_dup_netdev.c
@@ -13,22 +13,6 @@
#include <net/netfilter/nf_tables_offload.h>
#include <net/netfilter/nf_dup_netdev.h>
-#define NF_RECURSION_LIMIT 2
-
-#ifndef CONFIG_PREEMPT_RT
-static u8 *nf_get_nf_dup_skb_recursion(void)
-{
- return this_cpu_ptr(&softnet_data.xmit.nf_dup_skb_recursion);
-}
-#else
-
-static u8 *nf_get_nf_dup_skb_recursion(void)
-{
- return &current->net_xmit.nf_dup_skb_recursion;
-}
-
-#endif
-
static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev,
enum nf_dev_hooks hook)
{
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 2c4140e6f53c..785d8c244a77 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -122,6 +122,7 @@ static int flow_offload_fill_route(struct flow_offload *flow,
flow_tuple->tun = route->tuple[dir].in.tun;
flow_tuple->encap_num = route->tuple[dir].in.num_encaps;
+ flow_tuple->needs_gso_segment = route->tuple[dir].out.needs_gso_segment;
flow_tuple->tun_num = route->tuple[dir].in.num_tuns;
switch (route->tuple[dir].xmit_type) {
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index fd56d663cb5b..9c05a50d6013 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -445,13 +445,13 @@ static void nf_flow_encap_pop(struct nf_flowtable_ctx *ctx,
switch (skb->protocol) {
case htons(ETH_P_8021Q):
vlan_hdr = (struct vlan_hdr *)skb->data;
- __skb_pull(skb, VLAN_HLEN);
+ skb_pull_rcsum(skb, VLAN_HLEN);
vlan_set_encap_proto(skb, vlan_hdr);
skb_reset_network_header(skb);
break;
case htons(ETH_P_PPP_SES):
skb->protocol = __nf_flow_pppoe_proto(skb);
- skb_pull(skb, PPPOE_SES_HLEN);
+ skb_pull_rcsum(skb, PPPOE_SES_HLEN);
skb_reset_network_header(skb);
break;
}
@@ -462,23 +462,6 @@ static void nf_flow_encap_pop(struct nf_flowtable_ctx *ctx,
nf_flow_ip_tunnel_pop(ctx, skb);
}
-struct nf_flow_xmit {
- const void *dest;
- const void *source;
- struct net_device *outdev;
-};
-
-static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb,
- struct nf_flow_xmit *xmit)
-{
- skb->dev = xmit->outdev;
- dev_hard_header(skb, skb->dev, ntohs(skb->protocol),
- xmit->dest, xmit->source, skb->len);
- dev_queue_xmit(skb);
-
- return NF_STOLEN;
-}
-
static struct flow_offload_tuple_rhash *
nf_flow_offload_lookup(struct nf_flowtable_ctx *ctx,
struct nf_flowtable *flow_table, struct sk_buff *skb)
@@ -524,7 +507,7 @@ static int nf_flow_offload_forward(struct nf_flowtable_ctx *ctx,
return 0;
}
- if (skb_try_make_writable(skb, thoff + ctx->hdrsize))
+ if (skb_ensure_writable(skb, thoff + ctx->hdrsize))
return -1;
flow_offload_refresh(flow_table, flow, false);
@@ -544,7 +527,34 @@ static int nf_flow_offload_forward(struct nf_flowtable_ctx *ctx,
return 1;
}
-static int nf_flow_pppoe_push(struct sk_buff *skb, u16 id)
+/* Similar to skb_vlan_push. */
+static int nf_flow_vlan_push(struct sk_buff *skb, __be16 proto, u16 id,
+ u32 needed_headroom)
+{
+ if (skb_vlan_tag_present(skb)) {
+ struct vlan_hdr *vhdr;
+
+ if (skb_cow_head(skb, needed_headroom + VLAN_HLEN))
+ return -1;
+
+ __skb_push(skb, VLAN_HLEN);
+ if (skb_mac_header_was_set(skb))
+ skb->mac_header -= VLAN_HLEN;
+
+ vhdr = (struct vlan_hdr *)skb->data;
+ skb->network_header -= VLAN_HLEN;
+ vhdr->h_vlan_TCI = htons(skb_vlan_tag_get(skb));
+ vhdr->h_vlan_encapsulated_proto = skb->protocol;
+ skb->protocol = skb->vlan_proto;
+ skb_postpush_rcsum(skb, skb->data, VLAN_HLEN);
+ }
+ __vlan_hwaccel_put_tag(skb, proto, id);
+
+ return 0;
+}
+
+static int nf_flow_pppoe_push(struct sk_buff *skb, u16 id,
+ u32 needed_headroom)
{
int data_len = skb->len + sizeof(__be16);
struct ppp_hdr {
@@ -553,7 +563,7 @@ static int nf_flow_pppoe_push(struct sk_buff *skb, u16 id)
} *ph;
__be16 proto;
- if (skb_cow_head(skb, PPPOE_SES_HLEN))
+ if (skb_cow_head(skb, needed_headroom + PPPOE_SES_HLEN))
return -1;
switch (skb->protocol) {
@@ -730,21 +740,24 @@ static int nf_flow_tunnel_v6_push(struct net *net, struct sk_buff *skb,
}
static int nf_flow_encap_push(struct sk_buff *skb,
- struct flow_offload_tuple *tuple)
+ struct flow_offload_tuple *tuple,
+ struct net_device *outdev)
{
+ u32 needed_headroom = LL_RESERVED_SPACE(outdev);
int i;
- for (i = 0; i < tuple->encap_num; i++) {
+ for (i = tuple->encap_num - 1; i >= 0; i--) {
switch (tuple->encap[i].proto) {
case htons(ETH_P_8021Q):
case htons(ETH_P_8021AD):
- skb_reset_mac_header(skb);
- if (skb_vlan_push(skb, tuple->encap[i].proto,
- tuple->encap[i].id) < 0)
+ if (nf_flow_vlan_push(skb, tuple->encap[i].proto,
+ tuple->encap[i].id,
+ needed_headroom) < 0)
return -1;
break;
case htons(ETH_P_PPP_SES):
- if (nf_flow_pppoe_push(skb, tuple->encap[i].id) < 0)
+ if (nf_flow_pppoe_push(skb, tuple->encap[i].id,
+ needed_headroom) < 0)
return -1;
break;
}
@@ -753,6 +766,76 @@ static int nf_flow_encap_push(struct sk_buff *skb,
return 0;
}
+struct nf_flow_xmit {
+ const void *dest;
+ const void *source;
+ struct net_device *outdev;
+ struct flow_offload_tuple *tuple;
+ bool needs_gso_segment;
+};
+
+static void __nf_flow_queue_xmit(struct net *net, struct sk_buff *skb,
+ struct nf_flow_xmit *xmit)
+{
+ struct net_device *dev = xmit->outdev;
+ unsigned int hh_len = LL_RESERVED_SPACE(dev);
+
+ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
+ skb = skb_expand_head(skb, hh_len);
+ if (!skb)
+ return;
+ }
+
+ skb->dev = dev;
+ dev_hard_header(skb, dev, ntohs(skb->protocol),
+ xmit->dest, xmit->source, skb->len);
+ dev_queue_xmit(skb);
+}
+
+static unsigned int nf_flow_encap_gso_xmit(struct net *net, struct sk_buff *skb,
+ struct nf_flow_xmit *xmit)
+{
+ struct sk_buff *segs, *nskb;
+
+ segs = skb_gso_segment(skb, 0);
+ if (IS_ERR(segs))
+ return NF_DROP;
+
+ if (segs)
+ consume_skb(skb);
+ else
+ segs = skb;
+
+ skb_list_walk_safe(segs, segs, nskb) {
+ skb_mark_not_on_list(segs);
+
+ if (nf_flow_encap_push(segs, xmit->tuple, xmit->outdev) < 0) {
+ kfree_skb(segs);
+ kfree_skb_list(nskb);
+ return NF_STOLEN;
+ }
+ __nf_flow_queue_xmit(net, segs, xmit);
+ }
+
+ return NF_STOLEN;
+}
+
+static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb,
+ struct nf_flow_xmit *xmit)
+{
+ if (xmit->tuple->encap_num) {
+ if (skb_is_gso(skb) && xmit->needs_gso_segment)
+ return nf_flow_encap_gso_xmit(net, skb, xmit);
+
+ if (nf_flow_encap_push(skb, xmit->tuple, xmit->outdev) < 0)
+ return NF_DROP;
+ }
+
+ __nf_flow_queue_xmit(net, skb, xmit);
+
+ return NF_STOLEN;
+}
+
unsigned int
nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
@@ -797,9 +880,6 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
if (nf_flow_tunnel_v4_push(state->net, skb, other_tuple, &ip_daddr) < 0)
return NF_DROP;
- if (nf_flow_encap_push(skb, other_tuple) < 0)
- return NF_DROP;
-
switch (tuplehash->tuple.xmit_type) {
case FLOW_OFFLOAD_XMIT_NEIGH:
rt = dst_rtable(tuplehash->tuple.dst_cache);
@@ -829,6 +909,8 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
WARN_ON_ONCE(1);
return NF_DROP;
}
+ xmit.tuple = other_tuple;
+ xmit.needs_gso_segment = tuplehash->tuple.needs_gso_segment;
return nf_flow_queue_xmit(state->net, skb, &xmit);
}
@@ -1037,7 +1119,7 @@ static int nf_flow_offload_ipv6_forward(struct nf_flowtable_ctx *ctx,
return 0;
}
- if (skb_try_make_writable(skb, thoff + ctx->hdrsize))
+ if (skb_ensure_writable(skb, thoff + ctx->hdrsize))
return -1;
flow_offload_refresh(flow_table, flow, false);
@@ -1119,9 +1201,6 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
&ip6_daddr, encap_limit) < 0)
return NF_DROP;
- if (nf_flow_encap_push(skb, other_tuple) < 0)
- return NF_DROP;
-
switch (tuplehash->tuple.xmit_type) {
case FLOW_OFFLOAD_XMIT_NEIGH:
rt = dst_rt6_info(tuplehash->tuple.dst_cache);
@@ -1151,6 +1230,8 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
WARN_ON_ONCE(1);
return NF_DROP;
}
+ xmit.tuple = other_tuple;
+ xmit.needs_gso_segment = tuplehash->tuple.needs_gso_segment;
return nf_flow_queue_xmit(state->net, skb, &xmit);
}
diff --git a/net/netfilter/nf_flow_table_path.c b/net/netfilter/nf_flow_table_path.c
index 6bb9579dcc2a..9e88ea6a2eef 100644
--- a/net/netfilter/nf_flow_table_path.c
+++ b/net/netfilter/nf_flow_table_path.c
@@ -86,6 +86,7 @@ struct nft_forward_info {
u8 ingress_vlans;
u8 h_source[ETH_ALEN];
u8 h_dest[ETH_ALEN];
+ bool needs_gso_segment;
enum flow_offload_xmit_type xmit_type;
};
@@ -138,8 +139,11 @@ static void nft_dev_path_info(const struct net_device_path_stack *stack,
path->encap.proto;
info->num_encaps++;
}
- if (path->type == DEV_PATH_PPPOE)
+ if (path->type == DEV_PATH_PPPOE) {
memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN);
+ info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT;
+ info->needs_gso_segment = 1;
+ }
break;
case DEV_PATH_BRIDGE:
if (is_zero_ether_addr(info->h_source))
@@ -279,6 +283,7 @@ static void nft_dev_forward_path(const struct nft_pktinfo *pkt,
memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN);
route->tuple[dir].xmit_type = info.xmit_type;
}
+ route->tuple[dir].out.needs_gso_segment = info.needs_gso_segment;
}
int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct,
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index a6c81c04b3a5..57b450024a99 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -61,6 +61,7 @@ static void nf_queue_entry_release_refs(struct nf_queue_entry *entry)
struct nf_hook_state *state = &entry->state;
/* Release those devices we held, or Alexey will kill me. */
+ dev_put(entry->skb_dev);
dev_put(state->in);
dev_put(state->out);
if (state->sk)
@@ -102,6 +103,7 @@ bool nf_queue_entry_get_refs(struct nf_queue_entry *entry)
if (state->sk && !refcount_inc_not_zero(&state->sk->sk_refcnt))
return false;
+ dev_hold(entry->skb_dev);
dev_hold(state->in);
dev_hold(state->out);
@@ -202,11 +204,11 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
*entry = (struct nf_queue_entry) {
.skb = skb,
+ .skb_dev = skb->dev,
.state = *state,
.hook_index = index,
.size = sizeof(*entry) + route_key_size,
};
-
__nf_queue_entry_init_physdevs(entry);
if (!nf_queue_entry_get_refs(entry)) {
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index 57f57e2fc80a..ed00114f65f3 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -22,6 +22,8 @@
#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/nf_synproxy.h>
+static DEFINE_MUTEX(synproxy_mutex);
+
unsigned int synproxy_net_id;
EXPORT_SYMBOL_GPL(synproxy_net_id);
@@ -200,6 +202,8 @@ synproxy_tstamp_adjust(struct sk_buff *skb, unsigned int protoff,
if (skb_ensure_writable(skb, optend))
return 0;
+ th = (struct tcphdr *)(skb->data + protoff);
+
while (optoff < optend) {
unsigned char *op = skb->data + optoff;
@@ -767,26 +771,31 @@ static const struct nf_hook_ops ipv4_synproxy_ops[] = {
int nf_synproxy_ipv4_init(struct synproxy_net *snet, struct net *net)
{
- int err;
+ int err = 0;
+ mutex_lock(&synproxy_mutex);
if (snet->hook_ref4 == 0) {
err = nf_register_net_hooks(net, ipv4_synproxy_ops,
ARRAY_SIZE(ipv4_synproxy_ops));
if (err)
- return err;
+ goto out;
}
snet->hook_ref4++;
- return 0;
+out:
+ mutex_unlock(&synproxy_mutex);
+ return err;
}
EXPORT_SYMBOL_GPL(nf_synproxy_ipv4_init);
void nf_synproxy_ipv4_fini(struct synproxy_net *snet, struct net *net)
{
+ mutex_lock(&synproxy_mutex);
snet->hook_ref4--;
if (snet->hook_ref4 == 0)
nf_unregister_net_hooks(net, ipv4_synproxy_ops,
ARRAY_SIZE(ipv4_synproxy_ops));
+ mutex_unlock(&synproxy_mutex);
}
EXPORT_SYMBOL_GPL(nf_synproxy_ipv4_fini);
@@ -1191,27 +1200,32 @@ static const struct nf_hook_ops ipv6_synproxy_ops[] = {
int
nf_synproxy_ipv6_init(struct synproxy_net *snet, struct net *net)
{
- int err;
+ int err = 0;
+ mutex_lock(&synproxy_mutex);
if (snet->hook_ref6 == 0) {
err = nf_register_net_hooks(net, ipv6_synproxy_ops,
ARRAY_SIZE(ipv6_synproxy_ops));
if (err)
- return err;
+ goto out;
}
snet->hook_ref6++;
- return 0;
+out:
+ mutex_unlock(&synproxy_mutex);
+ return err;
}
EXPORT_SYMBOL_GPL(nf_synproxy_ipv6_init);
void
nf_synproxy_ipv6_fini(struct synproxy_net *snet, struct net *net)
{
+ mutex_lock(&synproxy_mutex);
snet->hook_ref6--;
if (snet->hook_ref6 == 0)
nf_unregister_net_hooks(net, ipv6_synproxy_ops,
ARRAY_SIZE(ipv6_synproxy_ops));
+ mutex_unlock(&synproxy_mutex);
}
EXPORT_SYMBOL_GPL(nf_synproxy_ipv6_fini);
#endif /* CONFIG_IPV6 */
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index d20ce5c36d31..87387adbca65 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -407,6 +407,7 @@ static void nft_netdev_unregister_trans_hook(struct net *net,
}
static void nft_netdev_unregister_hooks(struct net *net,
+ const struct nft_table *table,
struct list_head *hook_list,
bool release_netdev)
{
@@ -414,8 +415,10 @@ static void nft_netdev_unregister_hooks(struct net *net,
struct nf_hook_ops *ops;
list_for_each_entry_safe(hook, next, hook_list, list) {
- list_for_each_entry(ops, &hook->ops_list, list)
- nf_unregister_net_hook(net, ops);
+ if (!(table->flags & NFT_TABLE_F_DORMANT)) {
+ list_for_each_entry(ops, &hook->ops_list, list)
+ nf_unregister_net_hook(net, ops);
+ }
if (release_netdev)
nft_netdev_hook_unlink_free_rcu(hook);
}
@@ -452,20 +455,25 @@ static void __nf_tables_unregister_hook(struct net *net,
struct nft_base_chain *basechain;
const struct nf_hook_ops *ops;
- if (table->flags & NFT_TABLE_F_DORMANT ||
- !nft_is_base_chain(chain))
+ if (!nft_is_base_chain(chain))
return;
basechain = nft_base_chain(chain);
ops = &basechain->ops;
+ /* must also be called for dormant tables */
+ if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) {
+ nft_netdev_unregister_hooks(net, table, &basechain->hook_list,
+ release_netdev);
+ return;
+ }
+
+ if (table->flags & NFT_TABLE_F_DORMANT)
+ return;
+
if (basechain->type->ops_unregister)
return basechain->type->ops_unregister(net, ops);
- if (nft_base_chain_netdev(table->family, basechain->ops.hooknum))
- nft_netdev_unregister_hooks(net, &basechain->hook_list,
- release_netdev);
- else
- nf_unregister_net_hook(net, &basechain->ops);
+ nf_unregister_net_hook(net, &basechain->ops);
}
static void nf_tables_unregister_hook(struct net *net,
@@ -4205,6 +4213,7 @@ static int nft_table_validate(struct net *net, const struct nft_table *table)
struct nft_chain *chain;
struct nft_ctx ctx = {
.net = net,
+ .table = (struct nft_table *)table,
.family = table->family,
};
int err = 0;
@@ -11281,11 +11290,9 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action)
break;
case NFT_MSG_NEWCHAIN:
if (nft_trans_chain_update(trans)) {
- if (!(table->flags & NFT_TABLE_F_DORMANT)) {
- nft_netdev_unregister_hooks(net,
- &nft_trans_chain_hooks(trans),
- true);
- }
+ nft_netdev_unregister_hooks(net, table,
+ &nft_trans_chain_hooks(trans),
+ true);
free_percpu(nft_trans_chain_stats(trans));
kfree(nft_trans_chain_name(trans));
nft_trans_destroy(trans);
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 5ddd5b6e135f..8ab186f86dd4 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -153,7 +153,7 @@ static bool nft_payload_fast_eval(const struct nft_expr *expr,
if (priv->base == NFT_PAYLOAD_NETWORK_HEADER)
ptr = skb_network_header(skb) + pkt->nhoff;
else {
- if (!(pkt->flags & NFT_PKTINFO_L4PROTO))
+ if (!(pkt->flags & NFT_PKTINFO_L4PROTO) || pkt->fragoff)
return false;
ptr = skb->data + nft_thoff(pkt);
}
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 58304fd1f70f..60ab88d45096 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -1141,6 +1141,9 @@ nfqnl_mangle(void *data, unsigned int data_len, struct nf_queue_entry *e, int di
{
struct sk_buff *nskb;
+ if (e->state.net->user_ns != &init_user_ns)
+ return -EPERM;
+
if (diff < 0) {
unsigned int min_len = skb_transport_offset(e->skb);
@@ -1212,6 +1215,8 @@ dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex)
if (physinif == ifindex || physoutif == ifindex)
return 1;
#endif
+ if (entry->skb_dev && entry->skb_dev->ifindex == ifindex)
+ return 1;
if (entry->state.in)
if (entry->state.in->ifindex == ifindex)
return 1;
@@ -1535,8 +1540,7 @@ static int nfqnl_recv_verdict(struct sk_buff *skb, const struct nfnl_info *info,
if (nfqnl_mangle(nla_data(nfqa[NFQA_PAYLOAD]),
payload_len, entry, diff) < 0)
verdict = NF_DROP;
-
- if (ct && diff)
+ else if (ct && diff)
nfnl_ct->seq_adjust(entry->skb, ct, ctinfo, diff);
}
diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c
index 94dccdcfa06b..785b8e9731d1 100644
--- a/net/netfilter/nft_bitwise.c
+++ b/net/netfilter/nft_bitwise.c
@@ -43,8 +43,10 @@ static void nft_bitwise_eval_lshift(u32 *dst, const u32 *src,
u32 carry = 0;
for (i = DIV_ROUND_UP(priv->len, sizeof(u32)); i > 0; i--) {
- dst[i - 1] = (src[i - 1] << shift) | carry;
- carry = src[i - 1] >> (BITS_PER_TYPE(u32) - shift);
+ u32 tmp_src = src[i - 1];
+
+ dst[i - 1] = (tmp_src << shift) | carry;
+ carry = tmp_src >> (BITS_PER_TYPE(u32) - shift);
}
}
@@ -56,8 +58,10 @@ static void nft_bitwise_eval_rshift(u32 *dst, const u32 *src,
u32 carry = 0;
for (i = 0; i < DIV_ROUND_UP(priv->len, sizeof(u32)); i++) {
- dst[i] = carry | (src[i] >> shift);
- carry = src[i] << (BITS_PER_TYPE(u32) - shift);
+ u32 tmp_src = src[i];
+
+ dst[i] = carry | (tmp_src >> shift);
+ carry = tmp_src << (BITS_PER_TYPE(u32) - shift);
}
}
@@ -235,6 +239,9 @@ static int nft_bitwise_init_bool(const struct nft_ctx *ctx,
&priv->sreg2, priv->len);
if (err < 0)
return err;
+
+ if (nft_reg_overlap(priv->sreg2, priv->dreg, priv->len))
+ return -EINVAL;
}
return 0;
@@ -265,6 +272,9 @@ static int nft_bitwise_init(const struct nft_ctx *ctx,
if (err < 0)
return err;
+ if (nft_reg_overlap(priv->sreg, priv->dreg, priv->len))
+ return -EINVAL;
+
if (tb[NFTA_BITWISE_OP]) {
priv->op = ntohl(nla_get_be32(tb[NFTA_BITWISE_OP]));
switch (priv->op) {
diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c
index e00dddfa2fc0..dfd41fc8d9b8 100644
--- a/net/netfilter/nft_byteorder.c
+++ b/net/netfilter/nft_byteorder.c
@@ -19,7 +19,6 @@ struct nft_byteorder {
u8 sreg;
u8 dreg;
enum nft_byteorder_ops op:8;
- u8 len;
u8 size;
};
@@ -28,13 +27,8 @@ void nft_byteorder_eval(const struct nft_expr *expr,
const struct nft_pktinfo *pkt)
{
const struct nft_byteorder *priv = nft_expr_priv(expr);
- u32 *src = &regs->data[priv->sreg];
+ const u32 *src = &regs->data[priv->sreg];
u32 *dst = &regs->data[priv->dreg];
- u16 *s16, *d16;
- unsigned int i;
-
- s16 = (void *)src;
- d16 = (void *)dst;
switch (priv->size) {
case 8: {
@@ -43,18 +37,14 @@ void nft_byteorder_eval(const struct nft_expr *expr,
switch (priv->op) {
case NFT_BYTEORDER_NTOH:
- for (i = 0; i < priv->len / 8; i++) {
- src64 = nft_reg_load64(&src[i]);
- nft_reg_store64(&dst64[i],
- be64_to_cpu((__force __be64)src64));
- }
+ src64 = nft_reg_load64(src);
+
+ nft_reg_store64(dst64, be64_to_cpu((__force __be64)src64));
break;
case NFT_BYTEORDER_HTON:
- for (i = 0; i < priv->len / 8; i++) {
- src64 = (__force __u64)
- cpu_to_be64(nft_reg_load64(&src[i]));
- nft_reg_store64(&dst64[i], src64);
- }
+ src64 = (__force __u64)cpu_to_be64(nft_reg_load64(src));
+
+ nft_reg_store64(dst64, src64);
break;
}
break;
@@ -62,24 +52,20 @@ void nft_byteorder_eval(const struct nft_expr *expr,
case 4:
switch (priv->op) {
case NFT_BYTEORDER_NTOH:
- for (i = 0; i < priv->len / 4; i++)
- dst[i] = ntohl((__force __be32)src[i]);
+ *dst = ntohl((__force __be32)*src);
break;
case NFT_BYTEORDER_HTON:
- for (i = 0; i < priv->len / 4; i++)
- dst[i] = (__force __u32)htonl(src[i]);
+ *dst = (__force __u32)htonl(*src);
break;
}
break;
case 2:
switch (priv->op) {
case NFT_BYTEORDER_NTOH:
- for (i = 0; i < priv->len / 2; i++)
- d16[i] = ntohs((__force __be16)s16[i]);
+ nft_reg_store16(dst, ntohs(nft_reg_load_be16(src)));
break;
case NFT_BYTEORDER_HTON:
- for (i = 0; i < priv->len / 2; i++)
- d16[i] = (__force __u16)htons(s16[i]);
+ nft_reg_store_be16(dst, htons(nft_reg_load16(src)));
break;
}
break;
@@ -137,16 +123,25 @@ static int nft_byteorder_init(const struct nft_ctx *ctx,
if (err < 0)
return err;
- priv->len = len;
+ /* no longer support multi-reg conversions */
+ if (len != size)
+ return -EOPNOTSUPP;
err = nft_parse_register_load(ctx, tb[NFTA_BYTEORDER_SREG], &priv->sreg,
- priv->len);
+ len);
+ if (err < 0)
+ return err;
+
+ err = nft_parse_register_store(ctx, tb[NFTA_BYTEORDER_DREG],
+ &priv->dreg, NULL, NFT_DATA_VALUE,
+ len);
if (err < 0)
return err;
- return nft_parse_register_store(ctx, tb[NFTA_BYTEORDER_DREG],
- &priv->dreg, NULL, NFT_DATA_VALUE,
- priv->len);
+ if (nft_reg_overlap(priv->sreg, priv->dreg, len))
+ return -EINVAL;
+
+ return 0;
}
static int nft_byteorder_dump(struct sk_buff *skb,
@@ -160,10 +155,11 @@ static int nft_byteorder_dump(struct sk_buff *skb,
goto nla_put_failure;
if (nla_put_be32(skb, NFTA_BYTEORDER_OP, htonl(priv->op)))
goto nla_put_failure;
- if (nla_put_be32(skb, NFTA_BYTEORDER_LEN, htonl(priv->len)))
- goto nla_put_failure;
if (nla_put_be32(skb, NFTA_BYTEORDER_SIZE, htonl(priv->size)))
goto nla_put_failure;
+ /* compatibility for old userspace which permitted size != len */
+ if (nla_put_be32(skb, NFTA_BYTEORDER_LEN, htonl(priv->size)))
+ goto nla_put_failure;
return 0;
nla_put_failure:
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index decc725a33c2..0caa9304d2d0 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -261,10 +261,10 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
return ret;
}
- nft_target_set_tgchk_param(&par, ctx, target, info, &e, proto, inv);
-
nft_compat_wait_for_destructors(ctx->net);
+ nft_target_set_tgchk_param(&par, ctx, target, info, &e, proto, inv);
+
ret = xt_check_target(&par, size, proto, inv);
if (ret < 0) {
if (ret == -ENOENT) {
@@ -353,8 +353,6 @@ nla_put_failure:
static int nft_target_validate(const struct nft_ctx *ctx,
const struct nft_expr *expr)
{
- struct xt_target *target = expr->ops->data;
- unsigned int hook_mask = 0;
int ret;
if (ctx->family != NFPROTO_IPV4 &&
@@ -377,11 +375,21 @@ static int nft_target_validate(const struct nft_ctx *ctx,
const struct nft_base_chain *basechain =
nft_base_chain(ctx->chain);
const struct nf_hook_ops *ops = &basechain->ops;
+ unsigned int hook_mask = 1 << ops->hooknum;
+ struct xt_target *target = expr->ops->data;
+ void *info = nft_expr_priv(expr);
+ struct xt_tgchk_param par;
+ union nft_entry e = {};
- hook_mask = 1 << ops->hooknum;
if (target->hooks && !(hook_mask & target->hooks))
return -EINVAL;
+ nft_target_set_tgchk_param(&par, ctx, target, info, &e, 0, false);
+
+ ret = xt_check_hooks_target(&par);
+ if (ret < 0)
+ return ret;
+
ret = nft_compat_chain_validate_dependency(ctx, target->table);
if (ret < 0)
return ret;
@@ -515,10 +523,10 @@ __nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
return ret;
}
- nft_match_set_mtchk_param(&par, ctx, match, info, &e, proto, inv);
-
nft_compat_wait_for_destructors(ctx->net);
+ nft_match_set_mtchk_param(&par, ctx, match, info, &e, proto, inv);
+
return xt_check_match(&par, size, proto, inv);
}
@@ -614,8 +622,6 @@ static int nft_match_large_dump(struct sk_buff *skb,
static int nft_match_validate(const struct nft_ctx *ctx,
const struct nft_expr *expr)
{
- struct xt_match *match = expr->ops->data;
- unsigned int hook_mask = 0;
int ret;
if (ctx->family != NFPROTO_IPV4 &&
@@ -638,11 +644,30 @@ static int nft_match_validate(const struct nft_ctx *ctx,
const struct nft_base_chain *basechain =
nft_base_chain(ctx->chain);
const struct nf_hook_ops *ops = &basechain->ops;
+ unsigned int hook_mask = 1 << ops->hooknum;
+ struct xt_match *match = expr->ops->data;
+ size_t size = XT_ALIGN(match->matchsize);
+ struct xt_mtchk_param par;
+ union nft_entry e = {};
+ void *info;
- hook_mask = 1 << ops->hooknum;
if (match->hooks && !(hook_mask & match->hooks))
return -EINVAL;
+ if (NFT_EXPR_SIZE(size) > NFT_MATCH_LARGE_THRESH) {
+ struct nft_xt_match_priv *priv = nft_expr_priv(expr);
+
+ info = priv->info;
+ } else {
+ info = nft_expr_priv(expr);
+ }
+
+ nft_match_set_mtchk_param(&par, ctx, match, info, &e, 0, false);
+
+ ret = xt_check_hooks_match(&par);
+ if (ret < 0)
+ return ret;
+
ret = nft_compat_chain_validate_dependency(ctx, match->table);
if (ret < 0)
return ret;
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 60ee8d932fcb..357513c6dcea 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -78,7 +78,7 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
break;
}
- if (ct == NULL)
+ if (!ct || nf_ct_is_template(ct))
goto err;
switch (priv->key) {
@@ -180,12 +180,10 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
tuple = &ct->tuplehash[priv->dir].tuple;
switch (priv->key) {
case NFT_CT_SRC:
- memcpy(dest, tuple->src.u3.all,
- nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16);
+ memcpy(dest, tuple->src.u3.all, priv->len);
return;
case NFT_CT_DST:
- memcpy(dest, tuple->dst.u3.all,
- nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16);
+ memcpy(dest, tuple->dst.u3.all, priv->len);
return;
case NFT_CT_PROTO_SRC:
nft_reg_store16(dest, (__force u16)tuple->src.u.all);
@@ -1334,6 +1332,8 @@ static void nft_ct_expect_obj_eval(struct nft_object *obj,
if (nf_ct_expect_related(exp, 0) != 0)
regs->verdict.code = NF_DROP;
+
+ nf_ct_expect_put(exp);
}
static const struct nla_policy nft_ct_expect_policy[NFTA_CT_EXPECT_MAX + 1] = {
diff --git a/net/netfilter/nft_ct_fast.c b/net/netfilter/nft_ct_fast.c
index e684c8a91848..ecf7b3a404be 100644
--- a/net/netfilter/nft_ct_fast.c
+++ b/net/netfilter/nft_ct_fast.c
@@ -30,7 +30,7 @@ void nft_ct_get_fast_eval(const struct nft_expr *expr,
break;
}
- if (!ct) {
+ if (!ct || nf_ct_is_template(ct)) {
regs->verdict.code = NFT_BREAK;
return;
}
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index 0407d6f708ae..e6a07c0df207 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -376,7 +376,7 @@ static void nft_exthdr_sctp_eval(const struct nft_expr *expr,
const struct sctp_chunkhdr *sch;
struct sctp_chunkhdr _sch;
- if (pkt->tprot != IPPROTO_SCTP)
+ if (pkt->tprot != IPPROTO_SCTP || pkt->fragoff)
goto err;
do {
diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c
index 4bce36c3a6a0..b9e88d7cf308 100644
--- a/net/netfilter/nft_fwd_netdev.c
+++ b/net/netfilter/nft_fwd_netdev.c
@@ -95,12 +95,15 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
+ u8 *nf_dup_skb_recursion = nf_get_nf_dup_skb_recursion();
struct nft_fwd_neigh *priv = nft_expr_priv(expr);
void *addr = &regs->data[priv->sreg_addr];
int oif = regs->data[priv->sreg_dev];
unsigned int verdict = NF_STOLEN;
struct sk_buff *skb = pkt->skb;
+ int nhoff = skb_network_offset(skb);
struct net_device *dev;
+ unsigned int hh_len;
int neigh_table;
switch (priv->nfproto) {
@@ -111,7 +114,7 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr,
verdict = NFT_BREAK;
goto out;
}
- if (skb_try_make_writable(skb, sizeof(*iph))) {
+ if (skb_ensure_writable(skb, nhoff + sizeof(*iph))) {
verdict = NF_DROP;
goto out;
}
@@ -132,7 +135,7 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr,
verdict = NFT_BREAK;
goto out;
}
- if (skb_try_make_writable(skb, sizeof(*ip6h))) {
+ if (skb_ensure_writable(skb, nhoff + sizeof(*ip6h))) {
verdict = NF_DROP;
goto out;
}
@@ -151,13 +154,31 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr,
goto out;
}
+ if (*nf_dup_skb_recursion > NF_RECURSION_LIMIT) {
+ verdict = NF_DROP;
+ goto out;
+ }
+
dev = dev_get_by_index_rcu(nft_net(pkt), oif);
- if (dev == NULL)
- return;
+ if (dev == NULL) {
+ verdict = NF_DROP;
+ goto out;
+ }
+
+ hh_len = LL_RESERVED_SPACE(dev);
+ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
+ skb = skb_expand_head(skb, hh_len);
+ if (!skb) {
+ verdict = NF_STOLEN;
+ goto out;
+ }
+ }
skb->dev = dev;
skb_clear_tstamp(skb);
+ (*nf_dup_skb_recursion)++;
neigh_xmit(neigh_table, dev, addr, skb);
+ (*nf_dup_skb_recursion)--;
out:
regs->verdict.code = verdict;
}
diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c
index 03ffb1159fc1..d14ca157910b 100644
--- a/net/netfilter/nft_inner.c
+++ b/net/netfilter/nft_inner.c
@@ -163,7 +163,6 @@ static int nft_inner_parse_l2l3(const struct nft_inner *priv,
return -1;
if (fragoff == 0) {
- thoff = nhoff + sizeof(_ip6h);
ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH;
ctx->inner_thoff = thoff;
ctx->l4proto = l4proto;
@@ -247,8 +246,8 @@ static bool nft_inner_restore_tun_ctx(const struct nft_pktinfo *pkt,
local_lock_nested_bh(&nft_pcpu_tun_ctx.bh_lock);
this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx.ctx);
if (this_cpu_tun_ctx->cookie != (unsigned long)pkt->skb) {
- local_bh_enable();
local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock);
+ local_bh_enable();
return false;
}
*tun_ctx = *this_cpu_tun_ctx;
diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c
index c02d5cb52143..45fe56da5044 100644
--- a/net/netfilter/nft_osf.c
+++ b/net/netfilter/nft_osf.c
@@ -33,7 +33,7 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs,
return;
}
- if (pkt->tprot != IPPROTO_TCP) {
+ if (pkt->tprot != IPPROTO_TCP || pkt->fragoff) {
regs->verdict.code = NFT_BREAK;
return;
}
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index 01e13e5255a9..484a5490832e 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -917,6 +917,9 @@ static int nft_payload_set_init(const struct nft_ctx *ctx,
struct nft_payload_set *priv = nft_expr_priv(expr);
int err;
+ if (ctx->net->user_ns != &init_user_ns)
+ return -EPERM;
+
priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE]));
priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN]));
diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c
index f2101af8c867..89be443734f6 100644
--- a/net/netfilter/nft_tproxy.c
+++ b/net/netfilter/nft_tproxy.c
@@ -30,8 +30,8 @@ static void nft_tproxy_eval_v4(const struct nft_expr *expr,
__be16 tport = 0;
struct sock *sk;
- if (pkt->tprot != IPPROTO_TCP &&
- pkt->tprot != IPPROTO_UDP) {
+ if ((pkt->tprot != IPPROTO_TCP &&
+ pkt->tprot != IPPROTO_UDP) || pkt->fragoff) {
regs->verdict.code = NFT_BREAK;
return;
}
@@ -97,8 +97,8 @@ static void nft_tproxy_eval_v6(const struct nft_expr *expr,
memset(&taddr, 0, sizeof(taddr));
- if (pkt->tprot != IPPROTO_TCP &&
- pkt->tprot != IPPROTO_UDP) {
+ if ((pkt->tprot != IPPROTO_TCP &&
+ pkt->tprot != IPPROTO_UDP) || pkt->fragoff) {
regs->verdict.code = NFT_BREAK;
return;
}
diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c
index 0b987bc2132a..68f7cfbbee06 100644
--- a/net/netfilter/nft_tunnel.c
+++ b/net/netfilter/nft_tunnel.c
@@ -676,7 +676,7 @@ static void nft_tunnel_obj_destroy(const struct nft_ctx *ctx,
{
struct nft_tunnel_obj *priv = nft_obj_data(obj);
- metadata_dst_free(priv->md);
+ dst_release(&priv->md->dst);
}
static struct nft_object_type nft_tunnel_obj_type;
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 9f837fb5ceb4..4e6708c23922 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -55,6 +55,9 @@ static struct list_head xt_templates[NFPROTO_NUMPROTO];
struct xt_pernet {
struct list_head tables[NFPROTO_NUMPROTO];
+
+ /* stash area used during netns exit */
+ struct list_head dead_tables[NFPROTO_NUMPROTO];
};
struct compat_delta {
@@ -477,11 +480,9 @@ int xt_check_proc_name(const char *name, unsigned int size)
}
EXPORT_SYMBOL(xt_check_proc_name);
-int xt_check_match(struct xt_mtchk_param *par,
- unsigned int size, u16 proto, bool inv_proto)
+static int xt_check_match_common(struct xt_mtchk_param *par,
+ unsigned int size, u16 proto, bool inv_proto)
{
- int ret;
-
if (XT_ALIGN(par->match->matchsize) != size &&
par->match->matchsize != -1) {
/*
@@ -530,6 +531,14 @@ int xt_check_match(struct xt_mtchk_param *par,
par->match->proto);
return -EINVAL;
}
+
+ return 0;
+}
+
+static int xt_checkentry_match(struct xt_mtchk_param *par)
+{
+ int ret;
+
if (par->match->checkentry != NULL) {
ret = par->match->checkentry(par);
if (ret < 0)
@@ -538,8 +547,34 @@ int xt_check_match(struct xt_mtchk_param *par,
/* Flag up potential errors. */
return -EIO;
}
+
+ return 0;
+}
+
+int xt_check_hooks_match(struct xt_mtchk_param *par)
+{
+ if (par->match->check_hooks != NULL)
+ return par->match->check_hooks(par);
+
return 0;
}
+EXPORT_SYMBOL_GPL(xt_check_hooks_match);
+
+int xt_check_match(struct xt_mtchk_param *par,
+ unsigned int size, u16 proto, bool inv_proto)
+{
+ int ret;
+
+ ret = xt_check_match_common(par, size, proto, inv_proto);
+ if (ret < 0)
+ return ret;
+
+ ret = xt_check_hooks_match(par);
+ if (ret < 0)
+ return ret;
+
+ return xt_checkentry_match(par);
+}
EXPORT_SYMBOL_GPL(xt_check_match);
/** xt_check_entry_match - check that matches end before start of target
@@ -1012,11 +1047,9 @@ bool xt_find_jump_offset(const unsigned int *offsets,
}
EXPORT_SYMBOL(xt_find_jump_offset);
-int xt_check_target(struct xt_tgchk_param *par,
- unsigned int size, u16 proto, bool inv_proto)
+static int xt_check_target_common(struct xt_tgchk_param *par,
+ unsigned int size, u16 proto, bool inv_proto)
{
- int ret;
-
if (XT_ALIGN(par->target->targetsize) != size) {
pr_err_ratelimited("%s_tables: %s.%u target: invalid size %u (kernel) != (user) %u\n",
xt_prefix[par->family], par->target->name,
@@ -1061,6 +1094,23 @@ int xt_check_target(struct xt_tgchk_param *par,
par->target->proto);
return -EINVAL;
}
+
+ return 0;
+}
+
+int xt_check_hooks_target(struct xt_tgchk_param *par)
+{
+ if (par->target->check_hooks != NULL)
+ return par->target->check_hooks(par);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xt_check_hooks_target);
+
+static int xt_checkentry_target(struct xt_tgchk_param *par)
+{
+ int ret;
+
if (par->target->checkentry != NULL) {
ret = par->target->checkentry(par);
if (ret < 0)
@@ -1071,6 +1121,22 @@ int xt_check_target(struct xt_tgchk_param *par,
}
return 0;
}
+
+int xt_check_target(struct xt_tgchk_param *par,
+ unsigned int size, u16 proto, bool inv_proto)
+{
+ int ret;
+
+ ret = xt_check_target_common(par, size, proto, inv_proto);
+ if (ret < 0)
+ return ret;
+
+ ret = xt_check_hooks_target(par);
+ if (ret < 0)
+ return ret;
+
+ return xt_checkentry_target(par);
+}
EXPORT_SYMBOL_GPL(xt_check_target);
/**
@@ -1409,11 +1475,9 @@ struct xt_counters *xt_counters_alloc(unsigned int counters)
}
EXPORT_SYMBOL(xt_counters_alloc);
-struct xt_table_info *
-xt_replace_table(struct xt_table *table,
- unsigned int num_counters,
- struct xt_table_info *newinfo,
- int *error)
+static struct xt_table_info *
+do_replace_table(struct xt_table *table, unsigned int num_counters,
+ struct xt_table_info *newinfo, int *error)
{
struct xt_table_info *private;
unsigned int cpu;
@@ -1468,30 +1532,54 @@ xt_replace_table(struct xt_table *table,
}
}
- audit_log_nfcfg(table->name, table->af, private->number,
- !private->number ? AUDIT_XT_OP_REGISTER :
- AUDIT_XT_OP_REPLACE,
- GFP_KERNEL);
+ return private;
+}
+
+struct xt_table_info *
+xt_replace_table(struct xt_table *table, unsigned int num_counters,
+ struct xt_table_info *newinfo,
+ int *error)
+{
+ struct xt_table_info *private;
+
+ private = do_replace_table(table, num_counters, newinfo, error);
+ if (private)
+ audit_log_nfcfg(table->name, table->af, private->number,
+ AUDIT_XT_OP_REPLACE,
+ GFP_KERNEL);
+
return private;
}
EXPORT_SYMBOL_GPL(xt_replace_table);
struct xt_table *xt_register_table(struct net *net,
const struct xt_table *input_table,
+ const struct nf_hook_ops *template_ops,
struct xt_table_info *bootstrap,
struct xt_table_info *newinfo)
{
struct xt_pernet *xt_net = net_generic(net, xt_pernet_id);
+ struct xt_table *t, *table = NULL;
+ struct nf_hook_ops *ops = NULL;
struct xt_table_info *private;
- struct xt_table *t, *table;
- int ret;
+ unsigned int num_ops;
+ int ret = -EINVAL;
+
+ num_ops = hweight32(input_table->valid_hooks);
+ if (num_ops == 0)
+ goto out;
+
+ ret = -ENOMEM;
+ if (template_ops) {
+ ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL);
+ if (!ops)
+ goto out;
+ }
/* Don't add one object to multiple lists. */
table = kmemdup(input_table, sizeof(struct xt_table), GFP_KERNEL);
- if (!table) {
- ret = -ENOMEM;
+ if (!table)
goto out;
- }
mutex_lock(&xt[table->af].mutex);
/* Don't autoload: we'd eat our tail... */
@@ -1505,7 +1593,7 @@ struct xt_table *xt_register_table(struct net *net,
/* Simplifies replace_table code. */
table->private = bootstrap;
- if (!xt_replace_table(table, 0, newinfo, &ret))
+ if (!do_replace_table(table, 0, newinfo, &ret))
goto unlock;
private = table->private;
@@ -1514,34 +1602,122 @@ struct xt_table *xt_register_table(struct net *net,
/* save number of initial entries */
private->initial_entries = private->number;
+ if (ops) {
+ int i;
+
+ for (i = 0; i < num_ops; i++)
+ ops[i].priv = table;
+
+ ret = nf_register_net_hooks(net, ops, num_ops);
+ if (ret != 0) {
+ mutex_unlock(&xt[table->af].mutex);
+ /* nf_register_net_hooks() might have published a
+ * base chain before internal error unwind.
+ */
+ synchronize_rcu();
+ goto out;
+ }
+
+ table->ops = ops;
+ }
+
+ audit_log_nfcfg(table->name, table->af, private->number,
+ AUDIT_XT_OP_REGISTER, GFP_KERNEL);
+
list_add(&table->list, &xt_net->tables[table->af]);
mutex_unlock(&xt[table->af].mutex);
return table;
unlock:
mutex_unlock(&xt[table->af].mutex);
- kfree(table);
out:
+ kfree(table);
+ kfree(ops);
return ERR_PTR(ret);
}
EXPORT_SYMBOL_GPL(xt_register_table);
-void *xt_unregister_table(struct xt_table *table)
+/**
+ * xt_unregister_table_pre_exit - pre-shutdown unregister of a table
+ * @net: network namespace
+ * @af: address family (e.g., NFPROTO_IPV4, NFPROTO_IPV6)
+ * @name: name of the table to unregister
+ *
+ * Unregisters the specified netfilter table from the given network namespace
+ * and also unregisters the hooks from netfilter core: no new packets will be
+ * processed.
+ *
+ * This must be called prior to xt_unregister_table_exit() from the pernet
+ * .pre_exit callback. After this call, the table is no longer visible to
+ * the get/setsockopt path. In case of rmmod, module exit path must have
+ * called xt_unregister_template() prior to unregistering pernet ops to
+ * prevent re-instantiation of the table.
+ *
+ * See also: xt_unregister_table_exit()
+ */
+void xt_unregister_table_pre_exit(struct net *net, u8 af, const char *name)
{
- struct xt_table_info *private;
+ struct xt_pernet *xt_net = net_generic(net, xt_pernet_id);
+ struct xt_table *t;
- mutex_lock(&xt[table->af].mutex);
- private = table->private;
- list_del(&table->list);
- mutex_unlock(&xt[table->af].mutex);
- audit_log_nfcfg(table->name, table->af, private->number,
- AUDIT_XT_OP_UNREGISTER, GFP_KERNEL);
- kfree(table->ops);
- kfree(table);
+ mutex_lock(&xt[af].mutex);
+ list_for_each_entry(t, &xt_net->tables[af], list) {
+ if (strcmp(t->name, name) == 0) {
+ list_move(&t->list, &xt_net->dead_tables[af]);
+ mutex_unlock(&xt[af].mutex);
- return private;
+ if (t->ops) /* nat table registers with nat core, t->ops is NULL. */
+ nf_unregister_net_hooks(net, t->ops, hweight32(t->valid_hooks));
+ return;
+ }
+ }
+ mutex_unlock(&xt[af].mutex);
+}
+EXPORT_SYMBOL(xt_unregister_table_pre_exit);
+
+/**
+ * xt_unregister_table_exit - remove a table during namespace teardown
+ * @net: the network namespace from which to unregister the table
+ * @af: address family (e.g., NFPROTO_IPV4, NFPROTO_IPV6)
+ * @name: name of the table to unregister
+ *
+ * Completes the unregister process for a table. This must be called from
+ * the pernet ops .exit callback. This is the second stage after
+ * xt_unregister_table_pre_exit().
+ *
+ * pair with xt_unregister_table_pre_exit() during namespace shutdown.
+ *
+ * Return: the unregistered table or NULL if the table was never
+ * instantiated. The caller needs to kfree() the table after it
+ * has removed the family specific matches/targets.
+ */
+struct xt_table *xt_unregister_table_exit(struct net *net, u8 af, const char *name)
+{
+ struct xt_pernet *xt_net = net_generic(net, xt_pernet_id);
+ struct xt_table *table;
+
+ mutex_lock(&xt[af].mutex);
+ list_for_each_entry(table, &xt_net->dead_tables[af], list) {
+ struct nf_hook_ops *ops = NULL;
+
+ if (strcmp(table->name, name) != 0)
+ continue;
+
+ list_del(&table->list);
+
+ audit_log_nfcfg(table->name, table->af, table->private->number,
+ AUDIT_XT_OP_UNREGISTER, GFP_KERNEL);
+ swap(table->ops, ops);
+ mutex_unlock(&xt[af].mutex);
+
+ kfree(ops);
+ return table;
+ }
+ mutex_unlock(&xt[af].mutex);
+
+ return NULL;
}
-EXPORT_SYMBOL_GPL(xt_unregister_table);
+EXPORT_SYMBOL_GPL(xt_unregister_table_exit);
#endif
#ifdef CONFIG_PROC_FS
@@ -1988,8 +2164,10 @@ static int __net_init xt_net_init(struct net *net)
struct xt_pernet *xt_net = net_generic(net, xt_pernet_id);
int i;
- for (i = 0; i < NFPROTO_NUMPROTO; i++)
+ for (i = 0; i < NFPROTO_NUMPROTO; i++) {
INIT_LIST_HEAD(&xt_net->tables[i]);
+ INIT_LIST_HEAD(&xt_net->dead_tables[i]);
+ }
return 0;
}
@@ -1998,8 +2176,10 @@ static void __net_exit xt_net_exit(struct net *net)
struct xt_pernet *xt_net = net_generic(net, xt_pernet_id);
int i;
- for (i = 0; i < NFPROTO_NUMPROTO; i++)
+ for (i = 0; i < NFPROTO_NUMPROTO; i++) {
WARN_ON_ONCE(!list_empty(&xt_net->tables[i]));
+ WARN_ON_ONCE(!list_empty(&xt_net->dead_tables[i]));
+ }
}
static struct pernet_operations xt_net_ops = {
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 498f5871c84a..d2aeacf94230 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -354,7 +354,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = {
.family = NFPROTO_IPV4,
.revision = 1,
.targetsize = sizeof(struct xt_ct_target_info_v1),
- .usersize = offsetof(struct xt_ct_target_info, ct),
+ .usersize = offsetof(struct xt_ct_target_info_v1, ct),
.checkentry = xt_ct_tg_check_v1,
.destroy = xt_ct_tg_destroy_v1,
.target = xt_ct_target_v1,
@@ -366,7 +366,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = {
.family = NFPROTO_IPV4,
.revision = 2,
.targetsize = sizeof(struct xt_ct_target_info_v1),
- .usersize = offsetof(struct xt_ct_target_info, ct),
+ .usersize = offsetof(struct xt_ct_target_info_v1, ct),
.checkentry = xt_ct_tg_check_v2,
.destroy = xt_ct_tg_destroy_v1,
.target = xt_ct_target_v1,
@@ -398,7 +398,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = {
.family = NFPROTO_IPV6,
.revision = 1,
.targetsize = sizeof(struct xt_ct_target_info_v1),
- .usersize = offsetof(struct xt_ct_target_info, ct),
+ .usersize = offsetof(struct xt_ct_target_info_v1, ct),
.checkentry = xt_ct_tg_check_v1,
.destroy = xt_ct_tg_destroy_v1,
.target = xt_ct_target_v1,
@@ -410,7 +410,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = {
.family = NFPROTO_IPV6,
.revision = 2,
.targetsize = sizeof(struct xt_ct_target_info_v1),
- .usersize = offsetof(struct xt_ct_target_info, ct),
+ .usersize = offsetof(struct xt_ct_target_info_v1, ct),
.checkentry = xt_ct_tg_check_v2,
.destroy = xt_ct_tg_destroy_v1,
.target = xt_ct_target_v1,
diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c
index 466da23e36ff..b32d153e3a18 100644
--- a/net/netfilter/xt_NFQUEUE.c
+++ b/net/netfilter/xt_NFQUEUE.c
@@ -91,7 +91,7 @@ nfqueue_tg_v3(struct sk_buff *skb, const struct xt_action_param *par)
if (info->queues_total > 1) {
if (info->flags & NFQ_FLAG_CPU_FANOUT) {
- int cpu = smp_processor_id();
+ int cpu = raw_smp_processor_id();
queue = info->queuenum + cpu % info->queues_total;
} else {
diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
index 116a885adb3c..80e1634bc51f 100644
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c
@@ -247,6 +247,21 @@ tcpmss_tg6(struct sk_buff *skb, const struct xt_action_param *par)
}
#endif
+static int tcpmss_tg4_check_hooks(const struct xt_tgchk_param *par)
+{
+ const struct xt_tcpmss_info *info = par->targinfo;
+
+ if (info->mss == XT_TCPMSS_CLAMP_PMTU &&
+ (par->hook_mask & ~((1 << NF_INET_FORWARD) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_POST_ROUTING))) != 0) {
+ pr_info_ratelimited("path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
/* Must specify -p tcp --syn */
static inline bool find_syn_match(const struct xt_entry_match *m)
{
@@ -262,17 +277,9 @@ static inline bool find_syn_match(const struct xt_entry_match *m)
static int tcpmss_tg4_check(const struct xt_tgchk_param *par)
{
- const struct xt_tcpmss_info *info = par->targinfo;
const struct ipt_entry *e = par->entryinfo;
const struct xt_entry_match *ematch;
- if (info->mss == XT_TCPMSS_CLAMP_PMTU &&
- (par->hook_mask & ~((1 << NF_INET_FORWARD) |
- (1 << NF_INET_LOCAL_OUT) |
- (1 << NF_INET_POST_ROUTING))) != 0) {
- pr_info_ratelimited("path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n");
- return -EINVAL;
- }
if (par->nft_compat)
return 0;
@@ -286,17 +293,9 @@ static int tcpmss_tg4_check(const struct xt_tgchk_param *par)
#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
static int tcpmss_tg6_check(const struct xt_tgchk_param *par)
{
- const struct xt_tcpmss_info *info = par->targinfo;
const struct ip6t_entry *e = par->entryinfo;
const struct xt_entry_match *ematch;
- if (info->mss == XT_TCPMSS_CLAMP_PMTU &&
- (par->hook_mask & ~((1 << NF_INET_FORWARD) |
- (1 << NF_INET_LOCAL_OUT) |
- (1 << NF_INET_POST_ROUTING))) != 0) {
- pr_info_ratelimited("path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n");
- return -EINVAL;
- }
if (par->nft_compat)
return 0;
@@ -312,6 +311,7 @@ static struct xt_target tcpmss_tg_reg[] __read_mostly = {
{
.family = NFPROTO_IPV4,
.name = "TCPMSS",
+ .check_hooks = tcpmss_tg4_check_hooks,
.checkentry = tcpmss_tg4_check,
.target = tcpmss_tg4,
.targetsize = sizeof(struct xt_tcpmss_info),
@@ -322,6 +322,7 @@ static struct xt_target tcpmss_tg_reg[] __read_mostly = {
{
.family = NFPROTO_IPV6,
.name = "TCPMSS",
+ .check_hooks = tcpmss_tg4_check_hooks,
.checkentry = tcpmss_tg6_check,
.target = tcpmss_tg6,
.targetsize = sizeof(struct xt_tcpmss_info),
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index e4bea1d346cf..5f60e7298a1e 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -86,6 +86,9 @@ tproxy_tg4_v0(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_tproxy_target_info *tgi = par->targinfo;
+ if (par->fragoff)
+ return NF_DROP;
+
return tproxy_tg4(xt_net(par), skb, tgi->laddr, tgi->lport,
tgi->mark_mask, tgi->mark_value);
}
@@ -95,6 +98,9 @@ tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
+ if (par->fragoff)
+ return NF_DROP;
+
return tproxy_tg4(xt_net(par), skb, tgi->laddr.ip, tgi->lport,
tgi->mark_mask, tgi->mark_value);
}
@@ -106,6 +112,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ipv6hdr *iph = ipv6_hdr(skb);
const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
+ unsigned short fragoff = 0;
struct udphdr _hdr, *hp;
struct sock *sk;
const struct in6_addr *laddr;
@@ -113,8 +120,8 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
int thoff = 0;
int tproto;
- tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL);
- if (tproto < 0)
+ tproto = ipv6_find_hdr(skb, &thoff, -1, &fragoff, NULL);
+ if (tproto < 0 || fragoff)
return NF_DROP;
hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c
index a77088943107..913dbe3aa5e2 100644
--- a/net/netfilter/xt_addrtype.c
+++ b/net/netfilter/xt_addrtype.c
@@ -153,14 +153,10 @@ addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
return ret;
}
-static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par)
+static int addrtype_mt_check_hooks(const struct xt_mtchk_param *par)
{
- const char *errmsg = "both incoming and outgoing interface limitation cannot be selected";
struct xt_addrtype_info_v1 *info = par->matchinfo;
-
- if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN &&
- info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT)
- goto err;
+ const char *errmsg;
if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_IN)) &&
@@ -176,6 +172,21 @@ static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par)
goto err;
}
+ return 0;
+err:
+ pr_info_ratelimited("%s\n", errmsg);
+ return -EINVAL;
+}
+
+static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par)
+{
+ const char *errmsg = "both incoming and outgoing interface limitation cannot be selected";
+ struct xt_addrtype_info_v1 *info = par->matchinfo;
+
+ if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN &&
+ info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT)
+ goto err;
+
#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
if (par->family == NFPROTO_IPV6) {
if ((info->source | info->dest) & XT_ADDRTYPE_BLACKHOLE) {
@@ -211,6 +222,7 @@ static struct xt_match addrtype_mt_reg[] __read_mostly = {
.family = NFPROTO_IPV4,
.revision = 1,
.match = addrtype_mt_v1,
+ .check_hooks = addrtype_mt_check_hooks,
.checkentry = addrtype_mt_checkentry_v1,
.matchsize = sizeof(struct xt_addrtype_info_v1),
.me = THIS_MODULE
@@ -221,6 +233,7 @@ static struct xt_match addrtype_mt_reg[] __read_mostly = {
.family = NFPROTO_IPV6,
.revision = 1,
.match = addrtype_mt_v1,
+ .check_hooks = addrtype_mt_check_hooks,
.checkentry = addrtype_mt_checkentry_v1,
.matchsize = sizeof(struct xt_addrtype_info_v1),
.me = THIS_MODULE
diff --git a/net/netfilter/xt_cpu.c b/net/netfilter/xt_cpu.c
index 3bdc302a0f91..9cb259902a58 100644
--- a/net/netfilter/xt_cpu.c
+++ b/net/netfilter/xt_cpu.c
@@ -34,7 +34,7 @@ static bool cpu_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_cpu_info *info = par->matchinfo;
- return (info->cpu == smp_processor_id()) ^ info->invert;
+ return (info->cpu == raw_smp_processor_id()) ^ info->invert;
}
static struct xt_match cpu_mt_reg __read_mostly = {
diff --git a/net/netfilter/xt_devgroup.c b/net/netfilter/xt_devgroup.c
index 9520dd00070b..6d1a44ab5eee 100644
--- a/net/netfilter/xt_devgroup.c
+++ b/net/netfilter/xt_devgroup.c
@@ -33,14 +33,10 @@ static bool devgroup_mt(const struct sk_buff *skb, struct xt_action_param *par)
return true;
}
-static int devgroup_mt_checkentry(const struct xt_mtchk_param *par)
+static int devgroup_mt_check_hooks(const struct xt_mtchk_param *par)
{
const struct xt_devgroup_info *info = par->matchinfo;
- if (info->flags & ~(XT_DEVGROUP_MATCH_SRC | XT_DEVGROUP_INVERT_SRC |
- XT_DEVGROUP_MATCH_DST | XT_DEVGROUP_INVERT_DST))
- return -EINVAL;
-
if (info->flags & XT_DEVGROUP_MATCH_SRC &&
par->hook_mask & ~((1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_IN) |
@@ -56,9 +52,21 @@ static int devgroup_mt_checkentry(const struct xt_mtchk_param *par)
return 0;
}
+static int devgroup_mt_checkentry(const struct xt_mtchk_param *par)
+{
+ const struct xt_devgroup_info *info = par->matchinfo;
+
+ if (info->flags & ~(XT_DEVGROUP_MATCH_SRC | XT_DEVGROUP_INVERT_SRC |
+ XT_DEVGROUP_MATCH_DST | XT_DEVGROUP_INVERT_DST))
+ return -EINVAL;
+
+ return 0;
+}
+
static struct xt_match devgroup_mt_reg __read_mostly = {
.name = "devgroup",
.match = devgroup_mt,
+ .check_hooks = devgroup_mt_check_hooks,
.checkentry = devgroup_mt_checkentry,
.matchsize = sizeof(struct xt_devgroup_info),
.family = NFPROTO_UNSPEC,
diff --git a/net/netfilter/xt_ecn.c b/net/netfilter/xt_ecn.c
index b96e8203ac54..a8503f5d26bf 100644
--- a/net/netfilter/xt_ecn.c
+++ b/net/netfilter/xt_ecn.c
@@ -30,6 +30,10 @@ static bool match_tcp(const struct sk_buff *skb, struct xt_action_param *par)
struct tcphdr _tcph;
const struct tcphdr *th;
+ /* this is fine for IPv6 as ecn_mt_check6() enforces -p tcp */
+ if (par->fragoff)
+ return false;
+
/* In practice, TCP match does this, so can't fail. But let's
* be good citizens.
*/
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 3bd127bfc114..2704b4b60d1e 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -658,6 +658,8 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo,
if (!(hinfo->cfg.mode &
(XT_HASHLIMIT_HASH_DPT | XT_HASHLIMIT_HASH_SPT)))
return 0;
+ if (ntohs(ip_hdr(skb)->frag_off) & IP_OFFSET)
+ return -1;
nexthdr = ip_hdr(skb)->protocol;
break;
#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
@@ -681,7 +683,7 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo,
return 0;
nexthdr = ipv6_hdr(skb)->nexthdr;
protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, &frag_off);
- if ((int)protoff < 0)
+ if ((int)protoff < 0 || ntohs(frag_off) & IP6_OFFSET)
return -1;
break;
}
diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c
index dc9485854002..e8807caede68 100644
--- a/net/netfilter/xt_osf.c
+++ b/net/netfilter/xt_osf.c
@@ -27,6 +27,9 @@
static bool
xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
{
+ if (p->fragoff)
+ return false;
+
return nf_osf_match(skb, xt_family(p), xt_hooknum(p), xt_in(p),
xt_out(p), p->matchinfo, xt_net(p), nf_osf_fingers);
}
diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c
index d2b0b52434fa..dd98f758176c 100644
--- a/net/netfilter/xt_physdev.c
+++ b/net/netfilter/xt_physdev.c
@@ -91,14 +91,10 @@ match_outdev:
return (!!ret ^ !(info->invert & XT_PHYSDEV_OP_OUT));
}
-static int physdev_mt_check(const struct xt_mtchk_param *par)
+static int physdev_mt_check_hooks(const struct xt_mtchk_param *par)
{
const struct xt_physdev_info *info = par->matchinfo;
- static bool brnf_probed __read_mostly;
- if (!(info->bitmask & XT_PHYSDEV_OP_MASK) ||
- info->bitmask & ~XT_PHYSDEV_OP_MASK)
- return -EINVAL;
if (info->bitmask & (XT_PHYSDEV_OP_OUT | XT_PHYSDEV_OP_ISOUT) &&
(!(info->bitmask & XT_PHYSDEV_OP_BRIDGED) ||
info->invert & XT_PHYSDEV_OP_BRIDGED) &&
@@ -107,6 +103,18 @@ static int physdev_mt_check(const struct xt_mtchk_param *par)
return -EINVAL;
}
+ return 0;
+}
+
+static int physdev_mt_check(const struct xt_mtchk_param *par)
+{
+ const struct xt_physdev_info *info = par->matchinfo;
+ static bool brnf_probed __read_mostly;
+
+ if (!(info->bitmask & XT_PHYSDEV_OP_MASK) ||
+ info->bitmask & ~XT_PHYSDEV_OP_MASK)
+ return -EINVAL;
+
#define X(memb) strnlen(info->memb, sizeof(info->memb)) >= sizeof(info->memb)
if (info->bitmask & XT_PHYSDEV_OP_IN) {
if (info->physindev[0] == '\0')
@@ -141,6 +149,7 @@ static struct xt_match physdev_mt_reg[] __read_mostly = {
{
.name = "physdev",
.family = NFPROTO_IPV4,
+ .check_hooks = physdev_mt_check_hooks,
.checkentry = physdev_mt_check,
.match = physdev_mt,
.matchsize = sizeof(struct xt_physdev_info),
@@ -149,6 +158,7 @@ static struct xt_match physdev_mt_reg[] __read_mostly = {
{
.name = "physdev",
.family = NFPROTO_IPV6,
+ .check_hooks = physdev_mt_check_hooks,
.checkentry = physdev_mt_check,
.match = physdev_mt,
.matchsize = sizeof(struct xt_physdev_info),
diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c
index b5fa65558318..ff54e3a8581e 100644
--- a/net/netfilter/xt_policy.c
+++ b/net/netfilter/xt_policy.c
@@ -126,13 +126,10 @@ policy_mt(const struct sk_buff *skb, struct xt_action_param *par)
return ret;
}
-static int policy_mt_check(const struct xt_mtchk_param *par)
+static int policy_mt_check_hooks(const struct xt_mtchk_param *par)
{
const struct xt_policy_info *info = par->matchinfo;
- const char *errmsg = "neither incoming nor outgoing policy selected";
-
- if (!(info->flags & (XT_POLICY_MATCH_IN|XT_POLICY_MATCH_OUT)))
- goto err;
+ const char *errmsg;
if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_IN)) && info->flags & XT_POLICY_MATCH_OUT) {
@@ -144,6 +141,21 @@ static int policy_mt_check(const struct xt_mtchk_param *par)
errmsg = "input policy not valid in POSTROUTING and OUTPUT";
goto err;
}
+
+ return 0;
+err:
+ pr_info_ratelimited("%s\n", errmsg);
+ return -EINVAL;
+}
+
+static int policy_mt_check(const struct xt_mtchk_param *par)
+{
+ const struct xt_policy_info *info = par->matchinfo;
+ const char *errmsg = "neither incoming nor outgoing policy selected";
+
+ if (!(info->flags & (XT_POLICY_MATCH_IN|XT_POLICY_MATCH_OUT)))
+ goto err;
+
if (info->len > XT_POLICY_MAX_ELEM) {
errmsg = "too many policy elements";
goto err;
@@ -158,6 +170,7 @@ static struct xt_match policy_mt_reg[] __read_mostly = {
{
.name = "policy",
.family = NFPROTO_IPV4,
+ .check_hooks = policy_mt_check_hooks,
.checkentry = policy_mt_check,
.match = policy_mt,
.matchsize = sizeof(struct xt_policy_info),
@@ -166,6 +179,7 @@ static struct xt_match policy_mt_reg[] __read_mostly = {
{
.name = "policy",
.family = NFPROTO_IPV6,
+ .check_hooks = policy_mt_check_hooks,
.checkentry = policy_mt_check,
.match = policy_mt,
.matchsize = sizeof(struct xt_policy_info),
diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c
index 731bc2cafae4..4ae04bba9358 100644
--- a/net/netfilter/xt_set.c
+++ b/net/netfilter/xt_set.c
@@ -431,6 +431,29 @@ set_target_v3(struct sk_buff *skb, const struct xt_action_param *par)
}
static int
+set_target_v3_check_hooks(const struct xt_tgchk_param *par)
+{
+ const struct xt_set_info_target_v3 *info = par->targinfo;
+
+ if (info->map_set.index != IPSET_INVALID_ID) {
+ if (strncmp(par->table, "mangle", 7)) {
+ pr_info_ratelimited("--map-set only usable from mangle table\n");
+ return -EINVAL;
+ }
+ if (((info->flags & IPSET_FLAG_MAP_SKBPRIO) |
+ (info->flags & IPSET_FLAG_MAP_SKBQUEUE)) &&
+ (par->hook_mask & ~(1 << NF_INET_FORWARD |
+ 1 << NF_INET_LOCAL_OUT |
+ 1 << NF_INET_POST_ROUTING))) {
+ pr_info_ratelimited("mapping of prio or/and queue is allowed only from OUTPUT/FORWARD/POSTROUTING chains\n");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int
set_target_v3_checkentry(const struct xt_tgchk_param *par)
{
const struct xt_set_info_target_v3 *info = par->targinfo;
@@ -459,20 +482,6 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par)
}
if (info->map_set.index != IPSET_INVALID_ID) {
- if (strncmp(par->table, "mangle", 7)) {
- pr_info_ratelimited("--map-set only usable from mangle table\n");
- ret = -EINVAL;
- goto cleanup_del;
- }
- if (((info->flags & IPSET_FLAG_MAP_SKBPRIO) |
- (info->flags & IPSET_FLAG_MAP_SKBQUEUE)) &&
- (par->hook_mask & ~(1 << NF_INET_FORWARD |
- 1 << NF_INET_LOCAL_OUT |
- 1 << NF_INET_POST_ROUTING))) {
- pr_info_ratelimited("mapping of prio or/and queue is allowed only from OUTPUT/FORWARD/POSTROUTING chains\n");
- ret = -EINVAL;
- goto cleanup_del;
- }
index = ip_set_nfnl_get_byindex(par->net,
info->map_set.index);
if (index == IPSET_INVALID_ID) {
@@ -672,6 +681,7 @@ static struct xt_target set_targets[] __read_mostly = {
.family = NFPROTO_IPV4,
.target = set_target_v3,
.targetsize = sizeof(struct xt_set_info_target_v3),
+ .check_hooks = set_target_v3_check_hooks,
.checkentry = set_target_v3_checkentry,
.destroy = set_target_v3_destroy,
.me = THIS_MODULE
@@ -682,6 +692,7 @@ static struct xt_target set_targets[] __read_mostly = {
.family = NFPROTO_IPV6,
.target = set_target_v3,
.targetsize = sizeof(struct xt_set_info_target_v3),
+ .check_hooks = set_target_v3_check_hooks,
.checkentry = set_target_v3_checkentry,
.destroy = set_target_v3_destroy,
.me = THIS_MODULE
diff --git a/net/netfilter/xt_tcpmss.c b/net/netfilter/xt_tcpmss.c
index 0d32d4841cb3..b9da8269161d 100644
--- a/net/netfilter/xt_tcpmss.c
+++ b/net/netfilter/xt_tcpmss.c
@@ -32,6 +32,10 @@ tcpmss_mt(const struct sk_buff *skb, struct xt_action_param *par)
u8 _opt[15 * 4 - sizeof(_tcph)];
unsigned int i, optlen;
+ /* this is fine for IPv6 as xt_tcpmss enforces -p tcp */
+ if (par->fragoff)
+ return false;
+
/* If we don't have the whole header, drop packet. */
th = skb_header_pointer(skb, par->thoff, sizeof(_tcph), &_tcph);
if (th == NULL)
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 2aeb0680807d..7269e23b578d 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1482,9 +1482,14 @@ static void do_one_broadcast(struct sock *sk,
p->skb2 = NULL;
goto out;
}
- NETLINK_CB(p->skb2).nsid = peernet2id(sock_net(sk), p->net);
- if (NETLINK_CB(p->skb2).nsid != NETNSA_NSID_NOT_ASSIGNED)
- NETLINK_CB(p->skb2).nsid_is_set = true;
+
+ NETLINK_CB(p->skb2).nsid_is_set = false;
+ if (!net_eq(sock_net(sk), p->net)) {
+ NETLINK_CB(p->skb2).nsid = peernet2id(sock_net(sk), p->net);
+ if (NETLINK_CB(p->skb2).nsid != NETNSA_NSID_NOT_ASSIGNED)
+ NETLINK_CB(p->skb2).nsid_is_set = true;
+ }
+
val = netlink_broadcast_deliver(sk, p->skb2);
if (val < 0) {
netlink_overrun(sk);
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index d251d894afd4..0da39eaed255 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -1972,8 +1972,10 @@ int genlmsg_multicast_allns(const struct genl_family *family,
struct sk_buff *skb, u32 portid,
unsigned int group)
{
- if (WARN_ON_ONCE(group >= family->n_mcgrps))
+ if (WARN_ON_ONCE(group >= family->n_mcgrps)) {
+ kfree_skb(skb);
return -EINVAL;
+ }
group = family->mcgrp_offset + group;
return genlmsg_mcast(skb, portid, group);
@@ -1986,8 +1988,10 @@ void genl_notify(const struct genl_family *family, struct sk_buff *skb,
struct net *net = genl_info_net(info);
struct sock *sk = net->genl_sock;
- if (WARN_ON_ONCE(group >= family->n_mcgrps))
+ if (WARN_ON_ONCE(group >= family->n_mcgrps)) {
+ kfree_skb(skb);
return;
+ }
group = family->mcgrp_offset + group;
nlmsg_notify(sk, skb, info->snd_portid, group,
diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c
index 0d33c81a15fe..ba6f0310ffd7 100644
--- a/net/nfc/hci/core.c
+++ b/net/nfc/hci/core.c
@@ -861,6 +861,11 @@ static void nfc_hci_recv_from_llc(struct nfc_hci_dev *hdev, struct sk_buff *skb)
struct sk_buff *frag_skb;
int msg_len;
+ if (!pskb_may_pull(skb, NFC_HCI_HCP_PACKET_HEADER_LEN)) {
+ kfree_skb(skb);
+ return;
+ }
+
packet = (struct hcp_packet *)skb->data;
if ((packet->header & ~NFC_HCI_FRAGMENT) == 0) {
skb_queue_tail(&hdev->rx_hcp_frags, skb);
@@ -904,6 +909,11 @@ static void nfc_hci_recv_from_llc(struct nfc_hci_dev *hdev, struct sk_buff *skb)
* unblock waiting cmd context. Otherwise, enqueue to dispatch
* in separate context where handler can also execute command.
*/
+ if (!pskb_may_pull(hcp_skb, NFC_HCI_HCP_HEADER_LEN)) {
+ kfree_skb(hcp_skb);
+ return;
+ }
+
packet = (struct hcp_packet *)hcp_skb->data;
type = HCP_MSG_GET_TYPE(packet->message.header);
if (type == NFC_HCI_HCP_RESPONSE) {
diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c
index db5bc6a878dd..dc65c719f35f 100644
--- a/net/nfc/llcp_core.c
+++ b/net/nfc/llcp_core.c
@@ -1218,6 +1218,15 @@ static void nfc_llcp_recv_cc(struct nfc_llcp_local *local,
sk = &llcp_sock->sk;
+ lock_sock(sk);
+
+ /* Check if socket was destroyed whilst waiting for the lock */
+ if (!sk_hashed(sk)) {
+ release_sock(sk);
+ nfc_llcp_sock_put(llcp_sock);
+ return;
+ }
+
/* Unlink from connecting and link to the client array */
nfc_llcp_sock_unlink(&local->connecting_sockets, sk);
nfc_llcp_sock_link(&local->sockets, sk);
@@ -1229,6 +1238,8 @@ static void nfc_llcp_recv_cc(struct nfc_llcp_local *local,
sk->sk_state = LLCP_CONNECTED;
sk->sk_state_change(sk);
+ release_sock(sk);
+
nfc_llcp_sock_put(llcp_sock);
}
diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c
index f1be1e84f665..feab29fc62f4 100644
--- a/net/nfc/llcp_sock.c
+++ b/net/nfc/llcp_sock.c
@@ -633,6 +633,8 @@ static int llcp_sock_release(struct socket *sock)
if (sock->type == SOCK_RAW)
nfc_llcp_sock_unlink(&local->raw_sockets, sk);
+ else if (sk->sk_state == LLCP_CONNECTING)
+ nfc_llcp_sock_unlink(&local->connecting_sockets, sk);
else
nfc_llcp_sock_unlink(&local->sockets, sk);
diff --git a/net/nfc/nci/hci.c b/net/nfc/nci/hci.c
index 40ae8e5a7ec7..c03e8a0bd3bd 100644
--- a/net/nfc/nci/hci.c
+++ b/net/nfc/nci/hci.c
@@ -439,6 +439,11 @@ void nci_hci_data_received_cb(void *context,
return;
}
+ if (!pskb_may_pull(skb, NCI_HCI_HCP_PACKET_HEADER_LEN)) {
+ kfree_skb(skb);
+ return;
+ }
+
packet = (struct nci_hcp_packet *)skb->data;
if ((packet->header & ~NCI_HCI_FRAGMENT) == 0) {
skb_queue_tail(&ndev->hci_dev->rx_hcp_frags, skb);
@@ -482,6 +487,11 @@ void nci_hci_data_received_cb(void *context,
* unblock waiting cmd context. Otherwise, enqueue to dispatch
* in separate context where handler can also execute command.
*/
+ if (!pskb_may_pull(hcp_skb, NCI_HCI_HCP_HEADER_LEN)) {
+ kfree_skb(hcp_skb);
+ return;
+ }
+
packet = (struct nci_hcp_packet *)hcp_skb->data;
type = NCI_HCP_MSG_GET_TYPE(packet->message.header);
if (type == NCI_HCI_HCP_RESPONSE) {
diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c
index b10e1602c6b1..cb5ea4424ffc 100644
--- a/net/openvswitch/vport-geneve.c
+++ b/net/openvswitch/vport-geneve.c
@@ -97,6 +97,9 @@ static struct vport *geneve_tnl_create(const struct vport_parms *parms)
goto error;
}
+ vport->dev = dev;
+ netdev_hold(vport->dev, &vport->dev_tracker, GFP_KERNEL);
+
rtnl_unlock();
return vport;
error:
@@ -111,7 +114,7 @@ static struct vport *geneve_create(const struct vport_parms *parms)
if (IS_ERR(vport))
return vport;
- return ovs_netdev_link(vport, parms->name);
+ return ovs_netdev_link(vport, true);
}
static struct vport_ops ovs_geneve_vport_ops = {
diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c
index 4014c9b5eb79..6cb5a697b396 100644
--- a/net/openvswitch/vport-gre.c
+++ b/net/openvswitch/vport-gre.c
@@ -63,6 +63,9 @@ static struct vport *gre_tnl_create(const struct vport_parms *parms)
return ERR_PTR(err);
}
+ vport->dev = dev;
+ netdev_hold(vport->dev, &vport->dev_tracker, GFP_KERNEL);
+
rtnl_unlock();
return vport;
}
@@ -75,7 +78,7 @@ static struct vport *gre_create(const struct vport_parms *parms)
if (IS_ERR(vport))
return vport;
- return ovs_netdev_link(vport, parms->name);
+ return ovs_netdev_link(vport, true);
}
static struct vport_ops ovs_gre_vport_ops = {
diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c
index 12055af832dc..e7e8490a53d8 100644
--- a/net/openvswitch/vport-netdev.c
+++ b/net/openvswitch/vport-netdev.c
@@ -73,37 +73,29 @@ static struct net_device *get_dpdev(const struct datapath *dp)
return local->dev;
}
-struct vport *ovs_netdev_link(struct vport *vport, const char *name)
+struct vport *ovs_netdev_link(struct vport *vport, bool tunnel)
{
int err;
- vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), name);
- if (!vport->dev) {
+ if (WARN_ON_ONCE(!vport->dev)) {
err = -ENODEV;
goto error_free_vport;
}
- /* Ensure that the device exists and that the provided
- * name is not one of its aliases.
+
+ rtnl_lock();
+ /* Do not link devices that are not registered to avoid a potential
+ * race with the NETDEV_UNREGISTER notification in dp_device_event().
*/
- if (strcmp(name, ovs_vport_name(vport))) {
+ if (vport->dev->reg_state != NETREG_REGISTERED) {
err = -ENODEV;
- goto error_put;
- }
- netdev_tracker_alloc(vport->dev, &vport->dev_tracker, GFP_KERNEL);
- if (vport->dev->flags & IFF_LOOPBACK ||
- (vport->dev->type != ARPHRD_ETHER &&
- vport->dev->type != ARPHRD_NONE) ||
- ovs_is_internal_dev(vport->dev)) {
- err = -EINVAL;
- goto error_put;
+ goto error_put_unlock;
}
- rtnl_lock();
err = netdev_master_upper_dev_link(vport->dev,
get_dpdev(vport->dp),
NULL, NULL, NULL);
if (err)
- goto error_unlock;
+ goto error_put_unlock;
err = netdev_rx_handler_register(vport->dev, netdev_frame_hook,
vport);
@@ -119,10 +111,11 @@ struct vport *ovs_netdev_link(struct vport *vport, const char *name)
error_master_upper_dev_unlink:
netdev_upper_dev_unlink(vport->dev, get_dpdev(vport->dp));
-error_unlock:
- rtnl_unlock();
-error_put:
+error_put_unlock:
+ if (tunnel && vport->dev->reg_state == NETREG_REGISTERED)
+ rtnl_delete_link(vport->dev, 0, NULL);
netdev_put(vport->dev, &vport->dev_tracker);
+ rtnl_unlock();
error_free_vport:
ovs_vport_free(vport);
return ERR_PTR(err);
@@ -132,12 +125,39 @@ EXPORT_SYMBOL_GPL(ovs_netdev_link);
static struct vport *netdev_create(const struct vport_parms *parms)
{
struct vport *vport;
+ int err;
vport = ovs_vport_alloc(0, &ovs_netdev_vport_ops, parms);
if (IS_ERR(vport))
return vport;
- return ovs_netdev_link(vport, parms->name);
+ vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), parms->name);
+ if (!vport->dev) {
+ err = -ENODEV;
+ goto error_free_vport;
+ }
+ netdev_tracker_alloc(vport->dev, &vport->dev_tracker, GFP_KERNEL);
+
+ /* Ensure that the provided name is not an alias. */
+ if (strcmp(parms->name, ovs_vport_name(vport))) {
+ err = -ENODEV;
+ goto error_put;
+ }
+
+ if (vport->dev->flags & IFF_LOOPBACK ||
+ (vport->dev->type != ARPHRD_ETHER &&
+ vport->dev->type != ARPHRD_NONE) ||
+ ovs_is_internal_dev(vport->dev)) {
+ err = -EINVAL;
+ goto error_put;
+ }
+
+ return ovs_netdev_link(vport, false);
+error_put:
+ netdev_put(vport->dev, &vport->dev_tracker);
+error_free_vport:
+ ovs_vport_free(vport);
+ return ERR_PTR(err);
}
static void vport_netdev_free(struct rcu_head *rcu)
@@ -196,9 +216,13 @@ void ovs_netdev_tunnel_destroy(struct vport *vport)
*/
if (vport->dev->reg_state == NETREG_REGISTERED)
rtnl_delete_link(vport->dev, 0, NULL);
- rtnl_unlock();
+ /* We can't put the device reference yet, since it can still be in
+ * use, but rtnl_unlock()->netdev_run_todo() will block until all
+ * the references are released, so the RCU call must be before it.
+ */
call_rcu(&vport->rcu, vport_netdev_free);
+ rtnl_unlock();
}
EXPORT_SYMBOL_GPL(ovs_netdev_tunnel_destroy);
diff --git a/net/openvswitch/vport-netdev.h b/net/openvswitch/vport-netdev.h
index c5d83a43bfc4..6c0d7366f986 100644
--- a/net/openvswitch/vport-netdev.h
+++ b/net/openvswitch/vport-netdev.h
@@ -13,7 +13,7 @@
struct vport *ovs_netdev_get_vport(struct net_device *dev);
-struct vport *ovs_netdev_link(struct vport *vport, const char *name);
+struct vport *ovs_netdev_link(struct vport *vport, bool tunnel);
void ovs_netdev_detach_dev(struct vport *);
int __init ovs_netdev_init(void);
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
index 0b881b043bcf..c1b37b50d29e 100644
--- a/net/openvswitch/vport-vxlan.c
+++ b/net/openvswitch/vport-vxlan.c
@@ -126,6 +126,9 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms)
goto error;
}
+ vport->dev = dev;
+ netdev_hold(vport->dev, &vport->dev_tracker, GFP_KERNEL);
+
rtnl_unlock();
return vport;
error:
@@ -140,7 +143,7 @@ static struct vport *vxlan_create(const struct vport_parms *parms)
if (IS_ERR(vport))
return vport;
- return ovs_netdev_link(vport, parms->name);
+ return ovs_netdev_link(vport, true);
}
static struct vport_ops ovs_vxlan_netdev_vport_ops = {
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index 4dbf0914df7d..706927139393 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -671,8 +671,23 @@ static int pep_do_rcv(struct sock *sk, struct sk_buff *skb)
/* Look for an existing pipe handle */
sknode = pep_find_pipe(&pn->hlist, &dst, pipe_handle);
- if (sknode)
- return sk_receive_skb(sknode, skb, 1);
+ if (sknode) {
+ int rc;
+
+ /* pep_do_rcv() runs from two contexts: from softirq via
+ * phonet_rcv() -> __sk_receive_skb() with BH disabled,
+ * and from process context via
+ * release_sock() -> __release_sock(), which drops
+ * the listener slock with spin_unlock_bh() before draining
+ * the backlog. The child pipe slock is taken below via
+ * bh_lock_sock_nested(), which does not itself disable BH, so
+ * disable BH here to keep both acquire contexts consistent.
+ */
+ local_bh_disable();
+ rc = sk_receive_skb(sknode, skb, 1);
+ local_bh_enable();
+ return rc;
+ }
switch (hdr->message_id) {
case PNS_PEP_CONNECT_REQ:
diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c
index 9508b6c38003..e45549f08eef 100644
--- a/net/psp/psp_main.c
+++ b/net/psp/psp_main.c
@@ -263,15 +263,16 @@ EXPORT_SYMBOL(psp_dev_encapsulate);
/* Receive handler for PSP packets.
*
- * Presently it accepts only already-authenticated packets and does not
- * support optional fields, such as virtualization cookies. The caller should
- * ensure that skb->data is pointing to the mac header, and that skb->mac_len
- * is set. This function does not currently adjust skb->csum (CHECKSUM_COMPLETE
- * is not supported).
+ * Accepts only already-authenticated packets. The full PSP header is
+ * stripped according to psph->hdrlen; any optional fields it advertises
+ * (virtualization cookies, etc.) are ignored and discarded along with the
+ * rest of the header. The caller should ensure that skb->data is pointing
+ * to the mac header, and that skb->mac_len is set. This function does not
+ * currently adjust skb->csum (CHECKSUM_COMPLETE is not supported).
*/
int psp_dev_rcv(struct sk_buff *skb, u16 dev_id, u8 generation, bool strip_icv)
{
- int l2_hlen = 0, l3_hlen, encap;
+ int l2_hlen = 0, l3_hlen, encap, psp_hlen;
struct psp_skb_ext *pse;
struct psphdr *psph;
struct ethhdr *eth;
@@ -312,18 +313,36 @@ int psp_dev_rcv(struct sk_buff *skb, u16 dev_id, u8 generation, bool strip_icv)
if (unlikely(uh->dest != htons(PSP_DEFAULT_UDP_PORT)))
return -EINVAL;
- pse = skb_ext_add(skb, SKB_EXT_PSP);
- if (!pse)
+ psph = (struct psphdr *)(skb->data + l2_hlen + l3_hlen +
+ sizeof(struct udphdr));
+
+ /* Strip the full PSP header per psph->hdrlen; VC/options are pulled
+ * into the linear region only so they can be discarded with the
+ * rest of the header.
+ */
+ psp_hlen = (psph->hdrlen + 1) * 8;
+
+ if (unlikely(psp_hlen < sizeof(struct psphdr)))
+ return -EINVAL;
+
+ if (psp_hlen > sizeof(struct psphdr) &&
+ !pskb_may_pull(skb, l2_hlen + l3_hlen +
+ sizeof(struct udphdr) + psp_hlen))
return -EINVAL;
psph = (struct psphdr *)(skb->data + l2_hlen + l3_hlen +
sizeof(struct udphdr));
+
+ pse = skb_ext_add(skb, SKB_EXT_PSP);
+ if (!pse)
+ return -EINVAL;
+
pse->spi = psph->spi;
pse->dev_id = dev_id;
pse->generation = generation;
pse->version = FIELD_GET(PSPHDR_VERFL_VERSION, psph->verfl);
- encap = PSP_ENCAP_HLEN;
+ encap = sizeof(struct udphdr) + psp_hlen;
encap += strip_icv ? PSP_TRL_SIZE : 0;
if (proto == htons(ETH_P_IP)) {
@@ -340,8 +359,9 @@ int psp_dev_rcv(struct sk_buff *skb, u16 dev_id, u8 generation, bool strip_icv)
ipv6h->payload_len = htons(ntohs(ipv6h->payload_len) - encap);
}
- memmove(skb->data + PSP_ENCAP_HLEN, skb->data, l2_hlen + l3_hlen);
- skb_pull(skb, PSP_ENCAP_HLEN);
+ memmove(skb->data + sizeof(struct udphdr) + psp_hlen,
+ skb->data, l2_hlen + l3_hlen);
+ skb_pull(skb, sizeof(struct udphdr) + psp_hlen);
if (strip_icv)
pskb_trim(skb, skb->len - PSP_TRL_SIZE);
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 0c64c504f79d..4001de0c4959 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -656,6 +656,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
sends_out:
vfree(ic->i_sends);
+ ic->i_sends = NULL;
ack_dma_out:
rds_dma_hdr_free(rds_ibdev->dev, ic->i_ack, ic->i_ack_dma,
diff --git a/net/rds/message.c b/net/rds/message.c
index eaa6f22601a4..7feb0eb6537d 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -131,24 +131,34 @@ static void rds_rm_zerocopy_callback(struct rds_sock *rs,
*/
static void rds_message_purge(struct rds_message *rm)
{
+ struct rds_znotifier *znotifier;
unsigned long i, flags;
- bool zcopy = false;
+ bool zcopy;
if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags)))
return;
spin_lock_irqsave(&rm->m_rs_lock, flags);
+ znotifier = rm->data.op_mmp_znotifier;
+ rm->data.op_mmp_znotifier = NULL;
+ zcopy = !!znotifier;
+
if (rm->m_rs) {
struct rds_sock *rs = rm->m_rs;
- if (rm->data.op_mmp_znotifier) {
- zcopy = true;
- rds_rm_zerocopy_callback(rs, rm->data.op_mmp_znotifier);
+ if (znotifier) {
+ rds_rm_zerocopy_callback(rs, znotifier);
rds_wake_sk_sleep(rs);
- rm->data.op_mmp_znotifier = NULL;
}
sock_put(rds_rs_to_sk(rs));
rm->m_rs = NULL;
+ } else if (znotifier) {
+ /*
+ * Zerocopy can fail before the message is queued on the
+ * socket, so there is no rs to carry the notification.
+ */
+ mm_unaccount_pinned_pages(&znotifier->z_mmp);
+ kfree(rds_info_from_znotifier(znotifier));
}
spin_unlock_irqrestore(&rm->m_rs_lock, flags);
@@ -438,6 +448,7 @@ static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *
for (i = 0; i < rm->data.op_nents; i++)
put_page(sg_page(&rm->data.op_sg[i]));
+ rm->data.op_nents = 0;
mmp = &rm->data.op_mmp_znotifier->z_mmp;
mm_unaccount_pinned_pages(mmp);
ret = -EFAULT;
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 654e23d13e3d..5830b31a1f37 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -198,8 +198,13 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp)
rdsdebug("setting sock %p callbacks to tc %p\n", sock, tc);
write_lock_bh(&sock->sk->sk_callback_lock);
- /* done under the callback_lock to serialize with write_space */
+ /* done under the callback_lock to serialize with write_space.
+ * Set t_sock inside rds_tcp_tc_list_lock so readers walking
+ * rds_tcp_tc_list under the same lock cannot observe an
+ * entry whose t_sock is NULL.
+ */
spin_lock(&rds_tcp_tc_list_lock);
+ tc->t_sock = sock;
list_add_tail(&tc->t_list_item, &rds_tcp_tc_list);
#if IS_ENABLED(CONFIG_IPV6)
rds6_tcp_tc_count++;
@@ -211,8 +216,6 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp)
/* accepted sockets need our listen data ready undone */
if (sock->sk->sk_data_ready == rds_tcp_listen_data_ready)
sock->sk->sk_data_ready = sock->sk->sk_user_data;
-
- tc->t_sock = sock;
if (!tc->t_rtn)
tc->t_rtn = net_generic(sock_net(sock->sk), rds_tcp_netid);
tc->t_cpath = cp;
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 27c2aa2dd023..98f2165159d7 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -213,8 +213,6 @@ struct rxrpc_skb_priv {
struct {
u16 offset; /* Offset of data */
u16 len; /* Length of data */
- u8 flags;
-#define RXRPC_RX_VERIFIED 0x01
};
struct {
rxrpc_seq_t first_ack; /* First packet in acks table */
@@ -309,15 +307,16 @@ struct rxrpc_security {
struct sk_buff *challenge);
/* verify a response */
- int (*verify_response)(struct rxrpc_connection *,
- struct sk_buff *);
+ int (*verify_response)(struct rxrpc_connection *conn,
+ struct sk_buff *response_skb,
+ void *response, unsigned int len);
/* clear connection security */
void (*clear)(struct rxrpc_connection *);
/* Default ticket -> key decoder */
int (*default_decode_ticket)(struct rxrpc_connection *conn, struct sk_buff *skb,
- unsigned int ticket_offset, unsigned int ticket_len,
+ void *ticket, unsigned int ticket_len,
struct key **_key);
};
@@ -774,6 +773,11 @@ struct rxrpc_call {
struct sk_buff_head recvmsg_queue; /* Queue of packets ready for recvmsg() */
struct sk_buff_head rx_queue; /* Queue of packets for this call to receive */
struct sk_buff_head rx_oos_queue; /* Queue of out of sequence packets */
+ void *rx_dec_buffer; /* Decryption buffer */
+ unsigned short rx_dec_bsize; /* rx_dec_buffer size */
+ unsigned short rx_dec_offset; /* Decrypted packet data offset */
+ unsigned short rx_dec_len; /* Decrypted packet data len */
+ rxrpc_seq_t rx_dec_seq; /* Packet in decryption buffer */
rxrpc_seq_t rx_highest_seq; /* Higest sequence number received */
rxrpc_seq_t rx_consumed; /* Highest packet consumed */
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index fdd683261226..fec59d9338b9 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -332,25 +332,7 @@ bool rxrpc_input_call_event(struct rxrpc_call *call)
saw_ack |= sp->hdr.type == RXRPC_PACKET_TYPE_ACK;
- if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA &&
- sp->hdr.securityIndex != 0 &&
- skb_cloned(skb)) {
- /* Unshare the packet so that it can be
- * modified by in-place decryption.
- */
- struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
-
- if (nskb) {
- rxrpc_new_skb(nskb, rxrpc_skb_new_unshared);
- rxrpc_input_call_packet(call, nskb);
- rxrpc_free_skb(nskb, rxrpc_skb_put_call_rx);
- } else {
- /* OOM - Drop the packet. */
- rxrpc_see_skb(skb, rxrpc_skb_see_unshare_nomem);
- }
- } else {
- rxrpc_input_call_packet(call, skb);
- }
+ rxrpc_input_call_packet(call, skb);
rxrpc_free_skb(skb, rxrpc_skb_put_call_rx);
did_receive = true;
}
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index f035f486c139..fcb9d38bb521 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -152,6 +152,7 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp,
spin_lock_init(&call->notify_lock);
refcount_set(&call->ref, 1);
call->debug_id = debug_id;
+ call->rx_pkt_offset = USHRT_MAX;
call->tx_total_len = -1;
call->tx_jumbo_max = 1;
call->next_rx_timo = 20 * HZ;
@@ -553,6 +554,7 @@ static void rxrpc_cleanup_rx_buffers(struct rxrpc_call *call)
rxrpc_purge_queue(&call->recvmsg_queue);
rxrpc_purge_queue(&call->rx_queue);
rxrpc_purge_queue(&call->rx_oos_queue);
+ kfree(call->rx_dec_buffer);
}
/*
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index a2130d25aaa9..c96ca615b787 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -243,27 +243,22 @@ static void rxrpc_call_is_secure(struct rxrpc_call *call)
static int rxrpc_verify_response(struct rxrpc_connection *conn,
struct sk_buff *skb)
{
+ unsigned int len = skb->len - sizeof(struct rxrpc_wire_header);
+ void *buffer;
int ret;
- if (skb_cloned(skb)) {
- /* Copy the packet if shared so that we can do in-place
- * decryption.
- */
- struct sk_buff *nskb = skb_copy(skb, GFP_NOFS);
+ buffer = kmalloc(len, GFP_NOFS);
+ if (!buffer)
+ return -ENOMEM;
- if (nskb) {
- rxrpc_new_skb(nskb, rxrpc_skb_new_unshared);
- ret = conn->security->verify_response(conn, nskb);
- rxrpc_free_skb(nskb, rxrpc_skb_put_response_copy);
- } else {
- /* OOM - Drop the packet. */
- rxrpc_see_skb(skb, rxrpc_skb_see_unshare_nomem);
- ret = -ENOMEM;
- }
- } else {
- ret = conn->security->verify_response(conn, skb);
- }
+ ret = skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), buffer, len);
+ if (ret < 0)
+ goto out;
+
+ ret = conn->security->verify_response(conn, skb, buffer, len);
+out:
+ kfree(buffer);
return ret;
}
diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c
index 0a260df45d25..0b39046bdc61 100644
--- a/net/rxrpc/insecure.c
+++ b/net/rxrpc/insecure.c
@@ -32,9 +32,6 @@ static int none_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
static int none_verify_packet(struct rxrpc_call *call, struct sk_buff *skb)
{
- struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
-
- sp->flags |= RXRPC_RX_VERIFIED;
return 0;
}
@@ -57,9 +54,10 @@ static int none_sendmsg_respond_to_challenge(struct sk_buff *challenge,
}
static int none_verify_response(struct rxrpc_connection *conn,
- struct sk_buff *skb)
+ struct sk_buff *response_skb,
+ void *response, unsigned int len)
{
- return rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO,
+ return rxrpc_abort_conn(conn, response_skb, RX_PROTOCOL_ERROR, -EPROTO,
rxrpc_eproto_rxnull_response);
}
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index e1f7513a46db..c940600117a4 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -147,15 +147,52 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call)
}
/*
- * Decrypt and verify a DATA packet.
+ * Decrypt and verify a DATA packet. The content of the packet is pulled out
+ * into a flat buffer rather than decrypting in place in the skbuff. This also
+ * has the advantage of aligning the buffer correctly for the crypto routines.
+ *
+ * We keep track of the sequence number of the packet currently decrypted into
+ * the buffer in ->rx_dec_seq. If MSG_PEEK is used and steps onto a new
+ * packet, subsequent recvmsg() calls will have to go back and re-decrypt the
+ * current packet.
*/
static int rxrpc_verify_data(struct rxrpc_call *call, struct sk_buff *skb)
{
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ int ret;
- if (sp->flags & RXRPC_RX_VERIFIED)
- return 0;
- return call->security->verify_packet(call, skb);
+ if (sp->len > call->rx_dec_bsize) {
+ /* Make sure we can hold a 1412-byte jumbo subpacket and make
+ * sure that the buffer size is aligned to a crypto blocksize.
+ */
+ size_t size = clamp(round_up(sp->len, 32), 2048, 65535);
+ void *buffer = krealloc(call->rx_dec_buffer, size, GFP_NOFS);
+
+ if (!buffer)
+ return -ENOMEM;
+ call->rx_dec_buffer = buffer;
+ call->rx_dec_bsize = size;
+ }
+
+ ret = -EFAULT;
+ if (skb_copy_bits(skb, sp->offset, call->rx_dec_buffer, sp->len) < 0)
+ goto err;
+
+ call->rx_dec_offset = 0;
+ call->rx_dec_len = sp->len;
+ call->rx_dec_seq = sp->hdr.seq;
+ ret = call->security->verify_packet(call, skb);
+ if (ret < 0)
+ goto err;
+ return 0;
+
+err:
+ kfree(call->rx_dec_buffer);
+ call->rx_dec_buffer = NULL;
+ call->rx_dec_bsize = 0;
+ call->rx_dec_offset = 0;
+ call->rx_dec_len = 0;
+ return ret;
}
/*
@@ -283,16 +320,21 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
if (msg)
sock_recv_timestamp(msg, sock->sk, skb);
- if (rx_pkt_offset == 0) {
+ if (call->rx_dec_seq != sp->hdr.seq ||
+ !call->rx_dec_buffer) {
ret2 = rxrpc_verify_data(call, skb);
trace_rxrpc_recvdata(call, rxrpc_recvmsg_next, seq,
- sp->offset, sp->len, ret2);
+ call->rx_dec_offset,
+ call->rx_dec_len, ret2);
if (ret2 < 0) {
ret = ret2;
goto out;
}
- rx_pkt_offset = sp->offset;
- rx_pkt_len = sp->len;
+ }
+
+ if (rx_pkt_offset == USHRT_MAX) {
+ rx_pkt_offset = call->rx_dec_offset;
+ rx_pkt_len = call->rx_dec_len;
} else {
trace_rxrpc_recvdata(call, rxrpc_recvmsg_cont, seq,
rx_pkt_offset, rx_pkt_len, 0);
@@ -304,10 +346,10 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
if (copy > remain)
copy = remain;
if (copy > 0) {
- ret2 = skb_copy_datagram_iter(skb, rx_pkt_offset, iter,
- copy);
- if (ret2 < 0) {
- ret = ret2;
+ ret2 = copy_to_iter(call->rx_dec_buffer + rx_pkt_offset,
+ copy, iter);
+ if (ret2 != copy) {
+ ret = -EFAULT;
goto out;
}
@@ -328,7 +370,7 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
/* The whole packet has been transferred. */
if (sp->hdr.flags & RXRPC_LAST_PACKET)
ret = 1;
- rx_pkt_offset = 0;
+ rx_pkt_offset = USHRT_MAX;
rx_pkt_len = 0;
skb = skb_peek_next(skb, &call->recvmsg_queue);
diff --git a/net/rxrpc/rxgk.c b/net/rxrpc/rxgk.c
index 0d5e654da918..a1ee102abae1 100644
--- a/net/rxrpc/rxgk.c
+++ b/net/rxrpc/rxgk.c
@@ -473,15 +473,20 @@ static int rxgk_verify_packet_integrity(struct rxrpc_call *call,
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
struct rxgk_header *hdr;
struct krb5_buffer metadata;
- unsigned int offset = sp->offset, len = sp->len;
+ unsigned int len = call->rx_dec_len;
size_t data_offset = 0, data_len = len;
+ void *data = call->rx_dec_buffer, *p = data;
u32 ac = 0;
int ret = -ENOMEM;
_enter("");
- crypto_krb5_where_is_the_data(gk->krb5, KRB5_CHECKSUM_MODE,
- &data_offset, &data_len);
+ if (crypto_krb5_where_is_the_data(gk->krb5, KRB5_CHECKSUM_MODE,
+ &data_offset, &data_len) < 0) {
+ ret = rxrpc_abort_eproto(call, skb, RXGK_PACKETSHORT,
+ rxgk_abort_1_short_header);
+ goto put_gk;
+ }
hdr = kzalloc_obj(*hdr, GFP_NOFS);
if (!hdr)
@@ -496,16 +501,15 @@ static int rxgk_verify_packet_integrity(struct rxrpc_call *call,
metadata.len = sizeof(*hdr);
metadata.data = hdr;
- ret = rxgk_verify_mic_skb(gk->krb5, gk->rx_Kc, &metadata,
- skb, &offset, &len, &ac);
+ ret = rxgk_verify_mic(gk->krb5, gk->rx_Kc, &metadata, &p, &len, &ac);
kfree(hdr);
if (ret < 0) {
if (ret != -ENOMEM)
rxrpc_abort_eproto(call, skb, ac,
rxgk_abort_1_verify_mic_eproto);
} else {
- sp->offset = offset;
- sp->len = len;
+ call->rx_dec_offset = p - data;
+ call->rx_dec_len = len;
}
put_gk:
@@ -522,49 +526,53 @@ static int rxgk_verify_packet_encrypted(struct rxrpc_call *call,
struct sk_buff *skb)
{
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- struct rxgk_header hdr;
- unsigned int offset = sp->offset, len = sp->len;
+ struct rxgk_header *hdr;
+ unsigned int offset = 0, len = call->rx_dec_len;
+ void *data = call->rx_dec_buffer, *p = data;
int ret;
u32 ac = 0;
_enter("");
- ret = rxgk_decrypt_skb(gk->krb5, gk->rx_enc, skb, &offset, &len, &ac);
+ if (crypto_krb5_check_data_len(gk->krb5, KRB5_ENCRYPT_MODE,
+ len, sizeof(*hdr)) < 0) {
+ ret = rxrpc_abort_eproto(call, skb, RXGK_PACKETSHORT,
+ rxgk_abort_2_short_header);
+ goto error;
+ }
+
+ ret = rxgk_decrypt(gk->krb5, gk->rx_enc, &p, &len, &ac);
if (ret < 0) {
if (ret != -ENOMEM)
rxrpc_abort_eproto(call, skb, ac, rxgk_abort_2_decrypt_eproto);
goto error;
}
+ offset = p - data;
- if (len < sizeof(hdr)) {
+ if (len < sizeof(*hdr)) {
ret = rxrpc_abort_eproto(call, skb, RXGK_PACKETSHORT,
rxgk_abort_2_short_header);
goto error;
}
/* Extract the header from the skb */
- ret = skb_copy_bits(skb, offset, &hdr, sizeof(hdr));
- if (ret < 0) {
- ret = rxrpc_abort_eproto(call, skb, RXGK_PACKETSHORT,
- rxgk_abort_2_short_encdata);
- goto error;
- }
- offset += sizeof(hdr);
- len -= sizeof(hdr);
-
- if (ntohl(hdr.epoch) != call->conn->proto.epoch ||
- ntohl(hdr.cid) != call->cid ||
- ntohl(hdr.call_number) != call->call_id ||
- ntohl(hdr.seq) != sp->hdr.seq ||
- ntohl(hdr.sec_index) != call->security_ix ||
- ntohl(hdr.data_len) > len) {
+ hdr = data + offset;
+ offset += sizeof(*hdr);
+ len -= sizeof(*hdr);
+
+ if (ntohl(hdr->epoch) != call->conn->proto.epoch ||
+ ntohl(hdr->cid) != call->cid ||
+ ntohl(hdr->call_number) != call->call_id ||
+ ntohl(hdr->seq) != sp->hdr.seq ||
+ ntohl(hdr->sec_index) != call->security_ix ||
+ ntohl(hdr->data_len) > len) {
ret = rxrpc_abort_eproto(call, skb, RXGK_SEALEDINCON,
rxgk_abort_2_short_data);
goto error;
}
- sp->offset = offset;
- sp->len = ntohl(hdr.data_len);
+ call->rx_dec_offset = offset;
+ call->rx_dec_len = ntohl(hdr->data_len);
ret = 0;
error:
rxgk_put(gk);
@@ -1076,11 +1084,12 @@ static int rxgk_sendmsg_respond_to_challenge(struct sk_buff *challenge,
* unsigned int call_numbers<>;
* };
*/
-static int rxgk_do_verify_authenticator(struct rxrpc_connection *conn,
- const struct krb5_enctype *krb5,
- struct sk_buff *skb,
- __be32 *p, __be32 *end)
+static int rxgk_verify_authenticator(struct rxrpc_connection *conn,
+ const struct krb5_enctype *krb5,
+ struct sk_buff *skb,
+ void *auth, unsigned int auth_len)
{
+ __be32 *p = auth, *end = auth + auth_len;
u32 app_len, call_count, level, epoch, cid, i;
_enter("");
@@ -1144,37 +1153,6 @@ static int rxgk_do_verify_authenticator(struct rxrpc_connection *conn,
}
/*
- * Extract the authenticator and verify it.
- */
-static int rxgk_verify_authenticator(struct rxrpc_connection *conn,
- const struct krb5_enctype *krb5,
- struct sk_buff *skb,
- unsigned int auth_offset, unsigned int auth_len)
-{
- void *auth;
- __be32 *p;
- int ret;
-
- auth = kmalloc(auth_len, GFP_NOFS);
- if (!auth)
- return -ENOMEM;
-
- ret = skb_copy_bits(skb, auth_offset, auth, auth_len);
- if (ret < 0) {
- ret = rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO,
- rxgk_abort_resp_short_auth);
- goto error;
- }
-
- p = auth;
- ret = rxgk_do_verify_authenticator(conn, krb5, skb, p,
- p + auth_len / sizeof(*p));
-error:
- kfree(auth);
- return ret;
-}
-
-/*
* Verify a response.
*
* struct RXGK_Response {
@@ -1184,49 +1162,45 @@ error:
* };
*/
static int rxgk_verify_response(struct rxrpc_connection *conn,
- struct sk_buff *skb)
+ struct sk_buff *skb,
+ void *buffer, unsigned int len)
{
const struct krb5_enctype *krb5;
struct rxrpc_key_token *token;
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- struct rxgk_response rhdr;
+ struct rxgk_response *rhdr;
struct rxgk_context *gk;
struct key *key = NULL;
- unsigned int offset = sizeof(struct rxrpc_wire_header);
- unsigned int len = skb->len - sizeof(struct rxrpc_wire_header);
- unsigned int token_offset, token_len;
- unsigned int auth_offset, auth_len;
+ unsigned int resp_token_len, auth_len;
+ void *resp_token, *auth;
__be32 xauth_len;
int ret, ec;
_enter("{%d}", conn->debug_id);
/* Parse the RXGK_Response object */
- if (sizeof(rhdr) + sizeof(__be32) > len)
+ if (len < sizeof(*rhdr) + sizeof(__be32))
goto short_packet;
-
- if (skb_copy_bits(skb, offset, &rhdr, sizeof(rhdr)) < 0)
- goto short_packet;
- offset += sizeof(rhdr);
- len -= sizeof(rhdr);
-
- token_offset = offset;
- token_len = ntohl(rhdr.token_len);
- if (token_len > len ||
- xdr_round_up(token_len) + sizeof(__be32) > len)
+ rhdr = buffer;
+ buffer += sizeof(*rhdr);
+ len -= sizeof(*rhdr);
+
+ resp_token = buffer;
+ resp_token_len = ntohl(rhdr->token_len);
+ if (resp_token_len > len ||
+ xdr_round_up(resp_token_len) + sizeof(__be32) > len)
goto short_packet;
- trace_rxrpc_rx_response(conn, sp->hdr.serial, 0, sp->hdr.cksum, token_len);
+ trace_rxrpc_rx_response(conn, sp->hdr.serial, 0, sp->hdr.cksum, resp_token_len);
- offset += xdr_round_up(token_len);
- len -= xdr_round_up(token_len);
+ buffer += xdr_round_up(resp_token_len);
+ len -= xdr_round_up(resp_token_len);
- if (skb_copy_bits(skb, offset, &xauth_len, sizeof(xauth_len)) < 0)
- goto short_packet;
- offset += sizeof(xauth_len);
+ xauth_len = *(__be32 *)buffer;
+ buffer += sizeof(xauth_len);
len -= sizeof(xauth_len);
- auth_offset = offset;
+ auth = buffer;
auth_len = ntohl(xauth_len);
if (auth_len > len)
goto short_packet;
@@ -1241,7 +1215,7 @@ static int rxgk_verify_response(struct rxrpc_connection *conn,
* to the app to deal with - which might mean a round trip to
* userspace.
*/
- ret = rxgk_extract_token(conn, skb, token_offset, token_len, &key);
+ ret = rxgk_extract_token(conn, skb, resp_token, resp_token_len, &key);
if (ret < 0)
goto out;
@@ -1255,7 +1229,7 @@ static int rxgk_verify_response(struct rxrpc_connection *conn,
*/
token = key->payload.data[0];
conn->security_level = token->rxgk->level;
- conn->rxgk.start_time = __be64_to_cpu(rhdr.start_time);
+ conn->rxgk.start_time = __be64_to_cpu(rhdr->start_time);
gk = rxgk_generate_transport_key(conn, token->rxgk, sp->hdr.cksum, GFP_NOFS);
if (IS_ERR(gk)) {
@@ -1265,18 +1239,18 @@ static int rxgk_verify_response(struct rxrpc_connection *conn,
krb5 = gk->krb5;
- trace_rxrpc_rx_response(conn, sp->hdr.serial, krb5->etype, sp->hdr.cksum, token_len);
+ trace_rxrpc_rx_response(conn, sp->hdr.serial, krb5->etype, sp->hdr.cksum,
+ resp_token_len);
/* Decrypt, parse and verify the authenticator. */
- ret = rxgk_decrypt_skb(krb5, gk->resp_enc, skb,
- &auth_offset, &auth_len, &ec);
+ ret = rxgk_decrypt(krb5, gk->resp_enc, &auth, &auth_len, &ec);
if (ret < 0) {
rxrpc_abort_conn(conn, skb, RXGK_SEALEDINCON, ret,
rxgk_abort_resp_auth_dec);
goto out_gk;
}
- ret = rxgk_verify_authenticator(conn, krb5, skb, auth_offset, auth_len);
+ ret = rxgk_verify_authenticator(conn, krb5, skb, auth, auth_len);
if (ret < 0)
goto out_gk;
diff --git a/net/rxrpc/rxgk_app.c b/net/rxrpc/rxgk_app.c
index 0ef2a29eb695..200a30064fae 100644
--- a/net/rxrpc/rxgk_app.c
+++ b/net/rxrpc/rxgk_app.c
@@ -40,7 +40,7 @@
* };
*/
int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb,
- unsigned int ticket_offset, unsigned int ticket_len,
+ void *buffer, unsigned int ticket_len,
struct key **_key)
{
struct rxrpc_key_token *token;
@@ -49,7 +49,7 @@ int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb,
size_t pre_ticket_len, payload_len;
unsigned int klen, enctype;
void *payload, *ticket;
- __be32 *t, *p, *q, tmp[2];
+ __be32 *t, *p, *q, *tmp;
int ret;
_enter("");
@@ -59,10 +59,7 @@ int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb,
rxgk_abort_resp_short_yfs_tkt);
/* Get the session key length */
- ret = skb_copy_bits(skb, ticket_offset, tmp, sizeof(tmp));
- if (ret < 0)
- return rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO,
- rxgk_abort_resp_short_yfs_klen);
+ tmp = buffer;
enctype = ntohl(tmp[0]);
klen = ntohl(tmp[1]);
@@ -84,12 +81,7 @@ int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb,
* it.
*/
ticket = payload + pre_ticket_len;
- ret = skb_copy_bits(skb, ticket_offset, ticket, ticket_len);
- if (ret < 0) {
- ret = rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO,
- rxgk_abort_resp_short_yfs_tkt);
- goto error;
- }
+ memcpy(ticket, buffer, ticket_len);
/* Fill out the form header. */
p = payload;
@@ -131,7 +123,7 @@ int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb,
goto error;
}
- /* Ticket read in with skb_copy_bits above */
+ /* Ticket appended above. */
q += xdr_round_up(ticket_len) / 4;
if (WARN_ON((unsigned long)q - (unsigned long)payload != payload_len)) {
ret = -EIO;
@@ -182,14 +174,15 @@ error:
* [tools.ietf.org/html/draft-wilkinson-afs3-rxgk-afs-08 sec 6.1]
*/
int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb,
- unsigned int token_offset, unsigned int token_len,
+ void *token, unsigned int token_len,
struct key **_key)
{
const struct krb5_enctype *krb5;
const struct krb5_buffer *server_secret;
struct crypto_aead *token_enc = NULL;
struct key *server_key;
- unsigned int ticket_offset, ticket_len;
+ unsigned int ticket_len;
+ void *ticket;
u32 kvno, enctype;
int ret, ec = 0;
@@ -197,24 +190,23 @@ int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb,
__be32 kvno;
__be32 enctype;
__be32 token_len;
- } container;
+ } *container;
- if (token_len < sizeof(container))
+ if (token_len < sizeof(*container))
goto short_packet;
/* Decode the RXGK_TokenContainer object. This tells us which server
* key we should be using. We can then fetch the key, get the secret
* and set up the crypto to extract the token.
*/
- if (skb_copy_bits(skb, token_offset, &container, sizeof(container)) < 0)
- goto short_packet;
+ container = token;
+ token += sizeof(*container);
- kvno = ntohl(container.kvno);
- enctype = ntohl(container.enctype);
- ticket_len = ntohl(container.token_len);
- ticket_offset = token_offset + sizeof(container);
+ kvno = ntohl(container->kvno);
+ enctype = ntohl(container->enctype);
+ ticket_len = ntohl(container->token_len);
- if (ticket_len > xdr_round_down(token_len - sizeof(container)))
+ if (ticket_len > xdr_round_down(token_len - sizeof(*container)))
goto short_packet;
_debug("KVNO %u", kvno);
@@ -237,8 +229,8 @@ int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb,
* gain access to K0, from which we can derive the transport key and
* thence decode the authenticator.
*/
- ret = rxgk_decrypt_skb(krb5, token_enc, skb,
- &ticket_offset, &ticket_len, &ec);
+ ticket = token;
+ ret = rxgk_decrypt(krb5, token_enc, &ticket, &ticket_len, &ec);
crypto_free_aead(token_enc);
token_enc = NULL;
if (ret < 0) {
@@ -248,7 +240,7 @@ int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb,
return ret;
}
- ret = conn->security->default_decode_ticket(conn, skb, ticket_offset,
+ ret = conn->security->default_decode_ticket(conn, skb, ticket,
ticket_len, _key);
if (ret < 0)
goto cant_get_token;
diff --git a/net/rxrpc/rxgk_common.h b/net/rxrpc/rxgk_common.h
index 1e257d7ab8ec..3deed5863f5a 100644
--- a/net/rxrpc/rxgk_common.h
+++ b/net/rxrpc/rxgk_common.h
@@ -41,10 +41,10 @@ struct rxgk_context {
* rxgk_app.c
*/
int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb,
- unsigned int ticket_offset, unsigned int ticket_len,
+ void *ticket, unsigned int ticket_len,
struct key **_key);
int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb,
- unsigned int token_offset, unsigned int token_len,
+ void *token, unsigned int token_len,
struct key **_key);
/*
@@ -62,31 +62,30 @@ int rxgk_set_up_token_cipher(const struct krb5_buffer *server_key,
gfp_t gfp);
/*
- * Apply decryption and checksumming functions to part of an skbuff. The
- * offset and length are updated to reflect the actual content of the encrypted
+ * Apply decryption and checksumming functions a flat data buffer. The data
+ * point and length are updated to reflect the actual content of the encrypted
* region.
*/
-static inline
-int rxgk_decrypt_skb(const struct krb5_enctype *krb5,
- struct crypto_aead *aead,
- struct sk_buff *skb,
- unsigned int *_offset, unsigned int *_len,
- int *_error_code)
+static inline int rxgk_decrypt(const struct krb5_enctype *krb5,
+ struct crypto_aead *aead,
+ void **_data, unsigned int *_len,
+ int *_error_code)
{
- struct scatterlist sg[16];
+ struct scatterlist sg[1];
size_t offset = 0, len = *_len;
- int nr_sg, ret;
+ int ret;
- sg_init_table(sg, ARRAY_SIZE(sg));
- nr_sg = skb_to_sgvec(skb, sg, *_offset, len);
- if (unlikely(nr_sg < 0))
- return nr_sg;
+ sg_init_one(sg, *_data, len);
- ret = crypto_krb5_decrypt(krb5, aead, sg, nr_sg,
- &offset, &len);
+ ret = crypto_krb5_decrypt(krb5, aead, sg, 1, &offset, &len);
switch (ret) {
case 0:
- *_offset += offset;
+ if (offset & 3) {
+ *_error_code = RXGK_INCONSISTENCY;
+ ret = -EPROTO;
+ break;
+ }
+ *_data += offset;
*_len = len;
break;
case -EBADMSG: /* Checksum mismatch. */
@@ -106,31 +105,26 @@ int rxgk_decrypt_skb(const struct krb5_enctype *krb5,
}
/*
- * Check the MIC on a region of an skbuff. The offset and length are updated
- * to reflect the actual content of the secure region.
+ * Check the MIC on a flat buffer. The data pointer and length are updated to
+ * reflect the actual content of the secure region.
*/
static inline
-int rxgk_verify_mic_skb(const struct krb5_enctype *krb5,
- struct crypto_shash *shash,
- const struct krb5_buffer *metadata,
- struct sk_buff *skb,
- unsigned int *_offset, unsigned int *_len,
- u32 *_error_code)
+int rxgk_verify_mic(const struct krb5_enctype *krb5,
+ struct crypto_shash *shash,
+ const struct krb5_buffer *metadata,
+ void **_data, unsigned int *_len,
+ u32 *_error_code)
{
- struct scatterlist sg[16];
+ struct scatterlist sg[1];
size_t offset = 0, len = *_len;
- int nr_sg, ret;
+ int ret;
- sg_init_table(sg, ARRAY_SIZE(sg));
- nr_sg = skb_to_sgvec(skb, sg, *_offset, len);
- if (unlikely(nr_sg < 0))
- return nr_sg;
+ sg_init_one(sg, *_data, len);
- ret = crypto_krb5_verify_mic(krb5, shash, metadata, sg, nr_sg,
- &offset, &len);
+ ret = crypto_krb5_verify_mic(krb5, shash, metadata, sg, 1, &offset, &len);
switch (ret) {
case 0:
- *_offset += offset;
+ *_data += offset;
*_len = len;
break;
case -EBADMSG: /* Checksum mismatch */
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index cba7935977f0..6fbd883401ac 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -430,27 +430,25 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb,
rxrpc_seq_t seq,
struct skcipher_request *req)
{
- struct rxkad_level1_hdr sechdr;
+ struct rxkad_level1_hdr *sechdr;
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
struct rxrpc_crypt iv;
- struct scatterlist sg[16];
- u32 data_size, buf;
+ struct scatterlist sg[1];
+ void *data = call->rx_dec_buffer;
+ u32 len = sp->len, data_size, buf;
u16 check;
int ret;
_enter("");
- if (sp->len < 8)
+ if (len < 8)
return rxrpc_abort_eproto(call, skb, RXKADSEALEDINCON,
rxkad_abort_1_short_header);
/* Decrypt the skbuff in-place. TODO: We really want to decrypt
* directly into the target buffer.
*/
- sg_init_table(sg, ARRAY_SIZE(sg));
- ret = skb_to_sgvec(skb, sg, sp->offset, 8);
- if (unlikely(ret < 0))
- return ret;
+ sg_init_one(sg, data, len);
/* start the decryption afresh */
memset(&iv, 0, sizeof(iv));
@@ -464,13 +462,11 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb,
return ret;
/* Extract the decrypted packet length */
- if (skb_copy_bits(skb, sp->offset, &sechdr, sizeof(sechdr)) < 0)
- return rxrpc_abort_eproto(call, skb, RXKADDATALEN,
- rxkad_abort_1_short_encdata);
- sp->offset += sizeof(sechdr);
- sp->len -= sizeof(sechdr);
+ sechdr = data;
+ call->rx_dec_offset = sizeof(*sechdr);
+ len -= sizeof(*sechdr);
- buf = ntohl(sechdr.data_size);
+ buf = ntohl(sechdr->data_size);
data_size = buf & 0xffff;
check = buf >> 16;
@@ -479,10 +475,10 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb,
if (check != 0)
return rxrpc_abort_eproto(call, skb, RXKADSEALEDINCON,
rxkad_abort_1_short_check);
- if (data_size > sp->len)
+ if (data_size > len)
return rxrpc_abort_eproto(call, skb, RXKADDATALEN,
rxkad_abort_1_short_data);
- sp->len = data_size;
+ call->rx_dec_len = data_size;
_leave(" = 0 [dlen=%x]", data_size);
return 0;
@@ -496,43 +492,28 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb,
struct skcipher_request *req)
{
const struct rxrpc_key_token *token;
- struct rxkad_level2_hdr sechdr;
+ struct rxkad_level2_hdr *sechdr;
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
struct rxrpc_crypt iv;
- struct scatterlist _sg[4], *sg;
- u32 data_size, buf;
+ struct scatterlist sg[1];
+ void *data = call->rx_dec_buffer;
+ u32 len = sp->len, data_size, buf;
u16 check;
- int nsg, ret;
+ int ret;
- _enter(",{%d}", sp->len);
+ _enter(",{%d}", len);
- if (sp->len < 8)
+ if (len < 8)
return rxrpc_abort_eproto(call, skb, RXKADSEALEDINCON,
rxkad_abort_2_short_header);
/* Don't let the crypto algo see a misaligned length. */
- sp->len = round_down(sp->len, 8);
+ len = round_down(len, 8);
- /* Decrypt the skbuff in-place. TODO: We really want to decrypt
- * directly into the target buffer.
+ /* Decrypt in place in the call's decryption buffer. TODO: We really
+ * want to decrypt directly into the target buffer.
*/
- sg = _sg;
- nsg = skb_shinfo(skb)->nr_frags + 1;
- if (nsg <= 4) {
- nsg = 4;
- } else {
- sg = kmalloc_objs(*sg, nsg, GFP_NOIO);
- if (!sg)
- return -ENOMEM;
- }
-
- sg_init_table(sg, nsg);
- ret = skb_to_sgvec(skb, sg, sp->offset, sp->len);
- if (unlikely(ret < 0)) {
- if (sg != _sg)
- kfree(sg);
- return ret;
- }
+ sg_init_one(sg, data, len);
/* decrypt from the session key */
token = call->conn->key->payload.data[0];
@@ -540,11 +521,9 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb,
skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher);
skcipher_request_set_callback(req, 0, NULL, NULL);
- skcipher_request_set_crypt(req, sg, sg, sp->len, iv.x);
+ skcipher_request_set_crypt(req, sg, sg, len, iv.x);
ret = crypto_skcipher_decrypt(req);
skcipher_request_zero(req);
- if (sg != _sg)
- kfree(sg);
if (ret < 0) {
if (ret == -ENOMEM)
return ret;
@@ -553,13 +532,11 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb,
}
/* Extract the decrypted packet length */
- if (skb_copy_bits(skb, sp->offset, &sechdr, sizeof(sechdr)) < 0)
- return rxrpc_abort_eproto(call, skb, RXKADDATALEN,
- rxkad_abort_2_short_len);
- sp->offset += sizeof(sechdr);
- sp->len -= sizeof(sechdr);
+ sechdr = data;
+ call->rx_dec_offset = sizeof(*sechdr);
+ len -= sizeof(*sechdr);
- buf = ntohl(sechdr.data_size);
+ buf = ntohl(sechdr->data_size);
data_size = buf & 0xffff;
check = buf >> 16;
@@ -569,17 +546,18 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb,
return rxrpc_abort_eproto(call, skb, RXKADSEALEDINCON,
rxkad_abort_2_short_check);
- if (data_size > sp->len)
+ if (data_size > len)
return rxrpc_abort_eproto(call, skb, RXKADDATALEN,
rxkad_abort_2_short_data);
- sp->len = data_size;
+ call->rx_dec_len = data_size;
_leave(" = 0 [dlen=%x]", data_size);
return 0;
}
/*
- * Verify the security on a received packet and the subpackets therein.
+ * Verify the security on a received (sub)packet. If the packet needs
+ * modifying (e.g. decrypting), it must be copied.
*/
static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb)
{
@@ -985,7 +963,6 @@ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn,
*_expiry = 0;
ASSERT(server_key->payload.data[0] != NULL);
- ASSERTCMP((unsigned long) ticket & 7UL, ==, 0);
memcpy(&iv, &server_key->payload.data[2], sizeof(iv));
@@ -1134,14 +1111,15 @@ unlock:
* verify a response
*/
static int rxkad_verify_response(struct rxrpc_connection *conn,
- struct sk_buff *skb)
+ struct sk_buff *skb,
+ void *buffer, unsigned int len)
{
struct rxkad_response *response;
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
struct rxrpc_crypt session_key;
struct key *server_key;
time64_t expiry;
- void *ticket = NULL;
+ void *ticket;
u32 version, kvno, ticket_len, level;
__be32 csum;
int ret, i;
@@ -1164,13 +1142,8 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
}
}
- ret = -ENOMEM;
- response = kzalloc_obj(struct rxkad_response, GFP_NOFS);
- if (!response)
- goto error;
-
- if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header),
- response, sizeof(*response)) < 0) {
+ response = buffer;
+ if (len < sizeof(*response)) {
ret = rxrpc_abort_conn(conn, skb, RXKADPACKETSHORT, -EPROTO,
rxkad_abort_resp_short);
goto error;
@@ -1182,6 +1155,9 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
trace_rxrpc_rx_response(conn, sp->hdr.serial, version, kvno, ticket_len);
+ buffer += sizeof(*response);
+ len -= sizeof(*response);
+
if (version != RXKAD_VERSION) {
ret = rxrpc_abort_conn(conn, skb, RXKADINCONSISTENCY, -EPROTO,
rxkad_abort_resp_version);
@@ -1201,13 +1177,8 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
}
/* extract the kerberos ticket and decrypt and decode it */
- ret = -ENOMEM;
- ticket = kmalloc(ticket_len, GFP_NOFS);
- if (!ticket)
- goto error;
-
- if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header) + sizeof(*response),
- ticket, ticket_len) < 0) {
+ ticket = buffer;
+ if (ticket_len > len) {
ret = rxrpc_abort_conn(conn, skb, RXKADPACKETSHORT, -EPROTO,
rxkad_abort_resp_short_tkt);
goto error;
@@ -1287,8 +1258,6 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
ret = rxrpc_get_server_data_key(conn, &session_key, expiry, kvno);
error:
- kfree(ticket);
- kfree(response);
key_put(server_key);
_leave(" = %d", ret);
return ret;
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 332fd9695e54..04ea11c90e03 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -112,11 +112,6 @@ struct tcf_chain *tcf_action_set_ctrlact(struct tc_action *a, int action,
}
EXPORT_SYMBOL(tcf_action_set_ctrlact);
-/* XXX: For standalone actions, we don't need a RCU grace period either, because
- * actions are always connected to filters and filters are already destroyed in
- * RCU callbacks, so after a RCU grace period actions are already disconnected
- * from filters. Readers later can not find us.
- */
static void free_tcf(struct tc_action *p)
{
struct tcf_chain *chain = rcu_dereference_protected(p->goto_chain, 1);
@@ -129,7 +124,7 @@ static void free_tcf(struct tc_action *p)
if (chain)
tcf_chain_put_by_act(chain);
- kfree(p);
+ kfree_rcu(p, tcfa_rcu);
}
static void offload_action_hw_count_set(struct tc_action *act,
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 2c5a7a321a94..553342c55cf7 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -26,6 +26,10 @@
#include <net/tc_act/tc_mirred.h>
#include <net/tc_wrapper.h>
+#define MIRRED_DEFER_LIMIT 3
+_Static_assert(MIRRED_DEFER_LIMIT <= 3,
+ "MIRRED_DEFER_LIMIT exceeds tc_depth bitfield width");
+
static LIST_HEAD(mirred_list);
static DEFINE_SPINLOCK(mirred_list_lock);
@@ -234,12 +238,15 @@ tcf_mirred_forward(bool at_ingress, bool want_ingress, struct sk_buff *skb)
{
int err;
- if (!want_ingress)
+ if (!want_ingress) {
err = tcf_dev_queue_xmit(skb, dev_queue_xmit);
- else if (!at_ingress)
- err = netif_rx(skb);
- else
- err = netif_receive_skb(skb);
+ } else {
+ skb->tc_depth++;
+ if (!at_ingress)
+ err = netif_rx(skb);
+ else
+ err = netif_receive_skb(skb);
+ }
return err;
}
@@ -365,7 +372,8 @@ assign_prev:
dev_is_mac_header_xmit(dev_prev),
m_eaction, retval);
- return retval;
+ /* If the packet wasn't redirected, we have to register as a drop */
+ return TC_ACT_SHOT;
}
static int tcf_blockcast_mirror(struct sk_buff *skb, struct tcf_mirred *m,
@@ -389,14 +397,12 @@ static int tcf_blockcast_mirror(struct sk_buff *skb, struct tcf_mirred *m,
static int tcf_blockcast(struct sk_buff *skb, struct tcf_mirred *m,
const u32 blockid, struct tcf_result *res,
- int retval)
+ int m_eaction, int retval)
{
const u32 exception_ifindex = skb->dev->ifindex;
struct tcf_block *block;
bool is_redirect;
- int m_eaction;
- m_eaction = READ_ONCE(m->tcfm_eaction);
is_redirect = tcf_mirred_is_act_redirect(m_eaction);
/* we are already under rcu protection, so can call block lookup
@@ -405,7 +411,7 @@ static int tcf_blockcast(struct sk_buff *skb, struct tcf_mirred *m,
block = tcf_block_lookup(dev_net(skb->dev), blockid);
if (!block || xa_empty(&block->ports)) {
tcf_action_inc_overlimit_qstats(&m->common);
- return retval;
+ return is_redirect ? TC_ACT_SHOT : retval;
}
if (is_redirect)
@@ -423,9 +429,10 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb,
{
struct tcf_mirred *m = to_mirred(a);
int retval = READ_ONCE(m->tcf_action);
+ bool m_mac_header_xmit, is_redirect;
struct netdev_xmit *xmit;
- bool m_mac_header_xmit;
struct net_device *dev;
+ bool want_ingress;
int i, m_eaction;
u32 blockid;
@@ -434,7 +441,8 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb,
#else
xmit = this_cpu_ptr(&softnet_data.xmit);
#endif
- if (unlikely(xmit->sched_mirred_nest >= MIRRED_NEST_LIMIT)) {
+ if (unlikely(xmit->sched_mirred_nest >= MIRRED_NEST_LIMIT ||
+ skb->tc_depth >= MIRRED_DEFER_LIMIT)) {
net_warn_ratelimited("Packet exceeded mirred recursion limit on dev %s\n",
netdev_name(skb->dev));
return TC_ACT_SHOT;
@@ -444,34 +452,51 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb,
tcf_action_update_bstats(&m->common, skb);
blockid = READ_ONCE(m->tcfm_blockid);
- if (blockid)
- return tcf_blockcast(skb, m, blockid, res, retval);
+ m_eaction = READ_ONCE(m->tcfm_eaction);
+ want_ingress = tcf_mirred_act_wants_ingress(m_eaction);
+ if (blockid) {
+ if (!want_ingress)
+ xmit->sched_mirred_dev[xmit->sched_mirred_nest++] = NULL;
+ retval = tcf_blockcast(skb, m, blockid, res, m_eaction, retval);
+ if (!want_ingress)
+ xmit->sched_mirred_nest--;
+ return retval;
+ }
+
+ is_redirect = tcf_mirred_is_act_redirect(m_eaction);
dev = rcu_dereference_bh(m->tcfm_dev);
if (unlikely(!dev)) {
pr_notice_once("tc mirred: target device is gone\n");
tcf_action_inc_overlimit_qstats(&m->common);
- return retval;
- }
- for (i = 0; i < xmit->sched_mirred_nest; i++) {
- if (xmit->sched_mirred_dev[i] != dev)
- continue;
- pr_notice_once("tc mirred: loop on device %s\n",
- netdev_name(dev));
- tcf_action_inc_overlimit_qstats(&m->common);
- return retval;
+ goto err_out;
}
- xmit->sched_mirred_dev[xmit->sched_mirred_nest++] = dev;
+ if (!want_ingress) {
+ for (i = 0; i < xmit->sched_mirred_nest; i++) {
+ if (xmit->sched_mirred_dev[i] != dev)
+ continue;
+ pr_notice_once("tc mirred: loop on device %s\n",
+ netdev_name(dev));
+ tcf_action_inc_overlimit_qstats(&m->common);
+ goto err_out;
+ }
+ xmit->sched_mirred_dev[xmit->sched_mirred_nest++] = dev;
+ }
m_mac_header_xmit = READ_ONCE(m->tcfm_mac_header_xmit);
- m_eaction = READ_ONCE(m->tcfm_eaction);
retval = tcf_mirred_to_dev(skb, m, dev, m_mac_header_xmit, m_eaction,
retval);
- xmit->sched_mirred_nest--;
+ if (!want_ingress)
+ xmit->sched_mirred_nest--;
return retval;
+
+err_out:
+ if (is_redirect)
+ retval = TC_ACT_SHOT;
+ return retval;
}
static void tcf_stats_update(struct tc_action *a, u64 bytes, u64 packets,
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index bc20f08a2789..bd3b1da3cd63 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -16,6 +16,8 @@
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/slab.h>
+#include <linux/overflow.h>
+#include <linux/unaligned.h>
#include <net/ipv6.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
@@ -242,7 +244,6 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
goto out_free_ex;
}
- nparms->tcfp_off_max_hint = 0;
nparms->tcfp_flags = parm->flags;
nparms->tcfp_nkeys = parm->nkeys;
@@ -268,14 +269,6 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
BITS_PER_TYPE(int) - 1,
nparms->tcfp_keys[i].shift);
- /* The AT option can read a single byte, we can bound the actual
- * value with uchar max.
- */
- cur += (0xff & offmask) >> nparms->tcfp_keys[i].shift;
-
- /* Each key touches 4 bytes starting from the computed offset */
- nparms->tcfp_off_max_hint =
- max(nparms->tcfp_off_max_hint, cur + 4);
}
p = to_pedit(*a);
@@ -318,15 +311,12 @@ static void tcf_pedit_cleanup(struct tc_action *a)
call_rcu(&parms->rcu, tcf_pedit_cleanup_rcu);
}
-static bool offset_valid(struct sk_buff *skb, int offset)
+static bool offset_valid(struct sk_buff *skb, int offset, int len)
{
- if (offset > 0 && offset > skb->len)
- return false;
-
- if (offset < 0 && -offset > skb_headroom(skb))
+ if (offset < -(int)skb_headroom(skb))
return false;
- return true;
+ return offset <= (int)skb->len - len;
}
static int pedit_l4_skb_offset(struct sk_buff *skb, int *hoffset, const int header_type)
@@ -393,18 +383,10 @@ TC_INDIRECT_SCOPE int tcf_pedit_act(struct sk_buff *skb,
struct tcf_pedit_key_ex *tkey_ex;
struct tcf_pedit_parms *parms;
struct tc_pedit_key *tkey;
- u32 max_offset;
int i;
parms = rcu_dereference_bh(p->parms);
- max_offset = (skb_transport_header_was_set(skb) ?
- skb_transport_offset(skb) :
- skb_network_offset(skb)) +
- parms->tcfp_off_max_hint;
- if (skb_ensure_writable(skb, min(skb->len, max_offset)))
- goto done;
-
tcf_lastuse_update(&p->tcf_tm);
tcf_action_update_bstats(&p->common, skb);
@@ -412,10 +394,11 @@ TC_INDIRECT_SCOPE int tcf_pedit_act(struct sk_buff *skb,
tkey_ex = parms->tcfp_keys_ex;
for (i = parms->tcfp_nkeys; i > 0; i--, tkey++) {
+ int write_offset, write_len;
int offset = tkey->off;
int hoffset = 0;
- u32 *ptr, hdata;
- u32 val;
+ u32 cur_val, val;
+ u32 *ptr;
int rc;
if (tkey_ex) {
@@ -433,13 +416,15 @@ TC_INDIRECT_SCOPE int tcf_pedit_act(struct sk_buff *skb,
if (tkey->offmask) {
u8 *d, _d;
+ int at_offset;
- if (!offset_valid(skb, hoffset + tkey->at)) {
+ if (check_add_overflow(hoffset, (int)tkey->at, &at_offset) ||
+ !offset_valid(skb, at_offset, sizeof(_d))) {
pr_info_ratelimited("tc action pedit 'at' offset %d out of bounds\n",
hoffset + tkey->at);
goto bad;
}
- d = skb_header_pointer(skb, hoffset + tkey->at,
+ d = skb_header_pointer(skb, at_offset,
sizeof(_d), &_d);
if (!d)
goto bad;
@@ -451,31 +436,51 @@ TC_INDIRECT_SCOPE int tcf_pedit_act(struct sk_buff *skb,
}
}
- if (!offset_valid(skb, hoffset + offset)) {
- pr_info_ratelimited("tc action pedit offset %d out of bounds\n", hoffset + offset);
+ if (check_add_overflow(hoffset, offset, &write_offset)) {
+ pr_info_ratelimited("tc action pedit offset overflow\n");
goto bad;
}
- ptr = skb_header_pointer(skb, hoffset + offset,
- sizeof(hdata), &hdata);
- if (!ptr)
+ if (!offset_valid(skb, write_offset, sizeof(*ptr))) {
+ pr_info_ratelimited("tc action pedit offset %d out of bounds\n",
+ write_offset);
goto bad;
+ }
+
+ if (write_offset < 0) {
+ if (skb_cow(skb, -write_offset))
+ goto bad;
+ if (write_offset + (int)sizeof(*ptr) > 0) {
+ if (skb_ensure_writable(skb,
+ min_t(int, skb->len,
+ write_offset + (int)sizeof(*ptr))))
+ goto bad;
+ }
+ } else {
+ if (check_add_overflow(write_offset, (int)sizeof(*ptr),
+ &write_len))
+ goto bad;
+ if (skb_ensure_writable(skb, min_t(int, skb->len,
+ write_len)))
+ goto bad;
+ }
+
+ ptr = (u32 *)(skb->data + write_offset);
+ cur_val = get_unaligned(ptr);
/* just do it, baby */
switch (cmd) {
case TCA_PEDIT_KEY_EX_CMD_SET:
val = tkey->val;
break;
case TCA_PEDIT_KEY_EX_CMD_ADD:
- val = (*ptr + tkey->val) & ~tkey->mask;
+ val = (cur_val + tkey->val) & ~tkey->mask;
break;
default:
pr_info_ratelimited("tc action pedit bad command (%d)\n", cmd);
goto bad;
}
- *ptr = ((*ptr & tkey->mask) ^ val);
- if (ptr == &hdata)
- skb_store_bits(skb, hoffset + offset, ptr, 4);
+ put_unaligned((cur_val & tkey->mask) ^ val, ptr);
}
goto done;
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
index 13c6d1869a14..5862933be8d7 100644
--- a/net/sched/sch_cake.c
+++ b/net/sched/sch_cake.c
@@ -399,14 +399,14 @@ static void cake_configure_rates(struct Qdisc *sch, u64 rate, bool rate_adjust);
* Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
*/
-static void cobalt_newton_step(struct cobalt_vars *vars)
+static void cobalt_newton_step(struct cobalt_vars *vars, u32 count)
{
u32 invsqrt, invsqrt2;
u64 val;
invsqrt = vars->rec_inv_sqrt;
invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
- val = (3LL << 32) - ((u64)vars->count * invsqrt2);
+ val = (3LL << 32) - ((u64)count * invsqrt2);
val >>= 2; /* avoid overflow in following multiply */
val = (val * invsqrt) >> (32 - 2 + 1);
@@ -414,12 +414,12 @@ static void cobalt_newton_step(struct cobalt_vars *vars)
vars->rec_inv_sqrt = val;
}
-static void cobalt_invsqrt(struct cobalt_vars *vars)
+static void cobalt_invsqrt(struct cobalt_vars *vars, u32 count)
{
- if (vars->count < REC_INV_SQRT_CACHE)
- vars->rec_inv_sqrt = inv_sqrt_cache[vars->count];
+ if (count < REC_INV_SQRT_CACHE)
+ vars->rec_inv_sqrt = inv_sqrt_cache[count];
else
- cobalt_newton_step(vars);
+ cobalt_newton_step(vars, count);
}
static void cobalt_vars_init(struct cobalt_vars *vars)
@@ -449,16 +449,19 @@ static bool cobalt_queue_full(struct cobalt_vars *vars,
bool up = false;
if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
- up = !vars->p_drop;
- vars->p_drop += p->p_inc;
- if (vars->p_drop < p->p_inc)
- vars->p_drop = ~0;
- vars->blue_timer = now;
- }
- vars->dropping = true;
- vars->drop_next = now;
+ u32 p_drop = vars->p_drop;
+
+ up = !p_drop;
+ p_drop += p->p_inc;
+ if (p_drop < p->p_inc)
+ p_drop = ~0;
+ WRITE_ONCE(vars->p_drop, p_drop);
+ WRITE_ONCE(vars->blue_timer, now);
+ }
+ WRITE_ONCE(vars->dropping, true);
+ WRITE_ONCE(vars->drop_next, now);
if (!vars->count)
- vars->count = 1;
+ WRITE_ONCE(vars->count, 1);
return up;
}
@@ -475,20 +478,20 @@ static bool cobalt_queue_empty(struct cobalt_vars *vars,
if (vars->p_drop &&
ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
if (vars->p_drop < p->p_dec)
- vars->p_drop = 0;
+ WRITE_ONCE(vars->p_drop, 0);
else
- vars->p_drop -= p->p_dec;
- vars->blue_timer = now;
+ WRITE_ONCE(vars->p_drop, vars->p_drop - p->p_dec);
+ WRITE_ONCE(vars->blue_timer, now);
down = !vars->p_drop;
}
- vars->dropping = false;
+ WRITE_ONCE(vars->dropping, false);
if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
- vars->count--;
- cobalt_invsqrt(vars);
- vars->drop_next = cobalt_control(vars->drop_next,
- p->interval,
- vars->rec_inv_sqrt);
+ WRITE_ONCE(vars->count, vars->count - 1);
+ cobalt_invsqrt(vars, vars->count);
+ WRITE_ONCE(vars->drop_next,
+ cobalt_control(vars->drop_next, p->interval,
+ vars->rec_inv_sqrt));
}
return down;
@@ -507,6 +510,7 @@ static enum qdisc_drop_reason cobalt_should_drop(struct cobalt_vars *vars,
bool next_due, over_target;
ktime_t schedule;
u64 sojourn;
+ u32 count;
/* The 'schedule' variable records, in its sign, whether 'now' is before or
* after 'drop_next'. This allows 'drop_next' to be updated before the next
@@ -528,21 +532,22 @@ static enum qdisc_drop_reason cobalt_should_drop(struct cobalt_vars *vars,
over_target = sojourn > p->target &&
sojourn > p->mtu_time * bulk_flows * 2 &&
sojourn > p->mtu_time * 4;
- next_due = vars->count && ktime_to_ns(schedule) >= 0;
+ count = vars->count;
+ next_due = count && ktime_to_ns(schedule) >= 0;
vars->ecn_marked = false;
if (over_target) {
if (!vars->dropping) {
- vars->dropping = true;
- vars->drop_next = cobalt_control(now,
- p->interval,
- vars->rec_inv_sqrt);
+ WRITE_ONCE(vars->dropping, true);
+ WRITE_ONCE(vars->drop_next,
+ cobalt_control(now, p->interval,
+ vars->rec_inv_sqrt));
}
- if (!vars->count)
- vars->count = 1;
+ if (!count)
+ count = 1;
} else if (vars->dropping) {
- vars->dropping = false;
+ WRITE_ONCE(vars->dropping, false);
}
if (next_due && vars->dropping) {
@@ -550,23 +555,23 @@ static enum qdisc_drop_reason cobalt_should_drop(struct cobalt_vars *vars,
if (!(vars->ecn_marked = INET_ECN_set_ce(skb)))
reason = QDISC_DROP_CONGESTED;
- vars->count++;
- if (!vars->count)
- vars->count--;
- cobalt_invsqrt(vars);
- vars->drop_next = cobalt_control(vars->drop_next,
- p->interval,
- vars->rec_inv_sqrt);
+ count++;
+ if (!count)
+ count--;
+ cobalt_invsqrt(vars, count);
+ WRITE_ONCE(vars->drop_next,
+ cobalt_control(vars->drop_next, p->interval,
+ vars->rec_inv_sqrt));
schedule = ktime_sub(now, vars->drop_next);
} else {
while (next_due) {
- vars->count--;
- cobalt_invsqrt(vars);
- vars->drop_next = cobalt_control(vars->drop_next,
- p->interval,
- vars->rec_inv_sqrt);
+ count--;
+ cobalt_invsqrt(vars, count);
+ WRITE_ONCE(vars->drop_next,
+ cobalt_control(vars->drop_next, p->interval,
+ vars->rec_inv_sqrt));
schedule = ktime_sub(now, vars->drop_next);
- next_due = vars->count && ktime_to_ns(schedule) >= 0;
+ next_due = count && ktime_to_ns(schedule) >= 0;
}
}
@@ -575,11 +580,12 @@ static enum qdisc_drop_reason cobalt_should_drop(struct cobalt_vars *vars,
get_random_u32() < vars->p_drop)
reason = QDISC_DROP_FLOOD_PROTECTION;
+ WRITE_ONCE(vars->count, count);
/* Overload the drop_next field as an activity timeout */
- if (!vars->count)
- vars->drop_next = ktime_add_ns(now, p->interval);
+ if (!count)
+ WRITE_ONCE(vars->drop_next, ktime_add_ns(now, p->interval));
else if (ktime_to_ns(schedule) > 0 && reason == QDISC_DROP_UNSPEC)
- vars->drop_next = now;
+ WRITE_ONCE(vars->drop_next, now);
return reason;
}
@@ -914,7 +920,7 @@ static struct sk_buff *dequeue_head(struct cake_flow *flow)
struct sk_buff *skb = flow->head;
if (skb) {
- flow->head = skb->next;
+ WRITE_ONCE(flow->head, skb->next);
skb_mark_not_on_list(skb);
}
@@ -926,7 +932,7 @@ static struct sk_buff *dequeue_head(struct cake_flow *flow)
static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
{
if (!flow->head)
- flow->head = skb;
+ WRITE_ONCE(flow->head, skb);
else
flow->tail->next = skb;
flow->tail = skb;
@@ -1357,7 +1363,7 @@ found:
if (elig_ack_prev)
elig_ack_prev->next = elig_ack->next;
else
- flow->head = elig_ack->next;
+ WRITE_ONCE(flow->head, elig_ack->next);
skb_mark_not_on_list(elig_ack);
@@ -1595,11 +1601,11 @@ static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
len = qdisc_pkt_len(skb);
q->buffer_used -= skb->truesize;
- b->backlogs[idx] -= len;
WRITE_ONCE(b->tin_backlog, b->tin_backlog - len);
+ WRITE_ONCE(b->backlogs[idx], b->backlogs[idx] - len);
sch->qstats.backlog -= len;
- flow->dropped++;
+ WRITE_ONCE(flow->dropped, flow->dropped + 1);
WRITE_ONCE(b->tin_dropped, b->tin_dropped + 1);
if (q->config->rate_flags & CAKE_FLAG_INGRESS)
@@ -1824,11 +1830,11 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
}
/* stats */
- b->backlogs[idx] += slen;
sch->qstats.backlog += slen;
q->avg_window_bytes += slen;
WRITE_ONCE(b->bytes, b->bytes + slen);
WRITE_ONCE(b->tin_backlog, b->tin_backlog + slen);
+ WRITE_ONCE(b->backlogs[idx], b->backlogs[idx] + slen);
qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen);
consume_skb(skb);
@@ -1861,11 +1867,11 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
/* stats */
WRITE_ONCE(b->packets, b->packets + 1);
- b->backlogs[idx] += len - ack_pkt_len;
sch->qstats.backlog += len - ack_pkt_len;
q->avg_window_bytes += len - ack_pkt_len;
WRITE_ONCE(b->bytes, b->bytes + len - ack_pkt_len);
WRITE_ONCE(b->tin_backlog, b->tin_backlog + len - ack_pkt_len);
+ WRITE_ONCE(b->backlogs[idx], b->backlogs[idx] + len - ack_pkt_len);
}
if (q->overflow_timeout)
@@ -1924,7 +1930,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
flow->set = CAKE_SET_SPARSE;
WRITE_ONCE(b->sparse_flow_count, b->sparse_flow_count + 1);
- flow->deficit = cake_get_flow_quantum(b, flow, q->config->flow_mode);
+ WRITE_ONCE(flow->deficit, cake_get_flow_quantum(b, flow, q->config->flow_mode));
} else if (flow->set == CAKE_SET_SPARSE_WAIT) {
/* this flow was empty, accounted as a sparse flow, but actually
* in the bulk rotation.
@@ -1977,7 +1983,7 @@ static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
if (flow->head) {
skb = dequeue_head(flow);
len = qdisc_pkt_len(skb);
- b->backlogs[q->cur_flow] -= len;
+ WRITE_ONCE(b->backlogs[q->cur_flow], b->backlogs[q->cur_flow] - len);
WRITE_ONCE(b->tin_backlog, b->tin_backlog - len);
sch->qstats.backlog -= len;
q->buffer_used -= skb->truesize;
@@ -2166,7 +2172,8 @@ retry:
}
}
- flow->deficit += cake_get_flow_quantum(b, flow, q->config->flow_mode);
+ WRITE_ONCE(flow->deficit,
+ flow->deficit + cake_get_flow_quantum(b, flow, q->config->flow_mode));
list_move_tail(&flow->flowchain, &b->old_flows);
goto retry;
@@ -2232,10 +2239,10 @@ retry:
if (q->config->rate_flags & CAKE_FLAG_INGRESS) {
len = cake_advance_shaper(q, b, skb,
now, true);
- flow->deficit -= len;
+ WRITE_ONCE(flow->deficit, flow->deficit - len);
b->tin_deficit -= len;
}
- flow->dropped++;
+ WRITE_ONCE(flow->dropped, flow->dropped + 1);
WRITE_ONCE(b->tin_dropped, b->tin_dropped + 1);
qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
qdisc_qstats_drop(sch);
@@ -2259,7 +2266,7 @@ retry:
delay < b->base_delay ? 2 : 8));
len = cake_advance_shaper(q, b, skb, now, false);
- flow->deficit -= len;
+ WRITE_ONCE(flow->deficit, flow->deficit - len);
b->tin_deficit -= len;
if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
@@ -3137,7 +3144,7 @@ static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
flow = &b->flows[idx % CAKE_QUEUES];
- if (flow->head) {
+ if (READ_ONCE(flow->head)) {
sch_tree_lock(sch);
skb = flow->head;
while (skb) {
@@ -3146,13 +3153,15 @@ static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
}
sch_tree_unlock(sch);
}
- qs.backlog = b->backlogs[idx % CAKE_QUEUES];
- qs.drops = flow->dropped;
+ qs.backlog = READ_ONCE(b->backlogs[idx % CAKE_QUEUES]);
+ qs.drops = READ_ONCE(flow->dropped);
}
if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
return -1;
if (flow) {
ktime_t now = ktime_get();
+ bool dropping;
+ u32 p_drop;
stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
if (!stats)
@@ -3167,21 +3176,23 @@ static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
goto nla_put_failure; \
} while (0)
- PUT_STAT_S32(DEFICIT, flow->deficit);
- PUT_STAT_U32(DROPPING, flow->cvars.dropping);
- PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
- PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
- if (flow->cvars.p_drop) {
+ PUT_STAT_S32(DEFICIT, READ_ONCE(flow->deficit));
+ dropping = READ_ONCE(flow->cvars.dropping);
+ PUT_STAT_U32(DROPPING, dropping);
+ PUT_STAT_U32(COBALT_COUNT, READ_ONCE(flow->cvars.count));
+ p_drop = READ_ONCE(flow->cvars.p_drop);
+ PUT_STAT_U32(P_DROP, p_drop);
+ if (p_drop) {
PUT_STAT_S32(BLUE_TIMER_US,
ktime_to_us(
ktime_sub(now,
- flow->cvars.blue_timer)));
+ READ_ONCE(flow->cvars.blue_timer))));
}
- if (flow->cvars.dropping) {
+ if (dropping) {
PUT_STAT_S32(DROP_NEXT_US,
ktime_to_us(
ktime_sub(now,
- flow->cvars.drop_next)));
+ READ_ONCE(flow->cvars.drop_next))));
}
if (nla_nest_end(d->skb, stats) < 0)
diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c
index 8c9a0400c862..0f953bd46b58 100644
--- a/net/sched/sch_cbs.c
+++ b/net/sched/sch_cbs.c
@@ -243,6 +243,20 @@ static struct sk_buff *cbs_dequeue(struct Qdisc *sch)
return q->dequeue(sch);
}
+static void cbs_reset(struct Qdisc *sch)
+{
+ struct cbs_sched_data *q = qdisc_priv(sch);
+
+ /* Nothing to do if we couldn't create the underlying qdisc */
+ if (!q->qdisc)
+ return;
+
+ qdisc_reset(q->qdisc);
+ qdisc_watchdog_cancel(&q->watchdog);
+ q->credits = 0;
+ q->last = 0;
+}
+
static const struct nla_policy cbs_policy[TCA_CBS_MAX + 1] = {
[TCA_CBS_PARMS] = { .len = sizeof(struct tc_cbs_qopt) },
};
@@ -540,7 +554,7 @@ static struct Qdisc_ops cbs_qdisc_ops __read_mostly = {
.dequeue = cbs_dequeue,
.peek = qdisc_peek_dequeued,
.init = cbs_init,
- .reset = qdisc_reset_queue,
+ .reset = cbs_reset,
.destroy = cbs_destroy,
.change = cbs_change,
.dump = cbs_dump,
diff --git a/net/sched/sch_dualpi2.c b/net/sched/sch_dualpi2.c
index 241e6a46bd00..a22489c14458 100644
--- a/net/sched/sch_dualpi2.c
+++ b/net/sched/sch_dualpi2.c
@@ -938,6 +938,8 @@ static int dualpi2_init(struct Qdisc *sch, struct nlattr *opt,
int err;
sch->flags |= TCQ_F_DEQUEUE_DROPS;
+ hrtimer_setup(&q->pi2_timer, dualpi2_timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS_PINNED_SOFT);
q->l_queue = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
TC_H_MAKE(sch->handle, 1), extack);
@@ -950,8 +952,6 @@ static int dualpi2_init(struct Qdisc *sch, struct nlattr *opt,
q->sch = sch;
dualpi2_reset_default(sch);
- hrtimer_setup(&q->pi2_timer, dualpi2_timer, CLOCK_MONOTONIC,
- HRTIMER_MODE_ABS_PINNED_SOFT);
if (opt && nla_len(opt)) {
err = dualpi2_change(sch, opt, extack);
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 0664b2f2d6f2..24db54684e8a 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -117,7 +117,7 @@ static inline struct sk_buff *dequeue_head(struct fq_codel_flow *flow)
{
struct sk_buff *skb = flow->head;
- flow->head = skb->next;
+ WRITE_ONCE(flow->head, skb->next);
skb_mark_not_on_list(skb);
return skb;
}
@@ -127,7 +127,7 @@ static inline void flow_queue_add(struct fq_codel_flow *flow,
struct sk_buff *skb)
{
if (flow->head == NULL)
- flow->head = skb;
+ WRITE_ONCE(flow->head, skb);
else
flow->tail->next = skb;
flow->tail = skb;
@@ -173,8 +173,8 @@ static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets,
} while (++i < max_packets && len < threshold);
/* Tell codel to increase its signal strength also */
- flow->cvars.count += i;
- q->backlogs[idx] -= len;
+ WRITE_ONCE(flow->cvars.count, flow->cvars.count + i);
+ WRITE_ONCE(q->backlogs[idx], q->backlogs[idx] - len);
q->memory_usage -= mem;
sch->qstats.drops += i;
sch->qstats.backlog -= len;
@@ -204,13 +204,13 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch,
codel_set_enqueue_time(skb);
flow = &q->flows[idx];
flow_queue_add(flow, skb);
- q->backlogs[idx] += qdisc_pkt_len(skb);
+ WRITE_ONCE(q->backlogs[idx], q->backlogs[idx] + qdisc_pkt_len(skb));
qdisc_qstats_backlog_inc(sch, skb);
if (list_empty(&flow->flowchain)) {
list_add_tail(&flow->flowchain, &q->new_flows);
q->new_flow_count++;
- flow->deficit = q->quantum;
+ WRITE_ONCE(flow->deficit, q->quantum);
}
get_codel_cb(skb)->mem_usage = skb->truesize;
q->memory_usage += get_codel_cb(skb)->mem_usage;
@@ -263,7 +263,8 @@ static struct sk_buff *dequeue_func(struct codel_vars *vars, void *ctx)
flow = container_of(vars, struct fq_codel_flow, cvars);
if (flow->head) {
skb = dequeue_head(flow);
- q->backlogs[flow - q->flows] -= qdisc_pkt_len(skb);
+ WRITE_ONCE(q->backlogs[flow - q->flows],
+ q->backlogs[flow - q->flows] - qdisc_pkt_len(skb));
q->memory_usage -= get_codel_cb(skb)->mem_usage;
sch->q.qlen--;
sch->qstats.backlog -= qdisc_pkt_len(skb);
@@ -296,7 +297,7 @@ begin:
flow = list_first_entry(head, struct fq_codel_flow, flowchain);
if (flow->deficit <= 0) {
- flow->deficit += q->quantum;
+ WRITE_ONCE(flow->deficit, flow->deficit + q->quantum);
list_move_tail(&flow->flowchain, &q->old_flows);
goto begin;
}
@@ -314,7 +315,7 @@ begin:
goto begin;
}
qdisc_bstats_update(sch, skb);
- flow->deficit -= qdisc_pkt_len(skb);
+ WRITE_ONCE(flow->deficit, flow->deficit - qdisc_pkt_len(skb));
if (q->cstats.drop_count) {
qdisc_tree_reduce_backlog(sch, q->cstats.drop_count,
@@ -328,7 +329,7 @@ begin:
static void fq_codel_flow_purge(struct fq_codel_flow *flow)
{
rtnl_kfree_skbs(flow->head, flow->tail);
- flow->head = NULL;
+ WRITE_ONCE(flow->head, NULL);
}
static void fq_codel_reset(struct Qdisc *sch)
@@ -656,21 +657,21 @@ static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl,
memset(&xstats, 0, sizeof(xstats));
xstats.type = TCA_FQ_CODEL_XSTATS_CLASS;
- xstats.class_stats.deficit = flow->deficit;
+ xstats.class_stats.deficit = READ_ONCE(flow->deficit);
xstats.class_stats.ldelay =
- codel_time_to_us(flow->cvars.ldelay);
- xstats.class_stats.count = flow->cvars.count;
- xstats.class_stats.lastcount = flow->cvars.lastcount;
- xstats.class_stats.dropping = flow->cvars.dropping;
- if (flow->cvars.dropping) {
- codel_tdiff_t delta = flow->cvars.drop_next -
+ codel_time_to_us(READ_ONCE(flow->cvars.ldelay));
+ xstats.class_stats.count = READ_ONCE(flow->cvars.count);
+ xstats.class_stats.lastcount = READ_ONCE(flow->cvars.lastcount);
+ xstats.class_stats.dropping = READ_ONCE(flow->cvars.dropping);
+ if (xstats.class_stats.dropping) {
+ codel_tdiff_t delta = READ_ONCE(flow->cvars.drop_next) -
codel_get_time();
xstats.class_stats.drop_next = (delta >= 0) ?
codel_time_to_us(delta) :
-codel_time_to_us(-delta);
}
- if (flow->head) {
+ if (READ_ONCE(flow->head)) {
sch_tree_lock(sch);
skb = flow->head;
while (skb) {
@@ -679,7 +680,7 @@ static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl,
}
sch_tree_unlock(sch);
}
- qs.backlog = q->backlogs[idx];
+ qs.backlog = READ_ONCE(q->backlogs[idx]);
qs.drops = 0;
}
if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index bc18e1976b6e..17a79fe2f091 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -461,7 +461,8 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
skb->prev = NULL;
/* Random duplication */
- if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor, &q->prng))
+ if (q->duplicate && skb->tc_depth == 0 &&
+ q->duplicate >= get_crandom(&q->dup_cor, &q->prng))
++count;
/* Drop packet? */
@@ -540,11 +541,9 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
*/
if (skb2) {
struct Qdisc *rootq = qdisc_root_bh(sch);
- u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
- q->duplicate = 0;
+ skb2->tc_depth++; /* prevent duplicating a dup... */
rootq->enqueue(skb2, rootq, to_free);
- q->duplicate = dupsave;
skb2 = NULL;
}
@@ -1007,41 +1006,6 @@ static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
return 0;
}
-static const struct Qdisc_class_ops netem_class_ops;
-
-static int check_netem_in_tree(struct Qdisc *sch, bool duplicates,
- struct netlink_ext_ack *extack)
-{
- struct Qdisc *root, *q;
- unsigned int i;
-
- root = qdisc_root_sleeping(sch);
-
- if (sch != root && root->ops->cl_ops == &netem_class_ops) {
- if (duplicates ||
- ((struct netem_sched_data *)qdisc_priv(root))->duplicate)
- goto err;
- }
-
- if (!qdisc_dev(root))
- return 0;
-
- hash_for_each(qdisc_dev(root)->qdisc_hash, i, q, hash) {
- if (sch != q && q->ops->cl_ops == &netem_class_ops) {
- if (duplicates ||
- ((struct netem_sched_data *)qdisc_priv(q))->duplicate)
- goto err;
- }
- }
-
- return 0;
-
-err:
- NL_SET_ERR_MSG(extack,
- "netem: cannot mix duplicating netems with other netems in tree");
- return -EINVAL;
-}
-
/* Parse netlink message to set options */
static int netem_change(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
@@ -1118,11 +1082,6 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt,
q->gap = qopt->gap;
q->counter = 0;
q->loss = qopt->loss;
-
- ret = check_netem_in_tree(sch, qopt->duplicate, extack);
- if (ret)
- goto unlock;
-
q->duplicate = qopt->duplicate;
/* for compatibility with earlier versions.
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
index fb53fbf0e328..b41f2def2e2c 100644
--- a/net/sched/sch_pie.c
+++ b/net/sched/sch_pie.c
@@ -219,16 +219,14 @@ void pie_process_dequeue(struct sk_buff *skb, struct pie_params *params,
* packet timestamp.
*/
if (!params->dq_rate_estimator) {
- vars->qdelay = now - pie_get_enqueue_time(skb);
+ WRITE_ONCE(vars->qdelay,
+ backlog ? now - pie_get_enqueue_time(skb) : 0);
if (vars->dq_tstamp != DTIME_INVALID)
dtime = now - vars->dq_tstamp;
vars->dq_tstamp = now;
- if (backlog == 0)
- vars->qdelay = 0;
-
if (dtime == 0)
return;
@@ -376,7 +374,7 @@ void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars,
if (qdelay > (PSCHED_NS2TICKS(250 * NSEC_PER_MSEC)))
delta += MAX_PROB / (100 / 2);
- vars->prob += delta;
+ WRITE_ONCE(vars->prob, vars->prob + delta);
if (delta > 0) {
/* prevent overflow */
@@ -401,7 +399,7 @@ void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars,
if (qdelay == 0 && qdelay_old == 0 && update_prob)
/* Reduce drop probability to 98.4% */
- vars->prob -= vars->prob / 64;
+ WRITE_ONCE(vars->prob, vars->prob - vars->prob / 64);
WRITE_ONCE(vars->qdelay, qdelay);
vars->backlog_old = backlog;
@@ -501,7 +499,7 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
{
struct pie_sched_data *q = qdisc_priv(sch);
struct tc_pie_xstats st = {
- .prob = q->vars.prob << BITS_PER_BYTE,
+ .prob = READ_ONCE(q->vars.prob) << BITS_PER_BYTE,
.delay = ((u32)PSCHED_TICKS2NS(READ_ONCE(q->vars.qdelay))) /
NSEC_PER_USEC,
.packets_in = READ_ONCE(q->stats.packets_in),
@@ -512,7 +510,7 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
};
/* avg_dq_rate is only valid if dq_rate_estimator is enabled */
- st.dq_rate_estimating = q->params.dq_rate_estimator;
+ st.dq_rate_estimating = READ_ONCE(q->params.dq_rate_estimator);
/* unscale and return dq_rate in bytes per sec */
if (st.dq_rate_estimating)
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 432b8a3000a5..4d0e44a2e7c6 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -162,7 +162,7 @@ static struct sk_buff *red_dequeue(struct Qdisc *sch)
struct red_sched_data *q = qdisc_priv(sch);
struct Qdisc *child = q->qdisc;
- skb = child->dequeue(child);
+ skb = qdisc_dequeue_peeked(child);
if (skb) {
qdisc_bstats_update(sch, skb);
qdisc_qstats_backlog_dec(sch, skb);
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index bd5ef561030f..d3ee8e5479b3 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -441,7 +441,7 @@ static struct sk_buff *sfb_dequeue(struct Qdisc *sch)
struct Qdisc *child = q->qdisc;
struct sk_buff *skb;
- skb = child->dequeue(q->qdisc);
+ skb = qdisc_dequeue_peeked(child);
if (skb) {
qdisc_bstats_update(sch, skb);
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index c3f3181dba54..f39822babf88 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -225,7 +225,8 @@ static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
sfq_unlink(q, x, n, p);
- d = q->slots[x].qlen--;
+ d = q->slots[x].qlen;
+ WRITE_ONCE(q->slots[x].qlen, d - 1);
if (n == p && q->cur_depth == d)
q->cur_depth--;
sfq_link(q, x);
@@ -238,7 +239,8 @@ static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
sfq_unlink(q, x, n, p);
- d = ++q->slots[x].qlen;
+ d = q->slots[x].qlen + 1;
+ WRITE_ONCE(q->slots[x].qlen, d);
if (q->cur_depth < d)
q->cur_depth = d;
sfq_link(q, x);
@@ -298,7 +300,7 @@ static unsigned int sfq_drop(struct Qdisc *sch, struct sk_buff **to_free)
drop:
skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
len = qdisc_pkt_len(skb);
- slot->backlog -= len;
+ WRITE_ONCE(slot->backlog, slot->backlog - len);
sfq_dec(q, x);
sch->q.qlen--;
qdisc_qstats_backlog_dec(sch, skb);
@@ -314,7 +316,7 @@ drop:
q->tail = NULL; /* no more active slots */
else
q->tail->next = slot->next;
- q->ht[slot->hash] = SFQ_EMPTY_SLOT;
+ WRITE_ONCE(q->ht[slot->hash], SFQ_EMPTY_SLOT);
goto drop;
}
@@ -364,10 +366,10 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
x = q->dep[0].next; /* get a free slot */
if (x >= SFQ_MAX_FLOWS)
return qdisc_drop_reason(skb, sch, to_free, QDISC_DROP_MAXFLOWS);
- q->ht[hash] = x;
+ WRITE_ONCE(q->ht[hash], x);
slot = &q->slots[x];
slot->hash = hash;
- slot->backlog = 0; /* should already be 0 anyway... */
+ WRITE_ONCE(slot->backlog, 0); /* should already be 0 anyway... */
red_set_vars(&slot->vars);
goto enqueue;
}
@@ -426,7 +428,7 @@ congestion_drop:
head = slot_dequeue_head(slot);
delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
sch->qstats.backlog -= delta;
- slot->backlog -= delta;
+ WRITE_ONCE(slot->backlog, slot->backlog - delta);
qdisc_drop_reason(head, sch, to_free, QDISC_DROP_FLOW_LIMIT);
slot_queue_add(slot, skb);
@@ -436,7 +438,7 @@ congestion_drop:
enqueue:
qdisc_qstats_backlog_inc(sch, skb);
- slot->backlog += qdisc_pkt_len(skb);
+ WRITE_ONCE(slot->backlog, slot->backlog + qdisc_pkt_len(skb));
slot_queue_add(slot, skb);
sfq_inc(q, x);
if (slot->qlen == 1) { /* The flow is new */
@@ -452,7 +454,7 @@ enqueue:
*/
q->tail = slot;
/* We could use a bigger initial quantum for new flows */
- slot->allot = q->quantum;
+ WRITE_ONCE(slot->allot, q->quantum);
}
if (++sch->q.qlen <= q->limit)
return NET_XMIT_SUCCESS;
@@ -489,7 +491,7 @@ next_slot:
slot = &q->slots[a];
if (slot->allot <= 0) {
q->tail = slot;
- slot->allot += q->quantum;
+ WRITE_ONCE(slot->allot, slot->allot + q->quantum);
goto next_slot;
}
skb = slot_dequeue_head(slot);
@@ -497,10 +499,10 @@ next_slot:
qdisc_bstats_update(sch, skb);
sch->q.qlen--;
qdisc_qstats_backlog_dec(sch, skb);
- slot->backlog -= qdisc_pkt_len(skb);
+ WRITE_ONCE(slot->backlog, slot->backlog - qdisc_pkt_len(skb));
/* Is the slot empty? */
if (slot->qlen == 0) {
- q->ht[slot->hash] = SFQ_EMPTY_SLOT;
+ WRITE_ONCE(q->ht[slot->hash], SFQ_EMPTY_SLOT);
next_a = slot->next;
if (a == next_a) {
q->tail = NULL; /* no more active slots */
@@ -508,7 +510,7 @@ next_slot:
}
q->tail->next = next_a;
} else {
- slot->allot -= qdisc_pkt_len(skb);
+ WRITE_ONCE(slot->allot, slot->allot - qdisc_pkt_len(skb));
}
return skb;
}
@@ -549,9 +551,9 @@ static void sfq_rehash(struct Qdisc *sch)
sfq_dec(q, i);
__skb_queue_tail(&list, skb);
}
- slot->backlog = 0;
+ WRITE_ONCE(slot->backlog, 0);
red_set_vars(&slot->vars);
- q->ht[slot->hash] = SFQ_EMPTY_SLOT;
+ WRITE_ONCE(q->ht[slot->hash], SFQ_EMPTY_SLOT);
}
q->tail = NULL;
@@ -570,7 +572,7 @@ drop:
dropped++;
continue;
}
- q->ht[hash] = x;
+ WRITE_ONCE(q->ht[hash], x);
slot = &q->slots[x];
slot->hash = hash;
}
@@ -581,7 +583,7 @@ drop:
slot->vars.qavg = red_calc_qavg(q->red_parms,
&slot->vars,
slot->backlog);
- slot->backlog += qdisc_pkt_len(skb);
+ WRITE_ONCE(slot->backlog, slot->backlog + qdisc_pkt_len(skb));
sfq_inc(q, x);
if (slot->qlen == 1) { /* The flow is new */
if (q->tail == NULL) { /* It is the first flow */
@@ -591,7 +593,7 @@ drop:
q->tail->next = x;
}
q->tail = slot;
- slot->allot = q->quantum;
+ WRITE_ONCE(slot->allot, q->quantum);
}
}
sch->q.qlen -= dropped;
@@ -905,16 +907,16 @@ static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
struct gnet_dump *d)
{
struct sfq_sched_data *q = qdisc_priv(sch);
- sfq_index idx = q->ht[cl - 1];
+ sfq_index idx = READ_ONCE(q->ht[cl - 1]);
struct gnet_stats_queue qs = { 0 };
struct tc_sfq_xstats xstats = { 0 };
if (idx != SFQ_EMPTY_SLOT) {
const struct sfq_slot *slot = &q->slots[idx];
- xstats.allot = slot->allot;
- qs.qlen = slot->qlen;
- qs.backlog = slot->backlog;
+ xstats.allot = READ_ONCE(slot->allot);
+ qs.qlen = READ_ONCE(slot->qlen);
+ qs.backlog = READ_ONCE(slot->backlog);
}
if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
return -1;
@@ -930,7 +932,7 @@ static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
return;
for (i = 0; i < q->divisor; i++) {
- if (q->ht[i] == SFQ_EMPTY_SLOT) {
+ if (READ_ONCE(q->ht[i]) == SFQ_EMPTY_SLOT) {
arg->count++;
continue;
}
diff --git a/net/sctp/diag.c b/net/sctp/diag.c
index 2afb376299fe..d758f5c3e06e 100644
--- a/net/sctp/diag.c
+++ b/net/sctp/diag.c
@@ -266,15 +266,15 @@ static int sctp_sock_dump_one(struct sctp_endpoint *ep, struct sctp_transport *t
lock_sock(sk);
- rep = nlmsg_new(inet_assoc_attr_size(sk, assoc), GFP_KERNEL);
- if (!rep) {
- release_sock(sk);
- return -ENOMEM;
+ if (ep != assoc->ep || assoc->base.dead) {
+ err = -ESTALE;
+ goto out_unlock;
}
- if (ep != assoc->ep) {
- err = -EAGAIN;
- goto out;
+ rep = nlmsg_new(inet_assoc_attr_size(sk, assoc), GFP_KERNEL);
+ if (!rep) {
+ err = -ENOMEM;
+ goto out_unlock;
}
err = inet_sctp_diag_fill(sk, assoc, rep, req, sk_user_ns(NETLINK_CB(skb).sk),
@@ -289,8 +289,9 @@ static int sctp_sock_dump_one(struct sctp_endpoint *ep, struct sctp_transport *t
return nlmsg_unicast(sock_net(skb->sk)->diag_nlsk, rep, NETLINK_CB(skb).portid);
out:
- release_sock(sk);
kfree_skb(rep);
+out_unlock:
+ release_sock(sk);
return err;
}
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index de86ac088289..85264862fb6b 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -1730,6 +1730,7 @@ struct sctp_association *sctp_unpack_cookie(
struct sctp_signed_cookie *cookie;
struct sk_buff *skb = chunk->skb;
struct sctp_cookie *bear_cookie;
+ struct sctp_chunkhdr *ch;
enum sctp_scope scope;
unsigned int len;
ktime_t kt;
@@ -1759,6 +1760,10 @@ struct sctp_association *sctp_unpack_cookie(
cookie = chunk->subh.cookie_hdr;
bear_cookie = &cookie->c;
+ ch = (struct sctp_chunkhdr *)(bear_cookie + 1);
+ if (ntohs(ch->length) > len - fixed_size)
+ goto malformed;
+
/* Verify the cookie's MAC, if cookie authentication is enabled. */
if (sctp_sk(ep->base.sk)->cookie_auth_enable) {
u8 mac[SHA256_DIGEST_SIZE];
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 8e89a870780c..9b23c11cbb9e 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -2598,11 +2598,7 @@ static enum sctp_disposition sctp_sf_do_5_2_6_stale(
*/
sctp_add_cmd_sf(commands, SCTP_CMD_DEL_NON_PRIMARY, SCTP_NULL());
- /* If we've sent any data bundled with COOKIE-ECHO we will need to
- * resend
- */
- sctp_add_cmd_sf(commands, SCTP_CMD_T1_RETRAN,
- SCTP_TRANSPORT(asoc->peer.primary_path));
+ sctp_add_cmd_sf(commands, SCTP_CMD_PURGE_OUTQUEUE, SCTP_NULL());
/* Cast away the const modifier, as we want to just
* rerun it through as a sideffect.
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 58d0d9747f0b..66e12fb0c646 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1986,6 +1986,15 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
goto out_unlock;
iov_iter_revert(&msg->msg_iter, err);
+
+ /* sctp_sendmsg_to_asoc() may have released the socket
+ * lock (sctp_wait_for_sndbuf), during which other
+ * associations on ep->asocs could have been peeled
+ * off or freed. @asoc itself is revalidated by the
+ * base.dead and base.sk checks in sctp_wait_for_sndbuf,
+ * so re-derive the cached cursor from it.
+ */
+ tmp = list_next_entry(asoc, asocs);
}
goto out_unlock;
@@ -9394,6 +9403,8 @@ static int sctp_wait_for_connect(struct sctp_association *asoc, long *timeo_p)
release_sock(sk);
current_timeo = schedule_timeout(current_timeo);
lock_sock(sk);
+ if (sk != asoc->base.sk)
+ goto do_error;
*timeo_p = current_timeo;
}
diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c
index 94bc9c7382ea..dea9270f3e57 100644
--- a/net/shaper/shaper.c
+++ b/net/shaper/shaper.c
@@ -21,6 +21,8 @@
#define NET_SHAPER_ID_UNSPEC NET_SHAPER_ID_MASK
+static_assert(NET_SHAPER_ID_UNSPEC == NET_SHAPER_MAX_HANDLE_ID + 1);
+
struct net_shaper_hierarchy {
struct xarray shapers;
};
@@ -90,6 +92,12 @@ static int net_shaper_handle_size(void)
nla_total_size(sizeof(u32)));
}
+static int net_shaper_group_reply_size(void)
+{
+ return nla_total_size(sizeof(u32)) + /* NET_SHAPER_A_IFINDEX */
+ net_shaper_handle_size(); /* NET_SHAPER_A_HANDLE */
+}
+
static int net_shaper_fill_binding(struct sk_buff *msg,
const struct net_shaper_binding *binding,
u32 type)
@@ -130,35 +138,58 @@ handle_nest_cancel:
return -EMSGSIZE;
}
+static void net_shaper_copy(struct net_shaper *dst,
+ const struct net_shaper *src)
+{
+ WRITE_ONCE(dst->parent.scope, READ_ONCE(src->parent.scope));
+ WRITE_ONCE(dst->parent.id, READ_ONCE(src->parent.id));
+ WRITE_ONCE(dst->handle.scope, READ_ONCE(src->handle.scope));
+ WRITE_ONCE(dst->handle.id, READ_ONCE(src->handle.id));
+
+ WRITE_ONCE(dst->metric, READ_ONCE(src->metric));
+ WRITE_ONCE(dst->bw_min, READ_ONCE(src->bw_min));
+ WRITE_ONCE(dst->bw_max, READ_ONCE(src->bw_max));
+ WRITE_ONCE(dst->burst, READ_ONCE(src->burst));
+ WRITE_ONCE(dst->priority, READ_ONCE(src->priority));
+ WRITE_ONCE(dst->weight, READ_ONCE(src->weight));
+
+ /* private fields are only used on the write path under the lock */
+ data_race(dst->leaves = src->leaves);
+}
+
static int
net_shaper_fill_one(struct sk_buff *msg,
const struct net_shaper_binding *binding,
const struct net_shaper *shaper,
const struct genl_info *info)
{
+ struct net_shaper cur;
void *hdr;
hdr = genlmsg_iput(msg, info);
if (!hdr)
return -EMSGSIZE;
+ /* Make a copy to avoid data races */
+ net_shaper_copy(&cur, shaper);
+
if (net_shaper_fill_binding(msg, binding, NET_SHAPER_A_IFINDEX) ||
- net_shaper_fill_handle(msg, &shaper->parent,
+ net_shaper_fill_handle(msg, &cur.parent,
NET_SHAPER_A_PARENT) ||
- net_shaper_fill_handle(msg, &shaper->handle,
+ net_shaper_fill_handle(msg, &cur.handle,
NET_SHAPER_A_HANDLE) ||
- ((shaper->bw_min || shaper->bw_max || shaper->burst) &&
- nla_put_u32(msg, NET_SHAPER_A_METRIC, shaper->metric)) ||
- (shaper->bw_min &&
- nla_put_uint(msg, NET_SHAPER_A_BW_MIN, shaper->bw_min)) ||
- (shaper->bw_max &&
- nla_put_uint(msg, NET_SHAPER_A_BW_MAX, shaper->bw_max)) ||
- (shaper->burst &&
- nla_put_uint(msg, NET_SHAPER_A_BURST, shaper->burst)) ||
- (shaper->priority &&
- nla_put_u32(msg, NET_SHAPER_A_PRIORITY, shaper->priority)) ||
- (shaper->weight &&
- nla_put_u32(msg, NET_SHAPER_A_WEIGHT, shaper->weight)))
+ ((cur.bw_min || cur.bw_max || cur.burst) &&
+ nla_put_u32(msg, NET_SHAPER_A_METRIC, cur.metric)) ||
+ (cur.bw_min &&
+ nla_put_uint(msg, NET_SHAPER_A_BW_MIN, cur.bw_min)) ||
+ (cur.bw_max &&
+ nla_put_uint(msg, NET_SHAPER_A_BW_MAX, cur.bw_max)) ||
+ (cur.burst &&
+ nla_put_uint(msg, NET_SHAPER_A_BURST, cur.burst)) ||
+ (cur.priority &&
+ nla_put_u32(msg, NET_SHAPER_A_PRIORITY, cur.priority)) ||
+ (cur.weight &&
+ nla_put_u32(msg, NET_SHAPER_A_WEIGHT, cur.weight)))
goto nla_put_failure;
genlmsg_end(msg, hdr);
@@ -275,25 +306,24 @@ static void net_shaper_default_parent(const struct net_shaper_handle *handle,
parent->id = 0;
}
-/*
- * MARK_0 is already in use due to XA_FLAGS_ALLOC, can't reuse such flag as
- * it's cleared by xa_store().
- */
-#define NET_SHAPER_NOT_VALID XA_MARK_1
-
static struct net_shaper *
net_shaper_lookup(struct net_shaper_binding *binding,
const struct net_shaper_handle *handle)
{
u32 index = net_shaper_handle_to_index(handle);
struct net_shaper_hierarchy *hierarchy;
+ struct net_shaper *cur;
hierarchy = net_shaper_hierarchy_rcu(binding);
- if (!hierarchy || xa_get_mark(&hierarchy->shapers, index,
- NET_SHAPER_NOT_VALID))
+ if (!hierarchy)
+ return NULL;
+
+ cur = xa_load(&hierarchy->shapers, index);
+ /* Check valid before reading fields */
+ if (!cur || !smp_load_acquire(&cur->valid))
return NULL;
- return xa_load(&hierarchy->shapers, index);
+ return cur;
}
/* Allocate on demand the per device shaper's hierarchy container.
@@ -348,7 +378,7 @@ static int net_shaper_pre_insert(struct net_shaper_binding *binding,
handle->id == NET_SHAPER_ID_UNSPEC) {
u32 min, max;
- handle->id = NET_SHAPER_ID_MASK - 1;
+ handle->id = NET_SHAPER_MAX_HANDLE_ID;
max = net_shaper_handle_to_index(handle);
handle->id = 0;
min = net_shaper_handle_to_index(handle);
@@ -370,13 +400,10 @@ static int net_shaper_pre_insert(struct net_shaper_binding *binding,
goto free_id;
}
- /* Mark 'tentative' shaper inside the hierarchy container.
- * xa_set_mark is a no-op if the previous store fails.
+ /* Insert as 'tentative' (no VALID mark). The mark will be set by
+ * net_shaper_commit() once the driver-side configuration succeeds.
*/
- xa_lock(&hierarchy->shapers);
- prev = __xa_store(&hierarchy->shapers, index, cur, GFP_KERNEL);
- __xa_set_mark(&hierarchy->shapers, index, NET_SHAPER_NOT_VALID);
- xa_unlock(&hierarchy->shapers);
+ prev = xa_store(&hierarchy->shapers, index, cur, GFP_KERNEL);
if (xa_err(prev)) {
NL_SET_ERR_MSG(extack, "Can't insert shaper into device store");
kfree_rcu(cur, rcu);
@@ -410,12 +437,10 @@ static void net_shaper_commit(struct net_shaper_binding *binding,
if (WARN_ON_ONCE(!cur))
continue;
- /* Successful update: drop the tentative mark
- * and update the hierarchy container.
- */
- __xa_clear_mark(&hierarchy->shapers, index,
- NET_SHAPER_NOT_VALID);
- *cur = shapers[i];
+ /* Successful update: update the hierarchy container... */
+ net_shaper_copy(cur, &shapers[i]);
+ /* ... publish to lockless readers. */
+ smp_store_release(&cur->valid, true);
}
xa_unlock(&hierarchy->shapers);
}
@@ -431,10 +456,11 @@ static void net_shaper_rollback(struct net_shaper_binding *binding)
return;
xa_lock(&hierarchy->shapers);
- xa_for_each_marked(&hierarchy->shapers, index, cur,
- NET_SHAPER_NOT_VALID) {
+ xa_for_each(&hierarchy->shapers, index, cur) {
+ if (cur->valid)
+ continue;
__xa_erase(&hierarchy->shapers, index);
- kfree(cur);
+ kfree_rcu(cur, rcu);
}
xa_unlock(&hierarchy->shapers);
}
@@ -465,10 +491,21 @@ static int net_shaper_parse_handle(const struct nlattr *attr,
* shaper (any other value).
*/
id_attr = tb[NET_SHAPER_A_HANDLE_ID];
- if (id_attr)
+ if (id_attr) {
id = nla_get_u32(id_attr);
- else if (handle->scope == NET_SHAPER_SCOPE_NODE)
+ } else if (handle->scope == NET_SHAPER_SCOPE_NODE) {
id = NET_SHAPER_ID_UNSPEC;
+ } else if (handle->scope == NET_SHAPER_SCOPE_QUEUE) {
+ NL_SET_ERR_ATTR_MISS(info->extack, attr,
+ NET_SHAPER_A_HANDLE_ID);
+ return -EINVAL;
+ }
+
+ if (id && handle->scope == NET_SHAPER_SCOPE_NETDEV) {
+ NL_SET_ERR_MSG_ATTR(info->extack, id_attr,
+ "Netdev scope is a singleton, must use ID 0");
+ return -EINVAL;
+ }
handle->id = id;
return 0;
@@ -836,7 +873,12 @@ int net_shaper_nl_get_dumpit(struct sk_buff *skb,
goto out_unlock;
for (; (shaper = xa_find(&hierarchy->shapers, &ctx->start_index,
- U32_MAX, XA_PRESENT)); ctx->start_index++) {
+ U32_MAX, XA_PRESENT));
+ ctx->start_index++) {
+ /* Check valid before reading fields */
+ if (!smp_load_acquire(&shaper->valid))
+ continue;
+
ret = net_shaper_fill_one(skb, binding, shaper, info);
if (ret)
break;
@@ -932,6 +974,46 @@ static int net_shaper_handle_cmp(const struct net_shaper_handle *a,
return memcmp(a, b, sizeof(*a));
}
+static int net_shaper_parse_leaves(struct net_shaper_binding *binding,
+ struct genl_info *info,
+ const struct net_shaper *node,
+ struct net_shaper *leaves,
+ int leaves_count)
+{
+ struct nlattr *attr;
+ int i, j, ret, rem;
+
+ i = 0;
+ nla_for_each_attr_type(attr, NET_SHAPER_A_LEAVES,
+ genlmsg_data(info->genlhdr),
+ genlmsg_len(info->genlhdr), rem) {
+ if (WARN_ON_ONCE(i >= leaves_count))
+ return -EINVAL;
+
+ ret = net_shaper_parse_leaf(binding, attr, info,
+ node, &leaves[i]);
+ if (ret)
+ return ret;
+
+ /* Reject duplicates */
+ for (j = 0; j < i; j++) {
+ if (net_shaper_handle_cmp(&leaves[i].handle,
+ &leaves[j].handle))
+ continue;
+
+ NL_SET_ERR_MSG_ATTR_FMT(info->extack, attr,
+ "Duplicate leaf shaper %d:%d",
+ leaves[i].handle.scope,
+ leaves[i].handle.id);
+ return -EINVAL;
+ }
+
+ i++;
+ }
+
+ return 0;
+}
+
static int net_shaper_parent_from_leaves(int leaves_count,
const struct net_shaper *leaves,
struct net_shaper *node,
@@ -964,15 +1046,22 @@ static int __net_shaper_group(struct net_shaper_binding *binding,
int i, ret;
if (node->handle.scope == NET_SHAPER_SCOPE_NODE) {
+ struct net_shaper *cur = NULL;
+
new_node = node->handle.id == NET_SHAPER_ID_UNSPEC;
- if (!new_node && !net_shaper_lookup(binding, &node->handle)) {
- /* The related attribute is not available when
- * reaching here from the delete() op.
- */
- NL_SET_ERR_MSG_FMT(extack, "Node shaper %d:%d does not exists",
- node->handle.scope, node->handle.id);
- return -ENOENT;
+ if (!new_node) {
+ cur = net_shaper_lookup(binding, &node->handle);
+ if (!cur) {
+ /* The related attribute is not available
+ * when reaching here from the delete() op.
+ */
+ NL_SET_ERR_MSG_FMT(extack,
+ "Node shaper %d:%d does not exist",
+ node->handle.scope,
+ node->handle.id);
+ return -ENOENT;
+ }
}
/* When unspecified, the node parent scope is inherited from
@@ -986,6 +1075,15 @@ static int __net_shaper_group(struct net_shaper_binding *binding,
return ret;
}
+ if (cur && net_shaper_handle_cmp(&cur->parent,
+ &node->parent)) {
+ NL_SET_ERR_MSG_FMT(extack,
+ "Cannot reparent node shaper %d:%d",
+ node->handle.scope,
+ node->handle.id);
+ return -EOPNOTSUPP;
+ }
+
} else {
net_shaper_default_parent(&node->handle, &node->parent);
}
@@ -1162,7 +1260,7 @@ static int net_shaper_group_send_reply(struct net_shaper_binding *binding,
free_msg:
/* Should never happen as msg is pre-allocated with enough space. */
WARN_ONCE(true, "calculated message payload length (%d)",
- net_shaper_handle_size());
+ net_shaper_group_reply_size());
nlmsg_free(msg);
return -EMSGSIZE;
}
@@ -1172,10 +1270,9 @@ int net_shaper_nl_group_doit(struct sk_buff *skb, struct genl_info *info)
struct net_shaper **old_nodes, *leaves, node = {};
struct net_shaper_hierarchy *hierarchy;
struct net_shaper_binding *binding;
- int i, ret, rem, leaves_count;
+ int i, ret, leaves_count;
int old_nodes_count = 0;
struct sk_buff *msg;
- struct nlattr *attr;
if (GENL_REQ_ATTR_CHECK(info, NET_SHAPER_A_LEAVES))
return -EINVAL;
@@ -1203,26 +1300,19 @@ int net_shaper_nl_group_doit(struct sk_buff *skb, struct genl_info *info)
if (ret)
goto free_leaves;
- i = 0;
- nla_for_each_attr_type(attr, NET_SHAPER_A_LEAVES,
- genlmsg_data(info->genlhdr),
- genlmsg_len(info->genlhdr), rem) {
- if (WARN_ON_ONCE(i >= leaves_count))
- goto free_leaves;
-
- ret = net_shaper_parse_leaf(binding, attr, info,
- &node, &leaves[i]);
- if (ret)
- goto free_leaves;
- i++;
- }
+ ret = net_shaper_parse_leaves(binding, info, &node,
+ leaves, leaves_count);
+ if (ret)
+ goto free_leaves;
/* Prepare the msg reply in advance, to avoid device operation
* rollback on allocation failure.
*/
- msg = genlmsg_new(net_shaper_handle_size(), GFP_KERNEL);
- if (!msg)
+ msg = genlmsg_new(net_shaper_group_reply_size(), GFP_KERNEL);
+ if (!msg) {
+ ret = -ENOMEM;
goto free_leaves;
+ }
hierarchy = net_shaper_hierarchy_setup(binding);
if (!hierarchy) {
diff --git a/net/shaper/shaper_nl_gen.c b/net/shaper/shaper_nl_gen.c
index 9b29be3ef19a..76eff85ec66d 100644
--- a/net/shaper/shaper_nl_gen.c
+++ b/net/shaper/shaper_nl_gen.c
@@ -11,10 +11,15 @@
#include <uapi/linux/net_shaper.h>
+/* Integer value ranges */
+static const struct netlink_range_validation net_shaper_a_handle_id_range = {
+ .max = NET_SHAPER_MAX_HANDLE_ID,
+};
+
/* Common nested types */
const struct nla_policy net_shaper_handle_nl_policy[NET_SHAPER_A_HANDLE_ID + 1] = {
[NET_SHAPER_A_HANDLE_SCOPE] = NLA_POLICY_MAX(NLA_U32, 3),
- [NET_SHAPER_A_HANDLE_ID] = { .type = NLA_U32, },
+ [NET_SHAPER_A_HANDLE_ID] = NLA_POLICY_FULL_RANGE(NLA_U32, &net_shaper_a_handle_id_range),
};
const struct nla_policy net_shaper_leaf_info_nl_policy[NET_SHAPER_A_WEIGHT + 1] = {
diff --git a/net/shaper/shaper_nl_gen.h b/net/shaper/shaper_nl_gen.h
index 42c46c52c775..2406652a9014 100644
--- a/net/shaper/shaper_nl_gen.h
+++ b/net/shaper/shaper_nl_gen.h
@@ -12,6 +12,8 @@
#include <uapi/linux/net_shaper.h>
+#define NET_SHAPER_MAX_HANDLE_ID 67108862
+
/* Common nested types */
extern const struct nla_policy net_shaper_handle_nl_policy[NET_SHAPER_A_HANDLE_ID + 1];
extern const struct nla_policy net_shaper_leaf_info_nl_policy[NET_SHAPER_A_WEIGHT + 1];
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 1a565095376a..b5db69073e20 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -188,10 +188,12 @@ static bool smc_hs_congested(const struct sock *sk)
struct smc_hashinfo smc_v4_hashinfo = {
.lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
+ .ht = HLIST_HEAD_INIT,
};
struct smc_hashinfo smc_v6_hashinfo = {
.lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock),
+ .ht = HLIST_HEAD_INIT,
};
int smc_hash_sk(struct sock *sk)
@@ -1400,7 +1402,8 @@ smc_v2_determine_accepted_chid(struct smc_clc_msg_accept_confirm *aclc,
int i;
for (i = 0; i < ini->ism_offered_cnt + 1; i++) {
- if (ini->ism_chid[i] == ntohs(aclc->d1.chid)) {
+ if (ini->ism_dev[i] &&
+ ini->ism_chid[i] == ntohs(aclc->d1.chid)) {
ini->ism_selected = i;
return 0;
}
@@ -1628,12 +1631,8 @@ static void smc_connect_work(struct work_struct *work)
lock_sock(&smc->sk);
if (rc != 0 || smc->sk.sk_err) {
smc->sk.sk_state = SMC_CLOSED;
- if (rc == -EPIPE || rc == -EAGAIN)
- smc->sk.sk_err = EPIPE;
- else if (rc == -ECONNREFUSED)
- smc->sk.sk_err = ECONNREFUSED;
- else if (signal_pending(current))
- smc->sk.sk_err = -sock_intr_errno(timeo);
+ if (!smc->sk.sk_err)
+ smc->sk.sk_err = (rc == -EAGAIN) ? EPIPE : -rc;
sock_put(&smc->sk); /* passive closing */
goto out;
}
@@ -3058,18 +3057,17 @@ static int __smc_setsockopt(struct socket *sock, int level, int optname,
smc = smc_sk(sk);
+ /* pre-fetch user data outside the lock */
+ if (optname == SMC_LIMIT_HS) {
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ if (copy_from_sockptr(&val, optval, sizeof(int)))
+ return -EFAULT;
+ }
+
lock_sock(sk);
switch (optname) {
case SMC_LIMIT_HS:
- if (optlen < sizeof(int)) {
- rc = -EINVAL;
- break;
- }
- if (copy_from_sockptr(&val, optval, sizeof(int))) {
- rc = -EFAULT;
- break;
- }
-
smc->limit_smc_hs = !!val;
rc = 0;
break;
@@ -3521,8 +3519,6 @@ static int __init smc_init(void)
pr_err("%s: sock_register fails with %d\n", __func__, rc);
goto out_proto6;
}
- INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
- INIT_HLIST_HEAD(&smc_v6_hashinfo.ht);
rc = smc_ib_register_client();
if (rc) {
diff --git a/net/smc/smc_tracepoint.h b/net/smc/smc_tracepoint.h
index a9a6e3c1113a..53da84f57fd6 100644
--- a/net/smc/smc_tracepoint.h
+++ b/net/smc/smc_tracepoint.h
@@ -51,7 +51,7 @@ DECLARE_EVENT_CLASS(smc_msg_event,
__field(const void *, smc)
__field(u64, net_cookie)
__field(size_t, len)
- __string(name, smc->conn.lnk->ibname)
+ __string(name, smc->conn.lnk ? smc->conn.lnk->ibname : "")
),
TP_fast_assign(
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 7081c1214e6c..27dd6b58b8ff 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -403,7 +403,7 @@ void sunrpc_init_cache_detail(struct cache_detail *cd)
INIT_LIST_HEAD(&cd->readers);
spin_lock_init(&cd->queue_lock);
init_waitqueue_head(&cd->queue_wait);
- cd->next_seqno = 0;
+ cd->next_seqno = 1;
spin_lock(&cache_list_lock);
cd->nextcheck = 0;
cd->entries = 0;
@@ -1348,6 +1348,9 @@ static void *__cache_seq_start(struct seq_file *m, loff_t *pos)
hash = n >> 32;
entry = n & ((1LL<<32) - 1);
+ if (hash >= cd->hash_size)
+ return NULL;
+
hlist_for_each_entry_rcu(ch, &cd->hash_table[hash], cache_list)
if (!entry--)
return ch;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 798243eabb1f..964ebc268ee4 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -789,23 +789,33 @@ static int tls_push_record(struct sock *sk, int flags,
i = msg_pl->sg.end;
sk_msg_iter_var_prev(i);
+ /* msg_pl->sg.data is a ring; data[MAX+1] is reserved for the wrap
+ * link (frags won't use it). 'i' is now the last filled entry:
+ *
+ * i end start
+ * v v v [ rsv ]
+ * [ d ][ d ][ ][ ]...[ ][ d ][ d ][ d ][chain]
+ * ^ END v
+ * `-----------------------------------------'
+ *
+ * Note that SGL does not allow chain-after-chain, so for TLS 1.3,
+ * we must make sure we don't create the wrap entry and then chain
+ * link to content_type immediately at index 0.
+ */
+ if (i < msg_pl->sg.start)
+ sg_chain(msg_pl->sg.data, ARRAY_SIZE(msg_pl->sg.data),
+ msg_pl->sg.data);
+
rec->content_type = record_type;
if (prot->version == TLS_1_3_VERSION) {
/* Add content type to end of message. No padding added */
sg_set_buf(&rec->sg_content_type, &rec->content_type, 1);
sg_mark_end(&rec->sg_content_type);
- sg_chain(msg_pl->sg.data, msg_pl->sg.end + 1,
- &rec->sg_content_type);
+ sg_chain(msg_pl->sg.data, i + 2, &rec->sg_content_type);
} else {
sg_mark_end(sk_msg_elem(msg_pl, i));
}
- if (msg_pl->sg.end < msg_pl->sg.start) {
- sg_chain(&msg_pl->sg.data[msg_pl->sg.start],
- MAX_SKB_FRAGS - msg_pl->sg.start + 1,
- msg_pl->sg.data);
- }
-
i = msg_pl->sg.start;
sg_chain(rec->sg_aead_in, 2, &msg_pl->sg.data[i]);
@@ -1356,9 +1366,14 @@ unlock:
mutex_unlock(&tls_ctx->tx_lock);
}
+/* When has_copied is true the caller has already moved bytes to
+ * userspace. Report sk_err but leave it set so the next read
+ * surfaces it instead of a spurious EOF, otherwise sk_err is
+ * consumed via sock_error().
+ */
static int
tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock,
- bool released)
+ bool released, bool has_copied)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
@@ -1376,8 +1391,11 @@ tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock,
if (!sk_psock_queue_empty(psock))
return 0;
- if (sk->sk_err)
+ if (sk->sk_err) {
+ if (has_copied)
+ return -READ_ONCE(sk->sk_err);
return sock_error(sk);
+ }
if (ret < 0)
return ret;
@@ -1413,7 +1431,7 @@ tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock,
}
if (unlikely(!tls_strp_msg_load(&ctx->strp, released)))
- return tls_rx_rec_wait(sk, psock, nonblock, false);
+ return tls_rx_rec_wait(sk, psock, nonblock, false, has_copied);
return 1;
}
@@ -2100,7 +2118,7 @@ int tls_sw_recvmsg(struct sock *sk,
int to_decrypt, chunk;
err = tls_rx_rec_wait(sk, psock, flags & MSG_DONTWAIT,
- released);
+ released, !!(decrypted + copied));
if (err <= 0) {
if (psock) {
chunk = sk_msg_recvmsg(sk, psock, msg, len,
@@ -2287,7 +2305,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
struct tls_decrypt_arg darg;
err = tls_rx_rec_wait(sk, NULL, flags & SPLICE_F_NONBLOCK,
- true);
+ true, false);
if (err <= 0)
goto splice_read_end;
@@ -2317,9 +2335,9 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
if (copied < 0)
goto splice_requeue;
- if (chunk < rxm->full_len) {
- rxm->offset += len;
- rxm->full_len -= len;
+ if (copied < rxm->full_len) {
+ rxm->offset += copied;
+ rxm->full_len -= copied;
goto splice_requeue;
}
@@ -2373,7 +2391,7 @@ int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc,
} else {
struct tls_decrypt_arg darg;
- err = tls_rx_rec_wait(sk, NULL, true, released);
+ err = tls_rx_rec_wait(sk, NULL, true, released, !!copied);
if (err <= 0)
goto read_sock_end;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index e2d787ca3e74..0d9cd977c7b7 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2711,8 +2711,7 @@ static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
* Sleep until more data has arrived. But check for races..
*/
static long unix_stream_data_wait(struct sock *sk, long timeo,
- struct sk_buff *last, unsigned int last_len,
- bool freezable)
+ struct sk_buff *last, bool freezable)
{
unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
struct sk_buff *tail;
@@ -2725,7 +2724,6 @@ static long unix_stream_data_wait(struct sock *sk, long timeo,
tail = skb_peek_tail(&sk->sk_receive_queue);
if (tail != last ||
- (tail && tail->len != last_len) ||
sk->sk_err ||
(sk->sk_shutdown & RCV_SHUTDOWN) ||
signal_pending(current) ||
@@ -2888,7 +2886,7 @@ static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
return -EAGAIN;
}
- WRITE_ONCE(u->inq_len, u->inq_len - skb->len);
+ WRITE_ONCE(u->inq_len, u->inq_len - unix_skb_len(skb));
#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
if (skb == u->oob_skb) {
@@ -2921,7 +2919,6 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state,
int flags = state->flags;
bool check_creds = false;
struct scm_cookie scm;
- unsigned int last_len;
struct unix_sock *u;
int copied = 0;
int err = 0;
@@ -2967,7 +2964,6 @@ redo:
goto unlock;
}
last = skb = skb_peek(&sk->sk_receive_queue);
- last_len = last ? last->len : 0;
again:
#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
@@ -3001,8 +2997,7 @@ again:
mutex_unlock(&u->iolock);
- timeo = unix_stream_data_wait(sk, timeo, last,
- last_len, freezable);
+ timeo = unix_stream_data_wait(sk, timeo, last, freezable);
if (signal_pending(current)) {
err = sock_intr_errno(timeo);
@@ -3019,7 +3014,6 @@ unlock:
while (skip >= unix_skb_len(skb)) {
skip -= unix_skb_len(skb);
last = skb;
- last_len = skb->len;
skb = skb_peek_next(skb, &sk->sk_receive_queue);
if (!skb)
goto again;
@@ -3069,11 +3063,12 @@ unlock:
unix_detach_fds(&scm, skb);
}
- if (unix_skb_len(skb))
- break;
-
spin_lock(&sk->sk_receive_queue.lock);
- WRITE_ONCE(u->inq_len, u->inq_len - skb->len);
+ WRITE_ONCE(u->inq_len, u->inq_len - chunk);
+ if (unix_skb_len(skb)) {
+ spin_unlock(&sk->sk_receive_queue.lock);
+ break;
+ }
__skb_unlink(skb, &sk->sk_receive_queue);
spin_unlock(&sk->sk_receive_queue.lock);
@@ -3094,7 +3089,6 @@ unlock:
skip = 0;
last = skb;
- last_len = skb->len;
unix_state_lock(sk);
skb = skb_peek_next(skb, &sk->sk_receive_queue);
if (skb)
@@ -3323,6 +3317,9 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
struct sk_buff *skb;
int answ = 0;
+ if (sk->sk_type != SOCK_STREAM)
+ return -EOPNOTSUPP;
+
mutex_lock(&u->iolock);
skb = skb_peek(&sk->sk_receive_queue);
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index a7967a345827..0783555e2526 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -607,6 +607,8 @@ static void unix_gc(struct work_struct *work)
struct sk_buff_head hitlist;
struct sk_buff *skb;
+ WRITE_ONCE(gc_in_progress, true);
+
spin_lock(&unix_gc_lock);
if (unix_graph_state == UNIX_GRAPH_NOT_CYCLIC) {
@@ -649,10 +651,8 @@ void unix_schedule_gc(struct user_struct *user)
READ_ONCE(user->unix_inflight) < UNIX_INFLIGHT_SANE_USER)
return;
- if (!READ_ONCE(gc_in_progress)) {
- WRITE_ONCE(gc_in_progress, true);
+ if (!READ_ONCE(gc_in_progress))
queue_work(system_dfl_wq, &unix_gc_work);
- }
if (user && READ_ONCE(unix_graph_cyclic_sccs))
flush_work(&unix_gc_work);
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 44037b066a5f..2ce1063d4a67 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -642,7 +642,7 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk)
*/
sock_reset_flag(sk, SOCK_DONE);
sk->sk_state = TCP_CLOSE;
- vsk->peer_shutdown = 0;
+ WRITE_ONCE(vsk->peer_shutdown, 0);
}
if (sk->sk_type == SOCK_SEQPACKET) {
@@ -933,7 +933,7 @@ static struct sock *__vsock_create(struct net *net,
vsk->rejected = false;
vsk->sent_request = false;
vsk->ignore_connecting_rst = false;
- vsk->peer_shutdown = 0;
+ WRITE_ONCE(vsk->peer_shutdown, 0);
INIT_DELAYED_WORK(&vsk->connect_work, vsock_connect_timeout);
INIT_DELAYED_WORK(&vsk->pending_work, vsock_pending_work);
@@ -1241,6 +1241,25 @@ out:
return err;
}
+static __poll_t vsock_poll_shutdown(struct sock *sk, u32 peer_shutdown)
+{
+ __poll_t mask = 0;
+
+ /* INET sockets treat local write shutdown and peer write shutdown as a
+ * case of EPOLLHUP set.
+ */
+ if (sk->sk_shutdown == SHUTDOWN_MASK ||
+ ((sk->sk_shutdown & SEND_SHUTDOWN) &&
+ (peer_shutdown & SEND_SHUTDOWN)))
+ mask |= EPOLLHUP;
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN ||
+ peer_shutdown & SEND_SHUTDOWN)
+ mask |= EPOLLRDHUP;
+
+ return mask;
+}
+
static __poll_t vsock_poll(struct file *file, struct socket *sock,
poll_table *wait)
{
@@ -1258,24 +1277,17 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock,
/* Signify that there has been an error on this socket. */
mask |= EPOLLERR;
- /* INET sockets treat local write shutdown and peer write shutdown as a
- * case of EPOLLHUP set.
- */
- if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
- ((sk->sk_shutdown & SEND_SHUTDOWN) &&
- (vsk->peer_shutdown & SEND_SHUTDOWN))) {
- mask |= EPOLLHUP;
- }
-
- if (sk->sk_shutdown & RCV_SHUTDOWN ||
- vsk->peer_shutdown & SEND_SHUTDOWN) {
- mask |= EPOLLRDHUP;
- }
-
if (sk_is_readable(sk))
mask |= EPOLLIN | EPOLLRDNORM;
if (sock->type == SOCK_DGRAM) {
+ u32 peer_shutdown = READ_ONCE(vsk->peer_shutdown);
+
+ /* DGRAM sockets do not take lock_sock() in poll(), so use one
+ * lockless snapshot for all shutdown-derived mask bits.
+ */
+ mask |= vsock_poll_shutdown(sk, peer_shutdown);
+
/* For datagram sockets we can read if there is something in
* the queue and write as long as the socket isn't shutdown for
* sending.
@@ -1290,6 +1302,7 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock,
} else if (sock_type_connectible(sk->sk_type)) {
const struct vsock_transport *transport;
+ u32 peer_shutdown;
lock_sock(sk);
@@ -1322,8 +1335,10 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock,
* terminated should also be considered read, and we check the
* shutdown flag for that.
*/
+ peer_shutdown = READ_ONCE(vsk->peer_shutdown);
+ mask |= vsock_poll_shutdown(sk, peer_shutdown);
if (sk->sk_shutdown & RCV_SHUTDOWN ||
- vsk->peer_shutdown & SEND_SHUTDOWN) {
+ peer_shutdown & SEND_SHUTDOWN) {
mask |= EPOLLIN | EPOLLRDNORM;
}
diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index 7a8963595bf9..b3394946b2ed 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -264,7 +264,7 @@ static void hvs_do_close_lock_held(struct vsock_sock *vsk,
struct sock *sk = sk_vsock(vsk);
sock_set_flag(sk, SOCK_DONE);
- vsk->peer_shutdown = SHUTDOWN_MASK;
+ WRITE_ONCE(vsk->peer_shutdown, SHUTDOWN_MASK);
if (vsock_stream_has_data(vsk) <= 0)
sk->sk_state = TCP_CLOSING;
sk->sk_state_change(sk);
@@ -593,7 +593,9 @@ static int hvs_update_recv_data(struct hvsock *hvs)
return -EIO;
if (payload_len == 0)
- hvs->vsk->peer_shutdown |= SEND_SHUTDOWN;
+ WRITE_ONCE(hvs->vsk->peer_shutdown,
+ READ_ONCE(hvs->vsk->peer_shutdown) |
+ SEND_SHUTDOWN);
hvs->recv_data_len = payload_len;
hvs->recv_data_off = 0;
@@ -736,7 +738,8 @@ static s64 hvs_stream_has_data(struct vsock_sock *vsk)
return ret;
return hvs->recv_data_len;
case 0:
- vsk->peer_shutdown |= SEND_SHUTDOWN;
+ WRITE_ONCE(vsk->peer_shutdown,
+ READ_ONCE(vsk->peer_shutdown) | SEND_SHUTDOWN);
ret = 0;
break;
default: /* -1 */
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index 416d533f493d..b10666937c49 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -70,34 +70,6 @@ static bool virtio_transport_can_zcopy(const struct virtio_transport *t_ops,
return true;
}
-static int virtio_transport_init_zcopy_skb(struct vsock_sock *vsk,
- struct sk_buff *skb,
- struct msghdr *msg,
- size_t pkt_len,
- bool zerocopy)
-{
- struct ubuf_info *uarg;
-
- if (msg->msg_ubuf) {
- uarg = msg->msg_ubuf;
- net_zcopy_get(uarg);
- } else {
- struct ubuf_info_msgzc *uarg_zc;
-
- uarg = msg_zerocopy_realloc(sk_vsock(vsk),
- pkt_len, NULL, false);
- if (!uarg)
- return -1;
-
- uarg_zc = uarg_to_msgzc(uarg);
- uarg_zc->zerocopy = zerocopy ? 1 : 0;
- }
-
- skb_zcopy_init(skb, uarg);
-
- return 0;
-}
-
static int virtio_transport_fill_skb(struct sk_buff *skb,
struct virtio_vsock_pkt_info *info,
size_t len,
@@ -136,27 +108,6 @@ static void virtio_transport_init_hdr(struct sk_buff *skb,
hdr->fwd_cnt = cpu_to_le32(0);
}
-static void virtio_transport_copy_nonlinear_skb(const struct sk_buff *skb,
- void *dst,
- size_t len)
-{
- struct iov_iter iov_iter = { 0 };
- struct kvec kvec;
- size_t to_copy;
-
- kvec.iov_base = dst;
- kvec.iov_len = len;
-
- iov_iter.iter_type = ITER_KVEC;
- iov_iter.kvec = &kvec;
- iov_iter.nr_segs = 1;
-
- to_copy = min_t(size_t, len, skb->len);
-
- skb_copy_datagram_iter(skb, VIRTIO_VSOCK_SKB_CB(skb)->offset,
- &iov_iter, to_copy);
-}
-
/* Packet capture */
static struct sk_buff *virtio_transport_build_skb(void *opaque)
{
@@ -166,12 +117,12 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque)
struct sk_buff *skb;
size_t payload_len;
- /* A packet could be split to fit the RX buffer, so we can retrieve
- * the payload length from the header and the buffer pointer taking
- * care of the offset in the original packet.
+ /* A packet could be split to fit the RX buffer, so we use
+ * the payload length from the header, which has been updated
+ * by the sender to reflect the fragment size.
*/
pkt_hdr = virtio_vsock_hdr(pkt);
- payload_len = pkt->len;
+ payload_len = le32_to_cpu(pkt_hdr->len);
skb = alloc_skb(sizeof(*hdr) + sizeof(*pkt_hdr) + payload_len,
GFP_ATOMIC);
@@ -214,12 +165,18 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque)
skb_put_data(skb, pkt_hdr, sizeof(*pkt_hdr));
if (payload_len) {
- if (skb_is_nonlinear(pkt)) {
- void *data = skb_put(skb, payload_len);
-
- virtio_transport_copy_nonlinear_skb(pkt, data, payload_len);
- } else {
- skb_put_data(skb, pkt->data, payload_len);
+ struct iov_iter iov_iter;
+ struct kvec kvec;
+ void *data = skb_put(skb, payload_len);
+
+ kvec.iov_base = data;
+ kvec.iov_len = payload_len;
+ iov_iter_kvec(&iov_iter, ITER_DEST, &kvec, 1, payload_len);
+
+ if (skb_copy_datagram_iter(pkt, VIRTIO_VSOCK_SKB_CB(pkt)->offset,
+ &iov_iter, payload_len)) {
+ kfree_skb(skb);
+ return NULL;
}
}
@@ -248,6 +205,7 @@ static u16 virtio_transport_get_type(struct sock *sk)
static struct sk_buff *virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info,
size_t payload_len,
bool zcopy,
+ struct ubuf_info *uarg,
u32 src_cid,
u32 src_port,
u32 dst_cid,
@@ -288,6 +246,12 @@ static struct sk_buff *virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *
if (info->msg && payload_len > 0) {
int err;
+ /* Bind the zerocopy lifetime before filling frags so error
+ * rollback frees managed fixed-buffer pages through
+ * the uarg-aware path.
+ */
+ skb_zcopy_set(skb, uarg, NULL);
+
err = virtio_transport_fill_skb(skb, info, payload_len, zcopy);
if (err)
goto out;
@@ -332,8 +296,10 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
u32 src_cid, src_port, dst_cid, dst_port;
const struct virtio_transport *t_ops;
struct virtio_vsock_sock *vvs;
+ struct ubuf_info *uarg = NULL;
u32 pkt_len = info->pkt_len;
bool can_zcopy = false;
+ bool have_uref = false;
u32 rest_len;
int ret;
@@ -375,6 +341,25 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
if (can_zcopy)
max_skb_len = min_t(u32, VIRTIO_VSOCK_MAX_PKT_BUF_SIZE,
(MAX_SKB_FRAGS * PAGE_SIZE));
+
+ if (info->msg->msg_flags & MSG_ZEROCOPY &&
+ info->op == VIRTIO_VSOCK_OP_RW) {
+ uarg = info->msg->msg_ubuf;
+
+ if (!uarg) {
+ uarg = msg_zerocopy_realloc(sk_vsock(vsk),
+ pkt_len, NULL, false);
+ if (!uarg) {
+ virtio_transport_put_credit(vvs, pkt_len);
+ return -ENOMEM;
+ }
+
+ if (!can_zcopy)
+ uarg_to_msgzc(uarg)->zerocopy = 0;
+
+ have_uref = true;
+ }
+ }
}
rest_len = pkt_len;
@@ -386,6 +371,7 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
skb_len = min(max_skb_len, rest_len);
skb = virtio_transport_alloc_skb(info, skb_len, can_zcopy,
+ uarg,
src_cid, src_port,
dst_cid, dst_port);
if (!skb) {
@@ -393,28 +379,6 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
break;
}
- /* We process buffer part by part, allocating skb on
- * each iteration. If this is last skb for this buffer
- * and MSG_ZEROCOPY mode is in use - we must allocate
- * completion for the current syscall.
- *
- * Pass pkt_len because msg iter is already consumed
- * by virtio_transport_fill_skb(), so iter->count
- * can not be used for RLIMIT_MEMLOCK pinned-pages
- * accounting done by msg_zerocopy_realloc().
- */
- if (info->msg && info->msg->msg_flags & MSG_ZEROCOPY &&
- skb_len == rest_len && info->op == VIRTIO_VSOCK_OP_RW) {
- if (virtio_transport_init_zcopy_skb(vsk, skb,
- info->msg,
- pkt_len,
- can_zcopy)) {
- kfree_skb(skb);
- ret = -ENOMEM;
- break;
- }
- }
-
virtio_transport_inc_tx_pkt(vvs, skb);
ret = t_ops->send_pkt(skb, info->net);
@@ -437,6 +401,18 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
virtio_transport_put_credit(vvs, rest_len);
+ /* msg_zerocopy_realloc() initializes the ubuf_info refcnt to 1.
+ * skb_zcopy_set() increases it for each skb, so we can drop that
+ * initial reference to keep it balanced.
+ */
+ if (have_uref) {
+ if (rest_len == pkt_len)
+ /* No data sent, abort the notification. */
+ net_zcopy_put_abort(uarg, true);
+ else
+ net_zcopy_put(uarg);
+ }
+
/* Return number of bytes, if any data has been sent. */
if (rest_len != pkt_len)
ret = pkt_len - rest_len;
@@ -447,7 +423,16 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
static bool virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs,
u32 len)
{
- if (vvs->buf_used + len > vvs->buf_alloc)
+ u64 skb_overhead = ((u64)skb_queue_len(&vvs->rx_queue) + 1) * SKB_TRUESIZE(0);
+
+ /* Allow at most buf_alloc * 2 total budget (payload + overhead),
+ * similar to how SO_RCVBUF is doubled to reserve space for sk_buff
+ * metadata. Check payload against buf_alloc to be sure the other
+ * peer is respecting the credit, and sk_buff overhead to bound
+ * queue growth.
+ */
+ if ((u64)vvs->buf_used + len > vvs->buf_alloc ||
+ skb_overhead > vvs->buf_alloc)
return false;
vvs->rx_bytes += len;
@@ -1204,7 +1189,7 @@ static int virtio_transport_reset_no_sock(const struct virtio_transport *t,
if (!t)
return -ENOTCONN;
- reply = virtio_transport_alloc_skb(&info, 0, false,
+ reply = virtio_transport_alloc_skb(&info, 0, false, NULL,
le64_to_cpu(hdr->dst_cid),
le32_to_cpu(hdr->dst_port),
le64_to_cpu(hdr->src_cid),
@@ -1249,7 +1234,7 @@ static void virtio_transport_do_close(struct vsock_sock *vsk,
struct sock *sk = sk_vsock(vsk);
sock_set_flag(sk, SOCK_DONE);
- vsk->peer_shutdown = SHUTDOWN_MASK;
+ WRITE_ONCE(vsk->peer_shutdown, SHUTDOWN_MASK);
if (vsock_stream_has_data(vsk) <= 0)
sk->sk_state = TCP_CLOSING;
sk->sk_state_change(sk);
@@ -1363,7 +1348,7 @@ destroy:
return err;
}
-static void
+static bool
virtio_transport_recv_enqueue(struct vsock_sock *vsk,
struct sk_buff *skb)
{
@@ -1378,10 +1363,8 @@ virtio_transport_recv_enqueue(struct vsock_sock *vsk,
spin_lock_bh(&vvs->rx_lock);
can_enqueue = virtio_transport_inc_rx_pkt(vvs, len);
- if (!can_enqueue) {
- free_pkt = true;
+ if (!can_enqueue)
goto out;
- }
if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM)
vvs->msg_count++;
@@ -1421,6 +1404,8 @@ out:
spin_unlock_bh(&vvs->rx_lock);
if (free_pkt)
kfree_skb(skb);
+
+ return can_enqueue;
}
static int
@@ -1433,7 +1418,17 @@ virtio_transport_recv_connected(struct sock *sk,
switch (le16_to_cpu(hdr->op)) {
case VIRTIO_VSOCK_OP_RW:
- virtio_transport_recv_enqueue(vsk, skb);
+ if (!virtio_transport_recv_enqueue(vsk, skb)) {
+ /* There is no more space to queue the packet, so let's
+ * close the connection; otherwise, we'll lose data.
+ */
+ (void)virtio_transport_reset(vsk, skb);
+ virtio_transport_do_close(vsk, true);
+ sk->sk_err = ENOBUFS;
+ sk_error_report(sk);
+ vsock_remove_sock(vsk);
+ break;
+ }
vsock_data_ready(sk);
return err;
case VIRTIO_VSOCK_OP_CREDIT_REQUEST:
@@ -1442,12 +1437,15 @@ virtio_transport_recv_connected(struct sock *sk,
case VIRTIO_VSOCK_OP_CREDIT_UPDATE:
sk->sk_write_space(sk);
break;
- case VIRTIO_VSOCK_OP_SHUTDOWN:
+ case VIRTIO_VSOCK_OP_SHUTDOWN: {
+ u32 peer_shutdown = READ_ONCE(vsk->peer_shutdown);
+
if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SHUTDOWN_RCV)
- vsk->peer_shutdown |= RCV_SHUTDOWN;
+ peer_shutdown |= RCV_SHUTDOWN;
if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SHUTDOWN_SEND)
- vsk->peer_shutdown |= SEND_SHUTDOWN;
- if (vsk->peer_shutdown == SHUTDOWN_MASK) {
+ peer_shutdown |= SEND_SHUTDOWN;
+ WRITE_ONCE(vsk->peer_shutdown, peer_shutdown);
+ if (peer_shutdown == SHUTDOWN_MASK) {
if (vsock_stream_has_data(vsk) <= 0 && !sock_flag(sk, SOCK_DONE)) {
(void)virtio_transport_reset(vsk, NULL);
virtio_transport_do_close(vsk, true);
@@ -1462,6 +1460,7 @@ virtio_transport_recv_connected(struct sock *sk,
if (le32_to_cpu(virtio_vsock_hdr(skb)->flags))
sk->sk_state_change(sk);
break;
+ }
case VIRTIO_VSOCK_OP_RST:
virtio_transport_do_close(vsk, true);
break;
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index 4296ca1183f1..91516488a742 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -819,7 +819,7 @@ static void vmci_transport_handle_detach(struct sock *sk)
/* On a detach the peer will not be sending or receiving
* anymore.
*/
- vsk->peer_shutdown = SHUTDOWN_MASK;
+ WRITE_ONCE(vsk->peer_shutdown, SHUTDOWN_MASK);
/* We should not be sending anymore since the peer won't be
* there to receive, but we can still receive if there is data
@@ -980,8 +980,10 @@ static int vmci_transport_recv_listen(struct sock *sk,
err = -EINVAL;
}
- if (err < 0)
+ if (err < 0) {
vsock_remove_pending(sk, pending);
+ sk_acceptq_removed(sk);
+ }
release_sock(pending);
vmci_transport_release_pending(pending);
@@ -1164,7 +1166,7 @@ vmci_transport_recv_connecting_server(struct sock *listener,
/* Close and cleanup the connection. */
vmci_transport_send_reset(pending, pkt);
skerr = EPROTO;
- err = pkt->type == VMCI_TRANSPORT_PACKET_TYPE_RST ? 0 : -EINVAL;
+ err = -EINVAL;
goto destroy;
}
@@ -1542,7 +1544,9 @@ static int vmci_transport_recv_connected(struct sock *sk,
if (pkt->u.mode) {
vsk = vsock_sk(sk);
- vsk->peer_shutdown |= pkt->u.mode;
+ WRITE_ONCE(vsk->peer_shutdown,
+ READ_ONCE(vsk->peer_shutdown) |
+ pkt->u.mode);
sk->sk_state_change(sk);
}
break;
@@ -1559,7 +1563,7 @@ static int vmci_transport_recv_connected(struct sock *sk,
* a clean shutdown.
*/
sock_set_flag(sk, SOCK_DONE);
- vsk->peer_shutdown = SHUTDOWN_MASK;
+ WRITE_ONCE(vsk->peer_shutdown, SHUTDOWN_MASK);
if (vsock_stream_has_data(vsk) <= 0)
sk->sk_state = TCP_CLOSING;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index f334cdef8958..76c537a6e8b5 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -1276,6 +1276,18 @@ static int nl80211_prepare_wdev_dump(struct netlink_callback *cb,
rtnl_unlock();
return -ENODEV;
}
+
+ /*
+ * The first invocation validated the wdev's netns against
+ * the caller via __cfg80211_wdev_from_attrs(). The wiphy
+ * may have moved netns between dumpit invocations (via
+ * NL80211_CMD_SET_WIPHY_NETNS), so re-check here.
+ */
+ if (!net_eq(wiphy_net(wiphy), sock_net(cb->skb->sk))) {
+ rtnl_unlock();
+ return -ENODEV;
+ }
+
*rdev = wiphy_to_rdev(wiphy);
*wdev = NULL;
@@ -6354,6 +6366,9 @@ nl80211_parse_rnr_elems(struct wiphy *wiphy, struct nlattr *attrs,
if (ret)
return ERR_PTR(ret);
+ if (num_elems >= 255)
+ return ERR_PTR(-EINVAL);
+
num_elems++;
}
@@ -6699,6 +6714,12 @@ static int nl80211_calculate_ap_params(struct cfg80211_ap_settings *params)
return -EINVAL;
}
+ if (!!params->he_cap != !!params->he_oper)
+ return -EINVAL;
+
+ if (!!params->eht_cap != !!params->eht_oper)
+ return -EINVAL;
+
return 0;
}
@@ -13867,6 +13888,19 @@ static int nl80211_wiphy_netns(struct sk_buff *skb, struct genl_info *info)
if (IS_ERR(net))
return PTR_ERR(net);
+ /*
+ * The caller already has CAP_NET_ADMIN over the source netns
+ * (enforced by GENL_UNS_ADMIN_PERM on the genl op). Mirror the
+ * convention used by net/core/rtnetlink.c::rtnl_get_net_ns_capable()
+ * and require CAP_NET_ADMIN over the target netns as well, so that
+ * a caller that is privileged in their own user namespace cannot
+ * push a wiphy into a netns where they have no privilege.
+ */
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) {
+ put_net(net);
+ return -EPERM;
+ }
+
err = 0;
/* check if anything to do */
@@ -19828,6 +19862,7 @@ static const struct genl_small_ops nl80211_small_ops[] = {
.cmd = NL80211_CMD_SET_PMK,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = nl80211_set_pmk,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_CLEAR_SKB),
},
@@ -19835,6 +19870,7 @@ static const struct genl_small_ops nl80211_small_ops[] = {
.cmd = NL80211_CMD_DEL_PMK,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = nl80211_del_pmk,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
},
{
diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c
index 4c8ea0583f94..d6cd0de64d1f 100644
--- a/net/wireless/pmsr.c
+++ b/net/wireless/pmsr.c
@@ -88,7 +88,7 @@ static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev,
out->ftm.ftms_per_burst = 0;
if (tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST])
out->ftm.ftms_per_burst =
- nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST]);
+ nla_get_u8(tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST]);
if (capa->ftm.max_ftms_per_burst &&
(out->ftm.ftms_per_burst > capa->ftm.max_ftms_per_burst ||
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index 328af43ef832..27a56ee2e8f0 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -1071,6 +1071,7 @@ int cfg80211_scan(struct cfg80211_registered_device *rdev)
struct cfg80211_scan_request_int *request;
struct cfg80211_scan_request_int *rdev_req = rdev->scan_req;
u32 n_channels = 0, idx, i;
+ int err;
if (!(rdev->wiphy.flags & WIPHY_FLAG_SPLIT_SCAN_6GHZ)) {
rdev_req->req.first_part = true;
@@ -1100,8 +1101,14 @@ int cfg80211_scan(struct cfg80211_registered_device *rdev)
rdev_req->req.scan_6ghz = false;
rdev_req->req.first_part = true;
+ err = rdev_scan(rdev, request);
+ if (err) {
+ kfree(request);
+ return err;
+ }
+
rdev->int_scan_req = request;
- return rdev_scan(rdev, request);
+ return 0;
}
void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev,
@@ -2462,6 +2469,9 @@ size_t cfg80211_merge_profile(const u8 *ie, size_t ielen,
memcpy(merged_ie + copied_len, next_sub->data,
next_sub->datalen);
copied_len += next_sub->datalen;
+
+ mbssid_elem = next_mbssid;
+ sub_elem = next_sub;
}
return copied_len;
diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
index 22d9d9bae8f5..63d145b524c9 100644
--- a/net/wireless/wext-compat.c
+++ b/net/wireless/wext-compat.c
@@ -789,6 +789,8 @@ static int cfg80211_wext_siwfreq(struct net_device *dev,
chandef.chan = ieee80211_get_channel(&rdev->wiphy, freq);
if (!chandef.chan)
return -EINVAL;
+ if (!cfg80211_chandef_valid(&chandef))
+ return -EINVAL;
return cfg80211_set_monitor_channel(rdev, dev, &chandef);
case NL80211_IFTYPE_MESH_POINT:
freq = cfg80211_wext_freq(wextfreq);
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 887abed25466..f8c8a8c9dfba 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -646,9 +646,42 @@ static u64 xsk_skb_destructor_get_addr(struct sk_buff *skb)
return (u64)((uintptr_t)skb_shinfo(skb)->destructor_arg & ~0x1UL);
}
-static void xsk_skb_destructor_set_addr(struct sk_buff *skb, u64 addr)
+static struct xsk_addrs *__xsk_addrs_alloc(struct sk_buff *skb, u64 addr)
{
- skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t)addr | 0x1UL);
+ struct xsk_addrs *xsk_addr;
+
+ xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
+ if (unlikely(!xsk_addr))
+ return NULL;
+
+ xsk_addr->addrs[0] = addr;
+ skb_shinfo(skb)->destructor_arg = (void *)xsk_addr;
+ return xsk_addr;
+}
+
+static struct xsk_addrs *xsk_addrs_alloc(struct sk_buff *skb)
+{
+ struct xsk_addrs *xsk_addr;
+
+ if (!xsk_skb_destructor_is_addr(skb))
+ return (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+
+ xsk_addr = __xsk_addrs_alloc(skb, xsk_skb_destructor_get_addr(skb));
+ if (likely(xsk_addr))
+ xsk_addr->num_descs = 1;
+ return xsk_addr;
+}
+
+static int xsk_skb_destructor_set_addr(struct sk_buff *skb, u64 addr)
+{
+ if (IS_ENABLED(CONFIG_64BIT)) {
+ skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t)addr | 0x1UL);
+ return 0;
+ }
+
+ if (unlikely(!__xsk_addrs_alloc(skb, addr)))
+ return -ENOMEM;
+ return 0;
}
static void xsk_inc_num_desc(struct sk_buff *skb)
@@ -685,7 +718,7 @@ static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool,
spin_lock_irqsave(&pool->cq_prod_lock, flags);
idx = xskq_get_prod(pool->cq);
- if (unlikely(num_descs > 1)) {
+ if (unlikely(!xsk_skb_destructor_is_addr(skb))) {
xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
for (i = 0; i < num_descs; i++) {
@@ -724,14 +757,20 @@ void xsk_destruct_skb(struct sk_buff *skb)
sock_wfree(skb);
}
-static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs,
- u64 addr)
+static int xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs,
+ u64 addr)
{
+ int err;
+
+ err = xsk_skb_destructor_set_addr(skb, addr);
+ if (unlikely(err))
+ return err;
+
skb->dev = xs->dev;
skb->priority = READ_ONCE(xs->sk.sk_priority);
skb->mark = READ_ONCE(xs->sk.sk_mark);
skb->destructor = xsk_destruct_skb;
- xsk_skb_destructor_set_addr(skb, addr);
+ return 0;
}
static void xsk_consume_skb(struct sk_buff *skb)
@@ -740,7 +779,7 @@ static void xsk_consume_skb(struct sk_buff *skb)
u32 num_descs = xsk_get_num_desc(skb);
struct xsk_addrs *xsk_addr;
- if (unlikely(num_descs > 1)) {
+ if (unlikely(!xsk_skb_destructor_is_addr(skb))) {
xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
kmem_cache_free(xsk_tx_generic_cache, xsk_addr);
}
@@ -763,6 +802,7 @@ static int xsk_skb_metadata(struct sk_buff *skb, void *buffer,
u32 hr)
{
struct xsk_tx_metadata *meta = NULL;
+ u16 csum_start, csum_offset;
if (unlikely(pool->tx_metadata_len == 0))
return -EINVAL;
@@ -772,13 +812,15 @@ static int xsk_skb_metadata(struct sk_buff *skb, void *buffer,
return -EINVAL;
if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) {
- if (unlikely(meta->request.csum_start +
- meta->request.csum_offset +
+ csum_start = READ_ONCE(meta->request.csum_start);
+ csum_offset = READ_ONCE(meta->request.csum_offset);
+
+ if (unlikely(csum_start + csum_offset +
sizeof(__sum16) > desc->len))
return -EINVAL;
- skb->csum_start = hr + meta->request.csum_start;
- skb->csum_offset = meta->request.csum_offset;
+ skb->csum_start = hr + csum_start;
+ skb->csum_offset = csum_offset;
skb->ip_summed = CHECKSUM_PARTIAL;
if (unlikely(pool->tx_sw_csum)) {
@@ -819,28 +861,19 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
return ERR_PTR(err);
skb_reserve(skb, hr);
-
- xsk_skb_init_misc(skb, xs, desc->addr);
if (desc->options & XDP_TX_METADATA) {
err = xsk_skb_metadata(skb, buffer, desc, pool, hr);
- if (unlikely(err))
+ if (unlikely(err)) {
+ kfree_skb(skb);
return ERR_PTR(err);
+ }
}
} else {
struct xsk_addrs *xsk_addr;
- if (xsk_skb_destructor_is_addr(skb)) {
- xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache,
- GFP_KERNEL);
- if (!xsk_addr)
- return ERR_PTR(-ENOMEM);
-
- xsk_addr->num_descs = 1;
- xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb);
- skb_shinfo(skb)->destructor_arg = (void *)xsk_addr;
- } else {
- xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
- }
+ xsk_addr = xsk_addrs_alloc(skb);
+ if (!xsk_addr)
+ return ERR_PTR(-ENOMEM);
/* in case of -EOVERFLOW that could happen below,
* xsk_consume_skb() will release this node as whole skb
@@ -856,8 +889,11 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
addr = buffer - pool->addrs;
for (copied = 0, i = skb_shinfo(skb)->nr_frags; copied < len; i++) {
- if (unlikely(i >= MAX_SKB_FRAGS))
+ if (unlikely(i >= MAX_SKB_FRAGS)) {
+ if (!xs->skb)
+ kfree_skb(skb);
return ERR_PTR(-EOVERFLOW);
+ }
page = pool->umem->pgs[addr >> PAGE_SHIFT];
get_page(page);
@@ -914,7 +950,6 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
if (unlikely(err))
goto free_err;
- xsk_skb_init_misc(skb, xs, desc->addr);
if (desc->options & XDP_TX_METADATA) {
err = xsk_skb_metadata(skb, buffer, desc,
xs->pool, hr);
@@ -927,19 +962,10 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
struct page *page;
u8 *vaddr;
- if (xsk_skb_destructor_is_addr(skb)) {
- xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache,
- GFP_KERNEL);
- if (!xsk_addr) {
- err = -ENOMEM;
- goto free_err;
- }
-
- xsk_addr->num_descs = 1;
- xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb);
- skb_shinfo(skb)->destructor_arg = (void *)xsk_addr;
- } else {
- xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+ xsk_addr = xsk_addrs_alloc(skb);
+ if (!xsk_addr) {
+ err = -ENOMEM;
+ goto free_err;
}
if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) {
@@ -964,18 +990,28 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
}
}
+ if (!xs->skb) {
+ err = xsk_skb_init_misc(skb, xs, desc->addr);
+ if (unlikely(err))
+ goto free_err;
+ }
xsk_inc_num_desc(skb);
return skb;
free_err:
- if (skb && !skb_shinfo(skb)->nr_frags)
+ if (skb && !xs->skb)
kfree_skb(skb);
if (err == -EOVERFLOW) {
- /* Drop the packet */
- xsk_inc_num_desc(xs->skb);
- xsk_drop_skb(xs->skb);
+ if (xs->skb) {
+ /* Drop the packet */
+ xsk_inc_num_desc(xs->skb);
+ xsk_drop_skb(xs->skb);
+ } else {
+ xsk_cq_cancel_locked(xs->pool, 1);
+ xs->tx->invalid_descs++;
+ }
xskq_cons_release(xs->tx);
} else {
/* Let application retry */
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index cd7bc50872f6..d981cfdd8535 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -175,6 +175,9 @@ int xp_assign_dev(struct xsk_buff_pool *pool,
if (force_zc && force_copy)
return -EINVAL;
+ if (pool->tx_sw_csum && (netdev->priv_flags & IFF_TX_SKB_NO_LINEAR))
+ return -EOPNOTSUPP;
+
if (xsk_get_pool_from_qid(netdev, queue_id))
return -EBUSY;
diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c
index afa457506274..3bff346308d0 100644
--- a/net/xdp/xskmap.c
+++ b/net/xdp/xskmap.c
@@ -184,6 +184,10 @@ static long xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
}
xs = (struct xdp_sock *)sock->sk;
+ if (!READ_ONCE(xs->rx)) {
+ sockfd_put(sock);
+ return -ENOBUFS;
+ }
map_entry = &m->xsk_map[i];
node = xsk_map_node_alloc(m, map_entry);
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index f65291eba1f6..e4c2cd24936d 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -797,9 +797,12 @@ static void xfrm_trans_reinject(struct work_struct *work)
spin_unlock_bh(&trans->queue_lock);
local_bh_disable();
- while ((skb = __skb_dequeue(&queue)))
- XFRM_TRANS_SKB_CB(skb)->finish(XFRM_TRANS_SKB_CB(skb)->net,
- NULL, skb);
+ while ((skb = __skb_dequeue(&queue))) {
+ struct net *net = XFRM_TRANS_SKB_CB(skb)->net;
+
+ XFRM_TRANS_SKB_CB(skb)->finish(net, NULL, skb);
+ put_net(net);
+ }
local_bh_enable();
}
@@ -808,6 +811,7 @@ int xfrm_trans_queue_net(struct net *net, struct sk_buff *skb,
struct sk_buff *))
{
struct xfrm_trans_tasklet *trans;
+ struct net *hold_net;
trans = this_cpu_ptr(&xfrm_trans_tasklet);
@@ -816,8 +820,12 @@ int xfrm_trans_queue_net(struct net *net, struct sk_buff *skb,
BUILD_BUG_ON(sizeof(struct xfrm_trans_cb) > sizeof(skb->cb));
+ hold_net = maybe_get_net(net);
+ if (!hold_net)
+ return -ENODEV;
+
XFRM_TRANS_SKB_CB(skb)->finish = finish;
- XFRM_TRANS_SKB_CB(skb)->net = net;
+ XFRM_TRANS_SKB_CB(skb)->net = hold_net;
spin_lock_bh(&trans->queue_lock);
__skb_queue_tail(&trans->queue, skb);
spin_unlock_bh(&trans->queue_lock);
diff --git a/net/xfrm/xfrm_ipcomp.c b/net/xfrm/xfrm_ipcomp.c
index 5f38dff16177..671d48f8c937 100644
--- a/net/xfrm/xfrm_ipcomp.c
+++ b/net/xfrm/xfrm_ipcomp.c
@@ -51,11 +51,15 @@ static int ipcomp_post_acomp(struct sk_buff *skb, int err, int hlen)
struct scatterlist *dsg;
int len, dlen;
- if (unlikely(err))
- goto out_free_req;
+ if (unlikely(!req))
+ return err;
extra = acomp_request_extra(req);
dsg = extra->sg;
+
+ if (unlikely(err))
+ goto out_free_req;
+
dlen = req->dlen;
pskb_trim_unique(skb, 0);
@@ -84,10 +88,10 @@ static int ipcomp_post_acomp(struct sk_buff *skb, int err, int hlen)
skb_shinfo(skb)->nr_frags++;
} while ((dlen -= len));
- for (; dsg; dsg = sg_next(dsg))
+out_free_req:
+ for (; dsg && sg_page(dsg); dsg = sg_next(dsg))
__free_page(sg_page(dsg));
-out_free_req:
acomp_request_free(req);
return err;
}
diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c
index 97bc979e55ba..6c6bbc040517 100644
--- a/net/xfrm/xfrm_iptfs.c
+++ b/net/xfrm/xfrm_iptfs.c
@@ -2650,7 +2650,8 @@ static void __iptfs_init_state(struct xfrm_state *x,
x->props.enc_hdr_len = sizeof(struct ip_iptfs_hdr);
/* Always keep a module reference when x->mode_data is set */
- __module_get(x->mode_cbs->owner);
+ if (x->mode_data != xtfs)
+ __module_get(x->mode_cbs->owner);
x->mode_data = xtfs;
xtfs->x = x;
@@ -2658,22 +2659,39 @@ static void __iptfs_init_state(struct xfrm_state *x,
static int iptfs_clone_state(struct xfrm_state *x, struct xfrm_state *orig)
{
+ struct skb_wseq *w_saved = NULL;
struct xfrm_iptfs_data *xtfs;
xtfs = kmemdup(orig->mode_data, sizeof(*xtfs), GFP_KERNEL);
if (!xtfs)
return -ENOMEM;
- xtfs->ra_newskb = NULL;
if (xtfs->cfg.reorder_win_size) {
- xtfs->w_saved = kzalloc_objs(*xtfs->w_saved,
- xtfs->cfg.reorder_win_size);
- if (!xtfs->w_saved) {
+ w_saved = kzalloc_objs(*w_saved, xtfs->cfg.reorder_win_size);
+ if (!w_saved) {
kfree_sensitive(xtfs);
return -ENOMEM;
}
}
+ xtfs->w_saved = w_saved;
+
+ __skb_queue_head_init(&xtfs->queue);
+ xtfs->queue_size = 0;
+ hrtimer_setup(&xtfs->iptfs_timer, iptfs_delay_timer, CLOCK_MONOTONIC,
+ IPTFS_HRTIMER_MODE);
+
+ spin_lock_init(&xtfs->drop_lock);
+ hrtimer_setup(&xtfs->drop_timer, iptfs_drop_timer, CLOCK_MONOTONIC,
+ IPTFS_HRTIMER_MODE);
+ xtfs->w_seq_set = false;
+ xtfs->w_wantseq = 0;
+ xtfs->w_savedlen = 0;
+ xtfs->ra_newskb = NULL;
+ xtfs->ra_wantseq = 0;
+ xtfs->ra_runtlen = 0;
+
+ __module_get(x->mode_cbs->owner);
x->mode_data = xtfs;
xtfs->x = x;
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index a9652b422f51..cc35c2fcbbe0 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -66,7 +66,9 @@ static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb)
struct iphdr *iph = ip_hdr(skb);
int ihl = iph->ihl * 4;
- skb_set_inner_transport_header(skb, skb_transport_offset(skb));
+ if (!skb->inner_protocol)
+ skb_set_inner_transport_header(skb,
+ skb_transport_offset(skb));
skb_set_network_header(skb, -x->props.header_len);
skb->mac_header = skb->network_header +
@@ -167,7 +169,9 @@ static int xfrm6_transport_output(struct xfrm_state *x, struct sk_buff *skb)
int hdr_len;
iph = ipv6_hdr(skb);
- skb_set_inner_transport_header(skb, skb_transport_offset(skb));
+ if (!skb->inner_protocol)
+ skb_set_inner_transport_header(skb,
+ skb_transport_offset(skb));
hdr_len = xfrm6_hdr_offset(x, skb, &prevhdr);
if (hdr_len < 0)
@@ -276,8 +280,10 @@ static int xfrm4_tunnel_encap_add(struct xfrm_state *x, struct sk_buff *skb)
struct iphdr *top_iph;
int flags;
- skb_set_inner_network_header(skb, skb_network_offset(skb));
- skb_set_inner_transport_header(skb, skb_transport_offset(skb));
+ if (!skb->inner_protocol) {
+ skb_set_inner_network_header(skb, skb_network_offset(skb));
+ skb_set_inner_transport_header(skb, skb_transport_offset(skb));
+ }
skb_set_network_header(skb, -x->props.header_len);
skb->mac_header = skb->network_header +
@@ -321,8 +327,10 @@ static int xfrm6_tunnel_encap_add(struct xfrm_state *x, struct sk_buff *skb)
struct ipv6hdr *top_iph;
int dsfield;
- skb_set_inner_network_header(skb, skb_network_offset(skb));
- skb_set_inner_transport_header(skb, skb_transport_offset(skb));
+ if (!skb->inner_protocol) {
+ skb_set_inner_network_header(skb, skb_network_offset(skb));
+ skb_set_inner_transport_header(skb, skb_transport_offset(skb));
+ }
skb_set_network_header(skb, -x->props.header_len);
skb->mac_header = skb->network_header +
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index c944327ce66c..dd09d2063da2 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -4276,21 +4276,21 @@ out_byidx:
return -ENOMEM;
}
-static void xfrm_policy_fini(struct net *net)
+static void __net_exit xfrm_net_pre_exit(struct net *net)
{
- struct xfrm_pol_inexact_bin *b, *t;
- unsigned int sz;
- int dir;
-
disable_work_sync(&net->xfrm.policy_hthresh.work);
-
flush_work(&net->xfrm.policy_hash_work);
#ifdef CONFIG_XFRM_SUB_POLICY
xfrm_policy_flush(net, XFRM_POLICY_TYPE_SUB, false);
#endif
xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, false);
+}
- synchronize_rcu();
+static void xfrm_policy_fini(struct net *net)
+{
+ struct xfrm_pol_inexact_bin *b, *t;
+ unsigned int sz;
+ int dir;
WARN_ON(!list_empty(&net->xfrm.policy_all));
@@ -4368,6 +4368,7 @@ static void __net_exit xfrm_net_exit(struct net *net)
static struct pernet_operations __net_initdata xfrm_net_ops = {
.init = xfrm_net_init,
+ .pre_exit = xfrm_net_pre_exit,
.exit = xfrm_net_exit,
};
@@ -4703,7 +4704,7 @@ int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
}
/* Stage 5 - announce */
- km_migrate(sel, dir, type, m, num_migrate, k, encap);
+ km_migrate(sel, dir, type, m, num_migrate, k, net, encap);
xfrm_pol_put(pol);
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 1748d374abca..589c3b6e4679 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -818,17 +818,17 @@ int __xfrm_state_delete(struct xfrm_state *x)
spin_lock(&net->xfrm.xfrm_state_lock);
list_del(&x->km.all);
- hlist_del_rcu(&x->bydst);
- hlist_del_rcu(&x->bysrc);
- if (x->km.seq)
- hlist_del_rcu(&x->byseq);
+ hlist_del_init_rcu(&x->bydst);
+ hlist_del_init_rcu(&x->bysrc);
+ if (!hlist_unhashed(&x->byseq))
+ hlist_del_init_rcu(&x->byseq);
if (!hlist_unhashed(&x->state_cache))
hlist_del_rcu(&x->state_cache);
if (!hlist_unhashed(&x->state_cache_input))
hlist_del_rcu(&x->state_cache_input);
- if (x->id.spi)
- hlist_del_rcu(&x->byspi);
+ if (!hlist_unhashed(&x->byspi))
+ hlist_del_init_rcu(&x->byspi);
net->xfrm.state_num--;
xfrm_nat_keepalive_state_updated(x);
spin_unlock(&net->xfrm.xfrm_state_lock);
@@ -2837,7 +2837,7 @@ EXPORT_SYMBOL(km_policy_expired);
#ifdef CONFIG_XFRM_MIGRATE
int km_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
const struct xfrm_migrate *m, int num_migrate,
- const struct xfrm_kmaddress *k,
+ const struct xfrm_kmaddress *k, struct net *net,
const struct xfrm_encap_tmpl *encap)
{
int err = -EINVAL;
@@ -2848,7 +2848,7 @@ int km_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
list_for_each_entry_rcu(km, &xfrm_km_list, list) {
if (km->migrate) {
ret = km->migrate(sel, dir, type, m, num_migrate, k,
- encap);
+ net, encap);
if (!ret)
err = ret;
}
@@ -3114,10 +3114,14 @@ u32 xfrm_state_mtu(struct xfrm_state *x, int mtu)
const struct xfrm_type *type = READ_ONCE(x->type);
struct crypto_aead *aead;
u32 blksize, net_adj = 0;
+ u32 overhead, payload_mtu;
if (x->km.state != XFRM_STATE_VALID ||
- !type || type->proto != IPPROTO_ESP)
+ !type || type->proto != IPPROTO_ESP) {
+ if (mtu <= x->props.header_len)
+ return 1;
return mtu - x->props.header_len;
+ }
aead = x->data;
blksize = ALIGN(crypto_aead_blocksize(aead), 4);
@@ -3140,8 +3144,17 @@ u32 xfrm_state_mtu(struct xfrm_state *x, int mtu)
break;
}
- return ((mtu - x->props.header_len - crypto_aead_authsize(aead) -
- net_adj) & ~(blksize - 1)) + net_adj - 2;
+ overhead = x->props.header_len + crypto_aead_authsize(aead) + net_adj;
+ if (mtu <= overhead)
+ return 1;
+
+ payload_mtu = mtu - overhead;
+ payload_mtu &= ~(blksize - 1);
+ if (payload_mtu <= 2)
+ return 1;
+
+ return payload_mtu + net_adj - 2;
+
}
EXPORT_SYMBOL_GPL(xfrm_state_mtu);
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index d56450f61669..71a4b7278eba 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -3271,10 +3271,9 @@ out_cancel:
static int xfrm_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
const struct xfrm_migrate *m, int num_migrate,
- const struct xfrm_kmaddress *k,
+ const struct xfrm_kmaddress *k, struct net *net,
const struct xfrm_encap_tmpl *encap)
{
- struct net *net = &init_net;
struct sk_buff *skb;
int err;
@@ -3292,7 +3291,7 @@ static int xfrm_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
#else
static int xfrm_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
const struct xfrm_migrate *m, int num_migrate,
- const struct xfrm_kmaddress *k,
+ const struct xfrm_kmaddress *k, struct net *net,
const struct xfrm_encap_tmpl *encap)
{
return -ENOPROTOOPT;
@@ -3323,6 +3322,7 @@ const int xfrm_msg_min[XFRM_NR_MSGTYPES] = {
[XFRM_MSG_GETSADINFO - XFRM_MSG_BASE] = sizeof(u32),
[XFRM_MSG_NEWSPDINFO - XFRM_MSG_BASE] = sizeof(u32),
[XFRM_MSG_GETSPDINFO - XFRM_MSG_BASE] = sizeof(u32),
+ [XFRM_MSG_MAPPING - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_mapping),
[XFRM_MSG_SETDEFAULT - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_default),
[XFRM_MSG_GETDEFAULT - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_default),
};